Compare commits

..

No commits in common. "4e90275507c4af12acbdde9fc4259ae71ff1b449" and "0d98fc5193fa1c03e864ba45b1ca947ad0cdb576" have entirely different histories.

View File

@ -1,35 +1,34 @@
#!/usr/bin/env python
import re
import sys
from datetime import datetime
import scrapy
from scrapy.http import Response
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class TvShowItem(scrapy.Item):
article_id: scrapy.Field = scrapy.Field()
article_title: scrapy.Field = scrapy.Field()
title:scrapy.Field = scrapy.Field()
date: scrapy.Field = scrapy.Field()
summary: scrapy.Field = scrapy.Field()
image_url: scrapy.Field = scrapy.Field()
download_url: scrapy.Field = scrapy.Field()
article_id = scrapy.Field()
article_title = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
summary = scrapy.Field()
image_url = scrapy.Field()
download_url = scrapy.Field()
class TvShow(CrawlSpider):
name: str = "rlsb_tvshow"
allowed_domains: list[str] = ["rlsbb.ru"]
start_urls: list[str] = ["https://rlsbb.ru/category/tv-shows/"]
name = "rlsb_tvshow"
allowed_domains = ["rlsbb.ru"]
start_urls = ["https://rlsbb.ru/category/tv-shows/"]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'AUTOTHROTTLE_ENABLED': True,
'DOWNLOAD_DELAY': 10,
}
rules: list[Rule] = [
rules = [
Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True)
]
@ -39,20 +38,16 @@ class TvShow(CrawlSpider):
item['article_id'] = article.attrib['id'],
item['article_title'] = article.css('h1.entry-title > a::text').get(),
item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get(),
item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip()),
item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
yield item
def parse_date(self, formatted_date: str):
formatted_date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', formatted_date)
return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in")
def main():
process = CrawlerProcess()
_ = process.crawl(TvShow)
process.crawl(TvShow)
process.start()
return 0