Compare commits

..

No commits in common. "4e90275507c4af12acbdde9fc4259ae71ff1b449" and "0d98fc5193fa1c03e864ba45b1ca947ad0cdb576" have entirely different histories.

View File

@ -1,35 +1,34 @@
#!/usr/bin/env python #!/usr/bin/env python
import re
import sys import sys
from datetime import datetime
import scrapy import scrapy
from scrapy.http import Response
from scrapy.crawler import CrawlerProcess from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
class TvShowItem(scrapy.Item): class TvShowItem(scrapy.Item):
article_id: scrapy.Field = scrapy.Field() article_id = scrapy.Field()
article_title: scrapy.Field = scrapy.Field() article_title = scrapy.Field()
title:scrapy.Field = scrapy.Field() title = scrapy.Field()
date: scrapy.Field = scrapy.Field() date = scrapy.Field()
summary: scrapy.Field = scrapy.Field() summary = scrapy.Field()
image_url: scrapy.Field = scrapy.Field() image_url = scrapy.Field()
download_url: scrapy.Field = scrapy.Field() download_url = scrapy.Field()
class TvShow(CrawlSpider): class TvShow(CrawlSpider):
name: str = "rlsb_tvshow" name = "rlsb_tvshow"
allowed_domains: list[str] = ["rlsbb.ru"] allowed_domains = ["rlsbb.ru"]
start_urls: list[str] = ["https://rlsbb.ru/category/tv-shows/"] start_urls = ["https://rlsbb.ru/category/tv-shows/"]
custom_settings = { custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_ENABLED': True,
'DOWNLOAD_DELAY': 10, 'DOWNLOAD_DELAY': 10,
} }
rules: list[Rule] = [ rules = [
Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True) Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True)
] ]
@ -39,20 +38,16 @@ class TvShow(CrawlSpider):
item['article_id'] = article.attrib['id'], item['article_id'] = article.attrib['id'],
item['article_title'] = article.css('h1.entry-title > a::text').get(), item['article_title'] = article.css('h1.entry-title > a::text').get(),
item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get(), item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get(),
item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip()), item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(), item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(), item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get() item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
yield item yield item
def parse_date(self, formatted_date: str):
formatted_date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', formatted_date)
return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in")
def main(): def main():
process = CrawlerProcess() process = CrawlerProcess()
_ = process.crawl(TvShow) process.crawl(TvShow)
process.start() process.start()
return 0 return 0