Compare commits

...

5 Commits

Author SHA1 Message Date
4e90275507 Complying to pyright 2025-12-29 21:40:42 +01:00
d099cd262d Extracting the date from the post 2025-12-29 21:40:34 +01:00
3f6d1bfb4f Adding type annotation to TvShow 2025-12-29 21:39:31 +01:00
15877e456a Removing useless import 2025-12-29 21:38:39 +01:00
3dff072840 Adding type annotation 2025-12-29 21:38:19 +01:00

View File

@ -1,34 +1,35 @@
#!/usr/bin/env python
import re
import sys
from datetime import datetime
import scrapy
from scrapy.http import Response
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class TvShowItem(scrapy.Item):
article_id = scrapy.Field()
article_title = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
summary = scrapy.Field()
image_url = scrapy.Field()
download_url = scrapy.Field()
article_id: scrapy.Field = scrapy.Field()
article_title: scrapy.Field = scrapy.Field()
title:scrapy.Field = scrapy.Field()
date: scrapy.Field = scrapy.Field()
summary: scrapy.Field = scrapy.Field()
image_url: scrapy.Field = scrapy.Field()
download_url: scrapy.Field = scrapy.Field()
class TvShow(CrawlSpider):
name = "rlsb_tvshow"
allowed_domains = ["rlsbb.ru"]
start_urls = ["https://rlsbb.ru/category/tv-shows/"]
name: str = "rlsb_tvshow"
allowed_domains: list[str] = ["rlsbb.ru"]
start_urls: list[str] = ["https://rlsbb.ru/category/tv-shows/"]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'AUTOTHROTTLE_ENABLED': True,
'DOWNLOAD_DELAY': 10,
}
rules = [
rules: list[Rule] = [
Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True)
]
@ -38,16 +39,20 @@ class TvShow(CrawlSpider):
item['article_id'] = article.attrib['id'],
item['article_title'] = article.css('h1.entry-title > a::text').get(),
item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get(),
item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip()),
item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
yield item
def parse_date(self, formatted_date: str):
formatted_date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', formatted_date)
return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in")
def main():
process = CrawlerProcess()
process.crawl(TvShow)
_ = process.crawl(TvShow)
process.start()
return 0