Compare commits

..

5 Commits

Author SHA1 Message Date
edipretoro 4e90275507 Complying to pyright 2025-12-29 21:40:42 +01:00
edipretoro d099cd262d Extracting the date from the post 2025-12-29 21:40:34 +01:00
edipretoro 3f6d1bfb4f Adding type annotation to TvShow 2025-12-29 21:39:31 +01:00
edipretoro 15877e456a Removing useless import 2025-12-29 21:38:39 +01:00
edipretoro 3dff072840 Adding type annotation 2025-12-29 21:38:19 +01:00
+19 -14
View File
@@ -1,34 +1,35 @@
#!/usr/bin/env python
import re
import sys
from datetime import datetime
import scrapy
from scrapy.http import Response
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class TvShowItem(scrapy.Item):
article_id = scrapy.Field()
article_title = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
summary = scrapy.Field()
image_url = scrapy.Field()
download_url = scrapy.Field()
article_id: scrapy.Field = scrapy.Field()
article_title: scrapy.Field = scrapy.Field()
title:scrapy.Field = scrapy.Field()
date: scrapy.Field = scrapy.Field()
summary: scrapy.Field = scrapy.Field()
image_url: scrapy.Field = scrapy.Field()
download_url: scrapy.Field = scrapy.Field()
class TvShow(CrawlSpider):
name = "rlsb_tvshow"
allowed_domains = ["rlsbb.ru"]
start_urls = ["https://rlsbb.ru/category/tv-shows/"]
name: str = "rlsb_tvshow"
allowed_domains: list[str] = ["rlsbb.ru"]
start_urls: list[str] = ["https://rlsbb.ru/category/tv-shows/"]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'AUTOTHROTTLE_ENABLED': True,
'DOWNLOAD_DELAY': 10,
}
rules = [
rules: list[Rule] = [
Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True)
]
@@ -38,16 +39,20 @@ class TvShow(CrawlSpider):
item['article_id'] = article.attrib['id'],
item['article_title'] = article.css('h1.entry-title > a::text').get(),
item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get(),
item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip()),
item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
yield item
def parse_date(self, formatted_date: str):
formatted_date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', formatted_date)
return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in")
def main():
process = CrawlerProcess()
process.crawl(TvShow)
_ = process.crawl(TvShow)
process.start()
return 0