Complying to pyright

Extracting the date from the post
Adding type annotation to TvShow
2025-12-29 21:40:42 +01:00 · 2025-12-29 21:40:34 +01:00 · 2025-12-29 21:39:31 +01:00 · 2025-12-29 21:38:39 +01:00 · 2025-12-29 21:38:19 +01:00
1 changed files with 19 additions and 14 deletions
--- a/scrarls.py
+++ b/scrarls.py
@ -1,34 +1,35 @@
 #!/usr/bin/env python
 import re
 import sys
 from datetime import datetime
 import scrapy
 from scrapy.http import Response
 from scrapy.crawler import CrawlerProcess
 from scrapy.spiders import CrawlSpider, Rule
 from scrapy.linkextractors import LinkExtractor
 class TvShowItem(scrapy.Item):
-    article_id = scrapy.Field()
+    article_id: scrapy.Field = scrapy.Field()
-    article_title = scrapy.Field()
+    article_title: scrapy.Field = scrapy.Field()
-    title = scrapy.Field()
+    title:scrapy.Field = scrapy.Field()
-    date = scrapy.Field()
+    date: scrapy.Field = scrapy.Field()
-    summary = scrapy.Field()
+    summary: scrapy.Field = scrapy.Field()
-    image_url = scrapy.Field()
+    image_url: scrapy.Field = scrapy.Field()
-    download_url = scrapy.Field()
+    download_url: scrapy.Field = scrapy.Field()
 class TvShow(CrawlSpider):
-    name = "rlsb_tvshow"
+    name: str = "rlsb_tvshow"
-    allowed_domains = ["rlsbb.ru"]
+    allowed_domains: list[str] = ["rlsbb.ru"]
-    start_urls = ["https://rlsbb.ru/category/tv-shows/"]
+    start_urls: list[str] = ["https://rlsbb.ru/category/tv-shows/"]
    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        'AUTOTHROTTLE_ENABLED': True,
        'DOWNLOAD_DELAY': 10,
    }
-    rules = [
+    rules: list[Rule] = [
        Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True)
    ]
@ -38,16 +39,20 @@ class TvShow(CrawlSpider):
            item['article_id'] = article.attrib['id'],
            item['article_title'] = article.css('h1.entry-title > a::text').get(),
            item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get(),
-            item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
+            item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip()),
            item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
            item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
            item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
            yield item
    def parse_date(self, formatted_date: str):
        formatted_date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', formatted_date)
        return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in")
 def main():
    process = CrawlerProcess()
-    process.crawl(TvShow)
+    _ = process.crawl(TvShow)
    process.start()
    return 0
Author	SHA1	Message	Date
edipretoro	4e90275507	Complying to pyright	2025-12-29 21:40:42 +01:00
edipretoro	d099cd262d	Extracting the date from the post	2025-12-29 21:40:34 +01:00
edipretoro	3f6d1bfb4f	Adding type annotation to TvShow	2025-12-29 21:39:31 +01:00
edipretoro	15877e456a	Removing useless import	2025-12-29 21:38:39 +01:00
edipretoro	3dff072840	Adding type annotation	2025-12-29 21:38:19 +01:00