Adding a first working iteration of a spider for magazines in RlsBB

2024-06-05 22:33:04 +02:00 · 2024-06-05 22:33:04 +02:00 · 95c7a6cf12
commit 95c7a6cf12
parent 6998d60cc0
1 changed files with 22 additions and 0 deletions
--- a/rslbb_scraper/spiders/rlsbb.py
+++ b/rslbb_scraper/spiders/rlsbb.py
@ -0,0 +1,22 @@
+import scrapy
+
+
+class RlsBBMagsSpider(scrapy.Spider):
+    name = "rlsbb_mags"
+    start_urls = [
+        "https://rlsbb.ru/category/magazines/"
+    ]
+    custom_settings = {
+        'AUTOTHROTTLE_ENABLED': True,
+    }
+
+    def parse(self, response):
+        for article in response.css("article"):
+            yield {
+                'id': article.attrib['id'],
+                'article_title': article.css('h1.entry-title > a::text').get(),
+                'title': article.css('.entry-summary > p > strong::text').get(),
+                'date': article.css('.entry-meta-header-before::text').getall()[1].strip(),
+                'image_url': article.css('.entry-summary > p > img::attr(src)').get(),
+                'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
+            }