Refactoring the scraper to use our RlsbbScraperItem

Implementing an scrapy.Item to collect our data
Adding some logic to get the number of pages to crawl from the command line
2024-12-09 10:20:20 +01:00 · 2024-12-09 10:19:42 +01:00 · 2024-12-09 10:18:50 +01:00 · 2024-12-09 10:16:07 +01:00 · 2024-06-06 08:58:42 +02:00 · 2024-06-06 08:52:30 +02:00
12 changed files with 67 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
+*.json
+*.csv
+
 # Created by https://www.toptal.com/developers/gitignore/api/python
 # Edit at https://www.toptal.com/developers/gitignore?templates=python

--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+scrapy
--- a/rlsbb_scraper/init.py
+++ b/rlsbb_scraper/init.py
--- a/rlsbb_scraper/items.py
+++ b/rlsbb_scraper/items.py
@ -0,0 +1,16 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Field, Item
+
+
+class RlsbbScraperItem(Item):
+    article_id = Field()
+    article_title = Field()
+    title = Field()
+    date = Field()
+    summary = Field()
+    image_url = Field()
+    download_url = Field()
--- a/rlsbb_scraper/middlewares.py
+++ b/rlsbb_scraper/middlewares.py
@ -9,7 +9,7 @@ from scrapy import signals
 from itemadapter import is_item, ItemAdapter


-class RslbbScraperSpiderMiddleware:
+class RlsbbScraperSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
@ -56,7 +56,7 @@ class RslbbScraperSpiderMiddleware:
        spider.logger.info("Spider opened: %s" % spider.name)


-class RslbbScraperDownloaderMiddleware:
+class RlsbbScraperDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
--- a/rlsbb_scraper/pipelines.py
+++ b/rlsbb_scraper/pipelines.py
@ -8,6 +8,6 @@
 from itemadapter import ItemAdapter


-class RslbbScraperPipeline:
+class RlsbbScraperPipeline:
    def process_item(self, item, spider):
        return item
--- a/rlsbb_scraper/settings.py
+++ b/rlsbb_scraper/settings.py
@ -1,4 +1,4 @@
-# Scrapy settings for rslbb_scraper project
+# Scrapy settings for rlsbb_scraper project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
@ -7,14 +7,14 @@
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

-BOT_NAME = "rslbb_scraper"
+BOT_NAME = "rlsbb_scraper"

-SPIDER_MODULES = ["rslbb_scraper.spiders"]
-NEWSPIDER_MODULE = "rslbb_scraper.spiders"
+SPIDER_MODULES = ["rlsbb_scraper.spiders"]
+NEWSPIDER_MODULE = "rlsbb_scraper.spiders"


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = "rslbb_scraper (+http://www.yourdomain.com)"
+#USER_AGENT = "rlsbb_scraper (+http://www.yourdomain.com)"

 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
@ -45,13 +45,13 @@ ROBOTSTXT_OBEY = True
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
-#    "rslbb_scraper.middlewares.RslbbScraperSpiderMiddleware": 543,
+#    "rlsbb_scraper.middlewares.rlsbbScraperSpiderMiddleware": 543,
 #}

 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
-#    "rslbb_scraper.middlewares.RslbbScraperDownloaderMiddleware": 543,
+#    "rlsbb_scraper.middlewares.rlsbbScraperDownloaderMiddleware": 543,
 #}

 # Enable or disable extensions
@ -63,7 +63,7 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
-#    "rslbb_scraper.pipelines.RslbbScraperPipeline": 300,
+#    "rlsbb_scraper.pipelines.rlsbbScraperPipeline": 300,
 #}

 # Enable and configure the AutoThrottle extension (disabled by default)
--- a/rlsbb_scraper/spiders/init.py
+++ b/rlsbb_scraper/spiders/init.py
--- a/rlsbb_scraper/spiders/rlsbb.py
+++ b/rlsbb_scraper/spiders/rlsbb.py
@ -0,0 +1,34 @@
+import scrapy
+
+from rlsbb_scraper.items import RlsbbScraperItem
+
+
+class RlsBBMagsSpider(scrapy.Spider):
+    name = "rlsbb_mags"
+    
+    custom_settings = {
+        'AUTOTHROTTLE_ENABLED': True,
+    }
+
+    def __init__(self, start_page=1, end_page=10):
+        self.start_page = int(start_page)
+        self.end_page = int(end_page)
+
+    def start_requests(self):
+        for i in range(self.start_page, self.end_page + 1):
+            if i == 1:
+                yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse)
+            else:
+                yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse)
+
+    def parse(self, response):
+        for article in response.css("article"):
+            item = RlsbbScraperItem()
+            item['article_id'] = article.attrib['id'],
+            item['article_title'] = article.css('h1.entry-title > a::text').get(),
+            item['title'] = article.css('.entry-summary > p > strong::text').get(),
+            item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
+            item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
+            item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
+            item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
+            yield item
--- a/rslbb_scraper/items.py
+++ b/rslbb_scraper/items.py
@ -1,12 +0,0 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-
-
-class RslbbScraperItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    pass
--- a/rslbb_scraper/spiders/rlsbb.py
+++ b/rslbb_scraper/spiders/rlsbb.py
@ -1,22 +0,0 @@
-import scrapy
-
-
-class RlsBBMagsSpider(scrapy.Spider):
-    name = "rlsbb_mags"
-    start_urls = [
-        "https://rlsbb.ru/category/magazines/"
-    ]
-    custom_settings = {
-        'AUTOTHROTTLE_ENABLED': True,
-    }
-
-    def parse(self, response):
-        for article in response.css("article"):
-            yield {
-                'id': article.attrib['id'],
-                'article_title': article.css('h1.entry-title > a::text').get(),
-                'title': article.css('.entry-summary > p > strong::text').get(),
-                'date': article.css('.entry-meta-header-before::text').getall()[1].strip(),
-                'image_url': article.css('.entry-summary > p > img::attr(src)').get(),
-                'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
-            }
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -4,8 +4,8 @@
 # https://scrapyd.readthedocs.io/en/latest/deploy.html

 [settings]
-default = rslbb_scraper.settings
+default = rlsbb_scraper.settings

 [deploy]
 #url = http://localhost:6800/
-project = rslbb_scraper
+project = rlsbb_scraper
Author	SHA1	Message	Date
edipretoro	7d4dd8edbf	Refactoring the scraper to use our RlsbbScraperItem	2024-12-09 10:20:20 +01:00
edipretoro	1f4601b8cd	Implementing an scrapy.Item to collect our data	2024-12-09 10:19:42 +01:00
edipretoro	515f9ca361	Adding some logic to get the number of pages to crawl from the command line	2024-12-09 10:18:50 +01:00
edipretoro	ad50fe8224	Adding rules to ignore CSV & JSON files for git	2024-12-09 10:16:07 +01:00
edipretoro	6fa55f572a	Fixing the bugs related to the renaming	2024-06-06 08:58:42 +02:00
edipretoro	54f600f50d	Fixing the settings after the renaming	2024-06-06 08:52:30 +02:00
edipretoro	86607d90cd	Renaming the name of this project	2024-06-06 08:51:43 +02:00
edipretoro	4cc5438e5c	Adding the requirements for this project	2024-06-06 08:49:58 +02:00