Refactoring the scraper to use our RlsbbScraperItem

Implementing an scrapy.Item to collect our data
Adding some logic to get the number of pages to crawl from the command line
2024-12-09 10:20:20 +01:00 · 2024-12-09 10:19:42 +01:00 · 2024-12-09 10:18:50 +01:00 · 2024-12-09 10:16:07 +01:00
3 changed files with 35 additions and 16 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
 *.json
 *.csv
 # Created by https://www.toptal.com/developers/gitignore/api/python
 # Edit at https://www.toptal.com/developers/gitignore?templates=python
--- a/rlsbb_scraper/items.py
+++ b/rlsbb_scraper/items.py
@ -3,10 +3,14 @@
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
-import scrapy
+from scrapy.item import Field, Item
-class RlsbbScraperItem(scrapy.Item):
+class RlsbbScraperItem(Item):
-    # define the fields for your item here like:
+    article_id = Field()
-    # name = scrapy.Field()
+    article_title = Field()
-    pass
+    title = Field()
    date = Field()
    summary = Field()
    image_url = Field()
    download_url = Field()
--- a/rlsbb_scraper/spiders/rlsbb.py
+++ b/rlsbb_scraper/spiders/rlsbb.py
@ -1,22 +1,34 @@
 import scrapy
 from rlsbb_scraper.items import RlsbbScraperItem
 class RlsBBMagsSpider(scrapy.Spider):
    name = "rlsbb_mags"
-    start_urls = [
+    
        "https://rlsbb.ru/category/magazines/"
    ]
    custom_settings = {
        'AUTOTHROTTLE_ENABLED': True,
    }
    def __init__(self, start_page=1, end_page=10):
        self.start_page = int(start_page)
        self.end_page = int(end_page)
    def start_requests(self):
        for i in range(self.start_page, self.end_page + 1):
            if i == 1:
                yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse)
            else:
                yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse)
    def parse(self, response):
        for article in response.css("article"):
-            yield {
+            item = RlsbbScraperItem()
-                'id': article.attrib['id'],
+            item['article_id'] = article.attrib['id'],
-                'article_title': article.css('h1.entry-title > a::text').get(),
+            item['article_title'] = article.css('h1.entry-title > a::text').get(),
-                'title': article.css('.entry-summary > p > strong::text').get(),
+            item['title'] = article.css('.entry-summary > p > strong::text').get(),
-                'date': article.css('.entry-meta-header-before::text').getall()[1].strip(),
+            item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
-                'image_url': article.css('.entry-summary > p > img::attr(src)').get(),
+            item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
-                'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
+            item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
-            }
+            item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
            yield item
Author	SHA1	Message	Date
edipretoro	7d4dd8edbf	Refactoring the scraper to use our RlsbbScraperItem	2024-12-09 10:20:20 +01:00
edipretoro	1f4601b8cd	Implementing an scrapy.Item to collect our data	2024-12-09 10:19:42 +01:00
edipretoro	515f9ca361	Adding some logic to get the number of pages to crawl from the command line	2024-12-09 10:18:50 +01:00
edipretoro	ad50fe8224	Adding rules to ignore CSV & JSON files for git	2024-12-09 10:16:07 +01:00