From 515f9ca361eb3651ff467d9c23cbc06f900ce1f3 Mon Sep 17 00:00:00 2001 From: edipretoro Date: Mon, 9 Dec 2024 10:18:50 +0100 Subject: [PATCH] Adding some logic to get the number of pages to crawl from the command line --- rlsbb_scraper/spiders/rlsbb.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/rlsbb_scraper/spiders/rlsbb.py b/rlsbb_scraper/spiders/rlsbb.py index 3b071df..0e57807 100644 --- a/rlsbb_scraper/spiders/rlsbb.py +++ b/rlsbb_scraper/spiders/rlsbb.py @@ -3,13 +3,22 @@ import scrapy class RlsBBMagsSpider(scrapy.Spider): name = "rlsbb_mags" - start_urls = [ - "https://rlsbb.ru/category/magazines/" - ] + custom_settings = { 'AUTOTHROTTLE_ENABLED': True, } + def __init__(self, start_page=1, end_page=10): + self.start_page = int(start_page) + self.end_page = int(end_page) + + def start_requests(self): + for i in range(self.start_page, self.end_page + 1): + if i == 1: + yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse) + else: + yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse) + def parse(self, response): for article in response.css("article"): yield {