Adding some logic to get the number of pages to crawl from the command line

This commit is contained in:
edipretoro 2024-12-09 10:18:50 +01:00
parent ad50fe8224
commit 515f9ca361

View File

@ -3,13 +3,22 @@ import scrapy
class RlsBBMagsSpider(scrapy.Spider): class RlsBBMagsSpider(scrapy.Spider):
name = "rlsbb_mags" name = "rlsbb_mags"
start_urls = [
"https://rlsbb.ru/category/magazines/"
]
custom_settings = { custom_settings = {
'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_ENABLED': True,
} }
def __init__(self, start_page=1, end_page=10):
self.start_page = int(start_page)
self.end_page = int(end_page)
def start_requests(self):
for i in range(self.start_page, self.end_page + 1):
if i == 1:
yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse)
else:
yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse)
def parse(self, response): def parse(self, response):
for article in response.css("article"): for article in response.css("article"):
yield { yield {