Adding some logic to get the number of pages to crawl from the command line
This commit is contained in:
parent
ad50fe8224
commit
515f9ca361
@ -3,13 +3,22 @@ import scrapy
|
|||||||
|
|
||||||
class RlsBBMagsSpider(scrapy.Spider):
|
class RlsBBMagsSpider(scrapy.Spider):
|
||||||
name = "rlsbb_mags"
|
name = "rlsbb_mags"
|
||||||
start_urls = [
|
|
||||||
"https://rlsbb.ru/category/magazines/"
|
|
||||||
]
|
|
||||||
custom_settings = {
|
custom_settings = {
|
||||||
'AUTOTHROTTLE_ENABLED': True,
|
'AUTOTHROTTLE_ENABLED': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def __init__(self, start_page=1, end_page=10):
|
||||||
|
self.start_page = int(start_page)
|
||||||
|
self.end_page = int(end_page)
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for i in range(self.start_page, self.end_page + 1):
|
||||||
|
if i == 1:
|
||||||
|
yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse)
|
||||||
|
else:
|
||||||
|
yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
for article in response.css("article"):
|
for article in response.css("article"):
|
||||||
yield {
|
yield {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user