Adding a first working iteration of a spide for magazines in RlsBB
This commit is contained in:
parent
6998d60cc0
commit
db7d19761e
22
rslbb_scraper/spiders/rlsbb.py
Normal file
22
rslbb_scraper/spiders/rlsbb.py
Normal file
@ -0,0 +1,22 @@
|
||||
import scrapy
|
||||
|
||||
|
||||
class RlsBBMagsSpider(scrapy.Spider):
|
||||
name = "rlsbb_mags"
|
||||
start_urls = [
|
||||
"https://rlsbb.ru/category/magazines/"
|
||||
]
|
||||
custom_settings = {
|
||||
'AUTOTHROTTLE_ENABLED': True,
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
for article in response.css("article"):
|
||||
yield {
|
||||
'id': article.attrib['id'],
|
||||
'article_title': article.css('h1.entry-title > a::text').get(),
|
||||
'title': article.css('.entry-summary > p > strong::text').get(),
|
||||
'date': article.css('.entry-meta-header-before::text').getall()[1].strip(),
|
||||
'image_url': article.css('.entry-summary > p > img::attr(src)').get(),
|
||||
'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user