Compare commits

...

4 Commits

3 changed files with 35 additions and 16 deletions

3
.gitignore vendored
View File

@ -1,3 +1,6 @@
*.json
*.csv
# Created by https://www.toptal.com/developers/gitignore/api/python # Created by https://www.toptal.com/developers/gitignore/api/python
# Edit at https://www.toptal.com/developers/gitignore?templates=python # Edit at https://www.toptal.com/developers/gitignore?templates=python

View File

@ -3,10 +3,14 @@
# See documentation in: # See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy from scrapy.item import Field, Item
class RlsbbScraperItem(scrapy.Item): class RlsbbScraperItem(Item):
# define the fields for your item here like: article_id = Field()
# name = scrapy.Field() article_title = Field()
pass title = Field()
date = Field()
summary = Field()
image_url = Field()
download_url = Field()

View File

@ -1,22 +1,34 @@
import scrapy import scrapy
from rlsbb_scraper.items import RlsbbScraperItem
class RlsBBMagsSpider(scrapy.Spider): class RlsBBMagsSpider(scrapy.Spider):
name = "rlsbb_mags" name = "rlsbb_mags"
start_urls = [
"https://rlsbb.ru/category/magazines/"
]
custom_settings = { custom_settings = {
'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_ENABLED': True,
} }
def __init__(self, start_page=1, end_page=10):
self.start_page = int(start_page)
self.end_page = int(end_page)
def start_requests(self):
for i in range(self.start_page, self.end_page + 1):
if i == 1:
yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse)
else:
yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse)
def parse(self, response): def parse(self, response):
for article in response.css("article"): for article in response.css("article"):
yield { item = RlsbbScraperItem()
'id': article.attrib['id'], item['article_id'] = article.attrib['id'],
'article_title': article.css('h1.entry-title > a::text').get(), item['article_title'] = article.css('h1.entry-title > a::text').get(),
'title': article.css('.entry-summary > p > strong::text').get(), item['title'] = article.css('.entry-summary > p > strong::text').get(),
'date': article.css('.entry-meta-header-before::text').getall()[1].strip(), item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
'image_url': article.css('.entry-summary > p > img::attr(src)').get(), item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get() item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
} item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
yield item