Compare commits
No commits in common. "7d4dd8edbf3d439c36a8abebff20e72cbd27b434" and "6fa55f572a8d95a40ac015afd1931f426b4cd855" have entirely different histories.
7d4dd8edbf
...
6fa55f572a
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,6 +1,3 @@
|
|||||||
*.json
|
|
||||||
*.csv
|
|
||||||
|
|
||||||
# Created by https://www.toptal.com/developers/gitignore/api/python
|
# Created by https://www.toptal.com/developers/gitignore/api/python
|
||||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
||||||
|
|
||||||
|
|||||||
@ -3,14 +3,10 @@
|
|||||||
# See documentation in:
|
# See documentation in:
|
||||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
from scrapy.item import Field, Item
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
class RlsbbScraperItem(Item):
|
class RlsbbScraperItem(scrapy.Item):
|
||||||
article_id = Field()
|
# define the fields for your item here like:
|
||||||
article_title = Field()
|
# name = scrapy.Field()
|
||||||
title = Field()
|
pass
|
||||||
date = Field()
|
|
||||||
summary = Field()
|
|
||||||
image_url = Field()
|
|
||||||
download_url = Field()
|
|
||||||
|
|||||||
@ -1,34 +1,22 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
|
|
||||||
from rlsbb_scraper.items import RlsbbScraperItem
|
|
||||||
|
|
||||||
|
|
||||||
class RlsBBMagsSpider(scrapy.Spider):
|
class RlsBBMagsSpider(scrapy.Spider):
|
||||||
name = "rlsbb_mags"
|
name = "rlsbb_mags"
|
||||||
|
start_urls = [
|
||||||
|
"https://rlsbb.ru/category/magazines/"
|
||||||
|
]
|
||||||
custom_settings = {
|
custom_settings = {
|
||||||
'AUTOTHROTTLE_ENABLED': True,
|
'AUTOTHROTTLE_ENABLED': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, start_page=1, end_page=10):
|
|
||||||
self.start_page = int(start_page)
|
|
||||||
self.end_page = int(end_page)
|
|
||||||
|
|
||||||
def start_requests(self):
|
|
||||||
for i in range(self.start_page, self.end_page + 1):
|
|
||||||
if i == 1:
|
|
||||||
yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse)
|
|
||||||
else:
|
|
||||||
yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse)
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
for article in response.css("article"):
|
for article in response.css("article"):
|
||||||
item = RlsbbScraperItem()
|
yield {
|
||||||
item['article_id'] = article.attrib['id'],
|
'id': article.attrib['id'],
|
||||||
item['article_title'] = article.css('h1.entry-title > a::text').get(),
|
'article_title': article.css('h1.entry-title > a::text').get(),
|
||||||
item['title'] = article.css('.entry-summary > p > strong::text').get(),
|
'title': article.css('.entry-summary > p > strong::text').get(),
|
||||||
item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
|
'date': article.css('.entry-meta-header-before::text').getall()[1].strip(),
|
||||||
item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
|
'image_url': article.css('.entry-summary > p > img::attr(src)').get(),
|
||||||
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
|
'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
|
||||||
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
|
}
|
||||||
yield item
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user