Compare commits

...

8 Commits

12 changed files with 67 additions and 47 deletions

3
.gitignore vendored
View File

@ -1,3 +1,6 @@
*.json
*.csv
# Created by https://www.toptal.com/developers/gitignore/api/python
# Edit at https://www.toptal.com/developers/gitignore?templates=python

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
scrapy

16
rlsbb_scraper/items.py Normal file
View File

@ -0,0 +1,16 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
from scrapy.item import Field, Item
class RlsbbScraperItem(Item):
article_id = Field()
article_title = Field()
title = Field()
date = Field()
summary = Field()
image_url = Field()
download_url = Field()

View File

@ -9,7 +9,7 @@ from scrapy import signals
from itemadapter import is_item, ItemAdapter
class RslbbScraperSpiderMiddleware:
class RlsbbScraperSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@ -56,7 +56,7 @@ class RslbbScraperSpiderMiddleware:
spider.logger.info("Spider opened: %s" % spider.name)
class RslbbScraperDownloaderMiddleware:
class RlsbbScraperDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

View File

@ -8,6 +8,6 @@
from itemadapter import ItemAdapter
class RslbbScraperPipeline:
class RlsbbScraperPipeline:
def process_item(self, item, spider):
return item

View File

@ -1,4 +1,4 @@
# Scrapy settings for rslbb_scraper project
# Scrapy settings for rlsbb_scraper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
@ -7,14 +7,14 @@
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "rslbb_scraper"
BOT_NAME = "rlsbb_scraper"
SPIDER_MODULES = ["rslbb_scraper.spiders"]
NEWSPIDER_MODULE = "rslbb_scraper.spiders"
SPIDER_MODULES = ["rlsbb_scraper.spiders"]
NEWSPIDER_MODULE = "rlsbb_scraper.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "rslbb_scraper (+http://www.yourdomain.com)"
#USER_AGENT = "rlsbb_scraper (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
@ -45,13 +45,13 @@ ROBOTSTXT_OBEY = True
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "rslbb_scraper.middlewares.RslbbScraperSpiderMiddleware": 543,
# "rlsbb_scraper.middlewares.rlsbbScraperSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "rslbb_scraper.middlewares.RslbbScraperDownloaderMiddleware": 543,
# "rlsbb_scraper.middlewares.rlsbbScraperDownloaderMiddleware": 543,
#}
# Enable or disable extensions
@ -63,7 +63,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "rslbb_scraper.pipelines.RslbbScraperPipeline": 300,
# "rlsbb_scraper.pipelines.rlsbbScraperPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@ -0,0 +1,34 @@
import scrapy
from rlsbb_scraper.items import RlsbbScraperItem
class RlsBBMagsSpider(scrapy.Spider):
name = "rlsbb_mags"
custom_settings = {
'AUTOTHROTTLE_ENABLED': True,
}
def __init__(self, start_page=1, end_page=10):
self.start_page = int(start_page)
self.end_page = int(end_page)
def start_requests(self):
for i in range(self.start_page, self.end_page + 1):
if i == 1:
yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse)
else:
yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse)
def parse(self, response):
for article in response.css("article"):
item = RlsbbScraperItem()
item['article_id'] = article.attrib['id'],
item['article_title'] = article.css('h1.entry-title > a::text').get(),
item['title'] = article.css('.entry-summary > p > strong::text').get(),
item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
yield item

View File

@ -1,12 +0,0 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class RslbbScraperItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View File

@ -1,22 +0,0 @@
import scrapy
class RlsBBMagsSpider(scrapy.Spider):
name = "rlsbb_mags"
start_urls = [
"https://rlsbb.ru/category/magazines/"
]
custom_settings = {
'AUTOTHROTTLE_ENABLED': True,
}
def parse(self, response):
for article in response.css("article"):
yield {
'id': article.attrib['id'],
'article_title': article.css('h1.entry-title > a::text').get(),
'title': article.css('.entry-summary > p > strong::text').get(),
'date': article.css('.entry-meta-header-before::text').getall()[1].strip(),
'image_url': article.css('.entry-summary > p > img::attr(src)').get(),
'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
}

View File

@ -4,8 +4,8 @@
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = rslbb_scraper.settings
default = rlsbb_scraper.settings
[deploy]
#url = http://localhost:6800/
project = rslbb_scraper
project = rlsbb_scraper