Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7d4dd8edbf | |||
| 1f4601b8cd | |||
| 515f9ca361 | |||
| ad50fe8224 | |||
| 6fa55f572a | |||
| 54f600f50d | |||
| 86607d90cd | |||
| 4cc5438e5c |
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,6 @@
|
|||||||
|
*.json
|
||||||
|
*.csv
|
||||||
|
|
||||||
# Created by https://www.toptal.com/developers/gitignore/api/python
|
# Created by https://www.toptal.com/developers/gitignore/api/python
|
||||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
||||||
|
|
||||||
|
|||||||
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
scrapy
|
||||||
16
rlsbb_scraper/items.py
Normal file
16
rlsbb_scraper/items.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
from scrapy.item import Field, Item
|
||||||
|
|
||||||
|
|
||||||
|
class RlsbbScraperItem(Item):
|
||||||
|
article_id = Field()
|
||||||
|
article_title = Field()
|
||||||
|
title = Field()
|
||||||
|
date = Field()
|
||||||
|
summary = Field()
|
||||||
|
image_url = Field()
|
||||||
|
download_url = Field()
|
||||||
@ -9,7 +9,7 @@ from scrapy import signals
|
|||||||
from itemadapter import is_item, ItemAdapter
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
class RslbbScraperSpiderMiddleware:
|
class RlsbbScraperSpiderMiddleware:
|
||||||
# Not all methods need to be defined. If a method is not defined,
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
# scrapy acts as if the spider middleware does not modify the
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
# passed objects.
|
# passed objects.
|
||||||
@ -56,7 +56,7 @@ class RslbbScraperSpiderMiddleware:
|
|||||||
spider.logger.info("Spider opened: %s" % spider.name)
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
class RslbbScraperDownloaderMiddleware:
|
class RlsbbScraperDownloaderMiddleware:
|
||||||
# Not all methods need to be defined. If a method is not defined,
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
# scrapy acts as if the downloader middleware does not modify the
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
# passed objects.
|
# passed objects.
|
||||||
@ -8,6 +8,6 @@
|
|||||||
from itemadapter import ItemAdapter
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
class RslbbScraperPipeline:
|
class RlsbbScraperPipeline:
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
return item
|
return item
|
||||||
@ -1,4 +1,4 @@
|
|||||||
# Scrapy settings for rslbb_scraper project
|
# Scrapy settings for rlsbb_scraper project
|
||||||
#
|
#
|
||||||
# For simplicity, this file contains only settings considered important or
|
# For simplicity, this file contains only settings considered important or
|
||||||
# commonly used. You can find more settings consulting the documentation:
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
@ -7,14 +7,14 @@
|
|||||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
BOT_NAME = "rslbb_scraper"
|
BOT_NAME = "rlsbb_scraper"
|
||||||
|
|
||||||
SPIDER_MODULES = ["rslbb_scraper.spiders"]
|
SPIDER_MODULES = ["rlsbb_scraper.spiders"]
|
||||||
NEWSPIDER_MODULE = "rslbb_scraper.spiders"
|
NEWSPIDER_MODULE = "rlsbb_scraper.spiders"
|
||||||
|
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
#USER_AGENT = "rslbb_scraper (+http://www.yourdomain.com)"
|
#USER_AGENT = "rlsbb_scraper (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
# Obey robots.txt rules
|
# Obey robots.txt rules
|
||||||
ROBOTSTXT_OBEY = True
|
ROBOTSTXT_OBEY = True
|
||||||
@ -45,13 +45,13 @@ ROBOTSTXT_OBEY = True
|
|||||||
# Enable or disable spider middlewares
|
# Enable or disable spider middlewares
|
||||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
#SPIDER_MIDDLEWARES = {
|
#SPIDER_MIDDLEWARES = {
|
||||||
# "rslbb_scraper.middlewares.RslbbScraperSpiderMiddleware": 543,
|
# "rlsbb_scraper.middlewares.rlsbbScraperSpiderMiddleware": 543,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
# Enable or disable downloader middlewares
|
# Enable or disable downloader middlewares
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
#DOWNLOADER_MIDDLEWARES = {
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
# "rslbb_scraper.middlewares.RslbbScraperDownloaderMiddleware": 543,
|
# "rlsbb_scraper.middlewares.rlsbbScraperDownloaderMiddleware": 543,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
# Enable or disable extensions
|
# Enable or disable extensions
|
||||||
@ -63,7 +63,7 @@ ROBOTSTXT_OBEY = True
|
|||||||
# Configure item pipelines
|
# Configure item pipelines
|
||||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
#ITEM_PIPELINES = {
|
#ITEM_PIPELINES = {
|
||||||
# "rslbb_scraper.pipelines.RslbbScraperPipeline": 300,
|
# "rlsbb_scraper.pipelines.rlsbbScraperPipeline": 300,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
34
rlsbb_scraper/spiders/rlsbb.py
Normal file
34
rlsbb_scraper/spiders/rlsbb.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
from rlsbb_scraper.items import RlsbbScraperItem
|
||||||
|
|
||||||
|
|
||||||
|
class RlsBBMagsSpider(scrapy.Spider):
|
||||||
|
name = "rlsbb_mags"
|
||||||
|
|
||||||
|
custom_settings = {
|
||||||
|
'AUTOTHROTTLE_ENABLED': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, start_page=1, end_page=10):
|
||||||
|
self.start_page = int(start_page)
|
||||||
|
self.end_page = int(end_page)
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for i in range(self.start_page, self.end_page + 1):
|
||||||
|
if i == 1:
|
||||||
|
yield scrapy.Request(url="https://rlsbb.ru/category/magazines/", callback=self.parse)
|
||||||
|
else:
|
||||||
|
yield scrapy.Request(url=f"https://rlsbb.ru/category/magazines/page/{i}/", callback=self.parse)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
for article in response.css("article"):
|
||||||
|
item = RlsbbScraperItem()
|
||||||
|
item['article_id'] = article.attrib['id'],
|
||||||
|
item['article_title'] = article.css('h1.entry-title > a::text').get(),
|
||||||
|
item['title'] = article.css('.entry-summary > p > strong::text').get(),
|
||||||
|
item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(),
|
||||||
|
item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(),
|
||||||
|
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(),
|
||||||
|
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
|
||||||
|
yield item
|
||||||
@ -1,12 +0,0 @@
|
|||||||
# Define here the models for your scraped items
|
|
||||||
#
|
|
||||||
# See documentation in:
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
|
||||||
|
|
||||||
import scrapy
|
|
||||||
|
|
||||||
|
|
||||||
class RslbbScraperItem(scrapy.Item):
|
|
||||||
# define the fields for your item here like:
|
|
||||||
# name = scrapy.Field()
|
|
||||||
pass
|
|
||||||
@ -1,22 +0,0 @@
|
|||||||
import scrapy
|
|
||||||
|
|
||||||
|
|
||||||
class RlsBBMagsSpider(scrapy.Spider):
|
|
||||||
name = "rlsbb_mags"
|
|
||||||
start_urls = [
|
|
||||||
"https://rlsbb.ru/category/magazines/"
|
|
||||||
]
|
|
||||||
custom_settings = {
|
|
||||||
'AUTOTHROTTLE_ENABLED': True,
|
|
||||||
}
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
for article in response.css("article"):
|
|
||||||
yield {
|
|
||||||
'id': article.attrib['id'],
|
|
||||||
'article_title': article.css('h1.entry-title > a::text').get(),
|
|
||||||
'title': article.css('.entry-summary > p > strong::text').get(),
|
|
||||||
'date': article.css('.entry-meta-header-before::text').getall()[1].strip(),
|
|
||||||
'image_url': article.css('.entry-summary > p > img::attr(src)').get(),
|
|
||||||
'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get()
|
|
||||||
}
|
|
||||||
@ -4,8 +4,8 @@
|
|||||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
[settings]
|
[settings]
|
||||||
default = rslbb_scraper.settings
|
default = rlsbb_scraper.settings
|
||||||
|
|
||||||
[deploy]
|
[deploy]
|
||||||
#url = http://localhost:6800/
|
#url = http://localhost:6800/
|
||||||
project = rslbb_scraper
|
project = rlsbb_scraper
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user