scrarls/scrarls.py
2025-12-26 12:13:27 +01:00

48 lines
1.1 KiB
Python

#!/usr/bin/env python
import sys
import scrapy
from scrapy.http import Response
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class TvShowItem(scrapy.Item):
article_id = scrapy.Field()
article_title = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
summary = scrapy.Field()
image_url = scrapy.Field()
download_url = scrapy.Field()
class TvShow(CrawlSpider):
name = "rlsb_tvshow"
allowed_domains = ["rlsbb.ru"]
start_urls = ["https://rlsbb.ru/category/tv-shows/"]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'AUTOTHROTTLE_ENABLED': True,
'DOWNLOAD_DELAY': 10,
}
rules = [
Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True)
]
def parse(self, response):
pass
def main():
process = CrawlerProcess()
process.crawl(TvShow)
process.start()
return 0
if __name__ == "__main__":
sys.exit(main())