scrarls/scrarls.py

200 lines
6.0 KiB
Python

#!/usr/bin/env python
import re
import sys
from datetime import datetime
from multiprocessing import Process
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from sqlalchemy import ForeignKey, create_engine, func, select, Engine, Integer, Boolean, String, Text, DateTime
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
from sqlalchemy.orm import sessionmaker
class Base(DeclarativeBase):
"""Base déclarative pour SQLAlchemy 2.0."""
pass
class TvShowDB(Base):
"""Modèle pour le stockage des épisodes (SQLAlchemy 2.0)."""
__tablename__: str = "tvshows"
id: Mapped[int] = mapped_column(
Integer,
primary_key=True,
autoincrement=True
)
post_id: Mapped[str] = mapped_column(
String(length=255),
nullable=False,
unique=True,
index=True
)
post_title: Mapped[str] = mapped_column(
String(255),
nullable=False
)
title: Mapped[str] = mapped_column(
String(255),
nullable=False,
index=True
)
date: Mapped[datetime] = mapped_column(
DateTime,
nullable=False,
index=True
)
summary: Mapped[str | None] = mapped_column(
Text,
nullable=True
)
image_url: Mapped[str | None] = mapped_column(
String(255),
nullable=True
)
created_at: Mapped[datetime] = mapped_column(
DateTime,
server_default=func.datetime('now'),
nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
server_default=func.datetime('now'),
onupdate=func.datetime('now'),
nullable=False
)
links: Mapped[list["LinkDB"]] = relationship(back_populates="show")
class LinkDB(Base):
"""Modèle pour le stockage des liens de téléchargement (SQLAlchemy 2.0)."""
__tablename__: str = "links"
id: Mapped[int] = mapped_column(
Integer,
primary_key=True,
autoincrement=True
)
link: Mapped[str] = mapped_column(
String(255),
nullable=False
)
is_downloaded: Mapped[bool] = mapped_column(
Boolean,
default=False
)
show_id: Mapped[int] = mapped_column(ForeignKey("tvshows.id"))
show: Mapped["TvShowDB"] = relationship(back_populates="links")
class TvShowItem(scrapy.Item):
post_id: scrapy.Field = scrapy.Field()
post_title: scrapy.Field = scrapy.Field()
title:scrapy.Field = scrapy.Field()
date: scrapy.Field = scrapy.Field()
summary: scrapy.Field = scrapy.Field()
image_url: scrapy.Field = scrapy.Field()
download_url: scrapy.Field = scrapy.Field()
class SQLAlchemyPipeline:
def __init__(self):
self.engine: Engine = create_engine('sqlite:///tvshows.db', echo=True)
Base.metadata.create_all(self.engine)
self.Session = sessionmaker(bind=self.engine)
def process_item(self, item, spider):
session = self.Session()
try:
stmt = select(TvShowDB).where(TvShowDB.post_id == item["post_id"])
show = session.scalars(stmt).first()
print(f"{show=}")
if not show:
show = TvShowDB(
post_id=item["post_id"],
post_title=item["post_title"],
title=item["title"],
date=item["date"],
summary=item["summary"],
image_url=item["image_url"],
links=[LinkDB(link=url) for url in item["download_url"]]
)
session.add(show)
else:
for key, value in item.items():
if key != "download_url":
setattr(show, key, value)
session.commit()
except Exception as e:
session.rollback()
raise
finally:
session.close()
return item
class TvShow(CrawlSpider):
name: str = "rlsb_tvshow"
allowed_domains: list[str] = ["rlsbb.ru"]
start_urls: list[str] = ["https://rlsbb.ru/category/tv-shows/"]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'AUTOTHROTTLE_ENABLED': True,
'DOWNLOAD_DELAY': 10,
'ITEM_PIPELINES': {
'__main__.SQLAlchemyPipeline': 300,
},
}
rules: list[Rule] = [
Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True)
]
def parse(self, response):
for article in response.css("article"):
item = TvShowItem()
item['post_id'] = article.attrib['id']
item['post_title'] = article.css('h1.entry-title > a::text').get()
item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get()
item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip())
item['summary'] = "".join(article.xpath('.//div[@class="entry-summary"]/node()').extract())
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get()
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').extract()
yield item
def parse_date(self, formatted_date: str):
formatted_date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', formatted_date)
return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in")
def run_scrapy():
process = CrawlerProcess()
_ = process.crawl(TvShow)
process.start()
def main():
p_scrapy = Process(target=run_scrapy)
try:
p_scrapy.start()
except KeyboardInterrupt:
print(f"Closing...")
p_scrapy.terminate()
p_scrapy.join(timeout=30)
if p_scrapy.is_alive():
print("⚠️ Scrapy n'a pas pu s'arrêter proprement.")
p_scrapy.kill()
print(f"scrarls is stopped.")
if __name__ == "__main__":
sys.exit(main())