#!/usr/bin/env python import re import sys from datetime import datetime from multiprocessing import Process import scrapy from scrapy.crawler import CrawlerProcess from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from sqlalchemy import ForeignKey, create_engine, func, select, Engine, Integer, Boolean, String, Text, DateTime from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship from sqlalchemy.orm import sessionmaker class Base(DeclarativeBase): """Base déclarative pour SQLAlchemy 2.0.""" pass class TvShowDB(Base): """Modèle pour le stockage des épisodes (SQLAlchemy 2.0).""" __tablename__: str = "tvshows" id: Mapped[int] = mapped_column( Integer, primary_key=True, autoincrement=True ) post_id: Mapped[str] = mapped_column( String(length=255), nullable=False, unique=True, index=True ) post_title: Mapped[str] = mapped_column( String(255), nullable=False ) title: Mapped[str] = mapped_column( String(255), nullable=False, index=True ) date: Mapped[datetime] = mapped_column( DateTime, nullable=False, index=True ) summary: Mapped[str | None] = mapped_column( Text, nullable=True ) image_url: Mapped[str | None] = mapped_column( String(255), nullable=True ) created_at: Mapped[datetime] = mapped_column( DateTime, server_default=func.datetime('now'), nullable=False ) updated_at: Mapped[datetime] = mapped_column( DateTime, server_default=func.datetime('now'), onupdate=func.datetime('now'), nullable=False ) links: Mapped[list["LinkDB"]] = relationship(back_populates="show") class LinkDB(Base): """Modèle pour le stockage des liens de téléchargement (SQLAlchemy 2.0).""" __tablename__: str = "links" id: Mapped[int] = mapped_column( Integer, primary_key=True, autoincrement=True ) link: Mapped[str] = mapped_column( String(255), nullable=False ) is_downloaded: Mapped[bool] = mapped_column( Boolean, default=False ) show_id: Mapped[int] = mapped_column(ForeignKey("tvshows.id")) show: Mapped["TvShowDB"] = relationship(back_populates="links") class TvShowItem(scrapy.Item): post_id: scrapy.Field = scrapy.Field() post_title: scrapy.Field = scrapy.Field() title:scrapy.Field = scrapy.Field() date: scrapy.Field = scrapy.Field() summary: scrapy.Field = scrapy.Field() image_url: scrapy.Field = scrapy.Field() download_url: scrapy.Field = scrapy.Field() class SQLAlchemyPipeline: def __init__(self): self.engine: Engine = create_engine('sqlite:///tvshows.db', echo=True) Base.metadata.create_all(self.engine) self.Session = sessionmaker(bind=self.engine) def process_item(self, item, spider): session = self.Session() try: stmt = select(TvShowDB).where(TvShowDB.post_id == item["post_id"]) show = session.scalars(stmt).first() print(f"{show=}") if not show: show = TvShowDB( post_id=item["post_id"], post_title=item["post_title"], title=item["title"], date=item["date"], summary=item["summary"], image_url=item["image_url"], links=[LinkDB(link=url) for url in item["download_url"]] ) session.add(show) else: for key, value in item.items(): if key != "download_url": setattr(show, key, value) session.commit() except Exception as e: session.rollback() raise finally: session.close() return item class TvShow(CrawlSpider): name: str = "rlsb_tvshow" allowed_domains: list[str] = ["rlsbb.ru"] start_urls: list[str] = ["https://rlsbb.ru/category/tv-shows/"] custom_settings = { 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'AUTOTHROTTLE_ENABLED': True, 'DOWNLOAD_DELAY': 10, 'ITEM_PIPELINES': { '__main__.SQLAlchemyPipeline': 300, }, } rules: list[Rule] = [ Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True) ] def parse(self, response): for article in response.css("article"): item = TvShowItem() item['post_id'] = article.attrib['id'] item['post_title'] = article.css('h1.entry-title > a::text').get() item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get() item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip()) item['summary'] = "".join(article.xpath('.//div[@class="entry-summary"]/node()').extract()) item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get() item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').extract() yield item def parse_date(self, formatted_date: str): formatted_date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', formatted_date) return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in") def run_scrapy(): process = CrawlerProcess() _ = process.crawl(TvShow) process.start() def main(): p_scrapy = Process(target=run_scrapy) try: p_scrapy.start() except KeyboardInterrupt: print(f"Closing...") p_scrapy.terminate() p_scrapy.join(timeout=30) if p_scrapy.is_alive(): print("⚠️ Scrapy n'a pas pu s'arrêter proprement.") p_scrapy.kill() print(f"scrarls is stopped.") if __name__ == "__main__": sys.exit(main())