Compare commits

..

2 Commits

3 changed files with 90 additions and 96 deletions
+1
View File
@@ -1 +1,2 @@
/*.db /*.db
/__pycache__/
+85
View File
@@ -0,0 +1,85 @@
from datetime import datetime
from sqlalchemy import ForeignKey, func, Integer, Boolean, String, Text, DateTime
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
class Base(DeclarativeBase):
"""Base déclarative pour SQLAlchemy 2.0."""
pass
class TvShowDB(Base):
"""Modèle pour le stockage des épisodes (SQLAlchemy 2.0)."""
__tablename__: str = "tvshows"
id: Mapped[int] = mapped_column(
Integer,
primary_key=True,
autoincrement=True
)
post_id: Mapped[str] = mapped_column(
String(length=255),
nullable=False,
unique=True,
index=True
)
post_title: Mapped[str] = mapped_column(
String(255),
nullable=False
)
title: Mapped[str] = mapped_column(
String(255),
nullable=False,
index=True
)
date: Mapped[datetime] = mapped_column(
DateTime,
nullable=False,
index=True
)
summary: Mapped[str | None] = mapped_column(
Text,
nullable=True
)
image_url: Mapped[str | None] = mapped_column(
String(255),
nullable=True
)
created_at: Mapped[datetime] = mapped_column(
DateTime,
server_default=func.datetime('now'),
nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
server_default=func.datetime('now'),
onupdate=func.datetime('now'),
nullable=False
)
links: Mapped[list["LinkDB"]] = relationship(back_populates="show")
class LinkDB(Base):
"""Modèle pour le stockage des liens de téléchargement (SQLAlchemy 2.0)."""
__tablename__: str = "links"
id: Mapped[int] = mapped_column(
Integer,
primary_key=True,
autoincrement=True
)
link: Mapped[str] = mapped_column(
String(255),
nullable=False
)
is_downloaded: Mapped[bool] = mapped_column(
Boolean,
default=False
)
show_id: Mapped[int] = mapped_column(ForeignKey("tvshows.id"))
show: Mapped["TvShowDB"] = relationship(back_populates="links")
+4 -96
View File
@@ -4,95 +4,17 @@ import re
import sys import sys
from datetime import datetime from datetime import datetime
from multiprocessing import Process
import scrapy import scrapy
from scrapy.crawler import CrawlerProcess from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
from sqlalchemy import ForeignKey, create_engine, func, select, Engine, Integer, Boolean, String, Text, DateTime from sqlalchemy import create_engine, select, Engine
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
class Base(DeclarativeBase): from models import Base, TvShowDB, LinkDB
"""Base déclarative pour SQLAlchemy 2.0."""
pass
class TvShowDB(Base):
"""Modèle pour le stockage des épisodes (SQLAlchemy 2.0)."""
__tablename__: str = "tvshows"
id: Mapped[int] = mapped_column(
Integer,
primary_key=True,
autoincrement=True
)
post_id: Mapped[str] = mapped_column(
String(length=255),
nullable=False,
unique=True,
index=True
)
post_title: Mapped[str] = mapped_column(
String(255),
nullable=False
)
title: Mapped[str] = mapped_column(
String(255),
nullable=False,
index=True
)
date: Mapped[datetime] = mapped_column(
DateTime,
nullable=False,
index=True
)
summary: Mapped[str | None] = mapped_column(
Text,
nullable=True
)
image_url: Mapped[str | None] = mapped_column(
String(255),
nullable=True
)
created_at: Mapped[datetime] = mapped_column(
DateTime,
server_default=func.datetime('now'),
nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
server_default=func.datetime('now'),
onupdate=func.datetime('now'),
nullable=False
)
links: Mapped[list["LinkDB"]] = relationship(back_populates="show")
class LinkDB(Base):
"""Modèle pour le stockage des liens de téléchargement (SQLAlchemy 2.0)."""
__tablename__: str = "links"
id: Mapped[int] = mapped_column(
Integer,
primary_key=True,
autoincrement=True
)
link: Mapped[str] = mapped_column(
String(255),
nullable=False
)
is_downloaded: Mapped[bool] = mapped_column(
Boolean,
default=False
)
show_id: Mapped[int] = mapped_column(ForeignKey("tvshows.id"))
show: Mapped["TvShowDB"] = relationship(back_populates="links")
class TvShowItem(scrapy.Item): class TvShowItem(scrapy.Item):
@@ -174,25 +96,11 @@ class TvShow(CrawlSpider):
return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in") return datetime.strptime(formatted_date, "Posted on %B %d, %Y at %H:%M in")
def run_scrapy(): def main():
process = CrawlerProcess() process = CrawlerProcess()
_ = process.crawl(TvShow) _ = process.crawl(TvShow)
process.start() process.start()
return 0
def main():
p_scrapy = Process(target=run_scrapy)
try:
p_scrapy.start()
except KeyboardInterrupt:
print(f"Closing...")
p_scrapy.terminate()
p_scrapy.join(timeout=30)
if p_scrapy.is_alive():
print("⚠️ Scrapy n'a pas pu s'arrêter proprement.")
p_scrapy.kill()
print(f"scrarls is stopped.")
if __name__ == "__main__": if __name__ == "__main__":