Compare commits

...

16 Commits

Author SHA1 Message Date
edipretoro a2dce2bba9 Refactoring: extracting SQLAlchemy-related code to its own modules 2026-01-03 11:36:25 +01:00
edipretoro edbb92ff9a Adding a rule to ignore compiled files from Python 2026-01-03 11:35:48 +01:00
edipretoro a7952bc32c Processing correctly links extracted from a post 2026-01-02 15:11:22 +01:00
edipretoro 1d0cb8ed5d Updating the SQLAlchemyPipeline to use properly TvShowDB and LinkDB 2026-01-02 15:10:56 +01:00
edipretoro d8d8109cdc Updating the imports 2026-01-02 15:10:20 +01:00
edipretoro 05866cc862 Adding a links attribute to use the relationship to the LinkDB class 2026-01-02 15:09:50 +01:00
edipretoro f0e6d73dde Removing the existing download_url attribute 2026-01-02 15:09:30 +01:00
edipretoro 4bff05bc92 Adding a LinkDB model to store different links per show 2026-01-02 15:08:47 +01:00
edipretoro 8adc0623bd Adding the SQLAlchemyPipeline to the spider pipelines 2026-01-01 21:24:48 +01:00
edipretoro 0db07013ce Creating the SQLAlchemyPipeline class 2026-01-01 21:24:27 +01:00
edipretoro eaf854c3eb Adding a rule to ignore *.db files (SQLite databases) 2026-01-01 21:23:45 +01:00
edipretoro 359b3271e4 Updating the defition of our SQLAlchemy model 2026-01-01 21:23:05 +01:00
edipretoro 15e1f837c8 Updating the extraction of fields 2026-01-01 21:22:10 +01:00
edipretoro ada99be262 Renaming some fields for the TvShowItem 2026-01-01 21:15:36 +01:00
edipretoro 177652dce1 Adding a SQLAlchemy model to store scraped posts 2025-12-31 18:12:22 +01:00
edipretoro f8bb6678f1 Adding SQLAlchemy as requirement 2025-12-31 18:06:48 +01:00
5 changed files with 192 additions and 10 deletions
+2
View File
@@ -0,0 +1,2 @@
/*.db
/__pycache__/
+85
View File
@@ -0,0 +1,85 @@
from datetime import datetime
from sqlalchemy import ForeignKey, func, Integer, Boolean, String, Text, DateTime
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
class Base(DeclarativeBase):
"""Base déclarative pour SQLAlchemy 2.0."""
pass
class TvShowDB(Base):
"""Modèle pour le stockage des épisodes (SQLAlchemy 2.0)."""
__tablename__: str = "tvshows"
id: Mapped[int] = mapped_column(
Integer,
primary_key=True,
autoincrement=True
)
post_id: Mapped[str] = mapped_column(
String(length=255),
nullable=False,
unique=True,
index=True
)
post_title: Mapped[str] = mapped_column(
String(255),
nullable=False
)
title: Mapped[str] = mapped_column(
String(255),
nullable=False,
index=True
)
date: Mapped[datetime] = mapped_column(
DateTime,
nullable=False,
index=True
)
summary: Mapped[str | None] = mapped_column(
Text,
nullable=True
)
image_url: Mapped[str | None] = mapped_column(
String(255),
nullable=True
)
created_at: Mapped[datetime] = mapped_column(
DateTime,
server_default=func.datetime('now'),
nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
server_default=func.datetime('now'),
onupdate=func.datetime('now'),
nullable=False
)
links: Mapped[list["LinkDB"]] = relationship(back_populates="show")
class LinkDB(Base):
"""Modèle pour le stockage des liens de téléchargement (SQLAlchemy 2.0)."""
__tablename__: str = "links"
id: Mapped[int] = mapped_column(
Integer,
primary_key=True,
autoincrement=True
)
link: Mapped[str] = mapped_column(
String(255),
nullable=False
)
is_downloaded: Mapped[bool] = mapped_column(
Boolean,
default=False
)
show_id: Mapped[int] = mapped_column(ForeignKey("tvshows.id"))
show: Mapped["TvShowDB"] = relationship(back_populates="links")
+1
View File
@@ -4,6 +4,7 @@ version = "0.1.0"
requires-python = ">=3.14" requires-python = ">=3.14"
dependencies = [ dependencies = [
"scrapy>=2.13.4", "scrapy>=2.13.4",
"sqlalchemy>=2.0.45",
] ]
[tool] [tool]
+55 -9
View File
@@ -2,6 +2,7 @@
import re import re
import sys import sys
from datetime import datetime from datetime import datetime
import scrapy import scrapy
@@ -9,10 +10,16 @@ from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
from sqlalchemy import create_engine, select, Engine
from sqlalchemy.orm import sessionmaker
from models import Base, TvShowDB, LinkDB
class TvShowItem(scrapy.Item): class TvShowItem(scrapy.Item):
article_id: scrapy.Field = scrapy.Field() post_id: scrapy.Field = scrapy.Field()
article_title: scrapy.Field = scrapy.Field() post_title: scrapy.Field = scrapy.Field()
title:scrapy.Field = scrapy.Field() title:scrapy.Field = scrapy.Field()
date: scrapy.Field = scrapy.Field() date: scrapy.Field = scrapy.Field()
summary: scrapy.Field = scrapy.Field() summary: scrapy.Field = scrapy.Field()
@@ -20,6 +27,42 @@ class TvShowItem(scrapy.Item):
download_url: scrapy.Field = scrapy.Field() download_url: scrapy.Field = scrapy.Field()
class SQLAlchemyPipeline:
def __init__(self):
self.engine: Engine = create_engine('sqlite:///tvshows.db', echo=True)
Base.metadata.create_all(self.engine)
self.Session = sessionmaker(bind=self.engine)
def process_item(self, item, spider):
session = self.Session()
try:
stmt = select(TvShowDB).where(TvShowDB.post_id == item["post_id"])
show = session.scalars(stmt).first()
print(f"{show=}")
if not show:
show = TvShowDB(
post_id=item["post_id"],
post_title=item["post_title"],
title=item["title"],
date=item["date"],
summary=item["summary"],
image_url=item["image_url"],
links=[LinkDB(link=url) for url in item["download_url"]]
)
session.add(show)
else:
for key, value in item.items():
if key != "download_url":
setattr(show, key, value)
session.commit()
except Exception as e:
session.rollback()
raise
finally:
session.close()
return item
class TvShow(CrawlSpider): class TvShow(CrawlSpider):
name: str = "rlsb_tvshow" name: str = "rlsb_tvshow"
allowed_domains: list[str] = ["rlsbb.ru"] allowed_domains: list[str] = ["rlsbb.ru"]
@@ -28,6 +71,9 @@ class TvShow(CrawlSpider):
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_ENABLED': True,
'DOWNLOAD_DELAY': 10, 'DOWNLOAD_DELAY': 10,
'ITEM_PIPELINES': {
'__main__.SQLAlchemyPipeline': 300,
},
} }
rules: list[Rule] = [ rules: list[Rule] = [
Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True) Rule(LinkExtractor(allow=r"/tv-shows/page/"), callback="parse", follow=True)
@@ -36,13 +82,13 @@ class TvShow(CrawlSpider):
def parse(self, response): def parse(self, response):
for article in response.css("article"): for article in response.css("article"):
item = TvShowItem() item = TvShowItem()
item['article_id'] = article.attrib['id'], item['post_id'] = article.attrib['id']
item['article_title'] = article.css('h1.entry-title > a::text').get(), item['post_title'] = article.css('h1.entry-title > a::text').get()
item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get(), item['title'] = article.css('.entry-summary > p:nth-child(4) > strong::text').get()
item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip()), item['date'] = self.parse_date(article.css('.entry-meta-header-before::text').getall()[1].strip())
item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(), item['summary'] = "".join(article.xpath('.//div[@class="entry-summary"]/node()').extract())
item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(), item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get()
item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get() item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').extract()
yield item yield item
def parse_date(self, formatted_date: str): def parse_date(self, formatted_date: str):
Generated
+49 -1
View File
@@ -179,6 +179,29 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e3/7f/a1a97644e39e7316d850784c642093c99df1290a460df4ede27659056834/filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", size = 16666, upload-time = "2025-12-15T23:54:26.874Z" }, { url = "https://files.pythonhosted.org/packages/e3/7f/a1a97644e39e7316d850784c642093c99df1290a460df4ede27659056834/filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", size = 16666, upload-time = "2025-12-15T23:54:26.874Z" },
] ]
[[package]]
name = "greenlet"
version = "3.3.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/c7/e5/40dbda2736893e3e53d25838e0f19a2b417dfc122b9989c91918db30b5d3/greenlet-3.3.0.tar.gz", hash = "sha256:a82bb225a4e9e4d653dd2fb7b8b2d36e4fb25bc0165422a11e48b88e9e6f78fb", size = 190651, upload-time = "2025-12-04T14:49:44.05Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d7/7c/f0a6d0ede2c7bf092d00bc83ad5bafb7e6ec9b4aab2fbdfa6f134dc73327/greenlet-3.3.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:60c2ef0f578afb3c8d92ea07ad327f9a062547137afe91f38408f08aacab667f", size = 275671, upload-time = "2025-12-04T14:23:05.267Z" },
{ url = "https://files.pythonhosted.org/packages/44/06/dac639ae1a50f5969d82d2e3dd9767d30d6dbdbab0e1a54010c8fe90263c/greenlet-3.3.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a5d554d0712ba1de0a6c94c640f7aeba3f85b3a6e1f2899c11c2c0428da9365", size = 646360, upload-time = "2025-12-04T14:50:10.026Z" },
{ url = "https://files.pythonhosted.org/packages/e0/94/0fb76fe6c5369fba9bf98529ada6f4c3a1adf19e406a47332245ef0eb357/greenlet-3.3.0-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3a898b1e9c5f7307ebbde4102908e6cbfcb9ea16284a3abe15cab996bee8b9b3", size = 658160, upload-time = "2025-12-04T14:57:45.41Z" },
{ url = "https://files.pythonhosted.org/packages/93/79/d2c70cae6e823fac36c3bbc9077962105052b7ef81db2f01ec3b9bf17e2b/greenlet-3.3.0-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:dcd2bdbd444ff340e8d6bdf54d2f206ccddbb3ccfdcd3c25bf4afaa7b8f0cf45", size = 671388, upload-time = "2025-12-04T15:07:15.789Z" },
{ url = "https://files.pythonhosted.org/packages/b8/14/bab308fc2c1b5228c3224ec2bf928ce2e4d21d8046c161e44a2012b5203e/greenlet-3.3.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5773edda4dc00e173820722711d043799d3adb4f01731f40619e07ea2750b955", size = 660166, upload-time = "2025-12-04T14:26:05.099Z" },
{ url = "https://files.pythonhosted.org/packages/4b/d2/91465d39164eaa0085177f61983d80ffe746c5a1860f009811d498e7259c/greenlet-3.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ac0549373982b36d5fd5d30beb8a7a33ee541ff98d2b502714a09f1169f31b55", size = 1615193, upload-time = "2025-12-04T15:04:27.041Z" },
{ url = "https://files.pythonhosted.org/packages/42/1b/83d110a37044b92423084d52d5d5a3b3a73cafb51b547e6d7366ff62eff1/greenlet-3.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d198d2d977460358c3b3a4dc844f875d1adb33817f0613f663a656f463764ccc", size = 1683653, upload-time = "2025-12-04T14:27:32.366Z" },
{ url = "https://files.pythonhosted.org/packages/7c/9a/9030e6f9aa8fd7808e9c31ba4c38f87c4f8ec324ee67431d181fe396d705/greenlet-3.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:73f51dd0e0bdb596fb0417e475fa3c5e32d4c83638296e560086b8d7da7c4170", size = 305387, upload-time = "2025-12-04T14:26:51.063Z" },
{ url = "https://files.pythonhosted.org/packages/a0/66/bd6317bc5932accf351fc19f177ffba53712a202f9df10587da8df257c7e/greenlet-3.3.0-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:d6ed6f85fae6cdfdb9ce04c9bf7a08d666cfcfb914e7d006f44f840b46741931", size = 282638, upload-time = "2025-12-04T14:25:20.941Z" },
{ url = "https://files.pythonhosted.org/packages/30/cf/cc81cb030b40e738d6e69502ccbd0dd1bced0588e958f9e757945de24404/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d9125050fcf24554e69c4cacb086b87b3b55dc395a8b3ebe6487b045b2614388", size = 651145, upload-time = "2025-12-04T14:50:11.039Z" },
{ url = "https://files.pythonhosted.org/packages/9c/ea/1020037b5ecfe95ca7df8d8549959baceb8186031da83d5ecceff8b08cd2/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:87e63ccfa13c0a0f6234ed0add552af24cc67dd886731f2261e46e241608bee3", size = 654236, upload-time = "2025-12-04T14:57:47.007Z" },
{ url = "https://files.pythonhosted.org/packages/69/cc/1e4bae2e45ca2fa55299f4e85854606a78ecc37fead20d69322f96000504/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2662433acbca297c9153a4023fe2161c8dcfdcc91f10433171cf7e7d94ba2221", size = 662506, upload-time = "2025-12-04T15:07:16.906Z" },
{ url = "https://files.pythonhosted.org/packages/57/b9/f8025d71a6085c441a7eaff0fd928bbb275a6633773667023d19179fe815/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3c6e9b9c1527a78520357de498b0e709fb9e2f49c3a513afd5a249007261911b", size = 653783, upload-time = "2025-12-04T14:26:06.225Z" },
{ url = "https://files.pythonhosted.org/packages/f6/c7/876a8c7a7485d5d6b5c6821201d542ef28be645aa024cfe1145b35c120c1/greenlet-3.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:286d093f95ec98fdd92fcb955003b8a3d054b4e2cab3e2707a5039e7b50520fd", size = 1614857, upload-time = "2025-12-04T15:04:28.484Z" },
{ url = "https://files.pythonhosted.org/packages/4f/dc/041be1dff9f23dac5f48a43323cd0789cb798342011c19a248d9c9335536/greenlet-3.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c10513330af5b8ae16f023e8ddbfb486ab355d04467c4679c5cfe4659975dd9", size = 1676034, upload-time = "2025-12-04T14:27:33.531Z" },
]
[[package]] [[package]]
name = "hyperlink" name = "hyperlink"
version = "21.0.0" version = "21.0.0"
@@ -450,10 +473,14 @@ version = "0.1.0"
source = { virtual = "." } source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "scrapy" }, { name = "scrapy" },
{ name = "sqlalchemy" },
] ]
[package.metadata] [package.metadata]
requires-dist = [{ name = "scrapy", specifier = ">=2.13.4" }] requires-dist = [
{ name = "scrapy", specifier = ">=2.13.4" },
{ name = "sqlalchemy", specifier = ">=2.0.45" },
]
[[package]] [[package]]
name = "service-identity" name = "service-identity"
@@ -470,6 +497,27 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/08/2c/ca6dd598b384bc1ce581e24aaae0f2bed4ccac57749d5c3befbb5e742081/service_identity-24.2.0-py3-none-any.whl", hash = "sha256:6b047fbd8a84fd0bb0d55ebce4031e400562b9196e1e0d3e0fe2b8a59f6d4a85", size = 11364, upload-time = "2024-10-26T07:21:56.302Z" }, { url = "https://files.pythonhosted.org/packages/08/2c/ca6dd598b384bc1ce581e24aaae0f2bed4ccac57749d5c3befbb5e742081/service_identity-24.2.0-py3-none-any.whl", hash = "sha256:6b047fbd8a84fd0bb0d55ebce4031e400562b9196e1e0d3e0fe2b8a59f6d4a85", size = 11364, upload-time = "2024-10-26T07:21:56.302Z" },
] ]
[[package]]
name = "sqlalchemy"
version = "2.0.45"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "greenlet", marker = "platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/be/f9/5e4491e5ccf42f5d9cfc663741d261b3e6e1683ae7812114e7636409fcc6/sqlalchemy-2.0.45.tar.gz", hash = "sha256:1632a4bda8d2d25703fdad6363058d882541bdaaee0e5e3ddfa0cd3229efce88", size = 9869912, upload-time = "2025-12-09T21:05:16.737Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/cc/64/4e1913772646b060b025d3fc52ce91a58967fe58957df32b455de5a12b4f/sqlalchemy-2.0.45-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f46ec744e7f51275582e6a24326e10c49fbdd3fc99103e01376841213028774", size = 3272404, upload-time = "2025-12-09T22:11:09.662Z" },
{ url = "https://files.pythonhosted.org/packages/b3/27/caf606ee924282fe4747ee4fd454b335a72a6e018f97eab5ff7f28199e16/sqlalchemy-2.0.45-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:883c600c345123c033c2f6caca18def08f1f7f4c3ebeb591a63b6fceffc95cce", size = 3277057, upload-time = "2025-12-09T22:13:56.213Z" },
{ url = "https://files.pythonhosted.org/packages/85/d0/3d64218c9724e91f3d1574d12eb7ff8f19f937643815d8daf792046d88ab/sqlalchemy-2.0.45-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2c0b74aa79e2deade948fe8593654c8ef4228c44ba862bb7c9585c8e0db90f33", size = 3222279, upload-time = "2025-12-09T22:11:11.1Z" },
{ url = "https://files.pythonhosted.org/packages/24/10/dd7688a81c5bc7690c2a3764d55a238c524cd1a5a19487928844cb247695/sqlalchemy-2.0.45-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8a420169cef179d4c9064365f42d779f1e5895ad26ca0c8b4c0233920973db74", size = 3244508, upload-time = "2025-12-09T22:13:57.932Z" },
{ url = "https://files.pythonhosted.org/packages/aa/41/db75756ca49f777e029968d9c9fee338c7907c563267740c6d310a8e3f60/sqlalchemy-2.0.45-cp314-cp314-win32.whl", hash = "sha256:e50dcb81a5dfe4b7b4a4aa8f338116d127cb209559124f3694c70d6cd072b68f", size = 2113204, upload-time = "2025-12-09T21:39:38.365Z" },
{ url = "https://files.pythonhosted.org/packages/89/a2/0e1590e9adb292b1d576dbcf67ff7df8cf55e56e78d2c927686d01080f4b/sqlalchemy-2.0.45-cp314-cp314-win_amd64.whl", hash = "sha256:4748601c8ea959e37e03d13dcda4a44837afcd1b21338e637f7c935b8da06177", size = 2138785, upload-time = "2025-12-09T21:39:39.503Z" },
{ url = "https://files.pythonhosted.org/packages/42/39/f05f0ed54d451156bbed0e23eb0516bcad7cbb9f18b3bf219c786371b3f0/sqlalchemy-2.0.45-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cd337d3526ec5298f67d6a30bbbe4ed7e5e68862f0bf6dd21d289f8d37b7d60b", size = 3522029, upload-time = "2025-12-09T22:13:32.09Z" },
{ url = "https://files.pythonhosted.org/packages/54/0f/d15398b98b65c2bce288d5ee3f7d0a81f77ab89d9456994d5c7cc8b2a9db/sqlalchemy-2.0.45-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9a62b446b7d86a3909abbcd1cd3cc550a832f99c2bc37c5b22e1925438b9367b", size = 3475142, upload-time = "2025-12-09T22:13:33.739Z" },
{ url = "https://files.pythonhosted.org/packages/bf/e1/3ccb13c643399d22289c6a9786c1a91e3dcbb68bce4beb44926ac2c557bf/sqlalchemy-2.0.45-py3-none-any.whl", hash = "sha256:5225a288e4c8cc2308dbdd874edad6e7d0fd38eac1e9e5f23503425c8eee20d0", size = 1936672, upload-time = "2025-12-09T21:54:52.608Z" },
]
[[package]] [[package]]
name = "tldextract" name = "tldextract"
version = "5.3.0" version = "5.3.0"