From 7d4dd8edbf3d439c36a8abebff20e72cbd27b434 Mon Sep 17 00:00:00 2001 From: edipretoro Date: Mon, 9 Dec 2024 10:20:20 +0100 Subject: [PATCH] Refactoring the scraper to use our RlsbbScraperItem --- rlsbb_scraper/spiders/rlsbb.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/rlsbb_scraper/spiders/rlsbb.py b/rlsbb_scraper/spiders/rlsbb.py index 0e57807..584f546 100644 --- a/rlsbb_scraper/spiders/rlsbb.py +++ b/rlsbb_scraper/spiders/rlsbb.py @@ -1,5 +1,7 @@ import scrapy +from rlsbb_scraper.items import RlsbbScraperItem + class RlsBBMagsSpider(scrapy.Spider): name = "rlsbb_mags" @@ -21,11 +23,12 @@ class RlsBBMagsSpider(scrapy.Spider): def parse(self, response): for article in response.css("article"): - yield { - 'id': article.attrib['id'], - 'article_title': article.css('h1.entry-title > a::text').get(), - 'title': article.css('.entry-summary > p > strong::text').get(), - 'date': article.css('.entry-meta-header-before::text').getall()[1].strip(), - 'image_url': article.css('.entry-summary > p > img::attr(src)').get(), - 'download_url': article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get() - } + item = RlsbbScraperItem() + item['article_id'] = article.attrib['id'], + item['article_title'] = article.css('h1.entry-title > a::text').get(), + item['title'] = article.css('.entry-summary > p > strong::text').get(), + item['date'] = article.css('.entry-meta-header-before::text').getall()[1].strip(), + item['summary'] = article.xpath('.//div[@class="entry-summary"]/node()').extract(), + item['image_url'] = article.css('.entry-summary > p > img::attr(src)').get(), + item['download_url'] = article.css('.entry-summary > p > a[href ^= "https://rapidgator"]::attr(href)').get() + yield item