From 3de4bf819077815ce0a610381e0b47a2b08e30b4 Mon Sep 17 00:00:00 2001 From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com> Date: Wed, 18 Mar 2026 08:22:21 +0100 Subject: [PATCH] refactor: extract _fetch_raw to BaseScraper (DRY) Identical _fetch_raw() implementation (URL fetch + cache + error handling) was duplicated in 4 scrapers. Moved to BaseScraper.__init__ with url param. Each scraper now passes url to super().__init__() and inherits _fetch_raw(). Eliminates ~48 lines of duplicated code. DRY audit now clean: resolve logic in common.py, scraper CLI in base_scraper, _fetch_raw in BaseScraper. Remaining duplications are justified (different list_platforms semantics, context-specific hash computation). --- scripts/scraper/base_scraper.py | 18 ++++++++++++++++++ scripts/scraper/batocera_scraper.py | 14 +------------- scripts/scraper/libretro_scraper.py | 15 +-------------- scripts/scraper/recalbox_scraper.py | 14 +------------- scripts/scraper/retrobat_scraper.py | 14 +------------- 5 files changed, 22 insertions(+), 53 deletions(-) diff --git a/scripts/scraper/base_scraper.py b/scripts/scraper/base_scraper.py index efd16f27..b0132d81 100644 --- a/scripts/scraper/base_scraper.py +++ b/scripts/scraper/base_scraper.py @@ -48,6 +48,24 @@ class ChangeSet: class BaseScraper(ABC): """Abstract base class for platform BIOS requirement scrapers.""" + def __init__(self, url: str = ""): + self.url = url + self._raw_data: str | None = None + + def _fetch_raw(self) -> str: + """Fetch raw content from source URL. Cached after first call.""" + if self._raw_data is not None: + return self._raw_data + if not self.url: + raise ValueError("No source URL configured") + try: + req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"}) + with urllib.request.urlopen(req, timeout=30) as resp: + self._raw_data = resp.read().decode("utf-8") + return self._raw_data + except urllib.error.URLError as e: + raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e + @abstractmethod def fetch_requirements(self) -> list[BiosRequirement]: """Fetch current BIOS requirements from the platform source.""" diff --git a/scripts/scraper/batocera_scraper.py b/scripts/scraper/batocera_scraper.py index 58cdd6c9..30a521de 100644 --- a/scripts/scraper/batocera_scraper.py +++ b/scripts/scraper/batocera_scraper.py @@ -89,20 +89,8 @@ class Scraper(BaseScraper): """Scraper for batocera-systems Python dict.""" def __init__(self, url: str = SOURCE_URL): - self.url = url - self._raw_data: str | None = None + super().__init__(url=url) - def _fetch_raw(self) -> str: - if self._raw_data is not None: - return self._raw_data - - try: - req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"}) - with urllib.request.urlopen(req, timeout=30) as resp: - self._raw_data = resp.read().decode("utf-8") - return self._raw_data - except urllib.error.URLError as e: - raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e def _extract_systems_dict(self, raw: str) -> dict: """Extract and parse the 'systems' dict from the Python source via ast.literal_eval.""" diff --git a/scripts/scraper/libretro_scraper.py b/scripts/scraper/libretro_scraper.py index 139944a4..76b75a5f 100644 --- a/scripts/scraper/libretro_scraper.py +++ b/scripts/scraper/libretro_scraper.py @@ -88,21 +88,8 @@ class Scraper(BaseScraper): """Scraper for libretro System.dat.""" def __init__(self, url: str = SOURCE_URL): - self.url = url - self._raw_data: str | None = None + super().__init__(url=url) - def _fetch_raw(self) -> str: - """Fetch raw DAT content from source URL.""" - if self._raw_data is not None: - return self._raw_data - - try: - req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"}) - with urllib.request.urlopen(req, timeout=30) as resp: - self._raw_data = resp.read().decode("utf-8") - return self._raw_data - except urllib.error.URLError as e: - raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e def fetch_requirements(self) -> list[BiosRequirement]: """Parse System.dat and return BIOS requirements.""" diff --git a/scripts/scraper/recalbox_scraper.py b/scripts/scraper/recalbox_scraper.py index 089be62d..cb1ba826 100644 --- a/scripts/scraper/recalbox_scraper.py +++ b/scripts/scraper/recalbox_scraper.py @@ -86,20 +86,8 @@ class Scraper(BaseScraper): """Scraper for Recalbox es_bios.xml.""" def __init__(self, url: str = SOURCE_URL): - self.url = url - self._raw_data: str | None = None + super().__init__(url=url) - def _fetch_raw(self) -> str: - if self._raw_data is not None: - return self._raw_data - - try: - req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"}) - with urllib.request.urlopen(req, timeout=30) as resp: - self._raw_data = resp.read().decode("utf-8") - return self._raw_data - except urllib.error.URLError as e: - raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e def fetch_requirements(self) -> list[BiosRequirement]: """Parse es_bios.xml and return BIOS requirements.""" diff --git a/scripts/scraper/retrobat_scraper.py b/scripts/scraper/retrobat_scraper.py index 34b9eb61..08610667 100644 --- a/scripts/scraper/retrobat_scraper.py +++ b/scripts/scraper/retrobat_scraper.py @@ -32,21 +32,9 @@ class Scraper(BaseScraper): """Scraper for RetroBat batocera-systems.json.""" def __init__(self, url: str = SOURCE_URL): - self.url = url - self._raw_data: str | None = None + super().__init__(url=url) self._parsed: dict | None = None - def _fetch_raw(self) -> str: - if self._raw_data is not None: - return self._raw_data - - try: - req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"}) - with urllib.request.urlopen(req, timeout=30) as resp: - self._raw_data = resp.read().decode("utf-8") - return self._raw_data - except urllib.error.URLError as e: - raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e def _parse_json(self) -> dict: if self._parsed is not None: