From 3de4bf819077815ce0a610381e0b47a2b08e30b4 Mon Sep 17 00:00:00 2001
From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com>
Date: Wed, 18 Mar 2026 08:22:21 +0100
Subject: [PATCH] refactor: extract _fetch_raw to BaseScraper (DRY)

Identical _fetch_raw() implementation (URL fetch + cache + error handling)
was duplicated in 4 scrapers. Moved to BaseScraper.__init__ with url param.

Each scraper now passes url to super().__init__() and inherits _fetch_raw().
Eliminates ~48 lines of duplicated code.

DRY audit now clean: resolve logic in common.py, scraper CLI in base_scraper,
_fetch_raw in BaseScraper. Remaining duplications are justified (different
list_platforms semantics, context-specific hash computation).
---
 scripts/scraper/base_scraper.py     | 18 ++++++++++++++++++
 scripts/scraper/batocera_scraper.py | 14 +-------------
 scripts/scraper/libretro_scraper.py | 15 +--------------
 scripts/scraper/recalbox_scraper.py | 14 +-------------
 scripts/scraper/retrobat_scraper.py | 14 +-------------
 5 files changed, 22 insertions(+), 53 deletions(-)

diff --git a/scripts/scraper/base_scraper.py b/scripts/scraper/base_scraper.py
index efd16f27..b0132d81 100644
--- a/scripts/scraper/base_scraper.py
+++ b/scripts/scraper/base_scraper.py
@@ -48,6 +48,24 @@ class ChangeSet:
 class BaseScraper(ABC):
     """Abstract base class for platform BIOS requirement scrapers."""
 
+    def __init__(self, url: str = ""):
+        self.url = url
+        self._raw_data: str | None = None
+
+    def _fetch_raw(self) -> str:
+        """Fetch raw content from source URL. Cached after first call."""
+        if self._raw_data is not None:
+            return self._raw_data
+        if not self.url:
+            raise ValueError("No source URL configured")
+        try:
+            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                self._raw_data = resp.read().decode("utf-8")
+                return self._raw_data
+        except urllib.error.URLError as e:
+            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
+
     @abstractmethod
     def fetch_requirements(self) -> list[BiosRequirement]:
         """Fetch current BIOS requirements from the platform source."""
diff --git a/scripts/scraper/batocera_scraper.py b/scripts/scraper/batocera_scraper.py
index 58cdd6c9..30a521de 100644
--- a/scripts/scraper/batocera_scraper.py
+++ b/scripts/scraper/batocera_scraper.py
@@ -89,20 +89,8 @@ class Scraper(BaseScraper):
     """Scraper for batocera-systems Python dict."""
 
     def __init__(self, url: str = SOURCE_URL):
-        self.url = url
-        self._raw_data: str | None = None
+        super().__init__(url=url)
 
-    def _fetch_raw(self) -> str:
-        if self._raw_data is not None:
-            return self._raw_data
-
-        try:
-            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
-            with urllib.request.urlopen(req, timeout=30) as resp:
-                self._raw_data = resp.read().decode("utf-8")
-                return self._raw_data
-        except urllib.error.URLError as e:
-            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
 
     def _extract_systems_dict(self, raw: str) -> dict:
         """Extract and parse the 'systems' dict from the Python source via ast.literal_eval."""
diff --git a/scripts/scraper/libretro_scraper.py b/scripts/scraper/libretro_scraper.py
index 139944a4..76b75a5f 100644
--- a/scripts/scraper/libretro_scraper.py
+++ b/scripts/scraper/libretro_scraper.py
@@ -88,21 +88,8 @@ class Scraper(BaseScraper):
     """Scraper for libretro System.dat."""
 
     def __init__(self, url: str = SOURCE_URL):
-        self.url = url
-        self._raw_data: str | None = None
+        super().__init__(url=url)
 
-    def _fetch_raw(self) -> str:
-        """Fetch raw DAT content from source URL."""
-        if self._raw_data is not None:
-            return self._raw_data
-
-        try:
-            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
-            with urllib.request.urlopen(req, timeout=30) as resp:
-                self._raw_data = resp.read().decode("utf-8")
-                return self._raw_data
-        except urllib.error.URLError as e:
-            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
 
     def fetch_requirements(self) -> list[BiosRequirement]:
         """Parse System.dat and return BIOS requirements."""
diff --git a/scripts/scraper/recalbox_scraper.py b/scripts/scraper/recalbox_scraper.py
index 089be62d..cb1ba826 100644
--- a/scripts/scraper/recalbox_scraper.py
+++ b/scripts/scraper/recalbox_scraper.py
@@ -86,20 +86,8 @@ class Scraper(BaseScraper):
     """Scraper for Recalbox es_bios.xml."""
 
     def __init__(self, url: str = SOURCE_URL):
-        self.url = url
-        self._raw_data: str | None = None
+        super().__init__(url=url)
 
-    def _fetch_raw(self) -> str:
-        if self._raw_data is not None:
-            return self._raw_data
-
-        try:
-            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
-            with urllib.request.urlopen(req, timeout=30) as resp:
-                self._raw_data = resp.read().decode("utf-8")
-                return self._raw_data
-        except urllib.error.URLError as e:
-            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
 
     def fetch_requirements(self) -> list[BiosRequirement]:
         """Parse es_bios.xml and return BIOS requirements."""
diff --git a/scripts/scraper/retrobat_scraper.py b/scripts/scraper/retrobat_scraper.py
index 34b9eb61..08610667 100644
--- a/scripts/scraper/retrobat_scraper.py
+++ b/scripts/scraper/retrobat_scraper.py
@@ -32,21 +32,9 @@ class Scraper(BaseScraper):
     """Scraper for RetroBat batocera-systems.json."""
 
     def __init__(self, url: str = SOURCE_URL):
-        self.url = url
-        self._raw_data: str | None = None
+        super().__init__(url=url)
         self._parsed: dict | None = None
 
-    def _fetch_raw(self) -> str:
-        if self._raw_data is not None:
-            return self._raw_data
-
-        try:
-            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
-            with urllib.request.urlopen(req, timeout=30) as resp:
-                self._raw_data = resp.read().decode("utf-8")
-                return self._raw_data
-        except urllib.error.URLError as e:
-            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
 
     def _parse_json(self) -> dict:
         if self._parsed is not None: