Add TRS-80, RX-78, Sega AI entries; refactor tools

Add many MAME/MESS BIOS entries (TRS-80 family, Bandai RX-78, Sega AI) and update docs/navigation counts (README, mkdocs). Remove empty supplemental file references from database.json and update generated timestamps and totals. Harden and refactor tooling: add MAX_RESPONSE_SIZE limited reader in base_scraper, make target scrapers an abstract base, narrow exception handling in the Batocera targets parser, and switch generate_pack.py and verify.py to use build_target_cores_cache (simplifies target config loading and error handling). verify.py also loads supplemental cross-reference names and accepts them through verify_platform. Update tests to import from updated modules (validation/truth). Misc: small bugfix for case-insensitive path conflict check.
2026-04-13 12:22:33 -05:00 · 2026-03-29 23:04:30 +02:00
parent a08c730805
commit 0c5cde83e1
11 changed files with 658 additions and 92 deletions
--- a/scripts/scraper/base_scraper.py
+++ b/scripts/scraper/base_scraper.py
@@ -47,6 +47,24 @@ class ChangeSet:
        return ", ".join(parts) if parts else "no changes"


+MAX_RESPONSE_SIZE = 50 * 1024 * 1024  # 50 MB
+
+
+def _read_limited(resp: object, max_bytes: int = MAX_RESPONSE_SIZE) -> bytes:
+    """Read an HTTP response with a size limit to prevent OOM."""
+    chunks: list[bytes] = []
+    total = 0
+    while True:
+        chunk = resp.read(65536)  # type: ignore[union-attr]
+        if not chunk:
+            break
+        total += len(chunk)
+        if total > max_bytes:
+            raise ValueError(f"Response exceeds {max_bytes} byte limit")
+        chunks.append(chunk)
+    return b"".join(chunks)
+
+
 class BaseScraper(ABC):
    """Abstract base class for platform BIOS requirement scrapers."""

@@ -63,7 +81,7 @@ class BaseScraper(ABC):
        try:
            req = urllib.request.Request(self.url, headers={"User-Agent": "retrobios-scraper/1.0"})
            with urllib.request.urlopen(req, timeout=30) as resp:
-                self._raw_data = resp.read().decode("utf-8")
+                self._raw_data = _read_limited(resp).decode("utf-8")
                return self._raw_data
        except urllib.error.URLError as e:
            raise ConnectionError(f"Failed to fetch {self.url}: {e}") from e
--- a/scripts/scraper/targets/init.py
+++ b/scripts/scraper/targets/init.py
@@ -6,18 +6,20 @@ from __future__ import annotations

 import importlib
 import pkgutil
+from abc import ABC, abstractmethod
 from pathlib import Path


-class BaseTargetScraper:
+class BaseTargetScraper(ABC):
    """Base class for target scrapers."""

    def __init__(self, url: str = ""):
        self.url = url

+    @abstractmethod
    def fetch_targets(self) -> dict:
        """Fetch targets and their core lists. Returns dict matching target YAML format."""
-        raise NotImplementedError
+        ...

    def write_output(self, data: dict, output_path: str) -> None:
        """Write target data to YAML file."""
--- a/scripts/scraper/targets/batocera_targets_scraper.py
+++ b/scripts/scraper/targets/batocera_targets_scraper.py
@@ -152,7 +152,7 @@ def _condition_holds(condition: str, active: frozenset[str]) -> bool:
    try:
        result, _ = _check_condition(tokens, 0, active)
        return result
-    except Exception:
+    except (IndexError, ValueError, TypeError):
        return True  # conservative: include on parse failure