refactor: security hardening + mame arcade bios updates

Security fixes: - Zip-slip protection in _extract_zip_to_archive (sanitize paths) - Hash verification for large file downloads (cache + post-download) - Sanitize YAML destination fields against path traversal - Size limit on ZIP entry reads (512MB cap, prevents zip bombs) - Download size limits in auto_fetch (100MB cap) - Reject hashless external downloads - Sanitize filenames in place_file with basename() MAME arcade updates from Batocera v38 pack: - Updated naomi, naomi2, naomigd, awbios, airlbios, hod2bios, hikaru - Old versions preserved in .variants/ for RetroBat compatibility Batocera 675/680 (+9), all other platforms unchanged at 0 missing
2026-06-12 18:15:30 -05:00 · 2026-03-17 15:32:14 +01:00
parent af74fffa14
commit 5ab82a7898
26 changed files with 338 additions and 269 deletions
@@ -38,6 +38,8 @@ DEFAULT_DB = "database.json"
 DEFAULT_PLATFORMS_DIR = "platforms"
 DEFAULT_BIOS_DIR = "bios"

+MAX_DOWNLOAD = 100 * 1024 * 1024  # 100MB per file
+
 LEGACY_BRANCHES = ["libretro", "RetroArch", "RetroPie", "Recalbox", "batocera", "Other"]

 PUBLIC_REPOS = [
@@ -106,6 +108,14 @@ def verify_content(data: bytes, expected: dict) -> bool:
    return False


+def _read_limited(resp, limit: int = MAX_DOWNLOAD) -> bytes | None:
+    """Read response up to limit bytes. Returns None if exceeded."""
+    data = resp.read(limit + 1)
+    if len(data) > limit:
+        return None
+    return data
+
+
 def step1_crossref_db(entry: dict, db: dict) -> str | None:
    """Check if file exists under different name/path in database."""
    sha1 = entry.get("sha1")
@@ -166,7 +176,9 @@ def step3_search_public_repos(entry: dict) -> bytes | None:
        try:
            req = urllib.request.Request(url, headers={"User-Agent": "retrobios-fetch/1.0"})
            with urllib.request.urlopen(req, timeout=30) as resp:
-                data = resp.read()
+                data = _read_limited(resp)
+                if data is None:
+                    continue
                if verify_content(data, entry):
                    return data
        except (urllib.error.URLError, urllib.error.HTTPError):
@@ -177,7 +189,9 @@ def step3_search_public_repos(entry: dict) -> bytes | None:
            try:
                req = urllib.request.Request(url, headers={"User-Agent": "retrobios-fetch/1.0"})
                with urllib.request.urlopen(req, timeout=30) as resp:
-                    data = resp.read()
+                    data = _read_limited(resp)
+                    if data is None:
+                        continue
                    if verify_content(data, entry):
                        return data
            except (urllib.error.URLError, urllib.error.HTTPError):
@@ -196,7 +210,9 @@ def step4_search_archive_org(entry: dict) -> bytes | None:
            try:
                req = urllib.request.Request(url, headers={"User-Agent": "retrobios-fetch/1.0"})
                with urllib.request.urlopen(req, timeout=30) as resp:
-                    data = resp.read()
+                    data = _read_limited(resp)
+                    if data is None:
+                        continue
                    if verify_content(data, entry):
                        return data
            except (urllib.error.URLError, urllib.error.HTTPError):
@@ -223,8 +239,8 @@ def step4_search_archive_org(entry: dict) -> bytes | None:
                    try:
                        req2 = urllib.request.Request(dl_url, headers={"User-Agent": "retrobios-fetch/1.0"})
                        with urllib.request.urlopen(req2, timeout=30) as resp2:
-                            data = resp2.read()
-                            if verify_content(data, entry):
+                            data = _read_limited(resp2)
+                            if data is not None and verify_content(data, entry):
                                return data
                    except (urllib.error.URLError, urllib.error.HTTPError):
                        pass
@@ -236,7 +252,7 @@ def step4_search_archive_org(entry: dict) -> bytes | None:

 def place_file(data: bytes, entry: dict, bios_dir: str, db: dict) -> str:
    """Place a fetched BIOS file in the correct location."""
-    name = entry["name"]
+    name = os.path.basename(entry["name"])
    system = entry["system"]

    dest_dir = Path(bios_dir)
@@ -39,12 +39,35 @@ DEFAULT_BIOS_DIR = "bios"
 LARGE_FILES_RELEASE = "large-files"
 LARGE_FILES_REPO = "Abdess/retrobios"

+MAX_ENTRY_SIZE = 512 * 1024 * 1024  # 512MB

-def fetch_large_file(name: str, dest_dir: str = ".cache/large") -> str | None:
+
+def _verify_file_hash(path: str, expected_sha1: str = "",
+                      expected_md5: str = "") -> bool:
+    """Compute and compare hash of a local file."""
+    if not expected_sha1 and not expected_md5:
+        return True
+    h = hashlib.sha1() if expected_sha1 else hashlib.md5()
+    with open(path, "rb") as f:
+        while True:
+            chunk = f.read(65536)
+            if not chunk:
+                break
+            h.update(chunk)
+    return h.hexdigest() == (expected_sha1 or expected_md5)
+
+
+def fetch_large_file(name: str, dest_dir: str = ".cache/large",
+                     expected_sha1: str = "", expected_md5: str = "") -> str | None:
    """Download a large file from the 'large-files' GitHub release if not cached."""
    cached = os.path.join(dest_dir, name)
    if os.path.exists(cached):
-        return cached
+        if expected_sha1 or expected_md5:
+            if _verify_file_hash(cached, expected_sha1, expected_md5):
+                return cached
+            os.unlink(cached)
+        else:
+            return cached

    encoded_name = urllib.request.quote(name)
    url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
@@ -58,10 +81,22 @@ def fetch_large_file(name: str, dest_dir: str = ".cache/large") -> str | None:
                    if not chunk:
                        break
                    f.write(chunk)
-        return cached
    except (urllib.error.URLError, urllib.error.HTTPError):
        return None

+    if expected_sha1 or expected_md5:
+        if not _verify_file_hash(cached, expected_sha1, expected_md5):
+            os.unlink(cached)
+            return None
+    return cached
+
+
+def _sanitize_path(raw: str) -> str:
+    """Strip path traversal components from a relative path."""
+    raw = raw.replace("\\", "/")
+    parts = [p for p in raw.split("/") if p and p != ".."]
+    return "/".join(parts)
+

 def resolve_file(file_entry: dict, db: dict, bios_dir: str,
                  zip_contents: dict | None = None) -> tuple[str | None, str]:
@@ -110,7 +145,7 @@ def resolve_file(file_entry: dict, db: dict, bios_dir: str,
                    return local_path, "zip_exact"

    # Release assets override local files (authoritative large files)
-    cached = fetch_large_file(name)
+    cached = fetch_large_file(name, expected_sha1=sha1 or "", expected_md5=md5 or "")
    if cached:
        return cached, "release_asset"

@@ -146,6 +181,8 @@ def build_zip_contents_index(db: dict) -> dict:
                for info in zf.infolist():
                    if info.is_dir():
                        continue
+                    if info.file_size > MAX_ENTRY_SIZE:
+                        continue
                    data = zf.read(info.filename)
                    inner_md5 = hashlib.md5(data).hexdigest()
                    index[inner_md5] = sha1
@@ -160,6 +197,14 @@ def download_external(file_entry: dict, dest_path: str) -> bool:
    if not url:
        return False

+    sha256 = file_entry.get("sha256")
+    sha1 = file_entry.get("sha1")
+    md5 = file_entry.get("md5")
+
+    if not (sha256 or sha1 or md5):
+        print(f"    WARNING: no hash for {file_entry['name']}, skipping unverifiable download")
+        return False
+
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "retrobios-pack-gen/1.0"})
        with urllib.request.urlopen(req, timeout=120) as resp:
@@ -168,11 +213,6 @@ def download_external(file_entry: dict, dest_path: str) -> bool:
        print(f"    WARNING: Failed to download {url}: {e}")
        return False

-    # Verify hash
-    sha256 = file_entry.get("sha256")
-    sha1 = file_entry.get("sha1")
-    md5 = file_entry.get("md5")
-
    if sha256:
        actual = hashlib.sha256(data).hexdigest()
        if actual != sha256:
@@ -228,7 +268,9 @@ def generate_pack(
    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        for sys_id, system in sorted(config.get("systems", {}).items()):
            for file_entry in system.get("files", []):
-                dest = file_entry.get("destination", file_entry["name"])
+                dest = _sanitize_path(file_entry.get("destination", file_entry["name"]))
+                if not dest:
+                    continue
                if base_dest:
                    full_dest = f"{base_dest}/{dest}"
                else:
@@ -316,8 +358,11 @@ def _extract_zip_to_archive(source_zip: str, dest_prefix: str, target_zf: zipfil
        for info in src.infolist():
            if info.is_dir():
                continue
+            clean_name = _sanitize_path(info.filename)
+            if not clean_name:
+                continue
            data = src.read(info.filename)
-            target_path = f"{dest_prefix}/{info.filename}" if dest_prefix else info.filename
+            target_path = f"{dest_prefix}/{clean_name}" if dest_prefix else clean_name
            target_zf.writestr(target_path, data)