refactor: move fetch_large_file to common, auto-download on db rebuild

2026-04-13 12:22:33 -05:00 · 2026-03-25 13:19:12 +01:00
parent 910428c6f1
commit dbc26b11c1
4 changed files with 88 additions and 74 deletions
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -9,6 +9,8 @@ from __future__ import annotations
 import hashlib
 import json
 import os
+import urllib.error
+import urllib.request
 import zipfile
 import zlib
 from pathlib import Path
@@ -694,6 +696,59 @@ def filter_files_by_mode(files: list[dict], standalone: bool) -> list[dict]:
    return result


+LARGE_FILES_RELEASE = "large-files"
+LARGE_FILES_REPO = "Abdess/retrobios"
+LARGE_FILES_CACHE = ".cache/large"
+
+
+def fetch_large_file(name: str, dest_dir: str = LARGE_FILES_CACHE,
+                     expected_sha1: str = "", expected_md5: str = "") -> str | None:
+    """Download a large file from the 'large-files' GitHub release if not cached."""
+    cached = os.path.join(dest_dir, name)
+    if os.path.exists(cached):
+        if expected_sha1 or expected_md5:
+            hashes = compute_hashes(cached)
+            if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
+                os.unlink(cached)
+            elif expected_md5:
+                md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
+                if hashes["md5"].lower() not in md5_list:
+                    os.unlink(cached)
+                else:
+                    return cached
+            else:
+                return cached
+        else:
+            return cached
+
+    encoded_name = urllib.request.quote(name)
+    url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": "retrobios/1.0"})
+        with urllib.request.urlopen(req, timeout=300) as resp:
+            os.makedirs(dest_dir, exist_ok=True)
+            with open(cached, "wb") as f:
+                while True:
+                    chunk = resp.read(65536)
+                    if not chunk:
+                        break
+                    f.write(chunk)
+    except (urllib.error.URLError, urllib.error.HTTPError):
+        return None
+
+    if expected_sha1 or expected_md5:
+        hashes = compute_hashes(cached)
+        if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
+            os.unlink(cached)
+            return None
+        if expected_md5:
+            md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
+            if hashes["md5"].lower() not in md5_list:
+                os.unlink(cached)
+                return None
+    return cached
+
+
 def safe_extract_zip(zip_path: str, dest_dir: str) -> None:
    """Extract a ZIP file safely, preventing zip-slip path traversal."""
    dest = os.path.realpath(dest_dir)
--- a/scripts/generate_db.py
+++ b/scripts/generate_db.py
@@ -216,17 +216,18 @@ def save_cache(cache_path: str, cache: dict):
        json.dump(cache, f)


-def _load_gitignored_bios_paths() -> set[str]:
-    """Read .gitignore and return bios/ paths that are listed (large files)."""
+def _load_gitignored_large_files() -> dict[str, str]:
+    """Read .gitignore and return {filename: bios_path} for large files."""
    gitignore = Path(".gitignore")
    if not gitignore.exists():
-        return set()
-    paths = set()
+        return {}
+    entries = {}
    for line in gitignore.read_text().splitlines():
        line = line.strip()
        if line.startswith("bios/") and not line.startswith("#"):
-            paths.add(line)
-    return paths
+            name = Path(line).name
+            entries[name] = line
+    return entries


 def _preserve_large_file_entries(files: dict, db_path: str) -> int:
@@ -234,14 +235,14 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:

    Large files (>50 MB) are stored as GitHub release assets and listed
    in .gitignore. When generate_db runs locally without them, their
-    entries would be lost. This reads the existing database and re-adds
-    entries whose paths match .gitignore bios/ entries.
-
-    If the file exists in .cache/large/, the path is updated so that
-    resolve_local_file can find it for verify and pack generation.
+    entries would be lost. This reads the existing database, downloads
+    missing files from the release, and re-adds entries with paths
+    pointing to the local cache.
    """
-    gitignored = _load_gitignored_bios_paths()
-    if not gitignored:
+    from common import fetch_large_file
+
+    large_files = _load_gitignored_large_files()
+    if not large_files:
        return 0

    try:
@@ -250,18 +251,24 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
    except (FileNotFoundError, json.JSONDecodeError):
        return 0

-    cache_dir = Path(".cache/large")
    count = 0
    for sha1, entry in existing_db.get("files", {}).items():
+        if sha1 in files:
+            continue
+        name = entry.get("name", "")
        path = entry.get("path", "")
-        if path in gitignored and sha1 not in files:
-            # Point to cached copy if available
-            name = entry.get("name", "")
-            cached = cache_dir / name
-            if cached.exists():
-                entry = {**entry, "path": str(cached)}
-            files[sha1] = entry
-            count += 1
+        # Match by gitignored bios/ path OR by filename of a known large file
+        if path not in large_files.values() and name not in large_files:
+            continue
+        cached = fetch_large_file(
+            name,
+            expected_sha1=entry.get("sha1", ""),
+            expected_md5=entry.get("md5", ""),
+        )
+        if cached:
+            entry = {**entry, "path": cached}
+        files[sha1] = entry
+        count += 1
    return count


--- a/scripts/generate_pack.py
+++ b/scripts/generate_pack.py
@@ -26,7 +26,7 @@ from pathlib import Path
 sys.path.insert(0, os.path.dirname(__file__))
 from common import (
    _build_validation_index, build_zip_contents_index, check_file_validation,
-    check_inside_zip, compute_hashes, filter_files_by_mode,
+    check_inside_zip, compute_hashes, fetch_large_file, filter_files_by_mode,
    group_identical_platforms, list_emulator_profiles, list_system_ids,
    load_database, load_data_dir_registry, load_emulator_profiles,
    load_platform_config, md5_composite, resolve_local_file,
@@ -43,57 +43,9 @@ DEFAULT_PLATFORMS_DIR = "platforms"
 DEFAULT_DB_FILE = "database.json"
 DEFAULT_OUTPUT_DIR = "dist"
 DEFAULT_BIOS_DIR = "bios"
-LARGE_FILES_RELEASE = "large-files"
-LARGE_FILES_REPO = "Abdess/retrobios"
-
 MAX_ENTRY_SIZE = 512 * 1024 * 1024  # 512MB


-def _verify_file_hash(path: str, expected_sha1: str = "",
-                      expected_md5: str = "") -> bool:
-    if not expected_sha1 and not expected_md5:
-        return True
-    hashes = compute_hashes(path)
-    if expected_sha1:
-        return hashes["sha1"].lower() == expected_sha1.lower()
-    md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
-    return hashes["md5"].lower() in md5_list
-
-
-def fetch_large_file(name: str, dest_dir: str = ".cache/large",
-                     expected_sha1: str = "", expected_md5: str = "") -> str | None:
-    """Download a large file from the 'large-files' GitHub release if not cached."""
-    cached = os.path.join(dest_dir, name)
-    if os.path.exists(cached):
-        if expected_sha1 or expected_md5:
-            if _verify_file_hash(cached, expected_sha1, expected_md5):
-                return cached
-            os.unlink(cached)
-        else:
-            return cached
-
-    encoded_name = urllib.request.quote(name)
-    url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
-    try:
-        req = urllib.request.Request(url, headers={"User-Agent": "retrobios-pack/1.0"})
-        with urllib.request.urlopen(req, timeout=300) as resp:
-            os.makedirs(dest_dir, exist_ok=True)
-            with open(cached, "wb") as f:
-                while True:
-                    chunk = resp.read(65536)
-                    if not chunk:
-                        break
-                    f.write(chunk)
-    except (urllib.error.URLError, urllib.error.HTTPError):
-        return None
-
-    if expected_sha1 or expected_md5:
-        if not _verify_file_hash(cached, expected_sha1, expected_md5):
-            os.unlink(cached)
-            return None
-    return cached
-
-
 def _find_candidate_satisfying_both(
    file_entry: dict,
    db: dict,