refactor: move fetch_large_file to common, auto-download on db rebuild

2026-06-13 02:25:28 -05:00 · 2026-03-25 13:19:12 +01:00
parent 910428c6f1
commit dbc26b11c1
4 changed files with 88 additions and 74 deletions
@@ -1,5 +1,5 @@
 {
-  "generated_at": "2026-03-25T11:51:38Z",
+  "generated_at": "2026-03-25T12:18:43Z",
  "total_files": 6733,
  "total_size": 5288644732,
  "files": {
@@ -67274,7 +67274,7 @@
      "adler32": "701e6531"
    },
    "ac4b78d53c7a97da2451ca35498395d8dd1e3024": {
-      "path": "bios/Arcade/Arcade/Firmware.19.0.0.zip",
+      "path": ".cache/large/Firmware.19.0.0.zip",
      "name": "Firmware.19.0.0.zip",
      "size": 338076508,
      "sha1": "ac4b78d53c7a97da2451ca35498395d8dd1e3024",
@@ -67284,7 +67284,7 @@
      "adler32": "471a3291"
    },
    "add40c002084e8e25768671877b2aa603aaf5cb1": {
-      "path": "bios/Arcade/Arcade/maclc3.zip",
+      "path": ".cache/large/maclc3.zip",
      "name": "maclc3.zip",
      "size": 189428461,
      "sha1": "add40c002084e8e25768671877b2aa603aaf5cb1",
@@ -9,6 +9,8 @@ from __future__ import annotations
 import hashlib
 import json
 import os
 import urllib.error
 import urllib.request
 import zipfile
 import zlib
 from pathlib import Path
@@ -694,6 +696,59 @@ def filter_files_by_mode(files: list[dict], standalone: bool) -> list[dict]:
    return result
 LARGE_FILES_RELEASE = "large-files"
 LARGE_FILES_REPO = "Abdess/retrobios"
 LARGE_FILES_CACHE = ".cache/large"
 def fetch_large_file(name: str, dest_dir: str = LARGE_FILES_CACHE,
                     expected_sha1: str = "", expected_md5: str = "") -> str | None:
    """Download a large file from the 'large-files' GitHub release if not cached."""
    cached = os.path.join(dest_dir, name)
    if os.path.exists(cached):
        if expected_sha1 or expected_md5:
            hashes = compute_hashes(cached)
            if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
                os.unlink(cached)
            elif expected_md5:
                md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
                if hashes["md5"].lower() not in md5_list:
                    os.unlink(cached)
                else:
                    return cached
            else:
                return cached
        else:
            return cached
    encoded_name = urllib.request.quote(name)
    url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "retrobios/1.0"})
        with urllib.request.urlopen(req, timeout=300) as resp:
            os.makedirs(dest_dir, exist_ok=True)
            with open(cached, "wb") as f:
                while True:
                    chunk = resp.read(65536)
                    if not chunk:
                        break
                    f.write(chunk)
    except (urllib.error.URLError, urllib.error.HTTPError):
        return None
    if expected_sha1 or expected_md5:
        hashes = compute_hashes(cached)
        if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
            os.unlink(cached)
            return None
        if expected_md5:
            md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
            if hashes["md5"].lower() not in md5_list:
                os.unlink(cached)
                return None
    return cached
 def safe_extract_zip(zip_path: str, dest_dir: str) -> None:
    """Extract a ZIP file safely, preventing zip-slip path traversal."""
    dest = os.path.realpath(dest_dir)
@@ -216,17 +216,18 @@ def save_cache(cache_path: str, cache: dict):
        json.dump(cache, f)
-def _load_gitignored_bios_paths() -> set[str]:
+def _load_gitignored_large_files() -> dict[str, str]:
-    """Read .gitignore and return bios/ paths that are listed (large files)."""
+    """Read .gitignore and return {filename: bios_path} for large files."""
    gitignore = Path(".gitignore")
    if not gitignore.exists():
-        return set()
+        return {}
-    paths = set()
+    entries = {}
    for line in gitignore.read_text().splitlines():
        line = line.strip()
        if line.startswith("bios/") and not line.startswith("#"):
-            paths.add(line)
+            name = Path(line).name
-    return paths
+            entries[name] = line
    return entries
 def _preserve_large_file_entries(files: dict, db_path: str) -> int:
@@ -234,14 +235,14 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
    Large files (>50 MB) are stored as GitHub release assets and listed
    in .gitignore. When generate_db runs locally without them, their
-    entries would be lost. This reads the existing database and re-adds
+    entries would be lost. This reads the existing database, downloads
-    entries whose paths match .gitignore bios/ entries.
+    missing files from the release, and re-adds entries with paths
-
+    pointing to the local cache.
    If the file exists in .cache/large/, the path is updated so that
    resolve_local_file can find it for verify and pack generation.
    """
-    gitignored = _load_gitignored_bios_paths()
+    from common import fetch_large_file
-    if not gitignored:
+
    large_files = _load_gitignored_large_files()
    if not large_files:
        return 0
    try:
@@ -250,18 +251,24 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
    except (FileNotFoundError, json.JSONDecodeError):
        return 0
    cache_dir = Path(".cache/large")
    count = 0
    for sha1, entry in existing_db.get("files", {}).items():
        if sha1 in files:
            continue
        name = entry.get("name", "")
        path = entry.get("path", "")
-        if path in gitignored and sha1 not in files:
+        # Match by gitignored bios/ path OR by filename of a known large file
-            # Point to cached copy if available
+        if path not in large_files.values() and name not in large_files:
-            name = entry.get("name", "")
+            continue
-            cached = cache_dir / name
+        cached = fetch_large_file(
-            if cached.exists():
+            name,
-                entry = {**entry, "path": str(cached)}
+            expected_sha1=entry.get("sha1", ""),
-            files[sha1] = entry
+            expected_md5=entry.get("md5", ""),
-            count += 1
+        )
        if cached:
            entry = {**entry, "path": cached}
        files[sha1] = entry
        count += 1
    return count
@@ -26,7 +26,7 @@ from pathlib import Path
 sys.path.insert(0, os.path.dirname(__file__))
 from common import (
    _build_validation_index, build_zip_contents_index, check_file_validation,
-    check_inside_zip, compute_hashes, filter_files_by_mode,
+    check_inside_zip, compute_hashes, fetch_large_file, filter_files_by_mode,
    group_identical_platforms, list_emulator_profiles, list_system_ids,
    load_database, load_data_dir_registry, load_emulator_profiles,
    load_platform_config, md5_composite, resolve_local_file,
@@ -43,57 +43,9 @@ DEFAULT_PLATFORMS_DIR = "platforms"
 DEFAULT_DB_FILE = "database.json"
 DEFAULT_OUTPUT_DIR = "dist"
 DEFAULT_BIOS_DIR = "bios"
 LARGE_FILES_RELEASE = "large-files"
 LARGE_FILES_REPO = "Abdess/retrobios"
 MAX_ENTRY_SIZE = 512 * 1024 * 1024  # 512MB
 def _verify_file_hash(path: str, expected_sha1: str = "",
                      expected_md5: str = "") -> bool:
    if not expected_sha1 and not expected_md5:
        return True
    hashes = compute_hashes(path)
    if expected_sha1:
        return hashes["sha1"].lower() == expected_sha1.lower()
    md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
    return hashes["md5"].lower() in md5_list
 def fetch_large_file(name: str, dest_dir: str = ".cache/large",
                     expected_sha1: str = "", expected_md5: str = "") -> str | None:
    """Download a large file from the 'large-files' GitHub release if not cached."""
    cached = os.path.join(dest_dir, name)
    if os.path.exists(cached):
        if expected_sha1 or expected_md5:
            if _verify_file_hash(cached, expected_sha1, expected_md5):
                return cached
            os.unlink(cached)
        else:
            return cached
    encoded_name = urllib.request.quote(name)
    url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "retrobios-pack/1.0"})
        with urllib.request.urlopen(req, timeout=300) as resp:
            os.makedirs(dest_dir, exist_ok=True)
            with open(cached, "wb") as f:
                while True:
                    chunk = resp.read(65536)
                    if not chunk:
                        break
                    f.write(chunk)
    except (urllib.error.URLError, urllib.error.HTTPError):
        return None
    if expected_sha1 or expected_md5:
        if not _verify_file_hash(cached, expected_sha1, expected_md5):
            os.unlink(cached)
            return None
    return cached
 def _find_candidate_satisfying_both(
    file_entry: dict,
    db: dict,