From dbc26b11c1d83ce02fddef664c1bc5151d3e4f94 Mon Sep 17 00:00:00 2001
From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com>
Date: Wed, 25 Mar 2026 13:19:12 +0100
Subject: [PATCH] refactor: move fetch_large_file to common, auto-download on
 db rebuild

---
 database.json            |  6 ++---
 scripts/common.py        | 55 ++++++++++++++++++++++++++++++++++++++++
 scripts/generate_db.py   | 51 +++++++++++++++++++++----------------
 scripts/generate_pack.py | 50 +-----------------------------------
 4 files changed, 88 insertions(+), 74 deletions(-)

diff --git a/database.json b/database.json
index 4fd621aa..288ff017 100644
--- a/database.json
+++ b/database.json
@@ -1,5 +1,5 @@
 {
-  "generated_at": "2026-03-25T11:51:38Z",
+  "generated_at": "2026-03-25T12:18:43Z",
   "total_files": 6733,
   "total_size": 5288644732,
   "files": {
@@ -67274,7 +67274,7 @@
       "adler32": "701e6531"
     },
     "ac4b78d53c7a97da2451ca35498395d8dd1e3024": {
-      "path": "bios/Arcade/Arcade/Firmware.19.0.0.zip",
+      "path": ".cache/large/Firmware.19.0.0.zip",
       "name": "Firmware.19.0.0.zip",
       "size": 338076508,
       "sha1": "ac4b78d53c7a97da2451ca35498395d8dd1e3024",
@@ -67284,7 +67284,7 @@
       "adler32": "471a3291"
     },
     "add40c002084e8e25768671877b2aa603aaf5cb1": {
-      "path": "bios/Arcade/Arcade/maclc3.zip",
+      "path": ".cache/large/maclc3.zip",
       "name": "maclc3.zip",
       "size": 189428461,
       "sha1": "add40c002084e8e25768671877b2aa603aaf5cb1",
diff --git a/scripts/common.py b/scripts/common.py
index 656958cb..23fe7664 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -9,6 +9,8 @@ from __future__ import annotations
 import hashlib
 import json
 import os
+import urllib.error
+import urllib.request
 import zipfile
 import zlib
 from pathlib import Path
@@ -694,6 +696,59 @@ def filter_files_by_mode(files: list[dict], standalone: bool) -> list[dict]:
     return result
 
 
+LARGE_FILES_RELEASE = "large-files"
+LARGE_FILES_REPO = "Abdess/retrobios"
+LARGE_FILES_CACHE = ".cache/large"
+
+
+def fetch_large_file(name: str, dest_dir: str = LARGE_FILES_CACHE,
+                     expected_sha1: str = "", expected_md5: str = "") -> str | None:
+    """Download a large file from the 'large-files' GitHub release if not cached."""
+    cached = os.path.join(dest_dir, name)
+    if os.path.exists(cached):
+        if expected_sha1 or expected_md5:
+            hashes = compute_hashes(cached)
+            if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
+                os.unlink(cached)
+            elif expected_md5:
+                md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
+                if hashes["md5"].lower() not in md5_list:
+                    os.unlink(cached)
+                else:
+                    return cached
+            else:
+                return cached
+        else:
+            return cached
+
+    encoded_name = urllib.request.quote(name)
+    url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": "retrobios/1.0"})
+        with urllib.request.urlopen(req, timeout=300) as resp:
+            os.makedirs(dest_dir, exist_ok=True)
+            with open(cached, "wb") as f:
+                while True:
+                    chunk = resp.read(65536)
+                    if not chunk:
+                        break
+                    f.write(chunk)
+    except (urllib.error.URLError, urllib.error.HTTPError):
+        return None
+
+    if expected_sha1 or expected_md5:
+        hashes = compute_hashes(cached)
+        if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
+            os.unlink(cached)
+            return None
+        if expected_md5:
+            md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
+            if hashes["md5"].lower() not in md5_list:
+                os.unlink(cached)
+                return None
+    return cached
+
+
 def safe_extract_zip(zip_path: str, dest_dir: str) -> None:
     """Extract a ZIP file safely, preventing zip-slip path traversal."""
     dest = os.path.realpath(dest_dir)
diff --git a/scripts/generate_db.py b/scripts/generate_db.py
index d853f320..001149e5 100644
--- a/scripts/generate_db.py
+++ b/scripts/generate_db.py
@@ -216,17 +216,18 @@ def save_cache(cache_path: str, cache: dict):
         json.dump(cache, f)
 
 
-def _load_gitignored_bios_paths() -> set[str]:
-    """Read .gitignore and return bios/ paths that are listed (large files)."""
+def _load_gitignored_large_files() -> dict[str, str]:
+    """Read .gitignore and return {filename: bios_path} for large files."""
     gitignore = Path(".gitignore")
     if not gitignore.exists():
-        return set()
-    paths = set()
+        return {}
+    entries = {}
     for line in gitignore.read_text().splitlines():
         line = line.strip()
         if line.startswith("bios/") and not line.startswith("#"):
-            paths.add(line)
-    return paths
+            name = Path(line).name
+            entries[name] = line
+    return entries
 
 
 def _preserve_large_file_entries(files: dict, db_path: str) -> int:
@@ -234,14 +235,14 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
 
     Large files (>50 MB) are stored as GitHub release assets and listed
     in .gitignore. When generate_db runs locally without them, their
-    entries would be lost. This reads the existing database and re-adds
-    entries whose paths match .gitignore bios/ entries.
-
-    If the file exists in .cache/large/, the path is updated so that
-    resolve_local_file can find it for verify and pack generation.
+    entries would be lost. This reads the existing database, downloads
+    missing files from the release, and re-adds entries with paths
+    pointing to the local cache.
     """
-    gitignored = _load_gitignored_bios_paths()
-    if not gitignored:
+    from common import fetch_large_file
+
+    large_files = _load_gitignored_large_files()
+    if not large_files:
         return 0
 
     try:
@@ -250,18 +251,24 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
     except (FileNotFoundError, json.JSONDecodeError):
         return 0
 
-    cache_dir = Path(".cache/large")
     count = 0
     for sha1, entry in existing_db.get("files", {}).items():
+        if sha1 in files:
+            continue
+        name = entry.get("name", "")
         path = entry.get("path", "")
-        if path in gitignored and sha1 not in files:
-            # Point to cached copy if available
-            name = entry.get("name", "")
-            cached = cache_dir / name
-            if cached.exists():
-                entry = {**entry, "path": str(cached)}
-            files[sha1] = entry
-            count += 1
+        # Match by gitignored bios/ path OR by filename of a known large file
+        if path not in large_files.values() and name not in large_files:
+            continue
+        cached = fetch_large_file(
+            name,
+            expected_sha1=entry.get("sha1", ""),
+            expected_md5=entry.get("md5", ""),
+        )
+        if cached:
+            entry = {**entry, "path": cached}
+        files[sha1] = entry
+        count += 1
     return count
 
 
diff --git a/scripts/generate_pack.py b/scripts/generate_pack.py
index 3523146d..0e43ce6d 100644
--- a/scripts/generate_pack.py
+++ b/scripts/generate_pack.py
@@ -26,7 +26,7 @@ from pathlib import Path
 sys.path.insert(0, os.path.dirname(__file__))
 from common import (
     _build_validation_index, build_zip_contents_index, check_file_validation,
-    check_inside_zip, compute_hashes, filter_files_by_mode,
+    check_inside_zip, compute_hashes, fetch_large_file, filter_files_by_mode,
     group_identical_platforms, list_emulator_profiles, list_system_ids,
     load_database, load_data_dir_registry, load_emulator_profiles,
     load_platform_config, md5_composite, resolve_local_file,
@@ -43,57 +43,9 @@ DEFAULT_PLATFORMS_DIR = "platforms"
 DEFAULT_DB_FILE = "database.json"
 DEFAULT_OUTPUT_DIR = "dist"
 DEFAULT_BIOS_DIR = "bios"
-LARGE_FILES_RELEASE = "large-files"
-LARGE_FILES_REPO = "Abdess/retrobios"
-
 MAX_ENTRY_SIZE = 512 * 1024 * 1024  # 512MB
 
 
-def _verify_file_hash(path: str, expected_sha1: str = "",
-                      expected_md5: str = "") -> bool:
-    if not expected_sha1 and not expected_md5:
-        return True
-    hashes = compute_hashes(path)
-    if expected_sha1:
-        return hashes["sha1"].lower() == expected_sha1.lower()
-    md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
-    return hashes["md5"].lower() in md5_list
-
-
-def fetch_large_file(name: str, dest_dir: str = ".cache/large",
-                     expected_sha1: str = "", expected_md5: str = "") -> str | None:
-    """Download a large file from the 'large-files' GitHub release if not cached."""
-    cached = os.path.join(dest_dir, name)
-    if os.path.exists(cached):
-        if expected_sha1 or expected_md5:
-            if _verify_file_hash(cached, expected_sha1, expected_md5):
-                return cached
-            os.unlink(cached)
-        else:
-            return cached
-
-    encoded_name = urllib.request.quote(name)
-    url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
-    try:
-        req = urllib.request.Request(url, headers={"User-Agent": "retrobios-pack/1.0"})
-        with urllib.request.urlopen(req, timeout=300) as resp:
-            os.makedirs(dest_dir, exist_ok=True)
-            with open(cached, "wb") as f:
-                while True:
-                    chunk = resp.read(65536)
-                    if not chunk:
-                        break
-                    f.write(chunk)
-    except (urllib.error.URLError, urllib.error.HTTPError):
-        return None
-
-    if expected_sha1 or expected_md5:
-        if not _verify_file_hash(cached, expected_sha1, expected_md5):
-            os.unlink(cached)
-            return None
-    return cached
-
-
 def _find_candidate_satisfying_both(
     file_entry: dict,
     db: dict,