From dbc26b11c1d83ce02fddef664c1bc5151d3e4f94 Mon Sep 17 00:00:00 2001 From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com> Date: Wed, 25 Mar 2026 13:19:12 +0100 Subject: [PATCH] refactor: move fetch_large_file to common, auto-download on db rebuild --- database.json | 6 ++--- scripts/common.py | 55 ++++++++++++++++++++++++++++++++++++++++ scripts/generate_db.py | 51 +++++++++++++++++++++---------------- scripts/generate_pack.py | 50 +----------------------------------- 4 files changed, 88 insertions(+), 74 deletions(-) diff --git a/database.json b/database.json index 4fd621aa..288ff017 100644 --- a/database.json +++ b/database.json @@ -1,5 +1,5 @@ { - "generated_at": "2026-03-25T11:51:38Z", + "generated_at": "2026-03-25T12:18:43Z", "total_files": 6733, "total_size": 5288644732, "files": { @@ -67274,7 +67274,7 @@ "adler32": "701e6531" }, "ac4b78d53c7a97da2451ca35498395d8dd1e3024": { - "path": "bios/Arcade/Arcade/Firmware.19.0.0.zip", + "path": ".cache/large/Firmware.19.0.0.zip", "name": "Firmware.19.0.0.zip", "size": 338076508, "sha1": "ac4b78d53c7a97da2451ca35498395d8dd1e3024", @@ -67284,7 +67284,7 @@ "adler32": "471a3291" }, "add40c002084e8e25768671877b2aa603aaf5cb1": { - "path": "bios/Arcade/Arcade/maclc3.zip", + "path": ".cache/large/maclc3.zip", "name": "maclc3.zip", "size": 189428461, "sha1": "add40c002084e8e25768671877b2aa603aaf5cb1", diff --git a/scripts/common.py b/scripts/common.py index 656958cb..23fe7664 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -9,6 +9,8 @@ from __future__ import annotations import hashlib import json import os +import urllib.error +import urllib.request import zipfile import zlib from pathlib import Path @@ -694,6 +696,59 @@ def filter_files_by_mode(files: list[dict], standalone: bool) -> list[dict]: return result +LARGE_FILES_RELEASE = "large-files" +LARGE_FILES_REPO = "Abdess/retrobios" +LARGE_FILES_CACHE = ".cache/large" + + +def fetch_large_file(name: str, dest_dir: str = LARGE_FILES_CACHE, + expected_sha1: str = "", expected_md5: str = "") -> str | None: + """Download a large file from the 'large-files' GitHub release if not cached.""" + cached = os.path.join(dest_dir, name) + if os.path.exists(cached): + if expected_sha1 or expected_md5: + hashes = compute_hashes(cached) + if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower(): + os.unlink(cached) + elif expected_md5: + md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()] + if hashes["md5"].lower() not in md5_list: + os.unlink(cached) + else: + return cached + else: + return cached + else: + return cached + + encoded_name = urllib.request.quote(name) + url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}" + try: + req = urllib.request.Request(url, headers={"User-Agent": "retrobios/1.0"}) + with urllib.request.urlopen(req, timeout=300) as resp: + os.makedirs(dest_dir, exist_ok=True) + with open(cached, "wb") as f: + while True: + chunk = resp.read(65536) + if not chunk: + break + f.write(chunk) + except (urllib.error.URLError, urllib.error.HTTPError): + return None + + if expected_sha1 or expected_md5: + hashes = compute_hashes(cached) + if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower(): + os.unlink(cached) + return None + if expected_md5: + md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()] + if hashes["md5"].lower() not in md5_list: + os.unlink(cached) + return None + return cached + + def safe_extract_zip(zip_path: str, dest_dir: str) -> None: """Extract a ZIP file safely, preventing zip-slip path traversal.""" dest = os.path.realpath(dest_dir) diff --git a/scripts/generate_db.py b/scripts/generate_db.py index d853f320..001149e5 100644 --- a/scripts/generate_db.py +++ b/scripts/generate_db.py @@ -216,17 +216,18 @@ def save_cache(cache_path: str, cache: dict): json.dump(cache, f) -def _load_gitignored_bios_paths() -> set[str]: - """Read .gitignore and return bios/ paths that are listed (large files).""" +def _load_gitignored_large_files() -> dict[str, str]: + """Read .gitignore and return {filename: bios_path} for large files.""" gitignore = Path(".gitignore") if not gitignore.exists(): - return set() - paths = set() + return {} + entries = {} for line in gitignore.read_text().splitlines(): line = line.strip() if line.startswith("bios/") and not line.startswith("#"): - paths.add(line) - return paths + name = Path(line).name + entries[name] = line + return entries def _preserve_large_file_entries(files: dict, db_path: str) -> int: @@ -234,14 +235,14 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int: Large files (>50 MB) are stored as GitHub release assets and listed in .gitignore. When generate_db runs locally without them, their - entries would be lost. This reads the existing database and re-adds - entries whose paths match .gitignore bios/ entries. - - If the file exists in .cache/large/, the path is updated so that - resolve_local_file can find it for verify and pack generation. + entries would be lost. This reads the existing database, downloads + missing files from the release, and re-adds entries with paths + pointing to the local cache. """ - gitignored = _load_gitignored_bios_paths() - if not gitignored: + from common import fetch_large_file + + large_files = _load_gitignored_large_files() + if not large_files: return 0 try: @@ -250,18 +251,24 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int: except (FileNotFoundError, json.JSONDecodeError): return 0 - cache_dir = Path(".cache/large") count = 0 for sha1, entry in existing_db.get("files", {}).items(): + if sha1 in files: + continue + name = entry.get("name", "") path = entry.get("path", "") - if path in gitignored and sha1 not in files: - # Point to cached copy if available - name = entry.get("name", "") - cached = cache_dir / name - if cached.exists(): - entry = {**entry, "path": str(cached)} - files[sha1] = entry - count += 1 + # Match by gitignored bios/ path OR by filename of a known large file + if path not in large_files.values() and name not in large_files: + continue + cached = fetch_large_file( + name, + expected_sha1=entry.get("sha1", ""), + expected_md5=entry.get("md5", ""), + ) + if cached: + entry = {**entry, "path": cached} + files[sha1] = entry + count += 1 return count diff --git a/scripts/generate_pack.py b/scripts/generate_pack.py index 3523146d..0e43ce6d 100644 --- a/scripts/generate_pack.py +++ b/scripts/generate_pack.py @@ -26,7 +26,7 @@ from pathlib import Path sys.path.insert(0, os.path.dirname(__file__)) from common import ( _build_validation_index, build_zip_contents_index, check_file_validation, - check_inside_zip, compute_hashes, filter_files_by_mode, + check_inside_zip, compute_hashes, fetch_large_file, filter_files_by_mode, group_identical_platforms, list_emulator_profiles, list_system_ids, load_database, load_data_dir_registry, load_emulator_profiles, load_platform_config, md5_composite, resolve_local_file, @@ -43,57 +43,9 @@ DEFAULT_PLATFORMS_DIR = "platforms" DEFAULT_DB_FILE = "database.json" DEFAULT_OUTPUT_DIR = "dist" DEFAULT_BIOS_DIR = "bios" -LARGE_FILES_RELEASE = "large-files" -LARGE_FILES_REPO = "Abdess/retrobios" - MAX_ENTRY_SIZE = 512 * 1024 * 1024 # 512MB -def _verify_file_hash(path: str, expected_sha1: str = "", - expected_md5: str = "") -> bool: - if not expected_sha1 and not expected_md5: - return True - hashes = compute_hashes(path) - if expected_sha1: - return hashes["sha1"].lower() == expected_sha1.lower() - md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()] - return hashes["md5"].lower() in md5_list - - -def fetch_large_file(name: str, dest_dir: str = ".cache/large", - expected_sha1: str = "", expected_md5: str = "") -> str | None: - """Download a large file from the 'large-files' GitHub release if not cached.""" - cached = os.path.join(dest_dir, name) - if os.path.exists(cached): - if expected_sha1 or expected_md5: - if _verify_file_hash(cached, expected_sha1, expected_md5): - return cached - os.unlink(cached) - else: - return cached - - encoded_name = urllib.request.quote(name) - url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}" - try: - req = urllib.request.Request(url, headers={"User-Agent": "retrobios-pack/1.0"}) - with urllib.request.urlopen(req, timeout=300) as resp: - os.makedirs(dest_dir, exist_ok=True) - with open(cached, "wb") as f: - while True: - chunk = resp.read(65536) - if not chunk: - break - f.write(chunk) - except (urllib.error.URLError, urllib.error.HTTPError): - return None - - if expected_sha1 or expected_md5: - if not _verify_file_hash(cached, expected_sha1, expected_md5): - os.unlink(cached) - return None - return cached - - def _find_candidate_satisfying_both( file_entry: dict, db: dict,