refactor: move fetch_large_file to common, auto-download on db rebuild

This commit is contained in:
Abdessamad Derraz
2026-03-25 13:19:12 +01:00
parent 910428c6f1
commit dbc26b11c1
4 changed files with 88 additions and 74 deletions

View File

@@ -1,5 +1,5 @@
{ {
"generated_at": "2026-03-25T11:51:38Z", "generated_at": "2026-03-25T12:18:43Z",
"total_files": 6733, "total_files": 6733,
"total_size": 5288644732, "total_size": 5288644732,
"files": { "files": {
@@ -67274,7 +67274,7 @@
"adler32": "701e6531" "adler32": "701e6531"
}, },
"ac4b78d53c7a97da2451ca35498395d8dd1e3024": { "ac4b78d53c7a97da2451ca35498395d8dd1e3024": {
"path": "bios/Arcade/Arcade/Firmware.19.0.0.zip", "path": ".cache/large/Firmware.19.0.0.zip",
"name": "Firmware.19.0.0.zip", "name": "Firmware.19.0.0.zip",
"size": 338076508, "size": 338076508,
"sha1": "ac4b78d53c7a97da2451ca35498395d8dd1e3024", "sha1": "ac4b78d53c7a97da2451ca35498395d8dd1e3024",
@@ -67284,7 +67284,7 @@
"adler32": "471a3291" "adler32": "471a3291"
}, },
"add40c002084e8e25768671877b2aa603aaf5cb1": { "add40c002084e8e25768671877b2aa603aaf5cb1": {
"path": "bios/Arcade/Arcade/maclc3.zip", "path": ".cache/large/maclc3.zip",
"name": "maclc3.zip", "name": "maclc3.zip",
"size": 189428461, "size": 189428461,
"sha1": "add40c002084e8e25768671877b2aa603aaf5cb1", "sha1": "add40c002084e8e25768671877b2aa603aaf5cb1",

View File

@@ -9,6 +9,8 @@ from __future__ import annotations
import hashlib import hashlib
import json import json
import os import os
import urllib.error
import urllib.request
import zipfile import zipfile
import zlib import zlib
from pathlib import Path from pathlib import Path
@@ -694,6 +696,59 @@ def filter_files_by_mode(files: list[dict], standalone: bool) -> list[dict]:
return result return result
LARGE_FILES_RELEASE = "large-files"
LARGE_FILES_REPO = "Abdess/retrobios"
LARGE_FILES_CACHE = ".cache/large"
def fetch_large_file(name: str, dest_dir: str = LARGE_FILES_CACHE,
expected_sha1: str = "", expected_md5: str = "") -> str | None:
"""Download a large file from the 'large-files' GitHub release if not cached."""
cached = os.path.join(dest_dir, name)
if os.path.exists(cached):
if expected_sha1 or expected_md5:
hashes = compute_hashes(cached)
if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
os.unlink(cached)
elif expected_md5:
md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
if hashes["md5"].lower() not in md5_list:
os.unlink(cached)
else:
return cached
else:
return cached
else:
return cached
encoded_name = urllib.request.quote(name)
url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
try:
req = urllib.request.Request(url, headers={"User-Agent": "retrobios/1.0"})
with urllib.request.urlopen(req, timeout=300) as resp:
os.makedirs(dest_dir, exist_ok=True)
with open(cached, "wb") as f:
while True:
chunk = resp.read(65536)
if not chunk:
break
f.write(chunk)
except (urllib.error.URLError, urllib.error.HTTPError):
return None
if expected_sha1 or expected_md5:
hashes = compute_hashes(cached)
if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
os.unlink(cached)
return None
if expected_md5:
md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
if hashes["md5"].lower() not in md5_list:
os.unlink(cached)
return None
return cached
def safe_extract_zip(zip_path: str, dest_dir: str) -> None: def safe_extract_zip(zip_path: str, dest_dir: str) -> None:
"""Extract a ZIP file safely, preventing zip-slip path traversal.""" """Extract a ZIP file safely, preventing zip-slip path traversal."""
dest = os.path.realpath(dest_dir) dest = os.path.realpath(dest_dir)

View File

@@ -216,17 +216,18 @@ def save_cache(cache_path: str, cache: dict):
json.dump(cache, f) json.dump(cache, f)
def _load_gitignored_bios_paths() -> set[str]: def _load_gitignored_large_files() -> dict[str, str]:
"""Read .gitignore and return bios/ paths that are listed (large files).""" """Read .gitignore and return {filename: bios_path} for large files."""
gitignore = Path(".gitignore") gitignore = Path(".gitignore")
if not gitignore.exists(): if not gitignore.exists():
return set() return {}
paths = set() entries = {}
for line in gitignore.read_text().splitlines(): for line in gitignore.read_text().splitlines():
line = line.strip() line = line.strip()
if line.startswith("bios/") and not line.startswith("#"): if line.startswith("bios/") and not line.startswith("#"):
paths.add(line) name = Path(line).name
return paths entries[name] = line
return entries
def _preserve_large_file_entries(files: dict, db_path: str) -> int: def _preserve_large_file_entries(files: dict, db_path: str) -> int:
@@ -234,14 +235,14 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
Large files (>50 MB) are stored as GitHub release assets and listed Large files (>50 MB) are stored as GitHub release assets and listed
in .gitignore. When generate_db runs locally without them, their in .gitignore. When generate_db runs locally without them, their
entries would be lost. This reads the existing database and re-adds entries would be lost. This reads the existing database, downloads
entries whose paths match .gitignore bios/ entries. missing files from the release, and re-adds entries with paths
pointing to the local cache.
If the file exists in .cache/large/, the path is updated so that
resolve_local_file can find it for verify and pack generation.
""" """
gitignored = _load_gitignored_bios_paths() from common import fetch_large_file
if not gitignored:
large_files = _load_gitignored_large_files()
if not large_files:
return 0 return 0
try: try:
@@ -250,18 +251,24 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
except (FileNotFoundError, json.JSONDecodeError): except (FileNotFoundError, json.JSONDecodeError):
return 0 return 0
cache_dir = Path(".cache/large")
count = 0 count = 0
for sha1, entry in existing_db.get("files", {}).items(): for sha1, entry in existing_db.get("files", {}).items():
if sha1 in files:
continue
name = entry.get("name", "")
path = entry.get("path", "") path = entry.get("path", "")
if path in gitignored and sha1 not in files: # Match by gitignored bios/ path OR by filename of a known large file
# Point to cached copy if available if path not in large_files.values() and name not in large_files:
name = entry.get("name", "") continue
cached = cache_dir / name cached = fetch_large_file(
if cached.exists(): name,
entry = {**entry, "path": str(cached)} expected_sha1=entry.get("sha1", ""),
files[sha1] = entry expected_md5=entry.get("md5", ""),
count += 1 )
if cached:
entry = {**entry, "path": cached}
files[sha1] = entry
count += 1
return count return count

View File

@@ -26,7 +26,7 @@ from pathlib import Path
sys.path.insert(0, os.path.dirname(__file__)) sys.path.insert(0, os.path.dirname(__file__))
from common import ( from common import (
_build_validation_index, build_zip_contents_index, check_file_validation, _build_validation_index, build_zip_contents_index, check_file_validation,
check_inside_zip, compute_hashes, filter_files_by_mode, check_inside_zip, compute_hashes, fetch_large_file, filter_files_by_mode,
group_identical_platforms, list_emulator_profiles, list_system_ids, group_identical_platforms, list_emulator_profiles, list_system_ids,
load_database, load_data_dir_registry, load_emulator_profiles, load_database, load_data_dir_registry, load_emulator_profiles,
load_platform_config, md5_composite, resolve_local_file, load_platform_config, md5_composite, resolve_local_file,
@@ -43,57 +43,9 @@ DEFAULT_PLATFORMS_DIR = "platforms"
DEFAULT_DB_FILE = "database.json" DEFAULT_DB_FILE = "database.json"
DEFAULT_OUTPUT_DIR = "dist" DEFAULT_OUTPUT_DIR = "dist"
DEFAULT_BIOS_DIR = "bios" DEFAULT_BIOS_DIR = "bios"
LARGE_FILES_RELEASE = "large-files"
LARGE_FILES_REPO = "Abdess/retrobios"
MAX_ENTRY_SIZE = 512 * 1024 * 1024 # 512MB MAX_ENTRY_SIZE = 512 * 1024 * 1024 # 512MB
def _verify_file_hash(path: str, expected_sha1: str = "",
expected_md5: str = "") -> bool:
if not expected_sha1 and not expected_md5:
return True
hashes = compute_hashes(path)
if expected_sha1:
return hashes["sha1"].lower() == expected_sha1.lower()
md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
return hashes["md5"].lower() in md5_list
def fetch_large_file(name: str, dest_dir: str = ".cache/large",
expected_sha1: str = "", expected_md5: str = "") -> str | None:
"""Download a large file from the 'large-files' GitHub release if not cached."""
cached = os.path.join(dest_dir, name)
if os.path.exists(cached):
if expected_sha1 or expected_md5:
if _verify_file_hash(cached, expected_sha1, expected_md5):
return cached
os.unlink(cached)
else:
return cached
encoded_name = urllib.request.quote(name)
url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
try:
req = urllib.request.Request(url, headers={"User-Agent": "retrobios-pack/1.0"})
with urllib.request.urlopen(req, timeout=300) as resp:
os.makedirs(dest_dir, exist_ok=True)
with open(cached, "wb") as f:
while True:
chunk = resp.read(65536)
if not chunk:
break
f.write(chunk)
except (urllib.error.URLError, urllib.error.HTTPError):
return None
if expected_sha1 or expected_md5:
if not _verify_file_hash(cached, expected_sha1, expected_md5):
os.unlink(cached)
return None
return cached
def _find_candidate_satisfying_both( def _find_candidate_satisfying_both(
file_entry: dict, file_entry: dict,
db: dict, db: dict,