refactor: move fetch_large_file to common, auto-download on db rebuild

This commit is contained in:
Abdessamad Derraz
2026-03-25 13:19:12 +01:00
parent 910428c6f1
commit dbc26b11c1
4 changed files with 88 additions and 74 deletions

View File

@@ -9,6 +9,8 @@ from __future__ import annotations
import hashlib
import json
import os
import urllib.error
import urllib.request
import zipfile
import zlib
from pathlib import Path
@@ -694,6 +696,59 @@ def filter_files_by_mode(files: list[dict], standalone: bool) -> list[dict]:
return result
LARGE_FILES_RELEASE = "large-files"
LARGE_FILES_REPO = "Abdess/retrobios"
LARGE_FILES_CACHE = ".cache/large"
def fetch_large_file(name: str, dest_dir: str = LARGE_FILES_CACHE,
expected_sha1: str = "", expected_md5: str = "") -> str | None:
"""Download a large file from the 'large-files' GitHub release if not cached."""
cached = os.path.join(dest_dir, name)
if os.path.exists(cached):
if expected_sha1 or expected_md5:
hashes = compute_hashes(cached)
if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
os.unlink(cached)
elif expected_md5:
md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
if hashes["md5"].lower() not in md5_list:
os.unlink(cached)
else:
return cached
else:
return cached
else:
return cached
encoded_name = urllib.request.quote(name)
url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
try:
req = urllib.request.Request(url, headers={"User-Agent": "retrobios/1.0"})
with urllib.request.urlopen(req, timeout=300) as resp:
os.makedirs(dest_dir, exist_ok=True)
with open(cached, "wb") as f:
while True:
chunk = resp.read(65536)
if not chunk:
break
f.write(chunk)
except (urllib.error.URLError, urllib.error.HTTPError):
return None
if expected_sha1 or expected_md5:
hashes = compute_hashes(cached)
if expected_sha1 and hashes["sha1"].lower() != expected_sha1.lower():
os.unlink(cached)
return None
if expected_md5:
md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
if hashes["md5"].lower() not in md5_list:
os.unlink(cached)
return None
return cached
def safe_extract_zip(zip_path: str, dest_dir: str) -> None:
"""Extract a ZIP file safely, preventing zip-slip path traversal."""
dest = os.path.realpath(dest_dir)

View File

@@ -216,17 +216,18 @@ def save_cache(cache_path: str, cache: dict):
json.dump(cache, f)
def _load_gitignored_bios_paths() -> set[str]:
"""Read .gitignore and return bios/ paths that are listed (large files)."""
def _load_gitignored_large_files() -> dict[str, str]:
"""Read .gitignore and return {filename: bios_path} for large files."""
gitignore = Path(".gitignore")
if not gitignore.exists():
return set()
paths = set()
return {}
entries = {}
for line in gitignore.read_text().splitlines():
line = line.strip()
if line.startswith("bios/") and not line.startswith("#"):
paths.add(line)
return paths
name = Path(line).name
entries[name] = line
return entries
def _preserve_large_file_entries(files: dict, db_path: str) -> int:
@@ -234,14 +235,14 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
Large files (>50 MB) are stored as GitHub release assets and listed
in .gitignore. When generate_db runs locally without them, their
entries would be lost. This reads the existing database and re-adds
entries whose paths match .gitignore bios/ entries.
If the file exists in .cache/large/, the path is updated so that
resolve_local_file can find it for verify and pack generation.
entries would be lost. This reads the existing database, downloads
missing files from the release, and re-adds entries with paths
pointing to the local cache.
"""
gitignored = _load_gitignored_bios_paths()
if not gitignored:
from common import fetch_large_file
large_files = _load_gitignored_large_files()
if not large_files:
return 0
try:
@@ -250,18 +251,24 @@ def _preserve_large_file_entries(files: dict, db_path: str) -> int:
except (FileNotFoundError, json.JSONDecodeError):
return 0
cache_dir = Path(".cache/large")
count = 0
for sha1, entry in existing_db.get("files", {}).items():
if sha1 in files:
continue
name = entry.get("name", "")
path = entry.get("path", "")
if path in gitignored and sha1 not in files:
# Point to cached copy if available
name = entry.get("name", "")
cached = cache_dir / name
if cached.exists():
entry = {**entry, "path": str(cached)}
files[sha1] = entry
count += 1
# Match by gitignored bios/ path OR by filename of a known large file
if path not in large_files.values() and name not in large_files:
continue
cached = fetch_large_file(
name,
expected_sha1=entry.get("sha1", ""),
expected_md5=entry.get("md5", ""),
)
if cached:
entry = {**entry, "path": cached}
files[sha1] = entry
count += 1
return count

View File

@@ -26,7 +26,7 @@ from pathlib import Path
sys.path.insert(0, os.path.dirname(__file__))
from common import (
_build_validation_index, build_zip_contents_index, check_file_validation,
check_inside_zip, compute_hashes, filter_files_by_mode,
check_inside_zip, compute_hashes, fetch_large_file, filter_files_by_mode,
group_identical_platforms, list_emulator_profiles, list_system_ids,
load_database, load_data_dir_registry, load_emulator_profiles,
load_platform_config, md5_composite, resolve_local_file,
@@ -43,57 +43,9 @@ DEFAULT_PLATFORMS_DIR = "platforms"
DEFAULT_DB_FILE = "database.json"
DEFAULT_OUTPUT_DIR = "dist"
DEFAULT_BIOS_DIR = "bios"
LARGE_FILES_RELEASE = "large-files"
LARGE_FILES_REPO = "Abdess/retrobios"
MAX_ENTRY_SIZE = 512 * 1024 * 1024 # 512MB
def _verify_file_hash(path: str, expected_sha1: str = "",
expected_md5: str = "") -> bool:
if not expected_sha1 and not expected_md5:
return True
hashes = compute_hashes(path)
if expected_sha1:
return hashes["sha1"].lower() == expected_sha1.lower()
md5_list = [m.strip().lower() for m in expected_md5.split(",") if m.strip()]
return hashes["md5"].lower() in md5_list
def fetch_large_file(name: str, dest_dir: str = ".cache/large",
expected_sha1: str = "", expected_md5: str = "") -> str | None:
"""Download a large file from the 'large-files' GitHub release if not cached."""
cached = os.path.join(dest_dir, name)
if os.path.exists(cached):
if expected_sha1 or expected_md5:
if _verify_file_hash(cached, expected_sha1, expected_md5):
return cached
os.unlink(cached)
else:
return cached
encoded_name = urllib.request.quote(name)
url = f"https://github.com/{LARGE_FILES_REPO}/releases/download/{LARGE_FILES_RELEASE}/{encoded_name}"
try:
req = urllib.request.Request(url, headers={"User-Agent": "retrobios-pack/1.0"})
with urllib.request.urlopen(req, timeout=300) as resp:
os.makedirs(dest_dir, exist_ok=True)
with open(cached, "wb") as f:
while True:
chunk = resp.read(65536)
if not chunk:
break
f.write(chunk)
except (urllib.error.URLError, urllib.error.HTTPError):
return None
if expected_sha1 or expected_md5:
if not _verify_file_hash(cached, expected_sha1, expected_md5):
os.unlink(cached)
return None
return cached
def _find_candidate_satisfying_both(
file_entry: dict,
db: dict,