#!/usr/bin/env python3 """Auto-fetch missing BIOS files from multiple sources. Pipeline: 1. Cross-reference database.json (already exists under different name/path?) 2. Scan old branches (git show origin/branch:path) 3. Search public BIOS repos on GitHub 4. Search archive.org collections 5. Create GitHub Issue for community help Usage: python scripts/auto_fetch.py --platform retroarch [--dry-run] python scripts/auto_fetch.py --all [--dry-run] """ from __future__ import annotations import argparse import hashlib import json import os import subprocess import sys import urllib.request import urllib.error from pathlib import Path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from common import list_registered_platforms, load_database, load_platform_config, require_yaml yaml = require_yaml() DEFAULT_DB = "database.json" DEFAULT_PLATFORMS_DIR = "platforms" DEFAULT_BIOS_DIR = "bios" MAX_DOWNLOAD = 100 * 1024 * 1024 # 100MB per file LEGACY_BRANCHES = ["libretro", "RetroArch", "RetroPie", "Recalbox", "batocera", "Other"] PUBLIC_REPOS = [ # archtaurus/RetroPieBIOS - most complete verified collection "https://raw.githubusercontent.com/archtaurus/RetroPieBIOS/master/BIOS/{name}", "https://raw.githubusercontent.com/archtaurus/RetroPieBIOS/master/BIOS/pcsx2/bios/{name}", "https://raw.githubusercontent.com/archtaurus/RetroPieBIOS/master/BIOS/ep128emu/roms/{name}", "https://raw.githubusercontent.com/archtaurus/RetroPieBIOS/master/BIOS/fuse/{name}", # prefetchnta/retroarch-bios - alternative verified collection "https://raw.githubusercontent.com/prefetchnta/retroarch-bios/main/system/{name}", "https://raw.githubusercontent.com/prefetchnta/retroarch-bios/main/system/pcsx2/bios/{name}", # BatoceraPLUS - Batocera-specific "https://raw.githubusercontent.com/BatoceraPLUS/Batocera.PLUS-bios/main/{name}", ] ARCHIVE_ORG_COLLECTIONS = [ "RetroarchSystemFiles", "retroarch_bios", "retroarch-ultimate-bios-pack_20250824", "system_20240621", "full-pack-bios-batocera-39", ] def find_missing(config: dict, db: dict) -> list[dict]: """Find BIOS files required by platform but not in database.""" missing = [] for sys_id, system in config.get("systems", {}).items(): for file_entry in system.get("files", []): storage = file_entry.get("storage", "embedded") if storage != "embedded": continue sha1 = file_entry.get("sha1") md5 = file_entry.get("md5") name = file_entry.get("name", "") found = False if sha1 and sha1 in db.get("files", {}): found = True elif md5: by_md5 = db.get("indexes", {}).get("by_md5", {}) md5_list = [m.strip() for m in md5.split(",") if m.strip()] found = any(m in by_md5 for m in md5_list) if not found: missing.append({ "name": name, "system": sys_id, "sha1": sha1, "md5": md5, "size": file_entry.get("size"), "destination": file_entry.get("destination", name), }) return missing def verify_content(data: bytes, expected: dict) -> bool: """Verify downloaded content matches expected hashes.""" if expected.get("sha1"): actual = hashlib.sha1(data).hexdigest() return actual == expected["sha1"] if expected.get("md5"): actual = hashlib.md5(data).hexdigest() return actual == expected["md5"] return False def _read_limited(resp, limit: int = MAX_DOWNLOAD) -> bytes | None: """Read response up to limit bytes. Returns None if exceeded.""" data = resp.read(limit + 1) if len(data) > limit: return None return data def step1_crossref_db(entry: dict, db: dict) -> str | None: """Check if file exists under different name/path in database.""" sha1 = entry.get("sha1") md5 = entry.get("md5") if sha1 and sha1 in db.get("files", {}): return db["files"][sha1]["path"] if md5: sha1_match = db.get("indexes", {}).get("by_md5", {}).get(md5) if sha1_match and sha1_match in db["files"]: return db["files"][sha1_match]["path"] return None def step2_scan_branches(entry: dict) -> bytes | None: """Search old git branches for the file by hash.""" name = entry["name"] for branch in LEGACY_BRANCHES: ref = f"origin/{branch}" try: subprocess.run( ["git", "rev-parse", "--verify", ref], capture_output=True, check=True, ) except subprocess.CalledProcessError: continue result = subprocess.run( ["git", "ls-tree", "-r", "--name-only", ref], capture_output=True, text=True, ) for filepath in result.stdout.strip().split("\n"): if os.path.basename(filepath) == name: try: blob = subprocess.run( ["git", "show", f"{ref}:{filepath}"], capture_output=True, check=True, ) if verify_content(blob.stdout, entry): return blob.stdout except subprocess.CalledProcessError: continue return None def step3_search_public_repos(entry: dict) -> bytes | None: """Search public GitHub BIOS repos.""" name = entry["name"] destination = entry.get("destination", name) for url_template in PUBLIC_REPOS: url = url_template.format(name=name) try: req = urllib.request.Request(url, headers={"User-Agent": "retrobios-fetch/1.0"}) with urllib.request.urlopen(req, timeout=30) as resp: data = _read_limited(resp) if data is None: continue if verify_content(data, entry): return data except (urllib.error.URLError, urllib.error.HTTPError): continue if "/" in destination: url = url_template.format(name=destination) try: req = urllib.request.Request(url, headers={"User-Agent": "retrobios-fetch/1.0"}) with urllib.request.urlopen(req, timeout=30) as resp: data = _read_limited(resp) if data is None: continue if verify_content(data, entry): return data except (urllib.error.URLError, urllib.error.HTTPError): continue return None def step4_search_archive_org(entry: dict) -> bytes | None: """Search archive.org firmware collections by direct download.""" name = entry["name"] for collection_id in ARCHIVE_ORG_COLLECTIONS: for path in [name, f"system/{name}", f"bios/{name}"]: url = f"https://archive.org/download/{collection_id}/{path}" try: req = urllib.request.Request(url, headers={"User-Agent": "retrobios-fetch/1.0"}) with urllib.request.urlopen(req, timeout=30) as resp: data = _read_limited(resp) if data is None: continue if verify_content(data, entry): return data except (urllib.error.URLError, urllib.error.HTTPError): continue sha1 = entry.get("sha1", "") if not sha1: return None search_url = ( f"https://archive.org/advancedsearch.php?" f"q=sha1:{sha1}&output=json&rows=1" ) try: req = urllib.request.Request(search_url, headers={"User-Agent": "retrobios-fetch/1.0"}) with urllib.request.urlopen(req, timeout=30) as resp: result = json.loads(resp.read()) docs = result.get("response", {}).get("docs", []) if docs: identifier = docs[0].get("identifier") if identifier: dl_url = f"https://archive.org/download/{identifier}/{name}" try: req2 = urllib.request.Request(dl_url, headers={"User-Agent": "retrobios-fetch/1.0"}) with urllib.request.urlopen(req2, timeout=30) as resp2: data = _read_limited(resp2) if data is not None and verify_content(data, entry): return data except (urllib.error.URLError, urllib.error.HTTPError): pass except (urllib.error.URLError, json.JSONDecodeError): pass return None def place_file(data: bytes, entry: dict, bios_dir: str, db: dict) -> str: """Place a fetched BIOS file in the correct location.""" name = os.path.basename(entry["name"]) system = entry["system"] dest_dir = Path(bios_dir) for manufacturer_dir in dest_dir.iterdir(): if not manufacturer_dir.is_dir(): continue for console_dir in manufacturer_dir.iterdir(): if not console_dir.is_dir(): continue dir_path = f"{manufacturer_dir.name}/{console_dir.name}".lower() if any(part in dir_path for part in system.split("-") if len(part) > 2): dest = console_dir / name dest.parent.mkdir(parents=True, exist_ok=True) with open(dest, "wb") as f: f.write(data) return str(dest) dest = dest_dir / "Unknown" / system / name dest.parent.mkdir(parents=True, exist_ok=True) with open(dest, "wb") as f: f.write(data) return str(dest) def fetch_missing( missing: list[dict], db: dict, bios_dir: str, dry_run: bool = False, ) -> dict: """Run the 5-step auto-fetch pipeline for missing files.""" stats = {"found": 0, "not_found": 0, "errors": 0} still_missing = [] for entry in missing: name = entry["name"] print(f"\n Searching: {name} ({entry['system']})") existing = step1_crossref_db(entry, db) if existing: print(f" [1] Found in database at: {existing}") stats["found"] += 1 continue if dry_run: print(f" [DRY RUN] Would search branches, repos, archive.org") still_missing.append(entry) stats["not_found"] += 1 continue data = step2_scan_branches(entry) if data: path = place_file(data, entry, bios_dir, db) print(f" [2] Found in branch, saved to: {path}") stats["found"] += 1 continue data = step3_search_public_repos(entry) if data: path = place_file(data, entry, bios_dir, db) print(f" [3] Found in public repo, saved to: {path}") stats["found"] += 1 continue data = step4_search_archive_org(entry) if data: path = place_file(data, entry, bios_dir, db) print(f" [4] Found on archive.org, saved to: {path}") stats["found"] += 1 continue print(f" [5] Not found - needs community contribution") still_missing.append(entry) stats["not_found"] += 1 return {"stats": stats, "still_missing": still_missing} def generate_issue_body(missing: list[dict], platform: str) -> str: """Generate a GitHub Issue body for missing BIOS files.""" lines = [ f"## Missing BIOS Files for {platform}", "", "The following BIOS files are required but not available in the repository.", "If you have any of these files, please submit a Pull Request!", "", "| File | System | SHA1 | MD5 |", "|------|--------|------|-----|", ] for entry in missing: sha1 = entry.get("sha1") or "N/A" md5 = entry.get("md5") or "N/A" lines.append(f"| `{entry['name']}` | {entry['system']} | `{sha1[:12]}...` | `{md5[:12]}...` |") lines.extend([ "", "### How to Contribute", "", "1. Fork this repository", "2. Add the BIOS file to `bios/Manufacturer/Console/`", "3. Create a Pull Request - checksums are verified automatically", ]) return "\n".join(lines) def main(): parser = argparse.ArgumentParser(description="Auto-fetch missing BIOS files") parser.add_argument("--platform", "-p", help="Platform to check") parser.add_argument("--all", action="store_true", help="Check all platforms") parser.add_argument("--dry-run", action="store_true", help="Don't download, just report") parser.add_argument("--db", default=DEFAULT_DB) parser.add_argument("--platforms-dir", default=DEFAULT_PLATFORMS_DIR) parser.add_argument("--bios-dir", default=DEFAULT_BIOS_DIR) parser.add_argument("--create-issues", action="store_true", help="Output GitHub Issue bodies") args = parser.parse_args() if not os.path.exists(args.db): print(f"Error: {args.db} not found. Run generate_db.py first.", file=sys.stderr) sys.exit(1) db = load_database(args.db) if args.all: platforms = list_registered_platforms( args.platforms_dir, include_archived=True, ) elif args.platform: platforms = [args.platform] else: parser.error("Specify --platform or --all") return all_still_missing = {} for platform in sorted(platforms): print(f"\n{'='*60}") print(f"Platform: {platform}") print(f"{'='*60}") try: config = load_platform_config(platform, args.platforms_dir) except FileNotFoundError: print(f" Config not found, skipping") continue missing = find_missing(config, db) if not missing: print(f" All BIOS files present!") continue print(f" {len(missing)} missing files") result = fetch_missing(missing, db, args.bios_dir, args.dry_run) if result["still_missing"]: all_still_missing[platform] = result["still_missing"] stats = result["stats"] print(f"\n Results: {stats['found']} found, {stats['not_found']} not found") if args.create_issues and all_still_missing: print(f"\n{'='*60}") print("GitHub Issue Bodies") print(f"{'='*60}") for platform, missing in all_still_missing.items(): print(f"\n--- Issue for {platform} ---\n") print(generate_issue_body(missing, platform)) if __name__ == "__main__": main()