Files
libretro/scripts/scraper/mame_hash_scraper.py
Abdessamad Derraz 75e34898ee feat: add MAME/FBNeo hash auto-fetch scrapers
sparse clone upstream repos, parse BIOS root sets from C source,
cache as JSON, merge into emulator profiles with backup.
covers macro expansion, version detection, subset profile protection.
2026-03-30 19:11:26 +02:00

323 lines
9.7 KiB
Python

"""Fetch MAME BIOS hashes from mamedev/mame source and merge into profiles.
Sparse clones the MAME repo, parses the source tree for BIOS root sets,
caches results to data/mame-hashes.json, and optionally merges into
emulator profiles that reference mamedev/mame upstream.
"""
from __future__ import annotations
import argparse
import json
import logging
import shutil
import subprocess
import sys
import urllib.error
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import yaml
from .mame_parser import parse_mame_source_tree
from ._hash_merge import compute_diff, merge_mame_profile
log = logging.getLogger(__name__)
_ROOT = Path(__file__).resolve().parents[2]
_CACHE_PATH = _ROOT / 'data' / 'mame-hashes.json'
_CLONE_DIR = _ROOT / 'tmp' / 'mame'
_EMULATORS_DIR = _ROOT / 'emulators'
_REPO_URL = 'https://github.com/mamedev/mame.git'
_STALE_HOURS = 24
# ── Cache ────────────────────────────────────────────────────────────
def _load_cache() -> dict[str, Any] | None:
if not _CACHE_PATH.exists():
return None
try:
with open(_CACHE_PATH, encoding='utf-8') as f:
return json.load(f)
except (json.JSONDecodeError, OSError):
return None
def _is_stale(cache: dict[str, Any] | None) -> bool:
if cache is None:
return True
fetched_at = cache.get('fetched_at')
if not fetched_at:
return True
try:
ts = datetime.fromisoformat(fetched_at)
age = datetime.now(timezone.utc) - ts
return age.total_seconds() > _STALE_HOURS * 3600
except (ValueError, TypeError):
return True
def _write_cache(data: dict[str, Any]) -> None:
_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(_CACHE_PATH, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
log.info('cache written to %s', _CACHE_PATH)
# ── Git operations ───────────────────────────────────────────────────
def _run_git(args: list[str], cwd: Path | None = None) -> subprocess.CompletedProcess[str]:
return subprocess.run(
['git', *args],
cwd=cwd,
check=True,
capture_output=True,
text=True,
)
def _sparse_clone() -> None:
if _CLONE_DIR.exists():
shutil.rmtree(_CLONE_DIR)
_CLONE_DIR.parent.mkdir(parents=True, exist_ok=True)
log.info('sparse cloning mamedev/mame into %s', _CLONE_DIR)
_run_git([
'clone',
'--depth', '1',
'--filter=blob:none',
'--sparse',
_REPO_URL,
str(_CLONE_DIR),
])
_run_git(
['sparse-checkout', 'set', 'src/mame', 'src/devices'],
cwd=_CLONE_DIR,
)
def _get_version() -> str:
# version.cpp is generated at build time, not in the repo.
# Use GitHub API to get the latest release tag.
try:
req = urllib.request.Request(
'https://api.github.com/repos/mamedev/mame/releases/latest',
headers={'User-Agent': 'retrobios-scraper/1.0',
'Accept': 'application/vnd.github.v3+json'},
)
with urllib.request.urlopen(req, timeout=10) as resp:
data = json.loads(resp.read())
tag = data.get('tag_name', '')
if tag:
return _parse_version_tag(tag)
except (urllib.error.URLError, json.JSONDecodeError, OSError):
pass
return 'unknown'
def _parse_version_tag(tag: str) -> str:
prefix = 'mame'
raw = tag.removeprefix(prefix) if tag.startswith(prefix) else tag
if raw.isdigit() and len(raw) >= 4:
return f'{raw[0]}.{raw[1:]}'
return raw
def _get_commit() -> str:
try:
result = _run_git(['rev-parse', 'HEAD'], cwd=_CLONE_DIR)
return result.stdout.strip()
except subprocess.CalledProcessError:
return ''
def _cleanup() -> None:
if _CLONE_DIR.exists():
log.info('cleaning up %s', _CLONE_DIR)
shutil.rmtree(_CLONE_DIR)
# ── Profile discovery ────────────────────────────────────────────────
def _find_mame_profiles() -> list[Path]:
profiles: list[Path] = []
for path in sorted(_EMULATORS_DIR.glob('*.yml')):
if path.name.endswith('.old.yml'):
continue
try:
with open(path, encoding='utf-8') as f:
data = yaml.safe_load(f)
if not isinstance(data, dict):
continue
upstream = data.get('upstream', '')
# Only match profiles tracking current MAME (not frozen snapshots
# which have upstream like "mamedev/mame/tree/mame0139")
if isinstance(upstream, str) and upstream.rstrip('/') == 'https://github.com/mamedev/mame':
profiles.append(path)
except (yaml.YAMLError, OSError):
continue
return profiles
# ── Diff formatting ──────────────────────────────────────────────────
def _format_diff(
profile_path: Path,
diff: dict[str, Any],
hashes: dict[str, Any],
show_added: bool = True,
) -> list[str]:
lines: list[str] = []
name = profile_path.stem
added = diff.get('added', [])
updated = diff.get('updated', [])
removed = diff.get('removed', [])
unchanged = diff.get('unchanged', 0)
if not added and not updated and not removed:
lines.append(f' {name}:')
lines.append(' no changes')
return lines
lines.append(f' {name}:')
if show_added:
bios_sets = hashes.get('bios_sets', {})
for set_name in added:
rom_count = len(bios_sets.get(set_name, {}).get('roms', []))
source_file = bios_sets.get(set_name, {}).get('source_file', '')
source_line = bios_sets.get(set_name, {}).get('source_line', '')
ref = f'{source_file}:{source_line}' if source_file else ''
lines.append(f' + {set_name}.zip ({ref}, {rom_count} ROMs)')
elif added:
lines.append(f' + {len(added)} new sets available (main profile only)')
for set_name in updated:
lines.append(f' ~ {set_name}.zip (contents changed)')
oos = diff.get('out_of_scope', 0)
lines.append(f' = {unchanged} unchanged')
if oos:
lines.append(f' . {oos} out of scope (not BIOS root sets)')
return lines
# ── Main ─────────────────────────────────────────────────────────────
def _fetch_hashes(force: bool) -> dict[str, Any]:
cache = _load_cache()
if not force and not _is_stale(cache):
log.info('using cached data from %s', cache.get('fetched_at', ''))
return cache # type: ignore[return-value]
try:
_sparse_clone()
bios_sets = parse_mame_source_tree(str(_CLONE_DIR))
version = _get_version()
commit = _get_commit()
data: dict[str, Any] = {
'source': 'mamedev/mame',
'version': version,
'commit': commit,
'fetched_at': datetime.now(timezone.utc).isoformat(),
'bios_sets': bios_sets,
}
_write_cache(data)
return data
finally:
_cleanup()
def _run(args: argparse.Namespace) -> None:
hashes = _fetch_hashes(args.force)
total_sets = len(hashes.get('bios_sets', {}))
version = hashes.get('version', 'unknown')
commit = hashes.get('commit', '')[:12]
if args.json:
json.dump(hashes, sys.stdout, indent=2, ensure_ascii=False)
sys.stdout.write('\n')
return
print(f'mame-hashes: {total_sets} BIOS root sets from mamedev/mame'
f' @ {version} ({commit})')
print()
profiles = _find_mame_profiles()
if not profiles:
print(' no profiles with mamedev/mame upstream found')
return
for profile_path in profiles:
is_main = profile_path.name == 'mame.yml'
diff = compute_diff(str(profile_path), str(_CACHE_PATH), mode='mame')
lines = _format_diff(profile_path, diff, hashes, show_added=is_main)
for line in lines:
print(line)
if not args.dry_run:
updated = diff.get('updated', [])
added = diff.get('added', []) if is_main else []
if added or updated:
merge_mame_profile(
str(profile_path),
str(_CACHE_PATH),
write=True,
add_new=is_main,
)
log.info('merged into %s', profile_path.name)
print()
if args.dry_run:
print('(dry run, no files modified)')
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog='mame_hash_scraper',
description='Fetch MAME BIOS hashes from source and merge into profiles.',
)
parser.add_argument(
'--dry-run',
action='store_true',
help='show diff only, do not modify profiles',
)
parser.add_argument(
'--json',
action='store_true',
help='output raw JSON to stdout',
)
parser.add_argument(
'--force',
action='store_true',
help='re-fetch even if cache is fresh',
)
return parser
def main() -> None:
logging.basicConfig(
level=logging.INFO,
format='%(levelname)s: %(message)s',
)
parser = build_parser()
args = parser.parse_args()
_run(args)
if __name__ == '__main__':
main()