mirror of
https://github.com/Abdess/retroarch_system.git
synced 2026-04-13 12:22:33 -05:00
feat: add MAME/FBNeo hash auto-fetch scrapers
sparse clone upstream repos, parse BIOS root sets from C source, cache as JSON, merge into emulator profiles with backup. covers macro expansion, version detection, subset profile protection.
This commit is contained in:
322
scripts/scraper/mame_hash_scraper.py
Normal file
322
scripts/scraper/mame_hash_scraper.py
Normal file
@@ -0,0 +1,322 @@
|
||||
"""Fetch MAME BIOS hashes from mamedev/mame source and merge into profiles.
|
||||
|
||||
Sparse clones the MAME repo, parses the source tree for BIOS root sets,
|
||||
caches results to data/mame-hashes.json, and optionally merges into
|
||||
emulator profiles that reference mamedev/mame upstream.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
from .mame_parser import parse_mame_source_tree
|
||||
from ._hash_merge import compute_diff, merge_mame_profile
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_ROOT = Path(__file__).resolve().parents[2]
|
||||
_CACHE_PATH = _ROOT / 'data' / 'mame-hashes.json'
|
||||
_CLONE_DIR = _ROOT / 'tmp' / 'mame'
|
||||
_EMULATORS_DIR = _ROOT / 'emulators'
|
||||
_REPO_URL = 'https://github.com/mamedev/mame.git'
|
||||
_STALE_HOURS = 24
|
||||
|
||||
|
||||
# ── Cache ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _load_cache() -> dict[str, Any] | None:
|
||||
if not _CACHE_PATH.exists():
|
||||
return None
|
||||
try:
|
||||
with open(_CACHE_PATH, encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
def _is_stale(cache: dict[str, Any] | None) -> bool:
|
||||
if cache is None:
|
||||
return True
|
||||
fetched_at = cache.get('fetched_at')
|
||||
if not fetched_at:
|
||||
return True
|
||||
try:
|
||||
ts = datetime.fromisoformat(fetched_at)
|
||||
age = datetime.now(timezone.utc) - ts
|
||||
return age.total_seconds() > _STALE_HOURS * 3600
|
||||
except (ValueError, TypeError):
|
||||
return True
|
||||
|
||||
|
||||
def _write_cache(data: dict[str, Any]) -> None:
|
||||
_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(_CACHE_PATH, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
log.info('cache written to %s', _CACHE_PATH)
|
||||
|
||||
|
||||
# ── Git operations ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _run_git(args: list[str], cwd: Path | None = None) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(
|
||||
['git', *args],
|
||||
cwd=cwd,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
|
||||
def _sparse_clone() -> None:
|
||||
if _CLONE_DIR.exists():
|
||||
shutil.rmtree(_CLONE_DIR)
|
||||
_CLONE_DIR.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
log.info('sparse cloning mamedev/mame into %s', _CLONE_DIR)
|
||||
_run_git([
|
||||
'clone',
|
||||
'--depth', '1',
|
||||
'--filter=blob:none',
|
||||
'--sparse',
|
||||
_REPO_URL,
|
||||
str(_CLONE_DIR),
|
||||
])
|
||||
_run_git(
|
||||
['sparse-checkout', 'set', 'src/mame', 'src/devices'],
|
||||
cwd=_CLONE_DIR,
|
||||
)
|
||||
|
||||
|
||||
def _get_version() -> str:
|
||||
# version.cpp is generated at build time, not in the repo.
|
||||
# Use GitHub API to get the latest release tag.
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
'https://api.github.com/repos/mamedev/mame/releases/latest',
|
||||
headers={'User-Agent': 'retrobios-scraper/1.0',
|
||||
'Accept': 'application/vnd.github.v3+json'},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
data = json.loads(resp.read())
|
||||
tag = data.get('tag_name', '')
|
||||
if tag:
|
||||
return _parse_version_tag(tag)
|
||||
except (urllib.error.URLError, json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return 'unknown'
|
||||
|
||||
|
||||
def _parse_version_tag(tag: str) -> str:
|
||||
prefix = 'mame'
|
||||
raw = tag.removeprefix(prefix) if tag.startswith(prefix) else tag
|
||||
if raw.isdigit() and len(raw) >= 4:
|
||||
return f'{raw[0]}.{raw[1:]}'
|
||||
return raw
|
||||
|
||||
|
||||
|
||||
|
||||
def _get_commit() -> str:
|
||||
try:
|
||||
result = _run_git(['rev-parse', 'HEAD'], cwd=_CLONE_DIR)
|
||||
return result.stdout.strip()
|
||||
except subprocess.CalledProcessError:
|
||||
return ''
|
||||
|
||||
|
||||
def _cleanup() -> None:
|
||||
if _CLONE_DIR.exists():
|
||||
log.info('cleaning up %s', _CLONE_DIR)
|
||||
shutil.rmtree(_CLONE_DIR)
|
||||
|
||||
|
||||
# ── Profile discovery ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _find_mame_profiles() -> list[Path]:
|
||||
profiles: list[Path] = []
|
||||
for path in sorted(_EMULATORS_DIR.glob('*.yml')):
|
||||
if path.name.endswith('.old.yml'):
|
||||
continue
|
||||
try:
|
||||
with open(path, encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
if not isinstance(data, dict):
|
||||
continue
|
||||
upstream = data.get('upstream', '')
|
||||
# Only match profiles tracking current MAME (not frozen snapshots
|
||||
# which have upstream like "mamedev/mame/tree/mame0139")
|
||||
if isinstance(upstream, str) and upstream.rstrip('/') == 'https://github.com/mamedev/mame':
|
||||
profiles.append(path)
|
||||
except (yaml.YAMLError, OSError):
|
||||
continue
|
||||
return profiles
|
||||
|
||||
|
||||
# ── Diff formatting ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _format_diff(
|
||||
profile_path: Path,
|
||||
diff: dict[str, Any],
|
||||
hashes: dict[str, Any],
|
||||
show_added: bool = True,
|
||||
) -> list[str]:
|
||||
lines: list[str] = []
|
||||
name = profile_path.stem
|
||||
|
||||
added = diff.get('added', [])
|
||||
updated = diff.get('updated', [])
|
||||
removed = diff.get('removed', [])
|
||||
unchanged = diff.get('unchanged', 0)
|
||||
|
||||
if not added and not updated and not removed:
|
||||
lines.append(f' {name}:')
|
||||
lines.append(' no changes')
|
||||
return lines
|
||||
|
||||
lines.append(f' {name}:')
|
||||
|
||||
if show_added:
|
||||
bios_sets = hashes.get('bios_sets', {})
|
||||
for set_name in added:
|
||||
rom_count = len(bios_sets.get(set_name, {}).get('roms', []))
|
||||
source_file = bios_sets.get(set_name, {}).get('source_file', '')
|
||||
source_line = bios_sets.get(set_name, {}).get('source_line', '')
|
||||
ref = f'{source_file}:{source_line}' if source_file else ''
|
||||
lines.append(f' + {set_name}.zip ({ref}, {rom_count} ROMs)')
|
||||
elif added:
|
||||
lines.append(f' + {len(added)} new sets available (main profile only)')
|
||||
|
||||
for set_name in updated:
|
||||
lines.append(f' ~ {set_name}.zip (contents changed)')
|
||||
|
||||
oos = diff.get('out_of_scope', 0)
|
||||
lines.append(f' = {unchanged} unchanged')
|
||||
if oos:
|
||||
lines.append(f' . {oos} out of scope (not BIOS root sets)')
|
||||
return lines
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _fetch_hashes(force: bool) -> dict[str, Any]:
|
||||
cache = _load_cache()
|
||||
if not force and not _is_stale(cache):
|
||||
log.info('using cached data from %s', cache.get('fetched_at', ''))
|
||||
return cache # type: ignore[return-value]
|
||||
|
||||
try:
|
||||
_sparse_clone()
|
||||
bios_sets = parse_mame_source_tree(str(_CLONE_DIR))
|
||||
version = _get_version()
|
||||
commit = _get_commit()
|
||||
|
||||
data: dict[str, Any] = {
|
||||
'source': 'mamedev/mame',
|
||||
'version': version,
|
||||
'commit': commit,
|
||||
'fetched_at': datetime.now(timezone.utc).isoformat(),
|
||||
'bios_sets': bios_sets,
|
||||
}
|
||||
_write_cache(data)
|
||||
return data
|
||||
finally:
|
||||
_cleanup()
|
||||
|
||||
|
||||
def _run(args: argparse.Namespace) -> None:
|
||||
hashes = _fetch_hashes(args.force)
|
||||
|
||||
total_sets = len(hashes.get('bios_sets', {}))
|
||||
version = hashes.get('version', 'unknown')
|
||||
commit = hashes.get('commit', '')[:12]
|
||||
|
||||
if args.json:
|
||||
json.dump(hashes, sys.stdout, indent=2, ensure_ascii=False)
|
||||
sys.stdout.write('\n')
|
||||
return
|
||||
|
||||
print(f'mame-hashes: {total_sets} BIOS root sets from mamedev/mame'
|
||||
f' @ {version} ({commit})')
|
||||
print()
|
||||
|
||||
profiles = _find_mame_profiles()
|
||||
if not profiles:
|
||||
print(' no profiles with mamedev/mame upstream found')
|
||||
return
|
||||
|
||||
for profile_path in profiles:
|
||||
is_main = profile_path.name == 'mame.yml'
|
||||
diff = compute_diff(str(profile_path), str(_CACHE_PATH), mode='mame')
|
||||
lines = _format_diff(profile_path, diff, hashes, show_added=is_main)
|
||||
for line in lines:
|
||||
print(line)
|
||||
|
||||
if not args.dry_run:
|
||||
updated = diff.get('updated', [])
|
||||
added = diff.get('added', []) if is_main else []
|
||||
if added or updated:
|
||||
merge_mame_profile(
|
||||
str(profile_path),
|
||||
str(_CACHE_PATH),
|
||||
write=True,
|
||||
add_new=is_main,
|
||||
)
|
||||
log.info('merged into %s', profile_path.name)
|
||||
|
||||
print()
|
||||
if args.dry_run:
|
||||
print('(dry run, no files modified)')
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='mame_hash_scraper',
|
||||
description='Fetch MAME BIOS hashes from source and merge into profiles.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='show diff only, do not modify profiles',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_true',
|
||||
help='output raw JSON to stdout',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--force',
|
||||
action='store_true',
|
||||
help='re-fetch even if cache is fresh',
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(levelname)s: %(message)s',
|
||||
)
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
_run(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user