feat: add bizhawk scraper for FirmwareDatabase.cs

This commit is contained in:
Abdessamad Derraz
2026-03-28 09:41:52 +01:00
parent b75f2b2a43
commit 2738a4d326
2 changed files with 473 additions and 0 deletions

View File

@@ -0,0 +1,379 @@
#!/usr/bin/env python3
"""Scraper for BizHawk BIOS requirements.
Source: https://github.com/TASEmulators/BizHawk
Format: C# source (FirmwareDatabase.cs)
Hash: SHA1 primary
BizHawk declares firmware in FirmwareDatabase.cs using four patterns:
File(sha1, size, name, desc, isBad?) - file definition
Firmware(system, id, desc) - firmware slot declaration
Option(system, id, in fileref, status?) - binds file to slot
FirmwareAndOption(sha1, size, sys, id, ...) - combined one-liner
Variable assignments (var x = File(...)) let Option() reference files
by name. Multiple options per firmware slot are ranked by status;
the Ideal non-bad option is selected as canonical.
"""
from __future__ import annotations
import re
import sys
try:
from .base_scraper import (
BaseScraper,
BiosRequirement,
fetch_github_latest_tag,
scraper_cli,
)
except ImportError:
from base_scraper import (
BaseScraper,
BiosRequirement,
fetch_github_latest_tag,
scraper_cli,
)
PLATFORM_NAME = "bizhawk"
SOURCE_URL = (
"https://raw.githubusercontent.com/TASEmulators/BizHawk/"
"master/src/BizHawk.Emulation.Common/Database/FirmwareDatabase.cs"
)
GITHUB_REPO = "TASEmulators/BizHawk"
STATUS_RANK = {
"Bad": 0,
"Unacceptable": 1,
"Unknown": 2,
"Acceptable": 3,
"Ideal": 4,
}
GAME_DATA_SYSTEMS = {"BSX", "Doom"}
GAME_DATA_FILES = {"VEC_Minestorm.vec"}
SYSTEM_ID_MAP: dict[str, str] = {
"32X": "sega-32x",
"3DO": "3do",
"3DS": "nintendo-3ds",
"A26": "atari-2600",
"A78": "atari-7800",
"Amiga": "commodore-amiga",
"AmstradCPC": "amstrad-cpc",
"AppleII": "apple-ii",
"BSX": "nintendo-bsx",
"C64": "commodore-c64",
"ChannelF": "fairchild-channel-f",
"Coleco": "coleco-colecovision",
"Doom": "doom",
"DS": "nintendo-ds",
"FDS": "nintendo-fds",
"G7400": "philips-videopac-plus",
"GB": "nintendo-gb",
"GBA": "nintendo-gba",
"GBC": "nintendo-gbc",
"GEN": "sega-mega-drive",
"GG": "sega-game-gear",
"GGL": "sega-game-gear",
"INTV": "mattel-intellivision",
"Jaguar": "atari-jaguar",
"Lynx": "atari-lynx",
"MAME": "arcade",
"MSX": "microsoft-msx",
"N64": "nintendo-64",
"N64DD": "nintendo-64dd",
"NDS": "nintendo-ds",
"NES": "nintendo-nes",
"NGP": "snk-neo-geo-pocket",
"O2": "philips-videopac",
"PCECD": "nec-pc-engine-cd",
"PCFX": "nec-pc-fx",
"PS2": "sony-playstation-2",
"PSX": "sony-playstation",
"SAT": "sega-saturn",
"SGB": "nintendo-super-game-boy",
"SGX": "nec-supergrafx",
"SMS": "sega-master-system",
"SNES": "nintendo-snes",
"TI83": "texas-instruments-ti-83",
"UZE": "uzebox",
"VEC": "gce-vectrex",
"WSWAN": "bandai-wonderswan",
"ZXSpectrum": "sinclair-zx-spectrum",
}
# Cores that overlap with BizHawk's system coverage
BIZHAWK_CORES = [
"gambatte", "mgba", "sameboy", "melonds", "snes9x", "bsnes",
"beetle_psx", "beetle_saturn", "beetle_pce", "beetle_pcfx",
"beetle_wswan", "beetle_vb", "beetle_ngp", "opera", "stella",
"picodrive", "ppsspp", "handy", "quicknes", "genesis_plus_gx",
"ares", "mupen64plus_next", "puae", "prboom", "virtualjaguar",
"vice_x64", "mame",
]
def _safe_arithmetic(expr: str) -> int:
"""Compute simple integer arithmetic (+ and *) without code execution.
Handles: plain integers, multiplication chains (4 * 1024 * 1024),
addition of products (128 + 64 * 1024).
"""
expr = expr.strip()
total = 0
for addend in expr.split("+"):
factors = addend.strip().split("*")
product = 1
for f in factors:
product *= int(f.strip())
total += product
return total
def _strip_comments(source: str) -> str:
"""Remove block comments and #if false blocks."""
source = re.sub(r"/\*.*?\*/", "", source, flags=re.DOTALL)
source = re.sub(
r"#if\s+false\b.*?#endif", "", source, flags=re.DOTALL
)
return source
def parse_firmware_database(
source: str,
) -> tuple[list[dict], dict[str, dict]]:
"""Parse BizHawk FirmwareDatabase.cs source into firmware records.
Returns (records, files_by_hash) where each record is a dict with keys:
system, firmware_id, sha1, name, size, description, status
"""
source = _strip_comments(source)
# ── Pass 1: collect File() definitions ────────────────────────
files_by_hash: dict[str, dict] = {}
var_to_hash: dict[str, str] = {}
file_re = re.compile(
r'(?:var\s+(\w+)\s*=\s*)?'
r'File\(\s*'
r'(?:"([A-Fa-f0-9]+)"|SHA1Checksum\.Dummy)\s*,\s*'
r'([^,]+?)\s*,\s*'
r'"([^"]+)"\s*,\s*'
r'"([^"]*)"'
r'(?:\s*,\s*isBad:\s*(true|false))?'
r'\s*\)'
)
for m in file_re.finditer(source):
var_name = m.group(1)
sha1 = m.group(2) # None for SHA1Checksum.Dummy
size_expr = m.group(3)
name = m.group(4)
desc = m.group(5)
is_bad = m.group(6) == "true"
size = _safe_arithmetic(size_expr)
file_entry = {
"sha1": sha1,
"size": size,
"name": name,
"description": desc,
"is_bad": is_bad,
}
key = sha1 if sha1 else f"dummy_{name}"
files_by_hash[key] = file_entry
if var_name:
var_to_hash[var_name] = key
# ── Pass 2: collect firmware slots and options ────────────────
# FirmwareAndOption one-liner
fao_re = re.compile(
r'FirmwareAndOption\(\s*'
r'(?:"([A-Fa-f0-9]+)"|SHA1Checksum\.Dummy)\s*,\s*'
r'([^,]+?)\s*,\s*'
r'"([^"]+)"\s*,\s*'
r'"([^"]+)"\s*,\s*'
r'"([^"]+)"\s*,\s*'
r'"([^"]*)"'
r'(?:\s*,\s*FirmwareOptionStatus\.(\w+))?'
r'\s*\)'
)
# Firmware(system, id, desc)
firmware_re = re.compile(
r'Firmware\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*"([^"]*)"\s*\)'
)
# Option(system, id, in varref|File(...), status?)
option_re = re.compile(
r'Option\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*'
r'(?:in\s+(\w+)'
r'|File\(\s*"([A-Fa-f0-9]+)"\s*,\s*([^,]+?)\s*,\s*"([^"]+)"\s*,\s*"([^"]*)"\s*\))'
r'(?:\s*,\s*FirmwareOptionStatus\.(\w+))?'
r'\s*\)'
)
# Collect firmware slots
firmware_slots: dict[tuple[str, str], str] = {}
for m in firmware_re.finditer(source):
system, fw_id, desc = m.group(1), m.group(2), m.group(3)
firmware_slots[(system, fw_id)] = desc
# Collect options per slot: list of (file_entry, status)
slot_options: dict[tuple[str, str], list[tuple[dict, str]]] = {}
for m in option_re.finditer(source):
system, fw_id = m.group(1), m.group(2)
var_ref = m.group(3)
inline_sha1 = m.group(4)
status = m.group(8) or "Acceptable"
if var_ref:
key = var_to_hash.get(var_ref)
if key and key in files_by_hash:
file_entry = files_by_hash[key]
else:
continue
elif inline_sha1:
size_expr = m.group(5)
name = m.group(6)
desc = m.group(7)
file_entry = {
"sha1": inline_sha1,
"size": _safe_arithmetic(size_expr),
"name": name,
"description": desc,
"is_bad": False,
}
else:
continue
slot_key = (system, fw_id)
slot_options.setdefault(slot_key, []).append((file_entry, status))
# Build records from FirmwareAndOption one-liners
records: list[dict] = []
for m in fao_re.finditer(source):
sha1 = m.group(1)
size_expr = m.group(2)
system = m.group(3)
fw_id = m.group(4)
name = m.group(5)
desc = m.group(6)
status = m.group(7) or "Acceptable"
records.append({
"system": system,
"firmware_id": fw_id,
"sha1": sha1,
"name": name,
"size": _safe_arithmetic(size_expr),
"description": desc,
"status": status,
})
# Build records from Firmware+Option pairs, picking best option
for (system, fw_id), options in slot_options.items():
desc = firmware_slots.get((system, fw_id), "")
# Filter out bad files, then pick highest-ranked status
viable = [(f, s) for f, s in options if not f.get("is_bad")]
if not viable:
viable = options
viable.sort(key=lambda x: STATUS_RANK.get(x[1], 2), reverse=True)
best_file, best_status = viable[0]
records.append({
"system": system,
"firmware_id": fw_id,
"sha1": best_file["sha1"],
"name": best_file["name"],
"size": best_file["size"],
"description": best_file.get("description", desc),
"status": best_status,
})
return records, files_by_hash
class Scraper(BaseScraper):
"""BizHawk firmware database scraper."""
def __init__(self):
super().__init__(url=SOURCE_URL)
def validate_format(self, raw_data: str) -> bool:
return "FirmwareDatabase" in raw_data and "FirmwareAndOption" in raw_data
def fetch_requirements(self) -> list[BiosRequirement]:
raw = self._fetch_raw()
if not self.validate_format(raw):
raise ValueError("unexpected FirmwareDatabase.cs format")
records, _ = parse_firmware_database(raw)
requirements: list[BiosRequirement] = []
for rec in records:
system_id = SYSTEM_ID_MAP.get(rec["system"], rec["system"].lower())
req = BiosRequirement(
name=rec["name"],
system=system_id,
sha1=rec["sha1"],
size=rec["size"] if rec["size"] else None,
required=rec.get("status") != "Bad",
)
requirements.append(req)
return requirements
def generate_platform_yaml(self) -> dict:
"""Generate a platform YAML config dict from scraped data."""
requirements = self.fetch_requirements()
systems: dict[str, dict] = {}
for req in requirements:
if req.system not in systems:
systems[req.system] = {"files": []}
entry: dict = {
"name": req.name,
"destination": req.name,
"required": req.required,
}
if req.sha1:
entry["sha1"] = req.sha1
if req.size:
entry["size"] = req.size
systems[req.system]["files"].append(entry)
version = fetch_github_latest_tag(GITHUB_REPO) or ""
return {
"platform": "BizHawk",
"version": version,
"homepage": "https://tasvideos.org/BizHawk",
"source": SOURCE_URL,
"base_destination": "Firmware",
"hash_type": "sha1",
"verification_mode": "sha1",
"cores": BIZHAWK_CORES,
"systems": systems,
}
def main():
scraper_cli(Scraper, "Scrape BizHawk BIOS requirements")
if __name__ == "__main__":
main()

View File

@@ -2113,6 +2113,100 @@ class TestE2E(unittest.TestCase):
# --platform + --system is a valid combination
self.assertTrue(has_platform and has_system)
# ── BizHawk scraper tests ──────────────────────────────────────
def test_150_bizhawk_scraper_parse_firmware_and_option(self):
"""Parse FirmwareAndOption() one-liner pattern."""
from scraper.bizhawk_scraper import parse_firmware_database
fragment = '''
FirmwareAndOption("DBEBD76A448447CB6E524AC3CB0FD19FC065D944", 256, "32X", "G", "32X_G_BIOS.BIN", "32x 68k BIOS");
FirmwareAndOption("1E5B0B2441A4979B6966D942B20CC76C413B8C5E", 2048, "32X", "M", "32X_M_BIOS.BIN", "32x SH2 MASTER BIOS");
'''
records, files = parse_firmware_database(fragment)
self.assertEqual(len(records), 2)
self.assertEqual(records[0]["system"], "32X")
self.assertEqual(records[0]["firmware_id"], "G")
self.assertEqual(records[0]["sha1"], "DBEBD76A448447CB6E524AC3CB0FD19FC065D944")
self.assertEqual(records[0]["name"], "32X_G_BIOS.BIN")
self.assertEqual(records[0]["size"], 256)
def test_151_bizhawk_scraper_parse_variable_refs(self):
"""Parse var = File() + Firmware() + Option() pattern."""
from scraper.bizhawk_scraper import parse_firmware_database
fragment = '''
var gbaNormal = File("300C20DF6731A33952DED8C436F7F186D25D3492", 16384, "GBA_bios.rom", "Bios (World)");
Firmware("GBA", "Bios", "Bios");
Option("GBA", "Bios", in gbaNormal, FirmwareOptionStatus.Ideal);
'''
records, files = parse_firmware_database(fragment)
self.assertEqual(len(records), 1)
self.assertEqual(records[0]["system"], "GBA")
self.assertEqual(records[0]["sha1"], "300C20DF6731A33952DED8C436F7F186D25D3492")
self.assertEqual(records[0]["name"], "GBA_bios.rom")
self.assertEqual(records[0]["status"], "Ideal")
def test_152_bizhawk_scraper_skips_comments(self):
"""Commented-out blocks (PS2) are skipped."""
from scraper.bizhawk_scraper import parse_firmware_database
fragment = '''
FirmwareAndOption("DBEBD76A448447CB6E524AC3CB0FD19FC065D944", 256, "32X", "G", "32X_G_BIOS.BIN", "32x 68k BIOS");
/*
Firmware("PS2", "BIOS", "PS2 Bios");
Option("PS2", "BIOS", File("FBD54BFC020AF34008B317DCB80B812DD29B3759", 4194304, "ps2.bin", "PS2 Bios"));
*/
'''
records, files = parse_firmware_database(fragment)
systems = {r["system"] for r in records}
self.assertNotIn("PS2", systems)
self.assertEqual(len(records), 1)
def test_153_bizhawk_scraper_arithmetic_size(self):
"""Size expressions like 4 * 1024 * 1024 are evaluated."""
from scraper.bizhawk_scraper import parse_firmware_database
fragment = '''
FirmwareAndOption("BF861922DCB78C316360E3E742F4F70FF63C9BC3", 4 * 1024 * 1024, "N64DD", "IPL_JPN", "64DD_IPL.bin", "N64DD JPN IPL");
'''
records, _ = parse_firmware_database(fragment)
self.assertEqual(records[0]["size"], 4194304)
def test_154_bizhawk_scraper_dummy_hash(self):
"""SHA1Checksum.Dummy entries get no sha1 field."""
from scraper.bizhawk_scraper import parse_firmware_database
fragment = '''
FirmwareAndOption(SHA1Checksum.Dummy, 0, "3DS", "aes_keys", "aes_keys.txt", "AES Keys");
'''
records, _ = parse_firmware_database(fragment)
self.assertEqual(len(records), 1)
self.assertIsNone(records[0]["sha1"])
def test_155_bizhawk_scraper_multi_option_picks_ideal(self):
"""When multiple options exist, Ideal is selected as canonical."""
from scraper.bizhawk_scraper import parse_firmware_database
fragment = '''
var ss_100_j = File("2B8CB4F87580683EB4D760E4ED210813D667F0A2", 524288, "SAT_1.00-(J).bin", "Bios v1.00 (J)");
var ss_101_j = File("DF94C5B4D47EB3CC404D88B33A8FDA237EAF4720", 524288, "SAT_1.01-(J).bin", "Bios v1.01 (J)");
Firmware("SAT", "J", "Bios (J)");
Option("SAT", "J", in ss_100_j);
Option("SAT", "J", in ss_101_j, FirmwareOptionStatus.Ideal);
'''
records, _ = parse_firmware_database(fragment)
self.assertEqual(len(records), 1)
self.assertEqual(records[0]["sha1"], "DF94C5B4D47EB3CC404D88B33A8FDA237EAF4720")
self.assertEqual(records[0]["name"], "SAT_1.01-(J).bin")
def test_156_bizhawk_scraper_is_bad_excluded(self):
"""Files with isBad: true are not selected as canonical."""
from scraper.bizhawk_scraper import parse_firmware_database
fragment = '''
var good = File("AAAA", 100, "good.bin", "Good");
var bad = File("BBBB", 100, "bad.bin", "Bad", isBad: true);
Firmware("TEST", "X", "Test");
Option("TEST", "X", in bad);
Option("TEST", "X", in good, FirmwareOptionStatus.Ideal);
'''
records, _ = parse_firmware_database(fragment)
self.assertEqual(records[0]["name"], "good.bin")
if __name__ == "__main__":
unittest.main()