diff --git a/scripts/scraper/bizhawk_scraper.py b/scripts/scraper/bizhawk_scraper.py new file mode 100644 index 00000000..1c866647 --- /dev/null +++ b/scripts/scraper/bizhawk_scraper.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +"""Scraper for BizHawk BIOS requirements. + +Source: https://github.com/TASEmulators/BizHawk +Format: C# source (FirmwareDatabase.cs) +Hash: SHA1 primary + +BizHawk declares firmware in FirmwareDatabase.cs using four patterns: + File(sha1, size, name, desc, isBad?) - file definition + Firmware(system, id, desc) - firmware slot declaration + Option(system, id, in fileref, status?) - binds file to slot + FirmwareAndOption(sha1, size, sys, id, ...) - combined one-liner + +Variable assignments (var x = File(...)) let Option() reference files +by name. Multiple options per firmware slot are ranked by status; +the Ideal non-bad option is selected as canonical. +""" + +from __future__ import annotations + +import re +import sys + +try: + from .base_scraper import ( + BaseScraper, + BiosRequirement, + fetch_github_latest_tag, + scraper_cli, + ) +except ImportError: + from base_scraper import ( + BaseScraper, + BiosRequirement, + fetch_github_latest_tag, + scraper_cli, + ) + +PLATFORM_NAME = "bizhawk" + +SOURCE_URL = ( + "https://raw.githubusercontent.com/TASEmulators/BizHawk/" + "master/src/BizHawk.Emulation.Common/Database/FirmwareDatabase.cs" +) + +GITHUB_REPO = "TASEmulators/BizHawk" + +STATUS_RANK = { + "Bad": 0, + "Unacceptable": 1, + "Unknown": 2, + "Acceptable": 3, + "Ideal": 4, +} + +GAME_DATA_SYSTEMS = {"BSX", "Doom"} +GAME_DATA_FILES = {"VEC_Minestorm.vec"} + +SYSTEM_ID_MAP: dict[str, str] = { + "32X": "sega-32x", + "3DO": "3do", + "3DS": "nintendo-3ds", + "A26": "atari-2600", + "A78": "atari-7800", + "Amiga": "commodore-amiga", + "AmstradCPC": "amstrad-cpc", + "AppleII": "apple-ii", + "BSX": "nintendo-bsx", + "C64": "commodore-c64", + "ChannelF": "fairchild-channel-f", + "Coleco": "coleco-colecovision", + "Doom": "doom", + "DS": "nintendo-ds", + "FDS": "nintendo-fds", + "G7400": "philips-videopac-plus", + "GB": "nintendo-gb", + "GBA": "nintendo-gba", + "GBC": "nintendo-gbc", + "GEN": "sega-mega-drive", + "GG": "sega-game-gear", + "GGL": "sega-game-gear", + "INTV": "mattel-intellivision", + "Jaguar": "atari-jaguar", + "Lynx": "atari-lynx", + "MAME": "arcade", + "MSX": "microsoft-msx", + "N64": "nintendo-64", + "N64DD": "nintendo-64dd", + "NDS": "nintendo-ds", + "NES": "nintendo-nes", + "NGP": "snk-neo-geo-pocket", + "O2": "philips-videopac", + "PCECD": "nec-pc-engine-cd", + "PCFX": "nec-pc-fx", + "PS2": "sony-playstation-2", + "PSX": "sony-playstation", + "SAT": "sega-saturn", + "SGB": "nintendo-super-game-boy", + "SGX": "nec-supergrafx", + "SMS": "sega-master-system", + "SNES": "nintendo-snes", + "TI83": "texas-instruments-ti-83", + "UZE": "uzebox", + "VEC": "gce-vectrex", + "WSWAN": "bandai-wonderswan", + "ZXSpectrum": "sinclair-zx-spectrum", +} + +# Cores that overlap with BizHawk's system coverage +BIZHAWK_CORES = [ + "gambatte", "mgba", "sameboy", "melonds", "snes9x", "bsnes", + "beetle_psx", "beetle_saturn", "beetle_pce", "beetle_pcfx", + "beetle_wswan", "beetle_vb", "beetle_ngp", "opera", "stella", + "picodrive", "ppsspp", "handy", "quicknes", "genesis_plus_gx", + "ares", "mupen64plus_next", "puae", "prboom", "virtualjaguar", + "vice_x64", "mame", +] + + +def _safe_arithmetic(expr: str) -> int: + """Compute simple integer arithmetic (+ and *) without code execution. + + Handles: plain integers, multiplication chains (4 * 1024 * 1024), + addition of products (128 + 64 * 1024). + """ + expr = expr.strip() + total = 0 + for addend in expr.split("+"): + factors = addend.strip().split("*") + product = 1 + for f in factors: + product *= int(f.strip()) + total += product + return total + + +def _strip_comments(source: str) -> str: + """Remove block comments and #if false blocks.""" + source = re.sub(r"/\*.*?\*/", "", source, flags=re.DOTALL) + source = re.sub( + r"#if\s+false\b.*?#endif", "", source, flags=re.DOTALL + ) + return source + + +def parse_firmware_database( + source: str, +) -> tuple[list[dict], dict[str, dict]]: + """Parse BizHawk FirmwareDatabase.cs source into firmware records. + + Returns (records, files_by_hash) where each record is a dict with keys: + system, firmware_id, sha1, name, size, description, status + """ + source = _strip_comments(source) + + # ── Pass 1: collect File() definitions ──────────────────────── + files_by_hash: dict[str, dict] = {} + var_to_hash: dict[str, str] = {} + + file_re = re.compile( + r'(?:var\s+(\w+)\s*=\s*)?' + r'File\(\s*' + r'(?:"([A-Fa-f0-9]+)"|SHA1Checksum\.Dummy)\s*,\s*' + r'([^,]+?)\s*,\s*' + r'"([^"]+)"\s*,\s*' + r'"([^"]*)"' + r'(?:\s*,\s*isBad:\s*(true|false))?' + r'\s*\)' + ) + + for m in file_re.finditer(source): + var_name = m.group(1) + sha1 = m.group(2) # None for SHA1Checksum.Dummy + size_expr = m.group(3) + name = m.group(4) + desc = m.group(5) + is_bad = m.group(6) == "true" + + size = _safe_arithmetic(size_expr) + file_entry = { + "sha1": sha1, + "size": size, + "name": name, + "description": desc, + "is_bad": is_bad, + } + + key = sha1 if sha1 else f"dummy_{name}" + files_by_hash[key] = file_entry + if var_name: + var_to_hash[var_name] = key + + # ── Pass 2: collect firmware slots and options ──────────────── + + # FirmwareAndOption one-liner + fao_re = re.compile( + r'FirmwareAndOption\(\s*' + r'(?:"([A-Fa-f0-9]+)"|SHA1Checksum\.Dummy)\s*,\s*' + r'([^,]+?)\s*,\s*' + r'"([^"]+)"\s*,\s*' + r'"([^"]+)"\s*,\s*' + r'"([^"]+)"\s*,\s*' + r'"([^"]*)"' + r'(?:\s*,\s*FirmwareOptionStatus\.(\w+))?' + r'\s*\)' + ) + + # Firmware(system, id, desc) + firmware_re = re.compile( + r'Firmware\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*"([^"]*)"\s*\)' + ) + + # Option(system, id, in varref|File(...), status?) + option_re = re.compile( + r'Option\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*' + r'(?:in\s+(\w+)' + r'|File\(\s*"([A-Fa-f0-9]+)"\s*,\s*([^,]+?)\s*,\s*"([^"]+)"\s*,\s*"([^"]*)"\s*\))' + r'(?:\s*,\s*FirmwareOptionStatus\.(\w+))?' + r'\s*\)' + ) + + # Collect firmware slots + firmware_slots: dict[tuple[str, str], str] = {} + for m in firmware_re.finditer(source): + system, fw_id, desc = m.group(1), m.group(2), m.group(3) + firmware_slots[(system, fw_id)] = desc + + # Collect options per slot: list of (file_entry, status) + slot_options: dict[tuple[str, str], list[tuple[dict, str]]] = {} + + for m in option_re.finditer(source): + system, fw_id = m.group(1), m.group(2) + var_ref = m.group(3) + inline_sha1 = m.group(4) + status = m.group(8) or "Acceptable" + + if var_ref: + key = var_to_hash.get(var_ref) + if key and key in files_by_hash: + file_entry = files_by_hash[key] + else: + continue + elif inline_sha1: + size_expr = m.group(5) + name = m.group(6) + desc = m.group(7) + file_entry = { + "sha1": inline_sha1, + "size": _safe_arithmetic(size_expr), + "name": name, + "description": desc, + "is_bad": False, + } + else: + continue + + slot_key = (system, fw_id) + slot_options.setdefault(slot_key, []).append((file_entry, status)) + + # Build records from FirmwareAndOption one-liners + records: list[dict] = [] + + for m in fao_re.finditer(source): + sha1 = m.group(1) + size_expr = m.group(2) + system = m.group(3) + fw_id = m.group(4) + name = m.group(5) + desc = m.group(6) + status = m.group(7) or "Acceptable" + + records.append({ + "system": system, + "firmware_id": fw_id, + "sha1": sha1, + "name": name, + "size": _safe_arithmetic(size_expr), + "description": desc, + "status": status, + }) + + # Build records from Firmware+Option pairs, picking best option + for (system, fw_id), options in slot_options.items(): + desc = firmware_slots.get((system, fw_id), "") + + # Filter out bad files, then pick highest-ranked status + viable = [(f, s) for f, s in options if not f.get("is_bad")] + if not viable: + viable = options + + viable.sort(key=lambda x: STATUS_RANK.get(x[1], 2), reverse=True) + best_file, best_status = viable[0] + + records.append({ + "system": system, + "firmware_id": fw_id, + "sha1": best_file["sha1"], + "name": best_file["name"], + "size": best_file["size"], + "description": best_file.get("description", desc), + "status": best_status, + }) + + return records, files_by_hash + + +class Scraper(BaseScraper): + """BizHawk firmware database scraper.""" + + def __init__(self): + super().__init__(url=SOURCE_URL) + + def validate_format(self, raw_data: str) -> bool: + return "FirmwareDatabase" in raw_data and "FirmwareAndOption" in raw_data + + def fetch_requirements(self) -> list[BiosRequirement]: + raw = self._fetch_raw() + if not self.validate_format(raw): + raise ValueError("unexpected FirmwareDatabase.cs format") + + records, _ = parse_firmware_database(raw) + requirements: list[BiosRequirement] = [] + + for rec in records: + system_id = SYSTEM_ID_MAP.get(rec["system"], rec["system"].lower()) + + req = BiosRequirement( + name=rec["name"], + system=system_id, + sha1=rec["sha1"], + size=rec["size"] if rec["size"] else None, + required=rec.get("status") != "Bad", + ) + requirements.append(req) + + return requirements + + def generate_platform_yaml(self) -> dict: + """Generate a platform YAML config dict from scraped data.""" + requirements = self.fetch_requirements() + + systems: dict[str, dict] = {} + for req in requirements: + if req.system not in systems: + systems[req.system] = {"files": []} + + entry: dict = { + "name": req.name, + "destination": req.name, + "required": req.required, + } + if req.sha1: + entry["sha1"] = req.sha1 + if req.size: + entry["size"] = req.size + + systems[req.system]["files"].append(entry) + + version = fetch_github_latest_tag(GITHUB_REPO) or "" + + return { + "platform": "BizHawk", + "version": version, + "homepage": "https://tasvideos.org/BizHawk", + "source": SOURCE_URL, + "base_destination": "Firmware", + "hash_type": "sha1", + "verification_mode": "sha1", + "cores": BIZHAWK_CORES, + "systems": systems, + } + + +def main(): + scraper_cli(Scraper, "Scrape BizHawk BIOS requirements") + + +if __name__ == "__main__": + main() diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 86bdfdcb..40e5fad1 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -2113,6 +2113,100 @@ class TestE2E(unittest.TestCase): # --platform + --system is a valid combination self.assertTrue(has_platform and has_system) + # ── BizHawk scraper tests ────────────────────────────────────── + + def test_150_bizhawk_scraper_parse_firmware_and_option(self): + """Parse FirmwareAndOption() one-liner pattern.""" + from scraper.bizhawk_scraper import parse_firmware_database + fragment = ''' + FirmwareAndOption("DBEBD76A448447CB6E524AC3CB0FD19FC065D944", 256, "32X", "G", "32X_G_BIOS.BIN", "32x 68k BIOS"); + FirmwareAndOption("1E5B0B2441A4979B6966D942B20CC76C413B8C5E", 2048, "32X", "M", "32X_M_BIOS.BIN", "32x SH2 MASTER BIOS"); + ''' + records, files = parse_firmware_database(fragment) + self.assertEqual(len(records), 2) + self.assertEqual(records[0]["system"], "32X") + self.assertEqual(records[0]["firmware_id"], "G") + self.assertEqual(records[0]["sha1"], "DBEBD76A448447CB6E524AC3CB0FD19FC065D944") + self.assertEqual(records[0]["name"], "32X_G_BIOS.BIN") + self.assertEqual(records[0]["size"], 256) + + def test_151_bizhawk_scraper_parse_variable_refs(self): + """Parse var = File() + Firmware() + Option() pattern.""" + from scraper.bizhawk_scraper import parse_firmware_database + fragment = ''' + var gbaNormal = File("300C20DF6731A33952DED8C436F7F186D25D3492", 16384, "GBA_bios.rom", "Bios (World)"); + Firmware("GBA", "Bios", "Bios"); + Option("GBA", "Bios", in gbaNormal, FirmwareOptionStatus.Ideal); + ''' + records, files = parse_firmware_database(fragment) + self.assertEqual(len(records), 1) + self.assertEqual(records[0]["system"], "GBA") + self.assertEqual(records[0]["sha1"], "300C20DF6731A33952DED8C436F7F186D25D3492") + self.assertEqual(records[0]["name"], "GBA_bios.rom") + self.assertEqual(records[0]["status"], "Ideal") + + def test_152_bizhawk_scraper_skips_comments(self): + """Commented-out blocks (PS2) are skipped.""" + from scraper.bizhawk_scraper import parse_firmware_database + fragment = ''' + FirmwareAndOption("DBEBD76A448447CB6E524AC3CB0FD19FC065D944", 256, "32X", "G", "32X_G_BIOS.BIN", "32x 68k BIOS"); + /* + Firmware("PS2", "BIOS", "PS2 Bios"); + Option("PS2", "BIOS", File("FBD54BFC020AF34008B317DCB80B812DD29B3759", 4194304, "ps2.bin", "PS2 Bios")); + */ + ''' + records, files = parse_firmware_database(fragment) + systems = {r["system"] for r in records} + self.assertNotIn("PS2", systems) + self.assertEqual(len(records), 1) + + def test_153_bizhawk_scraper_arithmetic_size(self): + """Size expressions like 4 * 1024 * 1024 are evaluated.""" + from scraper.bizhawk_scraper import parse_firmware_database + fragment = ''' + FirmwareAndOption("BF861922DCB78C316360E3E742F4F70FF63C9BC3", 4 * 1024 * 1024, "N64DD", "IPL_JPN", "64DD_IPL.bin", "N64DD JPN IPL"); + ''' + records, _ = parse_firmware_database(fragment) + self.assertEqual(records[0]["size"], 4194304) + + def test_154_bizhawk_scraper_dummy_hash(self): + """SHA1Checksum.Dummy entries get no sha1 field.""" + from scraper.bizhawk_scraper import parse_firmware_database + fragment = ''' + FirmwareAndOption(SHA1Checksum.Dummy, 0, "3DS", "aes_keys", "aes_keys.txt", "AES Keys"); + ''' + records, _ = parse_firmware_database(fragment) + self.assertEqual(len(records), 1) + self.assertIsNone(records[0]["sha1"]) + + def test_155_bizhawk_scraper_multi_option_picks_ideal(self): + """When multiple options exist, Ideal is selected as canonical.""" + from scraper.bizhawk_scraper import parse_firmware_database + fragment = ''' + var ss_100_j = File("2B8CB4F87580683EB4D760E4ED210813D667F0A2", 524288, "SAT_1.00-(J).bin", "Bios v1.00 (J)"); + var ss_101_j = File("DF94C5B4D47EB3CC404D88B33A8FDA237EAF4720", 524288, "SAT_1.01-(J).bin", "Bios v1.01 (J)"); + Firmware("SAT", "J", "Bios (J)"); + Option("SAT", "J", in ss_100_j); + Option("SAT", "J", in ss_101_j, FirmwareOptionStatus.Ideal); + ''' + records, _ = parse_firmware_database(fragment) + self.assertEqual(len(records), 1) + self.assertEqual(records[0]["sha1"], "DF94C5B4D47EB3CC404D88B33A8FDA237EAF4720") + self.assertEqual(records[0]["name"], "SAT_1.01-(J).bin") + + def test_156_bizhawk_scraper_is_bad_excluded(self): + """Files with isBad: true are not selected as canonical.""" + from scraper.bizhawk_scraper import parse_firmware_database + fragment = ''' + var good = File("AAAA", 100, "good.bin", "Good"); + var bad = File("BBBB", 100, "bad.bin", "Bad", isBad: true); + Firmware("TEST", "X", "Test"); + Option("TEST", "X", in bad); + Option("TEST", "X", in good, FirmwareOptionStatus.Ideal); + ''' + records, _ = parse_firmware_database(fragment) + self.assertEqual(records[0]["name"], "good.bin") + if __name__ == "__main__": unittest.main()