diff --git a/scripts/scraper/mame_parser.py b/scripts/scraper/mame_parser.py new file mode 100644 index 00000000..0a27de39 --- /dev/null +++ b/scripts/scraper/mame_parser.py @@ -0,0 +1,298 @@ +"""Parser for MAME C source files. + +Extracts BIOS root sets and ROM definitions from MAME driver sources. +Handles GAME/SYST/COMP/CONS macros with MACHINE_IS_BIOS_ROOT flag, +ROM_START/ROM_END blocks, ROM_LOAD variants, ROM_REGION, ROM_SYSTEM_BIOS, +NO_DUMP filtering, and BAD_DUMP flagging. +""" + +from __future__ import annotations + +import os +import re +from pathlib import Path + +# Macros that declare a machine entry +_MACHINE_MACROS = re.compile( + r'\b(GAME|SYST|COMP|CONS)\s*\(', + re.MULTILINE, +) + +# ROM block boundaries +_ROM_START = re.compile(r'ROM_START\s*\(\s*(\w+)\s*\)') +_ROM_END = re.compile(r'ROM_END') + +# ROM_REGION( tag, offset, size ) +_ROM_REGION = re.compile( + r'ROM_REGION\s*\(' + r'\s*(0x[\da-fA-F]+|\d+)\s*,' # size + r'\s*"([^"]+)"\s*,', # tag +) + +# ROM_SYSTEM_BIOS( index, label, description ) +_ROM_SYSTEM_BIOS = re.compile( + r'ROM_SYSTEM_BIOS\s*\(' + r'\s*(\d+)\s*,' # index + r'\s*"([^"]+)"\s*,' # label + r'\s*"([^"]+)"\s*\)', # description +) + +# All ROM_LOAD variants: ROM_LOAD, ROMX_LOAD, ROM_LOAD16_BYTE, ROM_LOAD16_WORD, etc. +_ROM_LOAD = re.compile( + r'(ROMX?_LOAD(?:16_BYTE|16_WORD|16_WORD_SWAP|32_BYTE|32_WORD|32_WORD_SWAP)?)\s*\(' + r'\s*"([^"]+)"\s*,' # name + r'\s*(0x[\da-fA-F]+|\d+)\s*,' # offset + r'\s*(0x[\da-fA-F]+|\d+)\s*,', # size +) + +# CRC32 and SHA1 within a ROM_LOAD line +_CRC_SHA = re.compile( + r'CRC\s*\(\s*([0-9a-fA-F]+)\s*\)' + r'\s+' + r'SHA1\s*\(\s*([0-9a-fA-F]+)\s*\)', +) + +_NO_DUMP = re.compile(r'\bNO_DUMP\b') +_BAD_DUMP = re.compile(r'\bBAD_DUMP\b') +_ROM_BIOS = re.compile(r'ROM_BIOS\s*\(\s*(\d+)\s*\)') + + +def find_bios_root_sets(source: str, filename: str) -> dict[str, dict]: + """Find machine entries flagged as BIOS root sets. + + Scans for GAME/SYST/COMP/CONS macros where the args include + MACHINE_IS_BIOS_ROOT, returns set names with source location. + """ + results: dict[str, dict] = {} + + for match in _MACHINE_MACROS.finditer(source): + start = match.end() - 1 # position of opening paren + block_end = _find_closing_paren(source, start) + if block_end == -1: + continue + + block = source[start:block_end + 1] + if 'MACHINE_IS_BIOS_ROOT' not in block: + continue + + # Extract set name: first arg after the opening paren + inner = block[1:] # skip opening paren + args = _split_macro_args(inner) + if not args: + continue + + # The set name position varies by macro type + # GAME(year, setname, parent, machine, input, init, monitor, company, fullname, flags) + # CONS(year, setname, parent, compat, machine, input, init, company, fullname, flags) + # COMP(year, setname, parent, compat, machine, input, init, company, fullname, flags) + # SYST(year, setname, parent, compat, machine, input, init, company, fullname, flags) + # In all cases, setname is the second arg (index 1) + if len(args) < 2: + continue + + set_name = args[1].strip() + line_no = source[:match.start()].count('\n') + 1 + + results[set_name] = { + 'source_file': filename, + 'source_line': line_no, + } + + return results + + +def parse_rom_block(source: str, set_name: str) -> list[dict]: + """Parse ROM definitions for a given set name. + + Finds the ROM_START(set_name)...ROM_END block and extracts all + ROM_LOAD entries with their metadata. Skips NO_DUMP entries, + flags BAD_DUMP entries. + """ + pattern = re.compile( + r'ROM_START\s*\(\s*' + re.escape(set_name) + r'\s*\)', + ) + start_match = pattern.search(source) + if not start_match: + return [] + + end_match = _ROM_END.search(source, start_match.end()) + if not end_match: + return [] + + block = source[start_match.end():end_match.start()] + return _parse_rom_entries(block) + + +def parse_mame_source_tree(base_path: str) -> dict[str, dict]: + """Walk MAME source tree and extract all BIOS root sets with ROMs. + + Scans src/mame/ and src/devices/ for C/C++ source files. + """ + results: dict[str, dict] = {} + root = Path(base_path) + + search_dirs = [root / 'src' / 'mame', root / 'src' / 'devices'] + + for search_dir in search_dirs: + if not search_dir.is_dir(): + continue + for dirpath, _dirnames, filenames in os.walk(search_dir): + for fname in filenames: + if not fname.endswith(('.cpp', '.c', '.h', '.hxx')): + continue + filepath = Path(dirpath) / fname + rel_path = str(filepath.relative_to(root)) + content = filepath.read_text(encoding='utf-8', errors='replace') + + bios_sets = find_bios_root_sets(content, rel_path) + for set_name, info in bios_sets.items(): + roms = parse_rom_block(content, set_name) + results[set_name] = { + 'source_file': info['source_file'], + 'source_line': info['source_line'], + 'roms': roms, + } + + return results + + +def _find_closing_paren(source: str, start: int) -> int: + """Find the matching closing paren for source[start] which must be '('.""" + depth = 0 + i = start + while i < len(source): + ch = source[i] + if ch == '(': + depth += 1 + elif ch == ')': + depth -= 1 + if depth == 0: + return i + elif ch == '"': + i += 1 + while i < len(source) and source[i] != '"': + i += 1 + i += 1 + return -1 + + +def _split_macro_args(inner: str) -> list[str]: + """Split macro arguments respecting nested parens and strings.""" + args: list[str] = [] + depth = 0 + current: list[str] = [] + + i = 0 + while i < len(inner): + ch = inner[i] + if ch == '"': + current.append(ch) + i += 1 + while i < len(inner) and inner[i] != '"': + current.append(inner[i]) + i += 1 + if i < len(inner): + current.append(inner[i]) + elif ch == '(': + depth += 1 + current.append(ch) + elif ch == ')': + if depth == 0: + args.append(''.join(current)) + break + depth -= 1 + current.append(ch) + elif ch == ',' and depth == 0: + args.append(''.join(current)) + current = [] + else: + current.append(ch) + i += 1 + + if current: + remaining = ''.join(current).strip() + if remaining: + args.append(remaining) + + return args + + +def _parse_rom_entries(block: str) -> list[dict]: + """Parse ROM entries from a ROM block (content between ROM_START and ROM_END).""" + roms: list[dict] = [] + current_region = '' + bios_labels: dict[int, tuple[str, str]] = {} # index -> (label, description) + + for line in block.split('\n'): + stripped = line.strip() + + # Track region changes + region_match = _ROM_REGION.search(stripped) + if region_match: + current_region = region_match.group(2) + continue + + # Track BIOS labels + bios_match = _ROM_SYSTEM_BIOS.search(stripped) + if bios_match: + idx = int(bios_match.group(1)) + bios_labels[idx] = (bios_match.group(2), bios_match.group(3)) + continue + + # ROM_LOAD variants + load_match = _ROM_LOAD.search(stripped) + if not load_match: + continue + + # Skip NO_DUMP + if _NO_DUMP.search(stripped): + continue + + rom_name = load_match.group(2) + rom_size = _parse_int(load_match.group(4)) + + # Extract CRC32 and SHA1 + crc_sha_match = _CRC_SHA.search(stripped) + crc32 = '' + sha1 = '' + if crc_sha_match: + crc32 = crc_sha_match.group(1).lower() + sha1 = crc_sha_match.group(2).lower() + + bad_dump = bool(_BAD_DUMP.search(stripped)) + + # Check for ROM_BIOS association + bios_index = None + bios_label = '' + bios_description = '' + bios_ref = _ROM_BIOS.search(stripped) + if bios_ref: + bios_index = int(bios_ref.group(1)) + if bios_index in bios_labels: + bios_label, bios_description = bios_labels[bios_index] + + entry: dict = { + 'name': rom_name, + 'size': rom_size, + 'crc32': crc32, + 'sha1': sha1, + 'region': current_region, + 'bad_dump': bad_dump, + } + + if bios_index is not None: + entry['bios_index'] = bios_index + entry['bios_label'] = bios_label + entry['bios_description'] = bios_description + + roms.append(entry) + + return roms + + +def _parse_int(value: str) -> int: + """Parse an integer that may be hex (0x...) or decimal.""" + value = value.strip() + if value.startswith('0x') or value.startswith('0X'): + return int(value, 16) + return int(value) diff --git a/tests/test_mame_parser.py b/tests/test_mame_parser.py new file mode 100644 index 00000000..779e2ea1 --- /dev/null +++ b/tests/test_mame_parser.py @@ -0,0 +1,244 @@ +"""Tests for MAME source code parser.""" + +from __future__ import annotations + +import os +import tempfile +import unittest + +from scripts.scraper.mame_parser import ( + find_bios_root_sets, + parse_mame_source_tree, + parse_rom_block, +) + +# Standard GAME macro with MACHINE_IS_BIOS_ROOT, multiple ROM entries, BIOS variants +NEOGEO_FIXTURE = """\ +ROM_START( neogeo ) + ROM_REGION( 0x100000, "mainbios", 0 ) + + ROM_SYSTEM_BIOS( 0, "euro", "Europe MVS (Ver. 2)" ) + ROMX_LOAD( "sp-s2.sp1", 0x00000, 0x020000, CRC(9036d879) SHA1(4f5ed7105b7128794654ce82b51723e16e389543), ROM_BIOS(0) ) + + ROM_SYSTEM_BIOS( 1, "japan", "Japan MVS (Ver. 3)" ) + ROMX_LOAD( "vs-bios.rom", 0x00000, 0x020000, CRC(f0e8f27d) SHA1(ecf01bf6b3d6c7e4e0aae01e51e3ed4c0e1d5c2e), ROM_BIOS(1) ) + + ROM_REGION( 0x10000, "audiocpu", 0 ) + ROM_LOAD( "sm1.sm1", 0x00000, 0x20000, CRC(94416d67) SHA1(42f9d7ddd6c0931fd64226a60dc73602b2819571) ) +ROM_END + +GAME( 1990, neogeo, 0, neogeo_noslot, neogeo, neogeo_state, init_neogeo, ROT0, "SNK", "Neo Geo", MACHINE_IS_BIOS_ROOT ) +""" + +# COMP macro with MACHINE_IS_BIOS_ROOT +DEVICE_FIXTURE = """\ +ROM_START( bbcb ) + ROM_REGION( 0x40000, "maincpu", 0 ) + ROM_LOAD( "basic2.rom", 0x00000, 0x4000, CRC(a1b6a0e9) SHA1(6a0b9b8b7c3b3b9e6b7e8d0f2e7a6e7b8c9a0b1c) ) +ROM_END + +COMP( 1981, bbcb, 0, 0, bbcb, bbcb, bbc_state, init_bbc, "Acorn", "BBC Micro Model B", MACHINE_IS_BIOS_ROOT ) +""" + +# ROM_LOAD with NO_DUMP (should be skipped) +NODUMP_FIXTURE = """\ +ROM_START( testnd ) + ROM_REGION( 0x10000, "maincpu", 0 ) + ROM_LOAD( "good.rom", 0x00000, 0x4000, CRC(aabbccdd) SHA1(1122334455667788990011223344556677889900) ) + ROM_LOAD( "missing.rom", 0x04000, 0x4000, NO_DUMP ) +ROM_END + +GAME( 2000, testnd, 0, testnd, testnd, test_state, init_test, ROT0, "Test", "Test ND", MACHINE_IS_BIOS_ROOT ) +""" + +# ROM_LOAD with BAD_DUMP +BADDUMP_FIXTURE = """\ +ROM_START( testbd ) + ROM_REGION( 0x10000, "maincpu", 0 ) + ROM_LOAD( "badrom.bin", 0x00000, 0x4000, BAD_DUMP CRC(deadbeef) SHA1(0123456789abcdef0123456789abcdef01234567) ) +ROM_END + +GAME( 2000, testbd, 0, testbd, testbd, test_state, init_test, ROT0, "Test", "Test BD", MACHINE_IS_BIOS_ROOT ) +""" + +# CONS macro with ROM_LOAD16_WORD +CONS_FIXTURE = """\ +ROM_START( megadriv ) + ROM_REGION( 0x400000, "maincpu", 0 ) + ROM_LOAD16_WORD( "epr-6209.ic7", 0x000000, 0x004000, CRC(cafebabe) SHA1(abcdef0123456789abcdef0123456789abcdef01) ) +ROM_END + +CONS( 1988, megadriv, 0, 0, megadriv, megadriv, md_state, init_megadriv, "Sega", "Mega Drive", MACHINE_IS_BIOS_ROOT ) +""" + +# GAME macro WITHOUT MACHINE_IS_BIOS_ROOT (should NOT be detected) +NON_BIOS_FIXTURE = """\ +ROM_START( pacman ) + ROM_REGION( 0x10000, "maincpu", 0 ) + ROM_LOAD( "pacman.6e", 0x0000, 0x1000, CRC(c1e6ab10) SHA1(e87e059c5be45753f7e9f33dff851f16d6751181) ) +ROM_END + +GAME( 1980, pacman, 0, pacman, pacman, pacman_state, init_pacman, ROT90, "Namco", "Pac-Man", MACHINE_SUPPORTS_SAVE ) +""" + + +class TestFindBiosRootSets(unittest.TestCase): + """Tests for find_bios_root_sets.""" + + def test_detects_neogeo_from_game_macro(self) -> None: + result = find_bios_root_sets(NEOGEO_FIXTURE, 'src/mame/snk/neogeo.cpp') + self.assertIn('neogeo', result) + self.assertEqual(result['neogeo']['source_file'], 'src/mame/snk/neogeo.cpp') + self.assertIsInstance(result['neogeo']['source_line'], int) + + def test_detects_from_comp_macro(self) -> None: + result = find_bios_root_sets(DEVICE_FIXTURE, 'src/mame/acorn/bbc.cpp') + self.assertIn('bbcb', result) + + def test_detects_from_cons_macro(self) -> None: + result = find_bios_root_sets(CONS_FIXTURE, 'src/mame/sega/megadriv.cpp') + self.assertIn('megadriv', result) + + def test_ignores_non_bios_games(self) -> None: + result = find_bios_root_sets(NON_BIOS_FIXTURE, 'src/mame/pacman/pacman.cpp') + self.assertEqual(result, {}) + + def test_detects_from_nodump_fixture(self) -> None: + result = find_bios_root_sets(NODUMP_FIXTURE, 'test.cpp') + self.assertIn('testnd', result) + + def test_detects_from_baddump_fixture(self) -> None: + result = find_bios_root_sets(BADDUMP_FIXTURE, 'test.cpp') + self.assertIn('testbd', result) + + +class TestParseRomBlock(unittest.TestCase): + """Tests for parse_rom_block.""" + + def test_extracts_rom_names(self) -> None: + roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo') + names = [r['name'] for r in roms] + self.assertIn('sp-s2.sp1', names) + self.assertIn('vs-bios.rom', names) + self.assertIn('sm1.sm1', names) + + def test_extracts_crc32_and_sha1(self) -> None: + roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo') + sp_s2 = next(r for r in roms if r['name'] == 'sp-s2.sp1') + self.assertEqual(sp_s2['crc32'], '9036d879') + self.assertEqual(sp_s2['sha1'], '4f5ed7105b7128794654ce82b51723e16e389543') + + def test_extracts_size(self) -> None: + roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo') + sp_s2 = next(r for r in roms if r['name'] == 'sp-s2.sp1') + self.assertEqual(sp_s2['size'], 0x020000) + + def test_extracts_bios_metadata(self) -> None: + roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo') + sp_s2 = next(r for r in roms if r['name'] == 'sp-s2.sp1') + self.assertEqual(sp_s2['bios_index'], 0) + self.assertEqual(sp_s2['bios_label'], 'euro') + self.assertEqual(sp_s2['bios_description'], 'Europe MVS (Ver. 2)') + + def test_non_bios_rom_has_no_bios_fields(self) -> None: + roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo') + sm1 = next(r for r in roms if r['name'] == 'sm1.sm1') + self.assertNotIn('bios_index', sm1) + self.assertNotIn('bios_label', sm1) + + def test_skips_no_dump(self) -> None: + roms = parse_rom_block(NODUMP_FIXTURE, 'testnd') + names = [r['name'] for r in roms] + self.assertIn('good.rom', names) + self.assertNotIn('missing.rom', names) + + def test_includes_bad_dump_with_flag(self) -> None: + roms = parse_rom_block(BADDUMP_FIXTURE, 'testbd') + self.assertEqual(len(roms), 1) + self.assertEqual(roms[0]['name'], 'badrom.bin') + self.assertTrue(roms[0]['bad_dump']) + self.assertEqual(roms[0]['crc32'], 'deadbeef') + self.assertEqual(roms[0]['sha1'], '0123456789abcdef0123456789abcdef01234567') + + def test_handles_rom_load16_word(self) -> None: + roms = parse_rom_block(CONS_FIXTURE, 'megadriv') + self.assertEqual(len(roms), 1) + self.assertEqual(roms[0]['name'], 'epr-6209.ic7') + self.assertEqual(roms[0]['crc32'], 'cafebabe') + + def test_tracks_rom_region(self) -> None: + roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo') + sp_s2 = next(r for r in roms if r['name'] == 'sp-s2.sp1') + sm1 = next(r for r in roms if r['name'] == 'sm1.sm1') + self.assertEqual(sp_s2['region'], 'mainbios') + self.assertEqual(sm1['region'], 'audiocpu') + + def test_returns_empty_for_unknown_set(self) -> None: + roms = parse_rom_block(NEOGEO_FIXTURE, 'nonexistent') + self.assertEqual(roms, []) + + def test_good_rom_not_flagged_bad_dump(self) -> None: + roms = parse_rom_block(NODUMP_FIXTURE, 'testnd') + good = next(r for r in roms if r['name'] == 'good.rom') + self.assertFalse(good['bad_dump']) + + def test_crc32_sha1_lowercase(self) -> None: + fixture = """\ +ROM_START( upper ) + ROM_REGION( 0x10000, "maincpu", 0 ) + ROM_LOAD( "test.rom", 0x00000, 0x4000, CRC(AABBCCDD) SHA1(AABBCCDDEEFF00112233AABBCCDDEEFF00112233) ) +ROM_END +""" + roms = parse_rom_block(fixture, 'upper') + self.assertEqual(roms[0]['crc32'], 'aabbccdd') + self.assertEqual(roms[0]['sha1'], 'aabbccddeeff00112233aabbccddeeff00112233') + + +class TestParseMameSourceTree(unittest.TestCase): + """Tests for parse_mame_source_tree.""" + + def test_walks_source_tree(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + mame_dir = os.path.join(tmpdir, 'src', 'mame', 'snk') + os.makedirs(mame_dir) + filepath = os.path.join(mame_dir, 'neogeo.cpp') + with open(filepath, 'w') as f: + f.write(NEOGEO_FIXTURE) + + results = parse_mame_source_tree(tmpdir) + self.assertIn('neogeo', results) + self.assertEqual(len(results['neogeo']['roms']), 3) + self.assertEqual( + results['neogeo']['source_file'], + 'src/mame/snk/neogeo.cpp', + ) + + def test_ignores_non_source_files(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + mame_dir = os.path.join(tmpdir, 'src', 'mame') + os.makedirs(mame_dir) + # Write a .txt file that should be ignored + with open(os.path.join(mame_dir, 'notes.txt'), 'w') as f: + f.write(NEOGEO_FIXTURE) + + results = parse_mame_source_tree(tmpdir) + self.assertEqual(results, {}) + + def test_scans_devices_dir(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + dev_dir = os.path.join(tmpdir, 'src', 'devices', 'bus') + os.makedirs(dev_dir) + with open(os.path.join(dev_dir, 'test.cpp'), 'w') as f: + f.write(DEVICE_FIXTURE) + + results = parse_mame_source_tree(tmpdir) + self.assertIn('bbcb', results) + + def test_empty_tree(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + results = parse_mame_source_tree(tmpdir) + self.assertEqual(results, {}) + + +if __name__ == '__main__': + unittest.main()