feat: add MAME source code parser for BIOS root sets

This commit is contained in:
Abdessamad Derraz
2026-03-30 18:29:31 +02:00
parent 00d7b57884
commit 319a1d2041
2 changed files with 542 additions and 0 deletions

View File

@@ -0,0 +1,298 @@
"""Parser for MAME C source files.
Extracts BIOS root sets and ROM definitions from MAME driver sources.
Handles GAME/SYST/COMP/CONS macros with MACHINE_IS_BIOS_ROOT flag,
ROM_START/ROM_END blocks, ROM_LOAD variants, ROM_REGION, ROM_SYSTEM_BIOS,
NO_DUMP filtering, and BAD_DUMP flagging.
"""
from __future__ import annotations
import os
import re
from pathlib import Path
# Macros that declare a machine entry
_MACHINE_MACROS = re.compile(
r'\b(GAME|SYST|COMP|CONS)\s*\(',
re.MULTILINE,
)
# ROM block boundaries
_ROM_START = re.compile(r'ROM_START\s*\(\s*(\w+)\s*\)')
_ROM_END = re.compile(r'ROM_END')
# ROM_REGION( tag, offset, size )
_ROM_REGION = re.compile(
r'ROM_REGION\s*\('
r'\s*(0x[\da-fA-F]+|\d+)\s*,' # size
r'\s*"([^"]+)"\s*,', # tag
)
# ROM_SYSTEM_BIOS( index, label, description )
_ROM_SYSTEM_BIOS = re.compile(
r'ROM_SYSTEM_BIOS\s*\('
r'\s*(\d+)\s*,' # index
r'\s*"([^"]+)"\s*,' # label
r'\s*"([^"]+)"\s*\)', # description
)
# All ROM_LOAD variants: ROM_LOAD, ROMX_LOAD, ROM_LOAD16_BYTE, ROM_LOAD16_WORD, etc.
_ROM_LOAD = re.compile(
r'(ROMX?_LOAD(?:16_BYTE|16_WORD|16_WORD_SWAP|32_BYTE|32_WORD|32_WORD_SWAP)?)\s*\('
r'\s*"([^"]+)"\s*,' # name
r'\s*(0x[\da-fA-F]+|\d+)\s*,' # offset
r'\s*(0x[\da-fA-F]+|\d+)\s*,', # size
)
# CRC32 and SHA1 within a ROM_LOAD line
_CRC_SHA = re.compile(
r'CRC\s*\(\s*([0-9a-fA-F]+)\s*\)'
r'\s+'
r'SHA1\s*\(\s*([0-9a-fA-F]+)\s*\)',
)
_NO_DUMP = re.compile(r'\bNO_DUMP\b')
_BAD_DUMP = re.compile(r'\bBAD_DUMP\b')
_ROM_BIOS = re.compile(r'ROM_BIOS\s*\(\s*(\d+)\s*\)')
def find_bios_root_sets(source: str, filename: str) -> dict[str, dict]:
"""Find machine entries flagged as BIOS root sets.
Scans for GAME/SYST/COMP/CONS macros where the args include
MACHINE_IS_BIOS_ROOT, returns set names with source location.
"""
results: dict[str, dict] = {}
for match in _MACHINE_MACROS.finditer(source):
start = match.end() - 1 # position of opening paren
block_end = _find_closing_paren(source, start)
if block_end == -1:
continue
block = source[start:block_end + 1]
if 'MACHINE_IS_BIOS_ROOT' not in block:
continue
# Extract set name: first arg after the opening paren
inner = block[1:] # skip opening paren
args = _split_macro_args(inner)
if not args:
continue
# The set name position varies by macro type
# GAME(year, setname, parent, machine, input, init, monitor, company, fullname, flags)
# CONS(year, setname, parent, compat, machine, input, init, company, fullname, flags)
# COMP(year, setname, parent, compat, machine, input, init, company, fullname, flags)
# SYST(year, setname, parent, compat, machine, input, init, company, fullname, flags)
# In all cases, setname is the second arg (index 1)
if len(args) < 2:
continue
set_name = args[1].strip()
line_no = source[:match.start()].count('\n') + 1
results[set_name] = {
'source_file': filename,
'source_line': line_no,
}
return results
def parse_rom_block(source: str, set_name: str) -> list[dict]:
"""Parse ROM definitions for a given set name.
Finds the ROM_START(set_name)...ROM_END block and extracts all
ROM_LOAD entries with their metadata. Skips NO_DUMP entries,
flags BAD_DUMP entries.
"""
pattern = re.compile(
r'ROM_START\s*\(\s*' + re.escape(set_name) + r'\s*\)',
)
start_match = pattern.search(source)
if not start_match:
return []
end_match = _ROM_END.search(source, start_match.end())
if not end_match:
return []
block = source[start_match.end():end_match.start()]
return _parse_rom_entries(block)
def parse_mame_source_tree(base_path: str) -> dict[str, dict]:
"""Walk MAME source tree and extract all BIOS root sets with ROMs.
Scans src/mame/ and src/devices/ for C/C++ source files.
"""
results: dict[str, dict] = {}
root = Path(base_path)
search_dirs = [root / 'src' / 'mame', root / 'src' / 'devices']
for search_dir in search_dirs:
if not search_dir.is_dir():
continue
for dirpath, _dirnames, filenames in os.walk(search_dir):
for fname in filenames:
if not fname.endswith(('.cpp', '.c', '.h', '.hxx')):
continue
filepath = Path(dirpath) / fname
rel_path = str(filepath.relative_to(root))
content = filepath.read_text(encoding='utf-8', errors='replace')
bios_sets = find_bios_root_sets(content, rel_path)
for set_name, info in bios_sets.items():
roms = parse_rom_block(content, set_name)
results[set_name] = {
'source_file': info['source_file'],
'source_line': info['source_line'],
'roms': roms,
}
return results
def _find_closing_paren(source: str, start: int) -> int:
"""Find the matching closing paren for source[start] which must be '('."""
depth = 0
i = start
while i < len(source):
ch = source[i]
if ch == '(':
depth += 1
elif ch == ')':
depth -= 1
if depth == 0:
return i
elif ch == '"':
i += 1
while i < len(source) and source[i] != '"':
i += 1
i += 1
return -1
def _split_macro_args(inner: str) -> list[str]:
"""Split macro arguments respecting nested parens and strings."""
args: list[str] = []
depth = 0
current: list[str] = []
i = 0
while i < len(inner):
ch = inner[i]
if ch == '"':
current.append(ch)
i += 1
while i < len(inner) and inner[i] != '"':
current.append(inner[i])
i += 1
if i < len(inner):
current.append(inner[i])
elif ch == '(':
depth += 1
current.append(ch)
elif ch == ')':
if depth == 0:
args.append(''.join(current))
break
depth -= 1
current.append(ch)
elif ch == ',' and depth == 0:
args.append(''.join(current))
current = []
else:
current.append(ch)
i += 1
if current:
remaining = ''.join(current).strip()
if remaining:
args.append(remaining)
return args
def _parse_rom_entries(block: str) -> list[dict]:
"""Parse ROM entries from a ROM block (content between ROM_START and ROM_END)."""
roms: list[dict] = []
current_region = ''
bios_labels: dict[int, tuple[str, str]] = {} # index -> (label, description)
for line in block.split('\n'):
stripped = line.strip()
# Track region changes
region_match = _ROM_REGION.search(stripped)
if region_match:
current_region = region_match.group(2)
continue
# Track BIOS labels
bios_match = _ROM_SYSTEM_BIOS.search(stripped)
if bios_match:
idx = int(bios_match.group(1))
bios_labels[idx] = (bios_match.group(2), bios_match.group(3))
continue
# ROM_LOAD variants
load_match = _ROM_LOAD.search(stripped)
if not load_match:
continue
# Skip NO_DUMP
if _NO_DUMP.search(stripped):
continue
rom_name = load_match.group(2)
rom_size = _parse_int(load_match.group(4))
# Extract CRC32 and SHA1
crc_sha_match = _CRC_SHA.search(stripped)
crc32 = ''
sha1 = ''
if crc_sha_match:
crc32 = crc_sha_match.group(1).lower()
sha1 = crc_sha_match.group(2).lower()
bad_dump = bool(_BAD_DUMP.search(stripped))
# Check for ROM_BIOS association
bios_index = None
bios_label = ''
bios_description = ''
bios_ref = _ROM_BIOS.search(stripped)
if bios_ref:
bios_index = int(bios_ref.group(1))
if bios_index in bios_labels:
bios_label, bios_description = bios_labels[bios_index]
entry: dict = {
'name': rom_name,
'size': rom_size,
'crc32': crc32,
'sha1': sha1,
'region': current_region,
'bad_dump': bad_dump,
}
if bios_index is not None:
entry['bios_index'] = bios_index
entry['bios_label'] = bios_label
entry['bios_description'] = bios_description
roms.append(entry)
return roms
def _parse_int(value: str) -> int:
"""Parse an integer that may be hex (0x...) or decimal."""
value = value.strip()
if value.startswith('0x') or value.startswith('0X'):
return int(value, 16)
return int(value)

244
tests/test_mame_parser.py Normal file
View File

@@ -0,0 +1,244 @@
"""Tests for MAME source code parser."""
from __future__ import annotations
import os
import tempfile
import unittest
from scripts.scraper.mame_parser import (
find_bios_root_sets,
parse_mame_source_tree,
parse_rom_block,
)
# Standard GAME macro with MACHINE_IS_BIOS_ROOT, multiple ROM entries, BIOS variants
NEOGEO_FIXTURE = """\
ROM_START( neogeo )
ROM_REGION( 0x100000, "mainbios", 0 )
ROM_SYSTEM_BIOS( 0, "euro", "Europe MVS (Ver. 2)" )
ROMX_LOAD( "sp-s2.sp1", 0x00000, 0x020000, CRC(9036d879) SHA1(4f5ed7105b7128794654ce82b51723e16e389543), ROM_BIOS(0) )
ROM_SYSTEM_BIOS( 1, "japan", "Japan MVS (Ver. 3)" )
ROMX_LOAD( "vs-bios.rom", 0x00000, 0x020000, CRC(f0e8f27d) SHA1(ecf01bf6b3d6c7e4e0aae01e51e3ed4c0e1d5c2e), ROM_BIOS(1) )
ROM_REGION( 0x10000, "audiocpu", 0 )
ROM_LOAD( "sm1.sm1", 0x00000, 0x20000, CRC(94416d67) SHA1(42f9d7ddd6c0931fd64226a60dc73602b2819571) )
ROM_END
GAME( 1990, neogeo, 0, neogeo_noslot, neogeo, neogeo_state, init_neogeo, ROT0, "SNK", "Neo Geo", MACHINE_IS_BIOS_ROOT )
"""
# COMP macro with MACHINE_IS_BIOS_ROOT
DEVICE_FIXTURE = """\
ROM_START( bbcb )
ROM_REGION( 0x40000, "maincpu", 0 )
ROM_LOAD( "basic2.rom", 0x00000, 0x4000, CRC(a1b6a0e9) SHA1(6a0b9b8b7c3b3b9e6b7e8d0f2e7a6e7b8c9a0b1c) )
ROM_END
COMP( 1981, bbcb, 0, 0, bbcb, bbcb, bbc_state, init_bbc, "Acorn", "BBC Micro Model B", MACHINE_IS_BIOS_ROOT )
"""
# ROM_LOAD with NO_DUMP (should be skipped)
NODUMP_FIXTURE = """\
ROM_START( testnd )
ROM_REGION( 0x10000, "maincpu", 0 )
ROM_LOAD( "good.rom", 0x00000, 0x4000, CRC(aabbccdd) SHA1(1122334455667788990011223344556677889900) )
ROM_LOAD( "missing.rom", 0x04000, 0x4000, NO_DUMP )
ROM_END
GAME( 2000, testnd, 0, testnd, testnd, test_state, init_test, ROT0, "Test", "Test ND", MACHINE_IS_BIOS_ROOT )
"""
# ROM_LOAD with BAD_DUMP
BADDUMP_FIXTURE = """\
ROM_START( testbd )
ROM_REGION( 0x10000, "maincpu", 0 )
ROM_LOAD( "badrom.bin", 0x00000, 0x4000, BAD_DUMP CRC(deadbeef) SHA1(0123456789abcdef0123456789abcdef01234567) )
ROM_END
GAME( 2000, testbd, 0, testbd, testbd, test_state, init_test, ROT0, "Test", "Test BD", MACHINE_IS_BIOS_ROOT )
"""
# CONS macro with ROM_LOAD16_WORD
CONS_FIXTURE = """\
ROM_START( megadriv )
ROM_REGION( 0x400000, "maincpu", 0 )
ROM_LOAD16_WORD( "epr-6209.ic7", 0x000000, 0x004000, CRC(cafebabe) SHA1(abcdef0123456789abcdef0123456789abcdef01) )
ROM_END
CONS( 1988, megadriv, 0, 0, megadriv, megadriv, md_state, init_megadriv, "Sega", "Mega Drive", MACHINE_IS_BIOS_ROOT )
"""
# GAME macro WITHOUT MACHINE_IS_BIOS_ROOT (should NOT be detected)
NON_BIOS_FIXTURE = """\
ROM_START( pacman )
ROM_REGION( 0x10000, "maincpu", 0 )
ROM_LOAD( "pacman.6e", 0x0000, 0x1000, CRC(c1e6ab10) SHA1(e87e059c5be45753f7e9f33dff851f16d6751181) )
ROM_END
GAME( 1980, pacman, 0, pacman, pacman, pacman_state, init_pacman, ROT90, "Namco", "Pac-Man", MACHINE_SUPPORTS_SAVE )
"""
class TestFindBiosRootSets(unittest.TestCase):
"""Tests for find_bios_root_sets."""
def test_detects_neogeo_from_game_macro(self) -> None:
result = find_bios_root_sets(NEOGEO_FIXTURE, 'src/mame/snk/neogeo.cpp')
self.assertIn('neogeo', result)
self.assertEqual(result['neogeo']['source_file'], 'src/mame/snk/neogeo.cpp')
self.assertIsInstance(result['neogeo']['source_line'], int)
def test_detects_from_comp_macro(self) -> None:
result = find_bios_root_sets(DEVICE_FIXTURE, 'src/mame/acorn/bbc.cpp')
self.assertIn('bbcb', result)
def test_detects_from_cons_macro(self) -> None:
result = find_bios_root_sets(CONS_FIXTURE, 'src/mame/sega/megadriv.cpp')
self.assertIn('megadriv', result)
def test_ignores_non_bios_games(self) -> None:
result = find_bios_root_sets(NON_BIOS_FIXTURE, 'src/mame/pacman/pacman.cpp')
self.assertEqual(result, {})
def test_detects_from_nodump_fixture(self) -> None:
result = find_bios_root_sets(NODUMP_FIXTURE, 'test.cpp')
self.assertIn('testnd', result)
def test_detects_from_baddump_fixture(self) -> None:
result = find_bios_root_sets(BADDUMP_FIXTURE, 'test.cpp')
self.assertIn('testbd', result)
class TestParseRomBlock(unittest.TestCase):
"""Tests for parse_rom_block."""
def test_extracts_rom_names(self) -> None:
roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo')
names = [r['name'] for r in roms]
self.assertIn('sp-s2.sp1', names)
self.assertIn('vs-bios.rom', names)
self.assertIn('sm1.sm1', names)
def test_extracts_crc32_and_sha1(self) -> None:
roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo')
sp_s2 = next(r for r in roms if r['name'] == 'sp-s2.sp1')
self.assertEqual(sp_s2['crc32'], '9036d879')
self.assertEqual(sp_s2['sha1'], '4f5ed7105b7128794654ce82b51723e16e389543')
def test_extracts_size(self) -> None:
roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo')
sp_s2 = next(r for r in roms if r['name'] == 'sp-s2.sp1')
self.assertEqual(sp_s2['size'], 0x020000)
def test_extracts_bios_metadata(self) -> None:
roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo')
sp_s2 = next(r for r in roms if r['name'] == 'sp-s2.sp1')
self.assertEqual(sp_s2['bios_index'], 0)
self.assertEqual(sp_s2['bios_label'], 'euro')
self.assertEqual(sp_s2['bios_description'], 'Europe MVS (Ver. 2)')
def test_non_bios_rom_has_no_bios_fields(self) -> None:
roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo')
sm1 = next(r for r in roms if r['name'] == 'sm1.sm1')
self.assertNotIn('bios_index', sm1)
self.assertNotIn('bios_label', sm1)
def test_skips_no_dump(self) -> None:
roms = parse_rom_block(NODUMP_FIXTURE, 'testnd')
names = [r['name'] for r in roms]
self.assertIn('good.rom', names)
self.assertNotIn('missing.rom', names)
def test_includes_bad_dump_with_flag(self) -> None:
roms = parse_rom_block(BADDUMP_FIXTURE, 'testbd')
self.assertEqual(len(roms), 1)
self.assertEqual(roms[0]['name'], 'badrom.bin')
self.assertTrue(roms[0]['bad_dump'])
self.assertEqual(roms[0]['crc32'], 'deadbeef')
self.assertEqual(roms[0]['sha1'], '0123456789abcdef0123456789abcdef01234567')
def test_handles_rom_load16_word(self) -> None:
roms = parse_rom_block(CONS_FIXTURE, 'megadriv')
self.assertEqual(len(roms), 1)
self.assertEqual(roms[0]['name'], 'epr-6209.ic7')
self.assertEqual(roms[0]['crc32'], 'cafebabe')
def test_tracks_rom_region(self) -> None:
roms = parse_rom_block(NEOGEO_FIXTURE, 'neogeo')
sp_s2 = next(r for r in roms if r['name'] == 'sp-s2.sp1')
sm1 = next(r for r in roms if r['name'] == 'sm1.sm1')
self.assertEqual(sp_s2['region'], 'mainbios')
self.assertEqual(sm1['region'], 'audiocpu')
def test_returns_empty_for_unknown_set(self) -> None:
roms = parse_rom_block(NEOGEO_FIXTURE, 'nonexistent')
self.assertEqual(roms, [])
def test_good_rom_not_flagged_bad_dump(self) -> None:
roms = parse_rom_block(NODUMP_FIXTURE, 'testnd')
good = next(r for r in roms if r['name'] == 'good.rom')
self.assertFalse(good['bad_dump'])
def test_crc32_sha1_lowercase(self) -> None:
fixture = """\
ROM_START( upper )
ROM_REGION( 0x10000, "maincpu", 0 )
ROM_LOAD( "test.rom", 0x00000, 0x4000, CRC(AABBCCDD) SHA1(AABBCCDDEEFF00112233AABBCCDDEEFF00112233) )
ROM_END
"""
roms = parse_rom_block(fixture, 'upper')
self.assertEqual(roms[0]['crc32'], 'aabbccdd')
self.assertEqual(roms[0]['sha1'], 'aabbccddeeff00112233aabbccddeeff00112233')
class TestParseMameSourceTree(unittest.TestCase):
"""Tests for parse_mame_source_tree."""
def test_walks_source_tree(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
mame_dir = os.path.join(tmpdir, 'src', 'mame', 'snk')
os.makedirs(mame_dir)
filepath = os.path.join(mame_dir, 'neogeo.cpp')
with open(filepath, 'w') as f:
f.write(NEOGEO_FIXTURE)
results = parse_mame_source_tree(tmpdir)
self.assertIn('neogeo', results)
self.assertEqual(len(results['neogeo']['roms']), 3)
self.assertEqual(
results['neogeo']['source_file'],
'src/mame/snk/neogeo.cpp',
)
def test_ignores_non_source_files(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
mame_dir = os.path.join(tmpdir, 'src', 'mame')
os.makedirs(mame_dir)
# Write a .txt file that should be ignored
with open(os.path.join(mame_dir, 'notes.txt'), 'w') as f:
f.write(NEOGEO_FIXTURE)
results = parse_mame_source_tree(tmpdir)
self.assertEqual(results, {})
def test_scans_devices_dir(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
dev_dir = os.path.join(tmpdir, 'src', 'devices', 'bus')
os.makedirs(dev_dir)
with open(os.path.join(dev_dir, 'test.cpp'), 'w') as f:
f.write(DEVICE_FIXTURE)
results = parse_mame_source_tree(tmpdir)
self.assertIn('bbcb', results)
def test_empty_tree(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
results = parse_mame_source_tree(tmpdir)
self.assertEqual(results, {})
if __name__ == '__main__':
unittest.main()