Files
libretro/scripts/scraper/dat_parser.py
Abdessamad Derraz 0a272dc4e9 chore: lint and format entire codebase
Run ruff check --fix: remove unused imports (F401), fix f-strings
without placeholders (F541), remove unused variables (F841), fix
duplicate dict key (F601).

Run isort --profile black: normalize import ordering across all files.

Run ruff format: apply consistent formatting (black-compatible) to
all 58 Python files.

3 intentional E402 remain (imports after require_yaml() must execute
after yaml is available).
2026-04-01 13:17:55 +02:00

179 lines
4.4 KiB
Python

"""Parser for clrmamepro DAT format.
Parses files like libretro's System.dat which uses the format:
game (
name "System"
comment "Platform Name"
rom ( name filename size 12345 crc ABCD1234 md5 ... sha1 ... )
)
"""
from __future__ import annotations
from dataclasses import dataclass
@dataclass
class DatRom:
"""A ROM entry from a DAT file."""
name: str
size: int
crc32: str
md5: str
sha1: str
system: str # From the preceding comment line
@dataclass
class DatMetadata:
"""Metadata from a DAT file header."""
name: str = ""
version: str = ""
description: str = ""
author: str = ""
homepage: str = ""
url: str = ""
def parse_dat(content: str) -> list[DatRom]:
"""Parse clrmamepro DAT content and return list of DatRom entries.
Handles:
- Quoted filenames with spaces: name "7800 BIOS (U).rom"
- Path filenames: name "pcsx2/bios/file.bin"
- Unquoted filenames: name cpc464.rom
- Inconsistent indentation (tabs vs spaces)
"""
roms = []
current_system = ""
for line in content.split("\n"):
stripped = line.strip()
if stripped.startswith("comment "):
value = stripped[8:].strip().strip('"')
if value in (
"System",
"System, firmware, and BIOS files used by libretro cores.",
):
continue
current_system = value
elif stripped.startswith("rom (") or stripped.startswith("rom("):
rom = _parse_rom_line(stripped, current_system)
if rom:
roms.append(rom)
return roms
def parse_dat_metadata(content: str) -> DatMetadata:
"""Extract metadata from the clrmamepro header block."""
meta = DatMetadata()
in_header = False
for line in content.split("\n"):
stripped = line.strip()
if stripped.startswith("clrmamepro"):
in_header = True
continue
if in_header and stripped == ")":
break
if in_header:
for field in (
"name",
"version",
"description",
"author",
"homepage",
"url",
):
if stripped.startswith(f"{field} "):
value = stripped[len(field) + 1 :].strip().strip('"')
setattr(meta, field, value)
return meta
def _parse_rom_line(line: str, system: str) -> DatRom | None:
"""Parse a single rom ( ... ) line."""
# rfind because filenames may contain parentheses like "(E).rom"
start = line.find("(")
end = line.rfind(")")
if start == -1 or end == -1 or end <= start:
return None
content = line[start + 1 : end].strip()
fields = {}
i = 0
tokens = _tokenize(content)
while i < len(tokens) - 1:
key = tokens[i]
value = tokens[i + 1]
fields[key] = value
i += 2
name = fields.get("name", "")
if not name:
return None
try:
size = int(fields.get("size", "0"))
except ValueError:
size = 0
return DatRom(
name=name,
size=size,
crc32=fields.get("crc", "").lower(),
md5=fields.get("md5", ""),
sha1=fields.get("sha1", ""),
system=system,
)
def _tokenize(content: str) -> list[str]:
"""Tokenize DAT content, handling quoted strings."""
tokens = []
i = 0
while i < len(content):
while i < len(content) and content[i] in (" ", "\t"):
i += 1
if i >= len(content):
break
if content[i] == '"':
i += 1
start = i
while i < len(content) and content[i] != '"':
i += 1
tokens.append(content[start:i])
i += 1
else:
start = i
while i < len(content) and content[i] not in (" ", "\t"):
i += 1
tokens.append(content[start:i])
return tokens
def validate_dat_format(content: str) -> bool:
"""Validate that content is a valid clrmamepro DAT file.
Checks for:
- clrmamepro header
- game block
- rom entries
"""
has_header = "clrmamepro" in content[:500]
has_game = "game (" in content
has_rom = "rom (" in content or "rom(" in content
has_comment = 'comment "' in content
return has_header and has_game and has_rom and has_comment