libretro/scripts/scraper/dat_parser.py

"""Parser for clrmamepro DAT format.

Parses files like libretro's System.dat which uses the format:
    game (
        name "System"
        comment "Platform Name"
        rom ( name filename size 12345 crc ABCD1234 md5 ... sha1 ... )
    )
"""

from __future__ import annotations

from dataclasses import dataclass


@dataclass
class DatRom:
    """A ROM entry from a DAT file."""

    name: str
    size: int
    crc32: str
    md5: str
    sha1: str
    system: str  # From the preceding comment line


@dataclass
class DatMetadata:
    """Metadata from a DAT file header."""

    name: str = ""
    version: str = ""
    description: str = ""
    author: str = ""
    homepage: str = ""
    url: str = ""


def parse_dat(content: str) -> list[DatRom]:
    """Parse clrmamepro DAT content and return list of DatRom entries.

    Handles:
    - Quoted filenames with spaces: name "7800 BIOS (U).rom"
    - Path filenames: name "pcsx2/bios/file.bin"
    - Unquoted filenames: name cpc464.rom
    - Inconsistent indentation (tabs vs spaces)
    """
    roms = []
    current_system = ""

    for line in content.split("\n"):
        stripped = line.strip()

        if stripped.startswith("comment "):
            value = stripped[8:].strip().strip('"')
            if value in (
                "System",
                "System, firmware, and BIOS files used by libretro cores.",
            ):
                continue
            current_system = value

        elif stripped.startswith("rom (") or stripped.startswith("rom("):
            rom = _parse_rom_line(stripped, current_system)
            if rom:
                roms.append(rom)

    return roms


def parse_dat_metadata(content: str) -> DatMetadata:
    """Extract metadata from the clrmamepro header block."""
    meta = DatMetadata()
    in_header = False

    for line in content.split("\n"):
        stripped = line.strip()
        if stripped.startswith("clrmamepro"):
            in_header = True
            continue
        if in_header and stripped == ")":
            break
        if in_header:
            for field in (
                "name",
                "version",
                "description",
                "author",
                "homepage",
                "url",
            ):
                if stripped.startswith(f"{field} "):
                    value = stripped[len(field) + 1 :].strip().strip('"')
                    setattr(meta, field, value)

    return meta


def _parse_rom_line(line: str, system: str) -> DatRom | None:
    """Parse a single rom ( ... ) line."""
    # rfind because filenames may contain parentheses like "(E).rom"
    start = line.find("(")
    end = line.rfind(")")
    if start == -1 or end == -1 or end <= start:
        return None

    content = line[start + 1 : end].strip()

    fields = {}
    i = 0
    tokens = _tokenize(content)

    while i < len(tokens) - 1:
        key = tokens[i]
        value = tokens[i + 1]
        fields[key] = value
        i += 2

    name = fields.get("name", "")
    if not name:
        return None

    try:
        size = int(fields.get("size", "0"))
    except ValueError:
        size = 0

    return DatRom(
        name=name,
        size=size,
        crc32=fields.get("crc", "").lower(),
        md5=fields.get("md5", ""),
        sha1=fields.get("sha1", ""),
        system=system,
    )


def _tokenize(content: str) -> list[str]:
    """Tokenize DAT content, handling quoted strings."""
    tokens = []
    i = 0
    while i < len(content):
        while i < len(content) and content[i] in (" ", "\t"):
            i += 1
        if i >= len(content):
            break

        if content[i] == '"':
            i += 1
            start = i
            while i < len(content) and content[i] != '"':
                i += 1
            tokens.append(content[start:i])
            i += 1
        else:
            start = i
            while i < len(content) and content[i] not in (" ", "\t"):
                i += 1
            tokens.append(content[start:i])

    return tokens


def validate_dat_format(content: str) -> bool:
    """Validate that content is a valid clrmamepro DAT file.

    Checks for:
    - clrmamepro header
    - game block
    - rom entries
    """
    has_header = "clrmamepro" in content[:500]
    has_game = "game (" in content
    has_rom = "rom (" in content or "rom(" in content
    has_comment = 'comment "' in content

    return has_header and has_game and has_rom and has_comment