chore: deduplicate bios/ — remove 427 files, save 227 MB

true duplicates (same file in multiple dirs): removed copies, kept
canonical. MAME device clones (different names, identical content):
removed copies, created _mame_clones.json mapping for pack-time
assembly via deterministic ZIP rebuild. generate_pack.py resolves
clones transparently. 95 canonical ZIPs serve 392 clone names.
This commit is contained in:
Abdessamad Derraz
2026-03-24 21:35:50 +01:00
parent 8fcb86ba35
commit fb1007496d
451 changed files with 3088 additions and 2106 deletions

View File

@@ -1,21 +1,26 @@
#!/usr/bin/env python3
"""Deduplicate bios/ directory - keep one canonical file per unique SHA1.
"""Deduplicate bios/ directory — keep one canonical file per unique content.
Usage:
python scripts/dedup.py [--dry-run] [--bios-dir bios]
For each group of files with the same SHA1:
- Keeps the file with the shortest, most canonical path
- Removes duplicates
- Records all alternate names in database.json aliases
Two types of deduplication:
After dedup, generate_pack.py resolves files by hash and writes them
with the correct destination name - no duplicates needed on disk.
1. TRUE DUPLICATES: Same filename in different directories (e.g., naomi.zip
in both Arcade/ and Sega/Dreamcast/). Keeps one canonical copy, removes
the others. resolve_local_file finds files by hash, not path.
2. MAME DEVICE CLONES: Different filenames with identical content in the same
MAME directory (e.g., bbc_m87.zip and bbc_24bbc.zip are identical ZIPs).
These are NOT aliases — MAME loads each by its unique name. Instead of
deleting, we create a _mame_clones.json mapping so generate_pack.py can
pack all names from a single canonical file.
After dedup, run generate_db.py --force to rebuild database indexes.
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from collections import defaultdict
@@ -27,9 +32,6 @@ from common import compute_hashes
DEFAULT_BIOS_DIR = "bios"
# Directories where deduplication must NOT be applied.
# RPG Maker RTP files are referenced by exact name in game scripts -
# removing a "duplicate" breaks games that reference that specific filename.
# ScummVM themes/extra also have name-dependent loading.
NODEDUP_DIRS = {
"RPG Maker",
"ScummVM",
@@ -39,29 +41,29 @@ NODEDUP_DIRS = {
def path_priority(path: str) -> tuple:
"""Lower score = better candidate to keep as canonical.
Prefers:
- Shorter paths
- Non-.variants paths
- Non-nested paths (fewer /)
- Lowercase names (more standard)
Prefers: shorter path, non-variant, non-MAME (system-specific over generic).
"""
parts = path.split("/")
is_variant = ".variants" in path
depth = len(parts)
name = os.path.basename(path)
# Prefer non-variant, shallow, short name
return (is_variant, depth, len(name), path)
parts = Path(path).parts
is_variant = ".variants" in parts
is_mame = "MAME" in parts
is_arcade = "Arcade" in parts
return (is_variant, is_mame, is_arcade, len(parts), path)
def _in_nodedup_dir(path: str) -> bool:
"""Check if a file is inside a no-dedup directory."""
parts = Path(path).parts
return any(nodedup in parts for nodedup in NODEDUP_DIRS)
def _is_mame_dir(path: str) -> bool:
"""Check if a path is in a MAME-specific directory."""
parts = Path(path).parts
return "MAME" in parts or "Arcade" in parts
def scan_duplicates(bios_dir: str) -> dict[str, list[str]]:
"""Find all files grouped by SHA1, excluding no-dedup directories."""
sha1_to_paths = defaultdict(list)
sha1_to_paths: dict[str, list[str]] = defaultdict(list)
for root, dirs, files in os.walk(bios_dir):
for name in files:
@@ -75,7 +77,10 @@ def scan_duplicates(bios_dir: str) -> dict[str, list[str]]:
def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
"""Remove duplicate files, keeping one canonical copy per SHA1.
"""Remove true duplicates, map MAME device clones.
True duplicates (same name, different dirs): removes copies.
MAME clones (different names, same content, same dir): creates mapping.
Returns dict of {sha1: {"canonical": path, "removed": [paths], "aliases": [names]}}
"""
@@ -83,20 +88,63 @@ def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
results = {}
total_removed = 0
total_saved = 0
mame_clones: dict[str, dict] = {} # canonical_name -> {sha1, clones: [names]}
for sha1, paths in sorted(sha1_groups.items()):
if len(paths) <= 1:
continue
paths.sort(key=path_priority)
canonical = paths[0]
duplicates = paths[1:]
all_names = set()
# Separate by filename — same name = true duplicate, different name = clone
by_name: dict[str, list[str]] = defaultdict(list)
for p in paths:
all_names.add(os.path.basename(p))
by_name[os.path.basename(p)].append(p)
# True duplicates: same filename in multiple directories
true_dupes_to_remove = []
for name, name_paths in by_name.items():
if len(name_paths) > 1:
name_paths.sort(key=path_priority)
true_dupes_to_remove.extend(name_paths[1:])
# MAME device clones: different filenames, same content, in MAME dirs
unique_names = sorted(by_name.keys())
if len(unique_names) > 1:
# Check if these are all MAME ZIPs
all_mame = all(
any(_is_mame_dir(p) for p in name_paths)
for name_paths in by_name.values()
)
if all_mame and all(n.endswith(".zip") for n in unique_names):
# Pick canonical (shortest name) and record clones
canonical_name = min(unique_names, key=len)
clone_names = sorted(n for n in unique_names if n != canonical_name)
if clone_names:
mame_clones[canonical_name] = {
"sha1": sha1,
"clones": clone_names,
"total_copies": sum(len(by_name[n]) for n in clone_names),
}
# Remove all clone copies (keep one per unique name for now,
# or remove all clones and rely on pack-time assembly)
for clone_name in clone_names:
for p in by_name[clone_name]:
true_dupes_to_remove.append(p)
if not true_dupes_to_remove:
continue
# Find the best canonical across all paths
all_paths = [p for p in paths if p not in true_dupes_to_remove]
if not all_paths:
# All copies were marked for removal — keep the best one
all_paths_sorted = sorted(paths, key=path_priority)
all_paths = [all_paths_sorted[0]]
true_dupes_to_remove = [p for p in paths if p != all_paths[0]]
canonical = sorted(all_paths, key=path_priority)[0]
canonical_name = os.path.basename(canonical)
all_names = set(os.path.basename(p) for p in paths)
alias_names = sorted(all_names - {canonical_name})
size = os.path.getsize(canonical)
@@ -107,49 +155,67 @@ def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
"aliases": alias_names,
}
for dup in duplicates:
for dup in true_dupes_to_remove:
if dup == canonical:
continue
if not os.path.exists(dup):
continue
if dry_run:
print(f" WOULD REMOVE: {dup}")
else:
os.remove(dup)
# Clean up empty .variants/ directories
parent = os.path.dirname(dup)
if os.path.basename(parent) == ".variants" and not os.listdir(parent):
os.rmdir(parent)
results[sha1]["removed"].append(dup)
total_removed += 1
total_saved += size
if alias_names:
if alias_names or true_dupes_to_remove:
action = "Would remove" if dry_run else "Removed"
print(f" {canonical_name} (keep: {canonical})")
print(f" {action} {len(duplicates)} copies, aliases: {alias_names}")
dn = os.path.basename(canonical)
print(f" {dn} (keep: {canonical})")
if true_dupes_to_remove:
print(f" {action} {len(true_dupes_to_remove)} copies")
if alias_names:
print(f" MAME clones: {alias_names}")
if not dry_run:
for root, dirs, files in os.walk(bios_dir, topdown=False):
if not files and not dirs:
try:
os.rmdir(root)
except OSError:
pass
print(f"\n{'Would remove' if dry_run else 'Removed'}: {total_removed} files")
prefix = "Would remove" if dry_run else "Removed"
print(f"\n{prefix}: {total_removed} files")
print(f"Space {'to save' if dry_run else 'saved'}: {total_saved / 1024 / 1024:.1f} MB")
# Write MAME clone mapping
if mame_clones:
clone_path = os.path.join(bios_dir, "_mame_clones.json")
if dry_run:
print(f"\nWould write MAME clone map: {clone_path}")
print(f" {len(mame_clones)} canonical ZIPs with "
f"{sum(len(v['clones']) for v in mame_clones.values())} clones")
else:
with open(clone_path, "w") as f:
json.dump(mame_clones, f, indent=2, sort_keys=True)
print(f"\nWrote MAME clone map: {clone_path}")
print(f" {len(mame_clones)} canonical ZIPs with "
f"{sum(len(v['clones']) for v in mame_clones.values())} clones")
return results
def main():
parser = argparse.ArgumentParser(description="Deduplicate BIOS files")
parser.add_argument("--dry-run", action="store_true")
def main() -> None:
parser = argparse.ArgumentParser(description="Deduplicate bios/ directory")
parser.add_argument("--dry-run", action="store_true", help="Preview without deleting")
parser.add_argument("--bios-dir", default=DEFAULT_BIOS_DIR)
args = parser.parse_args()
if not os.path.isdir(args.bios_dir):
print(f"Error: {args.bios_dir} not found", file=sys.stderr)
sys.exit(1)
print(f"Scanning {args.bios_dir}/ for duplicates...")
if args.dry_run:
print("(DRY RUN)\n")
deduplicate(args.bios_dir, args.dry_run)
deduplicate(args.bios_dir, dry_run=args.dry_run)
if not args.dry_run:
print("\nRun 'python scripts/generate_db.py --force' to rebuild database.")
if __name__ == "__main__":

View File

@@ -100,13 +100,39 @@ def _sanitize_path(raw: str) -> str:
return "/".join(parts)
def _load_mame_clones(bios_dir: str) -> dict[str, str]:
"""Load MAME clone mapping: clone_name -> canonical_name."""
clone_path = os.path.join(bios_dir, "_mame_clones.json")
if not os.path.exists(clone_path):
return {}
with open(clone_path) as f:
data = json.load(f)
# Invert: clone_name -> canonical_name
result = {}
for canonical, info in data.items():
for clone in info.get("clones", []):
result[clone] = canonical
return result
_MAME_CLONE_MAP: dict[str, str] | None = None
def _get_mame_clone_map(bios_dir: str) -> dict[str, str]:
global _MAME_CLONE_MAP
if _MAME_CLONE_MAP is None:
_MAME_CLONE_MAP = _load_mame_clones(bios_dir)
return _MAME_CLONE_MAP
def resolve_file(file_entry: dict, db: dict, bios_dir: str,
zip_contents: dict | None = None,
dest_hint: str = "") -> tuple[str | None, str]:
"""Resolve a BIOS file with storage tiers and release asset fallback.
Wraps common.resolve_local_file() with pack-specific logic for
storage tiers (external/user_provided) and large file release assets.
storage tiers (external/user_provided), large file release assets,
and MAME clone mapping (deduped ZIPs).
"""
storage = file_entry.get("storage", "embedded")
if storage == "user_provided":
@@ -119,8 +145,17 @@ def resolve_file(file_entry: dict, db: dict, bios_dir: str,
if path:
return path, status
# Last resort: large files from GitHub release assets
# MAME clone fallback: if the file was deduped, resolve via canonical
name = file_entry.get("name", "")
clone_map = _get_mame_clone_map(bios_dir)
canonical = clone_map.get(name)
if canonical:
canonical_entry = {"name": canonical}
cpath, cstatus = resolve_local_file(canonical_entry, db, zip_contents)
if cpath:
return cpath, "mame_clone"
# Last resort: large files from GitHub release assets
sha1 = file_entry.get("sha1")
md5_raw = file_entry.get("md5", "")
md5_list = [m.strip().lower() for m in md5_raw.split(",") if m.strip()] if md5_raw else []