Files
libretro/scripts/dedup.py
Abdessamad Derraz af74fffa14 refactor: fix code review findings across all scripts
Critical: stream large file downloads (OOM fix), fix basename match
in auto_fetch, include hashes in pack grouping fingerprint, handle
not_in_zip status in verify, fix escaped quotes in batocera parser.

Important: deduplicate shared group includes, catch coreinfo network
errors, fix NODEDUP path component match, fix CI word splitting on
spaces, replace bare except Exception in 3 files.

Minor: argparse in list_platforms, specific exceptions in download.py.
2026-03-17 15:16:51 +01:00

157 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""Deduplicate bios/ directory - keep one canonical file per unique SHA1.
Usage:
python scripts/dedup.py [--dry-run] [--bios-dir bios]
For each group of files with the same SHA1:
- Keeps the file with the shortest, most canonical path
- Removes duplicates
- Records all alternate names in database.json aliases
After dedup, generate_pack.py resolves files by hash and writes them
with the correct destination name - no duplicates needed on disk.
"""
from __future__ import annotations
import argparse
import os
import sys
from collections import defaultdict
from pathlib import Path
sys.path.insert(0, os.path.dirname(__file__))
from common import compute_hashes
DEFAULT_BIOS_DIR = "bios"
# Directories where deduplication must NOT be applied.
# RPG Maker RTP files are referenced by exact name in game scripts -
# removing a "duplicate" breaks games that reference that specific filename.
# ScummVM themes/extra also have name-dependent loading.
NODEDUP_DIRS = {
"RPG Maker",
"ScummVM",
}
def path_priority(path: str) -> tuple:
"""Lower score = better candidate to keep as canonical.
Prefers:
- Shorter paths
- Non-.variants paths
- Non-nested paths (fewer /)
- Lowercase names (more standard)
"""
parts = path.split("/")
is_variant = ".variants" in path
depth = len(parts)
name = os.path.basename(path)
# Prefer non-variant, shallow, short name
return (is_variant, depth, len(name), path)
def _in_nodedup_dir(path: str) -> bool:
"""Check if a file is inside a no-dedup directory."""
parts = Path(path).parts
return any(nodedup in parts for nodedup in NODEDUP_DIRS)
def scan_duplicates(bios_dir: str) -> dict[str, list[str]]:
"""Find all files grouped by SHA1, excluding no-dedup directories."""
sha1_to_paths = defaultdict(list)
for root, dirs, files in os.walk(bios_dir):
for name in files:
path = os.path.join(root, name)
if _in_nodedup_dir(path):
continue
sha1 = compute_hashes(path)["sha1"]
sha1_to_paths[sha1].append(path)
return sha1_to_paths
def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
"""Remove duplicate files, keeping one canonical copy per SHA1.
Returns dict of {sha1: {"canonical": path, "removed": [paths], "aliases": [names]}}
"""
sha1_groups = scan_duplicates(bios_dir)
results = {}
total_removed = 0
total_saved = 0
for sha1, paths in sorted(sha1_groups.items()):
if len(paths) <= 1:
continue
paths.sort(key=path_priority)
canonical = paths[0]
duplicates = paths[1:]
all_names = set()
for p in paths:
all_names.add(os.path.basename(p))
canonical_name = os.path.basename(canonical)
alias_names = sorted(all_names - {canonical_name})
size = os.path.getsize(canonical)
results[sha1] = {
"canonical": canonical,
"removed": [],
"aliases": alias_names,
}
for dup in duplicates:
if dry_run:
print(f" WOULD REMOVE: {dup}")
else:
os.remove(dup)
results[sha1]["removed"].append(dup)
total_removed += 1
total_saved += size
if alias_names:
action = "Would remove" if dry_run else "Removed"
print(f" {canonical_name} (keep: {canonical})")
print(f" {action} {len(duplicates)} copies, aliases: {alias_names}")
if not dry_run:
for root, dirs, files in os.walk(bios_dir, topdown=False):
if not files and not dirs:
try:
os.rmdir(root)
except OSError:
pass
print(f"\n{'Would remove' if dry_run else 'Removed'}: {total_removed} files")
print(f"Space {'to save' if dry_run else 'saved'}: {total_saved / 1024 / 1024:.1f} MB")
return results
def main():
parser = argparse.ArgumentParser(description="Deduplicate BIOS files")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--bios-dir", default=DEFAULT_BIOS_DIR)
args = parser.parse_args()
if not os.path.isdir(args.bios_dir):
print(f"Error: {args.bios_dir} not found", file=sys.stderr)
sys.exit(1)
print(f"Scanning {args.bios_dir}/ for duplicates...")
if args.dry_run:
print("(DRY RUN)\n")
deduplicate(args.bios_dir, args.dry_run)
if __name__ == "__main__":
main()