Enhance audio verification features and improve code structure

- Added `ffmpeg` as a dependency in `flake.nix` and `package-cleanup.nix` for audio file property analysis.
- Updated `main.py` to include new functions for retrieving audio file properties using `ffprobe` and verifying audio matches with detailed confidence scoring.
- Refactored fingerprint comparison logic to improve accuracy and added logging for better traceability.
- Enhanced the `find_duplicate_singles` function to support audio verification results and confidence scores, providing clearer output for users.
This commit is contained in:
Danilo Reyes
2025-11-13 23:21:12 -06:00
parent 026c7fe0d8
commit 03e8eb6f4e
3 changed files with 450 additions and 120 deletions

View File

@@ -12,7 +12,7 @@ import os
import subprocess
import sys
from collections import defaultdict
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Union
import requests
from dotenv import load_dotenv
@@ -43,7 +43,7 @@ def get_json(
def get_trackfile_info(
base_url: str, track_file_id: int, headers: Dict[str, str]
) -> Optional[Dict]:
"""Get trackfile information including file path"""
"""Get trackfile information including file path and quality"""
try:
resp = requests.get(
f"{base_url.rstrip('/')}/api/v1/trackfile/{track_file_id}",
@@ -57,24 +57,62 @@ def get_trackfile_info(
return None
def get_track_info(
base_url: str, track_id: int, headers: Dict[str, str]
) -> Optional[Dict]:
"""Get track information including MusicBrainz recording ID"""
try:
resp = requests.get(
f"{base_url.rstrip('/')}/api/v1/track/{track_id}",
headers=headers,
timeout=30,
)
resp.raise_for_status()
return resp.json()
except requests.exceptions.RequestException as e:
logger.warning(f"Could not fetch track {track_id}: {e}")
return None
def map_docker_path(file_path: str, docker_mount: Optional[str] = None) -> str:
"""Map Docker container path to host path"""
if not docker_mount:
return file_path
container_path, host_path = docker_mount.split(":", 1)
if file_path.startswith(container_path):
mapped_path = file_path.replace(container_path, host_path, 1)
return mapped_path
return file_path
if not file_path.startswith(container_path):
return file_path
return file_path.replace(container_path, host_path, 1)
def get_file_hash(
file_path: str, docker_mount: Optional[str] = None, bytes_to_read: int = 1048576
) -> Optional[str]:
"""Get partial file hash (first N bytes) for quick exact duplicate detection"""
mapped_path = map_docker_path(file_path, docker_mount)
if not os.path.exists(mapped_path):
return None
try:
import hashlib
hasher = hashlib.md5()
with open(mapped_path, "rb") as f:
chunk = f.read(bytes_to_read)
hasher.update(chunk)
return hasher.hexdigest()
except Exception as e:
logger.debug(f"Could not compute hash for {mapped_path}: {e}")
return None
def get_audio_fingerprint(
file_path: str, docker_mount: Optional[str] = None
) -> Optional[str]:
"""Generate audio fingerprint using fpcalc (chromaprint)"""
) -> Optional[Tuple[str, int]]:
"""Generate audio fingerprint using fpcalc (chromaprint). Returns (fingerprint, duration_seconds)"""
mapped_path = map_docker_path(file_path, docker_mount)
logger.info(f"Generating fingerprint for: {mapped_path} (original: {file_path})")
logger.debug(f"Generating fingerprint for: {mapped_path} (original: {file_path})")
if not os.path.exists(mapped_path):
logger.warning(f"File not found: {mapped_path} (original: {file_path})")
@@ -83,7 +121,7 @@ def get_audio_fingerprint(
try:
logger.debug(f"Running fpcalc on: {mapped_path}")
result = subprocess.run(
["fpcalc", "-raw", mapped_path],
["fpcalc", "-json", "-length", "180", mapped_path],
capture_output=True,
text=True,
timeout=60,
@@ -93,15 +131,29 @@ def get_audio_fingerprint(
logger.warning(f"fpcalc failed for {mapped_path}: {result.stderr}")
return None
for line in result.stdout.split("\n"):
if line.startswith("FINGERPRINT="):
fingerprint = line.split("=", 1)[1]
logger.info(
f"Successfully generated fingerprint for {mapped_path} (length: {len(fingerprint)})"
)
return fingerprint
logger.warning(f"fpcalc output did not contain FINGERPRINT= for {mapped_path}")
return None
import json
try:
data = json.loads(result.stdout)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse fpcalc JSON output for {mapped_path}: {e}")
return None
fingerprint = data.get("fingerprint")
duration = data.get("duration")
if not fingerprint or duration is None:
logger.warning(
f"fpcalc output missing fingerprint or duration for {mapped_path}"
)
return None
# Fingerprint is already a string in JSON format, no conversion needed
logger.debug(
f"Successfully generated fingerprint for {mapped_path} (duration: {duration}s)"
)
return fingerprint, duration
except subprocess.TimeoutExpired:
logger.warning(f"fpcalc timeout for {mapped_path}")
return None
@@ -115,55 +167,64 @@ def get_audio_fingerprint(
return None
def _format_context(log_context: Optional[str]) -> str:
"""Format log context message"""
return f" ({log_context})" if log_context else ""
def compare_fingerprints(
fp1: Optional[str], fp2: Optional[str], log_context: Optional[str] = None
) -> bool:
"""Compare two audio fingerprints for similarity"""
if not fp1 or not fp2:
context_msg = f" ({log_context})" if log_context else ""
logger.debug(f"Fingerprint comparison failed: missing fingerprint{context_msg}")
return False
fp1_data: Optional[Tuple[str, int]],
fp2_data: Optional[Tuple[str, int]],
log_context: Optional[str] = None,
return_message: bool = False,
) -> Union[bool, Tuple[bool, str]]:
"""Compare two audio fingerprints for similarity. Returns match or (match, message) if return_message=True"""
if not fp1_data or not fp2_data:
message = "Fingerprint comparison failed: missing fingerprint"
logger.debug(f"{message}{_format_context(log_context)}")
return (False, message) if return_message else False
fp1, duration1 = fp1_data
fp2, duration2 = fp2_data
duration_diff = abs(duration1 - duration2)
if duration_diff > 5:
message = f"Fingerprint comparison: duration mismatch ({duration1}s vs {duration2}s, diff: {duration_diff}s)"
logger.debug(f"{message}{_format_context(log_context)}")
return (False, message) if return_message else False
if fp1 == fp2:
context_msg = f" ({log_context})" if log_context else ""
logger.info(f"Fingerprint comparison: exact match{context_msg}")
return True
def decode_fingerprint(fp: str) -> List[int]:
return [int(x) for x in fp.split(",") if x.strip()]
message = "Fingerprint comparison: exact match"
logger.debug(f"{message}{_format_context(log_context)}")
return (True, message) if return_message else True
# Fingerprints are base64-encoded strings from fpcalc -json
# For similarity, we can use simple string similarity metrics
try:
f1 = decode_fingerprint(fp1)
f2 = decode_fingerprint(fp2)
# Calculate similarity based on string edit distance
from difflib import SequenceMatcher
if not f1 or not f2:
context_msg = f" ({log_context})" if log_context else ""
logger.debug(
f"Fingerprint comparison failed: empty decoded fingerprint{context_msg}"
)
return False
# Use SequenceMatcher for string similarity
similarity = SequenceMatcher(None, fp1, fp2).ratio()
min_len = min(len(f1), len(f2))
if min_len == 0:
context_msg = f" ({log_context})" if log_context else ""
logger.debug(f"Fingerprint comparison failed: zero length{context_msg}")
return False
# Adjust threshold based on duration difference
if duration_diff <= 1:
threshold = 0.90
elif duration_diff <= 3:
threshold = 0.93
else:
threshold = 0.95
matches = sum(1 for i in range(min_len) if f1[i] == f2[i])
similarity = matches / min_len
match = similarity >= 0.95
context_msg = f" ({log_context})" if log_context else ""
logger.info(
f"Fingerprint comparison: similarity={similarity:.3f}, match={match}{context_msg}"
match = similarity >= threshold
message = f"Fingerprint comparison: similarity={similarity:.3f}, duration_diff={duration_diff}s, threshold={threshold:.2f}, match={match}"
logger.debug(f"{message}{_format_context(log_context)}")
return (match, message) if return_message else match
except Exception as e:
message = (
f"Fingerprint comparison failed: exception {type(e).__name__}: {str(e)}"
)
return match
except (ValueError, ZeroDivisionError) as e:
context_msg = f" ({log_context})" if log_context else ""
logger.debug(
f"Fingerprint comparison failed: exception {type(e).__name__}{context_msg}"
)
return False
logger.debug(f"{message}{_format_context(log_context)}")
return (False, message) if return_message else False
# DELETE FUNCTIONALITY COMMENTED OUT FOR SAFETY
@@ -201,14 +262,16 @@ def build_album_track_map(
for track in tracks_with_files:
title = track.get("title")
track_id = track.get("id")
track_file_id = track.get("trackFileId")
if not title or not track_file_id:
if not title or not track_file_id or not track_id:
continue
key = (artist_id, normalize_title(title))
album_track_map[key].append(
{
"album_id": album_id,
"album_title": album_title,
"track_id": track_id,
"track_file_id": track_file_id,
}
)
@@ -216,41 +279,254 @@ def build_album_track_map(
return album_track_map
def get_file_properties(
file_path: str, docker_mount: Optional[str] = None
) -> Optional[Dict]:
"""Get audio file properties using ffprobe"""
mapped_path = map_docker_path(file_path, docker_mount)
if not os.path.exists(mapped_path):
return None
try:
import json
result = subprocess.run(
[
"ffprobe",
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
"-show_streams",
mapped_path,
],
capture_output=True,
text=True,
timeout=30,
)
if result.returncode != 0:
return None
data = json.loads(result.stdout)
audio_stream = next(
(s for s in data.get("streams", []) if s.get("codec_type") == "audio"), None
)
format_info = data.get("format", {})
if not audio_stream:
return None
return {
"duration": float(format_info.get("duration", 0)),
"size": int(format_info.get("size", 0)),
"bitrate": int(format_info.get("bit_rate", 0)),
"sample_rate": int(audio_stream.get("sample_rate", 0)),
"channels": int(audio_stream.get("channels", 0)),
"codec": audio_stream.get("codec_name", ""),
"bit_depth": int(audio_stream.get("bits_per_raw_sample", 0)),
}
except Exception as e:
logger.debug(f"Could not get file properties for {mapped_path}: {e}")
return None
def verify_audio_match(
base_url: str,
headers: Dict[str, str],
single_track_id: int,
single_track_file_id: int,
album_track_id: int,
album_track_file_id: int,
docker_mount: Optional[str] = None,
) -> bool:
"""Verify that two track files contain the same audio"""
logger.info(
) -> Tuple[bool, Optional[str], int]:
"""
Verify that two track files contain the same audio using multiple verification methods.
Returns (match, result_message, confidence_score)
Confidence: 0-100, where 100 = definitely same, 0 = definitely different
"""
logger.debug(
f"Verifying audio match: single trackFileId {single_track_file_id} vs album trackFileId {album_track_file_id}"
)
confidence_score = 0
verification_results = []
# Verification 1: MusicBrainz Recording ID (most reliable - 50 points)
single_track_info = get_track_info(base_url, single_track_id, headers)
album_track_info = get_track_info(base_url, album_track_id, headers)
if single_track_info and album_track_info:
single_mb_id = single_track_info.get("foreignRecordingId")
album_mb_id = album_track_info.get("foreignRecordingId")
if single_mb_id and album_mb_id:
if single_mb_id == album_mb_id:
confidence_score += 50
verification_results.append("✓ MusicBrainz Recording ID match (+50)")
else:
confidence_score -= 30
verification_results.append(
"✗ Different MusicBrainz Recording IDs (-30)"
)
else:
verification_results.append("⚠ MusicBrainz Recording ID unavailable")
# Verification 2: File Properties (duration, size, bitrate, codec - 50 points)
single_file_info = get_trackfile_info(base_url, single_track_file_id, headers)
album_file_info = get_trackfile_info(base_url, album_track_file_id, headers)
if not single_file_info or not album_file_info:
logger.debug(
f"Could not fetch track file info: single={single_file_info is not None}, album={album_file_info is not None}"
)
return False
return False, "Could not fetch track file info", 0
single_path = single_file_info.get("path")
album_path = album_file_info.get("path")
if not single_path or not album_path:
logger.debug(
f"Missing file paths: single_path={single_path is not None}, album_path={album_path is not None}"
)
return False
return False, "Missing file paths", 0
logger.info(f"Fetching fingerprints: single={single_path}, album={album_path}")
# Verification 1.5: Lidarr quality profile comparison (10 points)
single_quality = single_file_info.get("quality", {}).get("quality", {})
album_quality = album_file_info.get("quality", {}).get("quality", {})
if single_quality and album_quality:
single_quality_name = single_quality.get("name", "")
album_quality_name = album_quality.get("name", "")
if single_quality_name and album_quality_name:
if single_quality_name == album_quality_name:
confidence_score += 10
verification_results.append(
f"✓ Same quality ({single_quality_name}) (+10)"
)
else:
verification_results.append(
f"⚠ Different quality ({single_quality_name} vs {album_quality_name})"
)
single_props = get_file_properties(single_path, docker_mount)
album_props = get_file_properties(album_path, docker_mount)
if single_props and album_props:
# Duration check (15 points)
duration_diff = abs(single_props["duration"] - album_props["duration"])
if duration_diff <= 1:
confidence_score += 15
verification_results.append(
f"✓ Duration match ({duration_diff:.1f}s diff) (+15)"
)
elif duration_diff <= 3:
confidence_score += 5
verification_results.append(
f"⚠ Close duration ({duration_diff:.1f}s diff) (+5)"
)
else:
confidence_score -= 10
verification_results.append(
f"✗ Duration mismatch ({duration_diff:.1f}s diff) (-10)"
)
# File size check (15 points)
size_ratio = min(single_props["size"], album_props["size"]) / max(
single_props["size"], album_props["size"]
)
if size_ratio >= 0.95:
confidence_score += 15
verification_results.append(f"✓ File size match ({size_ratio:.2%}) (+15)")
elif size_ratio >= 0.85:
confidence_score += 5
verification_results.append(f"⚠ Similar file size ({size_ratio:.2%}) (+5)")
else:
verification_results.append(f"⚠ Different file sizes ({size_ratio:.2%})")
# Bitrate check (10 points)
if single_props["bitrate"] > 0 and album_props["bitrate"] > 0:
bitrate_ratio = min(single_props["bitrate"], album_props["bitrate"]) / max(
single_props["bitrate"], album_props["bitrate"]
)
if bitrate_ratio >= 0.90:
confidence_score += 10
verification_results.append(
f"✓ Bitrate match ({bitrate_ratio:.2%}) (+10)"
)
# Sample rate check (5 points)
if single_props["sample_rate"] == album_props["sample_rate"]:
confidence_score += 5
verification_results.append(
f"✓ Sample rate match ({single_props['sample_rate']}Hz) (+5)"
)
# Codec check (5 points)
if single_props["codec"] and album_props["codec"]:
if single_props["codec"] == album_props["codec"]:
confidence_score += 5
verification_results.append(
f"✓ Same codec ({single_props['codec']}) (+5)"
)
else:
verification_results.append(
f"⚠ Different codecs ({single_props['codec']} vs {album_props['codec']})"
)
# Channels check (5 points)
if single_props["channels"] == album_props["channels"]:
confidence_score += 5
verification_results.append(
f"✓ Same channels ({single_props['channels']}) (+5)"
)
else:
verification_results.append(
f"⚠ Different channels ({single_props['channels']} vs {album_props['channels']})"
)
# Bit depth check (5 points) - helps identify remasters
if single_props["bit_depth"] > 0 and album_props["bit_depth"] > 0:
if single_props["bit_depth"] == album_props["bit_depth"]:
confidence_score += 5
verification_results.append(
f"✓ Same bit depth ({single_props['bit_depth']}-bit) (+5)"
)
else:
verification_results.append(
f"⚠ Different bit depths ({single_props['bit_depth']}-bit vs {album_props['bit_depth']}-bit)"
)
# Verification 3: File hash comparison (30 points) - quick exact duplicate check
single_hash = get_file_hash(single_path, docker_mount)
album_hash = get_file_hash(album_path, docker_mount)
if single_hash and album_hash:
if single_hash == album_hash:
confidence_score += 30
verification_results.append(f"✓ File hash match (exact duplicate) (+30)")
else:
verification_results.append(f"⚠ Different file hashes")
# Verification 4: Chromaprint fingerprint (20 points)
single_fp = get_audio_fingerprint(single_path, docker_mount)
album_fp = get_audio_fingerprint(album_path, docker_mount)
log_context = f"single trackFileId {single_track_file_id} vs album trackFileId {album_track_file_id}"
return compare_fingerprints(single_fp, album_fp, log_context)
if single_fp and album_fp:
log_context = f"single trackFileId {single_track_file_id} vs album trackFileId {album_track_file_id}"
fp_match, fp_message = compare_fingerprints(
single_fp, album_fp, log_context, return_message=True
)
if fp_match:
confidence_score += 20
verification_results.append(f"✓ Audio fingerprint match (+20)")
else:
verification_results.append(f"{fp_message}")
else:
verification_results.append("⚠ Audio fingerprint unavailable")
# Final decision
match = confidence_score >= 70
result_message = f"Confidence: {confidence_score}/100 | " + " | ".join(
verification_results
)
return match, result_message, confidence_score
def find_duplicate_singles(
@@ -285,8 +561,9 @@ def find_duplicate_singles(
for track in tracks_with_files:
title = track.get("title")
track_id = track.get("id")
track_file_id = track.get("trackFileId")
if not title or not track_file_id:
if not title or not track_file_id or not track_id:
continue
key = (artist_id, normalize_title(title))
@@ -294,33 +571,10 @@ def find_duplicate_singles(
continue
duplicate_albums = album_track_map[key]
verified_albums = []
if not duplicate_albums:
continue
if verify_audio:
logger.info(
f"Verifying audio for '{title}' from single '{album_title}' against {len(duplicate_albums)} album track(s)..."
)
for album_track in duplicate_albums:
album_track_file_id = album_track["track_file_id"]
if verify_audio_match(
base_url,
headers,
track_file_id,
album_track_file_id,
docker_mount,
):
verified_albums.append(album_track)
logger.debug(
f"Audio match confirmed: single trackFileId {track_file_id} matches album '{album_track['album_title']}' trackFileId {album_track_file_id}"
)
else:
logger.debug(
f"Audio mismatch: single trackFileId {track_file_id} does not match album '{album_track['album_title']}' trackFileId {album_track_file_id}"
)
else:
verified_albums = duplicate_albums
if verified_albums:
if not verify_audio:
duplicates.append(
{
"artist_id": artist_id,
@@ -328,9 +582,60 @@ def find_duplicate_singles(
"single_album_title": album_title,
"track_title": title,
"single_track_file_id": track_file_id,
"duplicate_albums": verified_albums,
"duplicate_albums": duplicate_albums,
"verified_albums": duplicate_albums,
"verification_results": ["Audio verification disabled"],
"confidence_scores": [0],
}
)
continue
logger.debug(
f"Verifying audio for '{title}' from single '{album_title}' against {len(duplicate_albums)} album track(s)..."
)
verified_albums = []
verification_results = []
confidence_scores = []
for album_track in duplicate_albums:
album_track_id = album_track["track_id"]
album_track_file_id = album_track["track_file_id"]
match, result_message, confidence = verify_audio_match(
base_url,
headers,
track_id,
track_file_id,
album_track_id,
album_track_file_id,
docker_mount,
)
verification_results.append(result_message)
confidence_scores.append(confidence)
if not match:
logger.debug(
f"Audio mismatch: single trackFileId {track_file_id} does not match album '{album_track['album_title']}' trackFileId {album_track_file_id} (confidence: {confidence}/100)"
)
continue
verified_albums.append(album_track)
logger.debug(
f"Audio match confirmed: single trackFileId {track_file_id} matches album '{album_track['album_title']}' trackFileId {album_track_file_id} (confidence: {confidence}/100)"
)
duplicates.append(
{
"artist_id": artist_id,
"single_album_id": album_id,
"single_album_title": album_title,
"track_title": title,
"single_track_file_id": track_file_id,
"duplicate_albums": duplicate_albums,
"verified_albums": verified_albums,
"verification_results": verification_results,
"confidence_scores": confidence_scores,
}
)
return duplicates
@@ -416,12 +721,11 @@ def main() -> None:
artist_id = artist.get("id")
if not artist_id:
continue
artist_albums = get_json(
f"{base_url}/api/v1/album",
headers,
params={"artistId": artist_id},
albums.extend(
get_json(
f"{base_url}/api/v1/album", headers, params={"artistId": artist_id}
)
)
albums.extend(artist_albums)
if not albums:
logger.warning("No albums found in the library.")
@@ -433,20 +737,18 @@ def main() -> None:
verify_audio = not args.no_audio_verify
docker_mount = args.docker_mount if args.docker_mount else None
if verify_audio:
if docker_mount:
logger.info(
f"Scanning for duplicate singles with audio verification (Docker mount: {docker_mount})..."
)
else:
logger.info("Scanning for duplicate singles with audio verification...")
logger.info(
"NOTE: Audio verification requires 'fpcalc' (chromaprint) to be installed"
)
else:
if not verify_audio:
logger.info(
"Scanning for duplicate singles (audio verification disabled - using title matching only)..."
)
else:
mount_msg = f" (Docker mount: {docker_mount})" if docker_mount else ""
logger.info(
f"Scanning for duplicate singles with audio verification{mount_msg}..."
)
logger.info(
"NOTE: Audio verification requires 'fpcalc' (chromaprint) to be installed"
)
duplicates = find_duplicate_singles(
base_url,
@@ -461,13 +763,17 @@ def main() -> None:
logger.info("No duplicate singles found. The library appears clean.")
return
verified_count = sum(1 for dup in duplicates if dup.get("verified_albums"))
logger.info(
f"Found {len(duplicates)} single track(s) that are duplicates of album tracks:"
f"Found {len(duplicates)} single track(s) that are duplicates of album tracks ({verified_count} verified by audio fingerprint):"
)
for dup in duplicates:
artist_id = dup["artist_id"]
artist_name = artist_map.get(artist_id, f"Unknown (ID: {artist_id})")
duplicate_albums = dup["duplicate_albums"]
verified_albums = dup.get("verified_albums", duplicate_albums)
verification_results = dup.get("verification_results", [])
confidence_scores = dup.get("confidence_scores", [])
album_names = [album["album_title"] for album in duplicate_albums]
logger.info(f"Artist: {artist_name}")
@@ -475,9 +781,23 @@ def main() -> None:
logger.info(
f" Track: '{dup['track_title']}' (trackFileId: {dup['single_track_file_id']})"
)
for i, result in enumerate(verification_results):
confidence = confidence_scores[i] if i < len(confidence_scores) else 0
logger.info(f" {result}")
logger.info(
f" Already present on {len(duplicate_albums)} album(s): {', '.join(album_names)}"
)
if verify_audio:
if verified_albums:
verified_names = [album["album_title"] for album in verified_albums]
logger.info(
f" ✓ LIKELY safe to delete (audio verified on {len(verified_albums)} album(s): {', '.join(verified_names)})"
)
logger.info(
f" ⚠ CAUTION: Always check for different versions (remaster, radio edit, live, etc)"
)
else:
logger.info(f" ⚠ NOT safe to delete (audio verification failed)")
logger.info("")
# DELETE FUNCTIONALITY COMMENTED OUT FOR SAFETY