Enhance audio verification features and improve code structure
- Added `ffmpeg` as a dependency in `flake.nix` and `package-cleanup.nix` for audio file property analysis. - Updated `main.py` to include new functions for retrieving audio file properties using `ffprobe` and verifying audio matches with detailed confidence scoring. - Refactored fingerprint comparison logic to improve accuracy and added logging for better traceability. - Enhanced the `find_duplicate_singles` function to support audio verification results and confidence scores, providing clearer output for users.
This commit is contained in:
@@ -52,11 +52,13 @@
|
|||||||
))
|
))
|
||||||
pkgs.black
|
pkgs.black
|
||||||
pkgs.chromaprint
|
pkgs.chromaprint
|
||||||
|
pkgs.ffmpeg
|
||||||
];
|
];
|
||||||
shellHook = ''
|
shellHook = ''
|
||||||
echo "Python environment ready!"
|
echo "Python environment ready!"
|
||||||
echo "Run: python src/main.py"
|
echo "Run: python src/main.py"
|
||||||
echo "Format code with: black src/"
|
echo "Format code with: black src/"
|
||||||
|
echo "Audio verification tools: ffprobe (ffmpeg), fpcalc (chromaprint)"
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -12,9 +12,17 @@ pkgs.python3Packages.buildPythonApplication {
|
|||||||
requests
|
requests
|
||||||
python-dotenv
|
python-dotenv
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Runtime dependencies for audio verification
|
||||||
buildInputs = [
|
buildInputs = [
|
||||||
pkgs.chromaprint
|
pkgs.chromaprint
|
||||||
|
pkgs.ffmpeg
|
||||||
];
|
];
|
||||||
|
|
||||||
|
makeWrapperArgs = [
|
||||||
|
"--prefix PATH : ${pkgs.lib.makeBinPath [ pkgs.ffmpeg pkgs.chromaprint ]}"
|
||||||
|
];
|
||||||
|
|
||||||
meta = {
|
meta = {
|
||||||
mainProgram = "lidarr-cleanup-singles";
|
mainProgram = "lidarr-cleanup-singles";
|
||||||
description = "Identify duplicate single tracks in Lidarr";
|
description = "Identify duplicate single tracks in Lidarr";
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ import os
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
@@ -43,7 +43,7 @@ def get_json(
|
|||||||
def get_trackfile_info(
|
def get_trackfile_info(
|
||||||
base_url: str, track_file_id: int, headers: Dict[str, str]
|
base_url: str, track_file_id: int, headers: Dict[str, str]
|
||||||
) -> Optional[Dict]:
|
) -> Optional[Dict]:
|
||||||
"""Get trackfile information including file path"""
|
"""Get trackfile information including file path and quality"""
|
||||||
try:
|
try:
|
||||||
resp = requests.get(
|
resp = requests.get(
|
||||||
f"{base_url.rstrip('/')}/api/v1/trackfile/{track_file_id}",
|
f"{base_url.rstrip('/')}/api/v1/trackfile/{track_file_id}",
|
||||||
@@ -57,24 +57,62 @@ def get_trackfile_info(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_track_info(
|
||||||
|
base_url: str, track_id: int, headers: Dict[str, str]
|
||||||
|
) -> Optional[Dict]:
|
||||||
|
"""Get track information including MusicBrainz recording ID"""
|
||||||
|
try:
|
||||||
|
resp = requests.get(
|
||||||
|
f"{base_url.rstrip('/')}/api/v1/track/{track_id}",
|
||||||
|
headers=headers,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.warning(f"Could not fetch track {track_id}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def map_docker_path(file_path: str, docker_mount: Optional[str] = None) -> str:
|
def map_docker_path(file_path: str, docker_mount: Optional[str] = None) -> str:
|
||||||
"""Map Docker container path to host path"""
|
"""Map Docker container path to host path"""
|
||||||
if not docker_mount:
|
if not docker_mount:
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
container_path, host_path = docker_mount.split(":", 1)
|
container_path, host_path = docker_mount.split(":", 1)
|
||||||
if file_path.startswith(container_path):
|
if not file_path.startswith(container_path):
|
||||||
mapped_path = file_path.replace(container_path, host_path, 1)
|
return file_path
|
||||||
return mapped_path
|
|
||||||
return file_path
|
return file_path.replace(container_path, host_path, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_hash(
|
||||||
|
file_path: str, docker_mount: Optional[str] = None, bytes_to_read: int = 1048576
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Get partial file hash (first N bytes) for quick exact duplicate detection"""
|
||||||
|
mapped_path = map_docker_path(file_path, docker_mount)
|
||||||
|
if not os.path.exists(mapped_path):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
hasher = hashlib.md5()
|
||||||
|
with open(mapped_path, "rb") as f:
|
||||||
|
chunk = f.read(bytes_to_read)
|
||||||
|
hasher.update(chunk)
|
||||||
|
return hasher.hexdigest()
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Could not compute hash for {mapped_path}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_audio_fingerprint(
|
def get_audio_fingerprint(
|
||||||
file_path: str, docker_mount: Optional[str] = None
|
file_path: str, docker_mount: Optional[str] = None
|
||||||
) -> Optional[str]:
|
) -> Optional[Tuple[str, int]]:
|
||||||
"""Generate audio fingerprint using fpcalc (chromaprint)"""
|
"""Generate audio fingerprint using fpcalc (chromaprint). Returns (fingerprint, duration_seconds)"""
|
||||||
mapped_path = map_docker_path(file_path, docker_mount)
|
mapped_path = map_docker_path(file_path, docker_mount)
|
||||||
logger.info(f"Generating fingerprint for: {mapped_path} (original: {file_path})")
|
logger.debug(f"Generating fingerprint for: {mapped_path} (original: {file_path})")
|
||||||
|
|
||||||
if not os.path.exists(mapped_path):
|
if not os.path.exists(mapped_path):
|
||||||
logger.warning(f"File not found: {mapped_path} (original: {file_path})")
|
logger.warning(f"File not found: {mapped_path} (original: {file_path})")
|
||||||
@@ -83,7 +121,7 @@ def get_audio_fingerprint(
|
|||||||
try:
|
try:
|
||||||
logger.debug(f"Running fpcalc on: {mapped_path}")
|
logger.debug(f"Running fpcalc on: {mapped_path}")
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["fpcalc", "-raw", mapped_path],
|
["fpcalc", "-json", "-length", "180", mapped_path],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=60,
|
timeout=60,
|
||||||
@@ -93,15 +131,29 @@ def get_audio_fingerprint(
|
|||||||
logger.warning(f"fpcalc failed for {mapped_path}: {result.stderr}")
|
logger.warning(f"fpcalc failed for {mapped_path}: {result.stderr}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
for line in result.stdout.split("\n"):
|
import json
|
||||||
if line.startswith("FINGERPRINT="):
|
|
||||||
fingerprint = line.split("=", 1)[1]
|
try:
|
||||||
logger.info(
|
data = json.loads(result.stdout)
|
||||||
f"Successfully generated fingerprint for {mapped_path} (length: {len(fingerprint)})"
|
except json.JSONDecodeError as e:
|
||||||
)
|
logger.warning(f"Failed to parse fpcalc JSON output for {mapped_path}: {e}")
|
||||||
return fingerprint
|
return None
|
||||||
logger.warning(f"fpcalc output did not contain FINGERPRINT= for {mapped_path}")
|
|
||||||
return None
|
fingerprint = data.get("fingerprint")
|
||||||
|
duration = data.get("duration")
|
||||||
|
|
||||||
|
if not fingerprint or duration is None:
|
||||||
|
logger.warning(
|
||||||
|
f"fpcalc output missing fingerprint or duration for {mapped_path}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Fingerprint is already a string in JSON format, no conversion needed
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Successfully generated fingerprint for {mapped_path} (duration: {duration}s)"
|
||||||
|
)
|
||||||
|
return fingerprint, duration
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
logger.warning(f"fpcalc timeout for {mapped_path}")
|
logger.warning(f"fpcalc timeout for {mapped_path}")
|
||||||
return None
|
return None
|
||||||
@@ -115,55 +167,64 @@ def get_audio_fingerprint(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _format_context(log_context: Optional[str]) -> str:
|
||||||
|
"""Format log context message"""
|
||||||
|
return f" ({log_context})" if log_context else ""
|
||||||
|
|
||||||
|
|
||||||
def compare_fingerprints(
|
def compare_fingerprints(
|
||||||
fp1: Optional[str], fp2: Optional[str], log_context: Optional[str] = None
|
fp1_data: Optional[Tuple[str, int]],
|
||||||
) -> bool:
|
fp2_data: Optional[Tuple[str, int]],
|
||||||
"""Compare two audio fingerprints for similarity"""
|
log_context: Optional[str] = None,
|
||||||
if not fp1 or not fp2:
|
return_message: bool = False,
|
||||||
context_msg = f" ({log_context})" if log_context else ""
|
) -> Union[bool, Tuple[bool, str]]:
|
||||||
logger.debug(f"Fingerprint comparison failed: missing fingerprint{context_msg}")
|
"""Compare two audio fingerprints for similarity. Returns match or (match, message) if return_message=True"""
|
||||||
return False
|
if not fp1_data or not fp2_data:
|
||||||
|
message = "Fingerprint comparison failed: missing fingerprint"
|
||||||
|
logger.debug(f"{message}{_format_context(log_context)}")
|
||||||
|
return (False, message) if return_message else False
|
||||||
|
|
||||||
|
fp1, duration1 = fp1_data
|
||||||
|
fp2, duration2 = fp2_data
|
||||||
|
|
||||||
|
duration_diff = abs(duration1 - duration2)
|
||||||
|
if duration_diff > 5:
|
||||||
|
message = f"Fingerprint comparison: duration mismatch ({duration1}s vs {duration2}s, diff: {duration_diff}s)"
|
||||||
|
logger.debug(f"{message}{_format_context(log_context)}")
|
||||||
|
return (False, message) if return_message else False
|
||||||
|
|
||||||
if fp1 == fp2:
|
if fp1 == fp2:
|
||||||
context_msg = f" ({log_context})" if log_context else ""
|
message = "Fingerprint comparison: exact match"
|
||||||
logger.info(f"Fingerprint comparison: exact match{context_msg}")
|
logger.debug(f"{message}{_format_context(log_context)}")
|
||||||
return True
|
return (True, message) if return_message else True
|
||||||
|
|
||||||
def decode_fingerprint(fp: str) -> List[int]:
|
|
||||||
return [int(x) for x in fp.split(",") if x.strip()]
|
|
||||||
|
|
||||||
|
# Fingerprints are base64-encoded strings from fpcalc -json
|
||||||
|
# For similarity, we can use simple string similarity metrics
|
||||||
try:
|
try:
|
||||||
f1 = decode_fingerprint(fp1)
|
# Calculate similarity based on string edit distance
|
||||||
f2 = decode_fingerprint(fp2)
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
if not f1 or not f2:
|
# Use SequenceMatcher for string similarity
|
||||||
context_msg = f" ({log_context})" if log_context else ""
|
similarity = SequenceMatcher(None, fp1, fp2).ratio()
|
||||||
logger.debug(
|
|
||||||
f"Fingerprint comparison failed: empty decoded fingerprint{context_msg}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
min_len = min(len(f1), len(f2))
|
# Adjust threshold based on duration difference
|
||||||
if min_len == 0:
|
if duration_diff <= 1:
|
||||||
context_msg = f" ({log_context})" if log_context else ""
|
threshold = 0.90
|
||||||
logger.debug(f"Fingerprint comparison failed: zero length{context_msg}")
|
elif duration_diff <= 3:
|
||||||
return False
|
threshold = 0.93
|
||||||
|
else:
|
||||||
|
threshold = 0.95
|
||||||
|
|
||||||
matches = sum(1 for i in range(min_len) if f1[i] == f2[i])
|
match = similarity >= threshold
|
||||||
similarity = matches / min_len
|
message = f"Fingerprint comparison: similarity={similarity:.3f}, duration_diff={duration_diff}s, threshold={threshold:.2f}, match={match}"
|
||||||
match = similarity >= 0.95
|
logger.debug(f"{message}{_format_context(log_context)}")
|
||||||
|
return (match, message) if return_message else match
|
||||||
context_msg = f" ({log_context})" if log_context else ""
|
except Exception as e:
|
||||||
logger.info(
|
message = (
|
||||||
f"Fingerprint comparison: similarity={similarity:.3f}, match={match}{context_msg}"
|
f"Fingerprint comparison failed: exception {type(e).__name__}: {str(e)}"
|
||||||
)
|
)
|
||||||
return match
|
logger.debug(f"{message}{_format_context(log_context)}")
|
||||||
except (ValueError, ZeroDivisionError) as e:
|
return (False, message) if return_message else False
|
||||||
context_msg = f" ({log_context})" if log_context else ""
|
|
||||||
logger.debug(
|
|
||||||
f"Fingerprint comparison failed: exception {type(e).__name__}{context_msg}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# DELETE FUNCTIONALITY COMMENTED OUT FOR SAFETY
|
# DELETE FUNCTIONALITY COMMENTED OUT FOR SAFETY
|
||||||
@@ -201,14 +262,16 @@ def build_album_track_map(
|
|||||||
|
|
||||||
for track in tracks_with_files:
|
for track in tracks_with_files:
|
||||||
title = track.get("title")
|
title = track.get("title")
|
||||||
|
track_id = track.get("id")
|
||||||
track_file_id = track.get("trackFileId")
|
track_file_id = track.get("trackFileId")
|
||||||
if not title or not track_file_id:
|
if not title or not track_file_id or not track_id:
|
||||||
continue
|
continue
|
||||||
key = (artist_id, normalize_title(title))
|
key = (artist_id, normalize_title(title))
|
||||||
album_track_map[key].append(
|
album_track_map[key].append(
|
||||||
{
|
{
|
||||||
"album_id": album_id,
|
"album_id": album_id,
|
||||||
"album_title": album_title,
|
"album_title": album_title,
|
||||||
|
"track_id": track_id,
|
||||||
"track_file_id": track_file_id,
|
"track_file_id": track_file_id,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -216,41 +279,254 @@ def build_album_track_map(
|
|||||||
return album_track_map
|
return album_track_map
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_properties(
|
||||||
|
file_path: str, docker_mount: Optional[str] = None
|
||||||
|
) -> Optional[Dict]:
|
||||||
|
"""Get audio file properties using ffprobe"""
|
||||||
|
mapped_path = map_docker_path(file_path, docker_mount)
|
||||||
|
if not os.path.exists(mapped_path):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"ffprobe",
|
||||||
|
"-v",
|
||||||
|
"quiet",
|
||||||
|
"-print_format",
|
||||||
|
"json",
|
||||||
|
"-show_format",
|
||||||
|
"-show_streams",
|
||||||
|
mapped_path,
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
data = json.loads(result.stdout)
|
||||||
|
audio_stream = next(
|
||||||
|
(s for s in data.get("streams", []) if s.get("codec_type") == "audio"), None
|
||||||
|
)
|
||||||
|
format_info = data.get("format", {})
|
||||||
|
|
||||||
|
if not audio_stream:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"duration": float(format_info.get("duration", 0)),
|
||||||
|
"size": int(format_info.get("size", 0)),
|
||||||
|
"bitrate": int(format_info.get("bit_rate", 0)),
|
||||||
|
"sample_rate": int(audio_stream.get("sample_rate", 0)),
|
||||||
|
"channels": int(audio_stream.get("channels", 0)),
|
||||||
|
"codec": audio_stream.get("codec_name", ""),
|
||||||
|
"bit_depth": int(audio_stream.get("bits_per_raw_sample", 0)),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Could not get file properties for {mapped_path}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def verify_audio_match(
|
def verify_audio_match(
|
||||||
base_url: str,
|
base_url: str,
|
||||||
headers: Dict[str, str],
|
headers: Dict[str, str],
|
||||||
|
single_track_id: int,
|
||||||
single_track_file_id: int,
|
single_track_file_id: int,
|
||||||
|
album_track_id: int,
|
||||||
album_track_file_id: int,
|
album_track_file_id: int,
|
||||||
docker_mount: Optional[str] = None,
|
docker_mount: Optional[str] = None,
|
||||||
) -> bool:
|
) -> Tuple[bool, Optional[str], int]:
|
||||||
"""Verify that two track files contain the same audio"""
|
"""
|
||||||
logger.info(
|
Verify that two track files contain the same audio using multiple verification methods.
|
||||||
|
Returns (match, result_message, confidence_score)
|
||||||
|
Confidence: 0-100, where 100 = definitely same, 0 = definitely different
|
||||||
|
"""
|
||||||
|
logger.debug(
|
||||||
f"Verifying audio match: single trackFileId {single_track_file_id} vs album trackFileId {album_track_file_id}"
|
f"Verifying audio match: single trackFileId {single_track_file_id} vs album trackFileId {album_track_file_id}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
confidence_score = 0
|
||||||
|
verification_results = []
|
||||||
|
|
||||||
|
# Verification 1: MusicBrainz Recording ID (most reliable - 50 points)
|
||||||
|
single_track_info = get_track_info(base_url, single_track_id, headers)
|
||||||
|
album_track_info = get_track_info(base_url, album_track_id, headers)
|
||||||
|
|
||||||
|
if single_track_info and album_track_info:
|
||||||
|
single_mb_id = single_track_info.get("foreignRecordingId")
|
||||||
|
album_mb_id = album_track_info.get("foreignRecordingId")
|
||||||
|
|
||||||
|
if single_mb_id and album_mb_id:
|
||||||
|
if single_mb_id == album_mb_id:
|
||||||
|
confidence_score += 50
|
||||||
|
verification_results.append("✓ MusicBrainz Recording ID match (+50)")
|
||||||
|
else:
|
||||||
|
confidence_score -= 30
|
||||||
|
verification_results.append(
|
||||||
|
"✗ Different MusicBrainz Recording IDs (-30)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
verification_results.append("⚠ MusicBrainz Recording ID unavailable")
|
||||||
|
|
||||||
|
# Verification 2: File Properties (duration, size, bitrate, codec - 50 points)
|
||||||
single_file_info = get_trackfile_info(base_url, single_track_file_id, headers)
|
single_file_info = get_trackfile_info(base_url, single_track_file_id, headers)
|
||||||
album_file_info = get_trackfile_info(base_url, album_track_file_id, headers)
|
album_file_info = get_trackfile_info(base_url, album_track_file_id, headers)
|
||||||
|
|
||||||
if not single_file_info or not album_file_info:
|
if not single_file_info or not album_file_info:
|
||||||
logger.debug(
|
return False, "Could not fetch track file info", 0
|
||||||
f"Could not fetch track file info: single={single_file_info is not None}, album={album_file_info is not None}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
single_path = single_file_info.get("path")
|
single_path = single_file_info.get("path")
|
||||||
album_path = album_file_info.get("path")
|
album_path = album_file_info.get("path")
|
||||||
|
|
||||||
if not single_path or not album_path:
|
if not single_path or not album_path:
|
||||||
logger.debug(
|
return False, "Missing file paths", 0
|
||||||
f"Missing file paths: single_path={single_path is not None}, album_path={album_path is not None}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
logger.info(f"Fetching fingerprints: single={single_path}, album={album_path}")
|
# Verification 1.5: Lidarr quality profile comparison (10 points)
|
||||||
|
single_quality = single_file_info.get("quality", {}).get("quality", {})
|
||||||
|
album_quality = album_file_info.get("quality", {}).get("quality", {})
|
||||||
|
|
||||||
|
if single_quality and album_quality:
|
||||||
|
single_quality_name = single_quality.get("name", "")
|
||||||
|
album_quality_name = album_quality.get("name", "")
|
||||||
|
|
||||||
|
if single_quality_name and album_quality_name:
|
||||||
|
if single_quality_name == album_quality_name:
|
||||||
|
confidence_score += 10
|
||||||
|
verification_results.append(
|
||||||
|
f"✓ Same quality ({single_quality_name}) (+10)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
verification_results.append(
|
||||||
|
f"⚠ Different quality ({single_quality_name} vs {album_quality_name})"
|
||||||
|
)
|
||||||
|
|
||||||
|
single_props = get_file_properties(single_path, docker_mount)
|
||||||
|
album_props = get_file_properties(album_path, docker_mount)
|
||||||
|
|
||||||
|
if single_props and album_props:
|
||||||
|
# Duration check (15 points)
|
||||||
|
duration_diff = abs(single_props["duration"] - album_props["duration"])
|
||||||
|
if duration_diff <= 1:
|
||||||
|
confidence_score += 15
|
||||||
|
verification_results.append(
|
||||||
|
f"✓ Duration match ({duration_diff:.1f}s diff) (+15)"
|
||||||
|
)
|
||||||
|
elif duration_diff <= 3:
|
||||||
|
confidence_score += 5
|
||||||
|
verification_results.append(
|
||||||
|
f"⚠ Close duration ({duration_diff:.1f}s diff) (+5)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
confidence_score -= 10
|
||||||
|
verification_results.append(
|
||||||
|
f"✗ Duration mismatch ({duration_diff:.1f}s diff) (-10)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# File size check (15 points)
|
||||||
|
size_ratio = min(single_props["size"], album_props["size"]) / max(
|
||||||
|
single_props["size"], album_props["size"]
|
||||||
|
)
|
||||||
|
if size_ratio >= 0.95:
|
||||||
|
confidence_score += 15
|
||||||
|
verification_results.append(f"✓ File size match ({size_ratio:.2%}) (+15)")
|
||||||
|
elif size_ratio >= 0.85:
|
||||||
|
confidence_score += 5
|
||||||
|
verification_results.append(f"⚠ Similar file size ({size_ratio:.2%}) (+5)")
|
||||||
|
else:
|
||||||
|
verification_results.append(f"⚠ Different file sizes ({size_ratio:.2%})")
|
||||||
|
|
||||||
|
# Bitrate check (10 points)
|
||||||
|
if single_props["bitrate"] > 0 and album_props["bitrate"] > 0:
|
||||||
|
bitrate_ratio = min(single_props["bitrate"], album_props["bitrate"]) / max(
|
||||||
|
single_props["bitrate"], album_props["bitrate"]
|
||||||
|
)
|
||||||
|
if bitrate_ratio >= 0.90:
|
||||||
|
confidence_score += 10
|
||||||
|
verification_results.append(
|
||||||
|
f"✓ Bitrate match ({bitrate_ratio:.2%}) (+10)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sample rate check (5 points)
|
||||||
|
if single_props["sample_rate"] == album_props["sample_rate"]:
|
||||||
|
confidence_score += 5
|
||||||
|
verification_results.append(
|
||||||
|
f"✓ Sample rate match ({single_props['sample_rate']}Hz) (+5)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Codec check (5 points)
|
||||||
|
if single_props["codec"] and album_props["codec"]:
|
||||||
|
if single_props["codec"] == album_props["codec"]:
|
||||||
|
confidence_score += 5
|
||||||
|
verification_results.append(
|
||||||
|
f"✓ Same codec ({single_props['codec']}) (+5)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
verification_results.append(
|
||||||
|
f"⚠ Different codecs ({single_props['codec']} vs {album_props['codec']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Channels check (5 points)
|
||||||
|
if single_props["channels"] == album_props["channels"]:
|
||||||
|
confidence_score += 5
|
||||||
|
verification_results.append(
|
||||||
|
f"✓ Same channels ({single_props['channels']}) (+5)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
verification_results.append(
|
||||||
|
f"⚠ Different channels ({single_props['channels']} vs {album_props['channels']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bit depth check (5 points) - helps identify remasters
|
||||||
|
if single_props["bit_depth"] > 0 and album_props["bit_depth"] > 0:
|
||||||
|
if single_props["bit_depth"] == album_props["bit_depth"]:
|
||||||
|
confidence_score += 5
|
||||||
|
verification_results.append(
|
||||||
|
f"✓ Same bit depth ({single_props['bit_depth']}-bit) (+5)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
verification_results.append(
|
||||||
|
f"⚠ Different bit depths ({single_props['bit_depth']}-bit vs {album_props['bit_depth']}-bit)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verification 3: File hash comparison (30 points) - quick exact duplicate check
|
||||||
|
single_hash = get_file_hash(single_path, docker_mount)
|
||||||
|
album_hash = get_file_hash(album_path, docker_mount)
|
||||||
|
|
||||||
|
if single_hash and album_hash:
|
||||||
|
if single_hash == album_hash:
|
||||||
|
confidence_score += 30
|
||||||
|
verification_results.append(f"✓ File hash match (exact duplicate) (+30)")
|
||||||
|
else:
|
||||||
|
verification_results.append(f"⚠ Different file hashes")
|
||||||
|
|
||||||
|
# Verification 4: Chromaprint fingerprint (20 points)
|
||||||
single_fp = get_audio_fingerprint(single_path, docker_mount)
|
single_fp = get_audio_fingerprint(single_path, docker_mount)
|
||||||
album_fp = get_audio_fingerprint(album_path, docker_mount)
|
album_fp = get_audio_fingerprint(album_path, docker_mount)
|
||||||
|
|
||||||
log_context = f"single trackFileId {single_track_file_id} vs album trackFileId {album_track_file_id}"
|
if single_fp and album_fp:
|
||||||
return compare_fingerprints(single_fp, album_fp, log_context)
|
log_context = f"single trackFileId {single_track_file_id} vs album trackFileId {album_track_file_id}"
|
||||||
|
fp_match, fp_message = compare_fingerprints(
|
||||||
|
single_fp, album_fp, log_context, return_message=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if fp_match:
|
||||||
|
confidence_score += 20
|
||||||
|
verification_results.append(f"✓ Audio fingerprint match (+20)")
|
||||||
|
else:
|
||||||
|
verification_results.append(f"⚠ {fp_message}")
|
||||||
|
else:
|
||||||
|
verification_results.append("⚠ Audio fingerprint unavailable")
|
||||||
|
|
||||||
|
# Final decision
|
||||||
|
match = confidence_score >= 70
|
||||||
|
result_message = f"Confidence: {confidence_score}/100 | " + " | ".join(
|
||||||
|
verification_results
|
||||||
|
)
|
||||||
|
|
||||||
|
return match, result_message, confidence_score
|
||||||
|
|
||||||
|
|
||||||
def find_duplicate_singles(
|
def find_duplicate_singles(
|
||||||
@@ -285,8 +561,9 @@ def find_duplicate_singles(
|
|||||||
|
|
||||||
for track in tracks_with_files:
|
for track in tracks_with_files:
|
||||||
title = track.get("title")
|
title = track.get("title")
|
||||||
|
track_id = track.get("id")
|
||||||
track_file_id = track.get("trackFileId")
|
track_file_id = track.get("trackFileId")
|
||||||
if not title or not track_file_id:
|
if not title or not track_file_id or not track_id:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
key = (artist_id, normalize_title(title))
|
key = (artist_id, normalize_title(title))
|
||||||
@@ -294,33 +571,10 @@ def find_duplicate_singles(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
duplicate_albums = album_track_map[key]
|
duplicate_albums = album_track_map[key]
|
||||||
verified_albums = []
|
if not duplicate_albums:
|
||||||
|
continue
|
||||||
|
|
||||||
if verify_audio:
|
if not verify_audio:
|
||||||
logger.info(
|
|
||||||
f"Verifying audio for '{title}' from single '{album_title}' against {len(duplicate_albums)} album track(s)..."
|
|
||||||
)
|
|
||||||
for album_track in duplicate_albums:
|
|
||||||
album_track_file_id = album_track["track_file_id"]
|
|
||||||
if verify_audio_match(
|
|
||||||
base_url,
|
|
||||||
headers,
|
|
||||||
track_file_id,
|
|
||||||
album_track_file_id,
|
|
||||||
docker_mount,
|
|
||||||
):
|
|
||||||
verified_albums.append(album_track)
|
|
||||||
logger.debug(
|
|
||||||
f"Audio match confirmed: single trackFileId {track_file_id} matches album '{album_track['album_title']}' trackFileId {album_track_file_id}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
f"Audio mismatch: single trackFileId {track_file_id} does not match album '{album_track['album_title']}' trackFileId {album_track_file_id}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
verified_albums = duplicate_albums
|
|
||||||
|
|
||||||
if verified_albums:
|
|
||||||
duplicates.append(
|
duplicates.append(
|
||||||
{
|
{
|
||||||
"artist_id": artist_id,
|
"artist_id": artist_id,
|
||||||
@@ -328,9 +582,60 @@ def find_duplicate_singles(
|
|||||||
"single_album_title": album_title,
|
"single_album_title": album_title,
|
||||||
"track_title": title,
|
"track_title": title,
|
||||||
"single_track_file_id": track_file_id,
|
"single_track_file_id": track_file_id,
|
||||||
"duplicate_albums": verified_albums,
|
"duplicate_albums": duplicate_albums,
|
||||||
|
"verified_albums": duplicate_albums,
|
||||||
|
"verification_results": ["Audio verification disabled"],
|
||||||
|
"confidence_scores": [0],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Verifying audio for '{title}' from single '{album_title}' against {len(duplicate_albums)} album track(s)..."
|
||||||
|
)
|
||||||
|
verified_albums = []
|
||||||
|
verification_results = []
|
||||||
|
confidence_scores = []
|
||||||
|
|
||||||
|
for album_track in duplicate_albums:
|
||||||
|
album_track_id = album_track["track_id"]
|
||||||
|
album_track_file_id = album_track["track_file_id"]
|
||||||
|
match, result_message, confidence = verify_audio_match(
|
||||||
|
base_url,
|
||||||
|
headers,
|
||||||
|
track_id,
|
||||||
|
track_file_id,
|
||||||
|
album_track_id,
|
||||||
|
album_track_file_id,
|
||||||
|
docker_mount,
|
||||||
|
)
|
||||||
|
verification_results.append(result_message)
|
||||||
|
confidence_scores.append(confidence)
|
||||||
|
|
||||||
|
if not match:
|
||||||
|
logger.debug(
|
||||||
|
f"Audio mismatch: single trackFileId {track_file_id} does not match album '{album_track['album_title']}' trackFileId {album_track_file_id} (confidence: {confidence}/100)"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
verified_albums.append(album_track)
|
||||||
|
logger.debug(
|
||||||
|
f"Audio match confirmed: single trackFileId {track_file_id} matches album '{album_track['album_title']}' trackFileId {album_track_file_id} (confidence: {confidence}/100)"
|
||||||
|
)
|
||||||
|
|
||||||
|
duplicates.append(
|
||||||
|
{
|
||||||
|
"artist_id": artist_id,
|
||||||
|
"single_album_id": album_id,
|
||||||
|
"single_album_title": album_title,
|
||||||
|
"track_title": title,
|
||||||
|
"single_track_file_id": track_file_id,
|
||||||
|
"duplicate_albums": duplicate_albums,
|
||||||
|
"verified_albums": verified_albums,
|
||||||
|
"verification_results": verification_results,
|
||||||
|
"confidence_scores": confidence_scores,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return duplicates
|
return duplicates
|
||||||
|
|
||||||
@@ -416,12 +721,11 @@ def main() -> None:
|
|||||||
artist_id = artist.get("id")
|
artist_id = artist.get("id")
|
||||||
if not artist_id:
|
if not artist_id:
|
||||||
continue
|
continue
|
||||||
artist_albums = get_json(
|
albums.extend(
|
||||||
f"{base_url}/api/v1/album",
|
get_json(
|
||||||
headers,
|
f"{base_url}/api/v1/album", headers, params={"artistId": artist_id}
|
||||||
params={"artistId": artist_id},
|
)
|
||||||
)
|
)
|
||||||
albums.extend(artist_albums)
|
|
||||||
|
|
||||||
if not albums:
|
if not albums:
|
||||||
logger.warning("No albums found in the library.")
|
logger.warning("No albums found in the library.")
|
||||||
@@ -433,20 +737,18 @@ def main() -> None:
|
|||||||
verify_audio = not args.no_audio_verify
|
verify_audio = not args.no_audio_verify
|
||||||
docker_mount = args.docker_mount if args.docker_mount else None
|
docker_mount = args.docker_mount if args.docker_mount else None
|
||||||
|
|
||||||
if verify_audio:
|
if not verify_audio:
|
||||||
if docker_mount:
|
|
||||||
logger.info(
|
|
||||||
f"Scanning for duplicate singles with audio verification (Docker mount: {docker_mount})..."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.info("Scanning for duplicate singles with audio verification...")
|
|
||||||
logger.info(
|
|
||||||
"NOTE: Audio verification requires 'fpcalc' (chromaprint) to be installed"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Scanning for duplicate singles (audio verification disabled - using title matching only)..."
|
"Scanning for duplicate singles (audio verification disabled - using title matching only)..."
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
mount_msg = f" (Docker mount: {docker_mount})" if docker_mount else ""
|
||||||
|
logger.info(
|
||||||
|
f"Scanning for duplicate singles with audio verification{mount_msg}..."
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"NOTE: Audio verification requires 'fpcalc' (chromaprint) to be installed"
|
||||||
|
)
|
||||||
|
|
||||||
duplicates = find_duplicate_singles(
|
duplicates = find_duplicate_singles(
|
||||||
base_url,
|
base_url,
|
||||||
@@ -461,13 +763,17 @@ def main() -> None:
|
|||||||
logger.info("No duplicate singles found. The library appears clean.")
|
logger.info("No duplicate singles found. The library appears clean.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
verified_count = sum(1 for dup in duplicates if dup.get("verified_albums"))
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Found {len(duplicates)} single track(s) that are duplicates of album tracks:"
|
f"Found {len(duplicates)} single track(s) that are duplicates of album tracks ({verified_count} verified by audio fingerprint):"
|
||||||
)
|
)
|
||||||
for dup in duplicates:
|
for dup in duplicates:
|
||||||
artist_id = dup["artist_id"]
|
artist_id = dup["artist_id"]
|
||||||
artist_name = artist_map.get(artist_id, f"Unknown (ID: {artist_id})")
|
artist_name = artist_map.get(artist_id, f"Unknown (ID: {artist_id})")
|
||||||
duplicate_albums = dup["duplicate_albums"]
|
duplicate_albums = dup["duplicate_albums"]
|
||||||
|
verified_albums = dup.get("verified_albums", duplicate_albums)
|
||||||
|
verification_results = dup.get("verification_results", [])
|
||||||
|
confidence_scores = dup.get("confidence_scores", [])
|
||||||
album_names = [album["album_title"] for album in duplicate_albums]
|
album_names = [album["album_title"] for album in duplicate_albums]
|
||||||
|
|
||||||
logger.info(f"Artist: {artist_name}")
|
logger.info(f"Artist: {artist_name}")
|
||||||
@@ -475,9 +781,23 @@ def main() -> None:
|
|||||||
logger.info(
|
logger.info(
|
||||||
f" Track: '{dup['track_title']}' (trackFileId: {dup['single_track_file_id']})"
|
f" Track: '{dup['track_title']}' (trackFileId: {dup['single_track_file_id']})"
|
||||||
)
|
)
|
||||||
|
for i, result in enumerate(verification_results):
|
||||||
|
confidence = confidence_scores[i] if i < len(confidence_scores) else 0
|
||||||
|
logger.info(f" {result}")
|
||||||
logger.info(
|
logger.info(
|
||||||
f" Already present on {len(duplicate_albums)} album(s): {', '.join(album_names)}"
|
f" Already present on {len(duplicate_albums)} album(s): {', '.join(album_names)}"
|
||||||
)
|
)
|
||||||
|
if verify_audio:
|
||||||
|
if verified_albums:
|
||||||
|
verified_names = [album["album_title"] for album in verified_albums]
|
||||||
|
logger.info(
|
||||||
|
f" ✓ LIKELY safe to delete (audio verified on {len(verified_albums)} album(s): {', '.join(verified_names)})"
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f" ⚠ CAUTION: Always check for different versions (remaster, radio edit, live, etc)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(f" ⚠ NOT safe to delete (audio verification failed)")
|
||||||
logger.info("")
|
logger.info("")
|
||||||
|
|
||||||
# DELETE FUNCTIONALITY COMMENTED OUT FOR SAFETY
|
# DELETE FUNCTIONALITY COMMENTED OUT FOR SAFETY
|
||||||
|
|||||||
Reference in New Issue
Block a user