plexfin-compare/analyze_movies.py

#!/usr/bin/env python3
"""
Analyze movies missing from Plex to identify common patterns.
"""

import json
from pathlib import Path
from collections import defaultdict
import re

def analyze_missing_movies(report_file='movies_comparison_report.json'):
    with open(report_file, 'r') as f:
        data = json.load(f)

    missing = data.get('missing_from_plex', {})

    if not missing:
        print("No missing movies found!")
        return

    print(f"Analyzing {len(missing)} movies missing from Plex...\n")

    # Analyze various attributes
    extensions = defaultdict(int)
    has_special_chars = []
    has_brackets = []
    has_edition_tag = []
    has_imdb_tag = []
    recently_released = []
    directory_names = defaultdict(int)
    path_depth = defaultdict(int)
    file_sizes = []
    naming_patterns = defaultdict(int)

    for title, info in missing.items():
        path = Path(info['path'])

        # File extension
        ext = path.suffix.lower()
        extensions[ext] += 1

        # Special characters in filename
        filename = path.name
        if re.search(r'[^a-zA-Z0-9\s\-_\.\(\)\[\]\{\}]', filename):
            has_special_chars.append((title, filename))

        # Brackets/braces patterns
        if '{' in filename or '}' in filename:
            has_brackets.append((title, filename))

        # Edition tags
        if '{edition-' in filename.lower():
            has_edition_tag.append((title, filename))

        # IMDB tags
        if '{imdb-' in filename.lower():
            has_imdb_tag.append((title, filename))

        # Directory structure
        parent_dir = path.parent.name
        directory_names[parent_dir] += 1

        # Path depth from Movies folder
        try:
            parts = path.parts
            movies_idx = parts.index('Movies')
            depth = len(parts) - movies_idx - 1
            path_depth[depth] += 1
        except (ValueError, IndexError):
            pass

        # Year extraction and recent releases
        year_match = re.search(r'\((\d{4})\)', filename)
        if year_match:
            year = int(year_match.group(1))
            if year >= 2023:
                recently_released.append((title, year, filename))

        # Naming pattern analysis
        # Check for common patterns like "Movie (Year) {tags} - [quality]"
        if re.search(r'\{[^}]+\}', filename):
            naming_patterns['has_curly_braces'] += 1
        if re.search(r'\[[^\]]+\]', filename):
            naming_patterns['has_square_brackets'] += 1
        if re.search(r'\((\d{4})\)', filename):
            naming_patterns['has_year'] += 1
        if re.search(r'(1080p|2160p|720p|4K)', filename, re.I):
            naming_patterns['has_quality'] += 1

    # Print analysis
    print("="*80)
    print("ANALYSIS RESULTS")
    print("="*80)

    print(f"\n📊 FILE EXTENSIONS:")
    for ext, count in sorted(extensions.items(), key=lambda x: x[1], reverse=True):
        pct = (count / len(missing)) * 100
        print(f"   {ext or 'no extension':15} {count:4} ({pct:.1f}%)")

    print(f"\n🔤 NAMING PATTERNS:")
    for pattern, count in sorted(naming_patterns.items(), key=lambda x: x[1], reverse=True):
        pct = (count / len(missing)) * 100
        print(f"   {pattern:30} {count:4} ({pct:.1f}%)")

    if has_edition_tag:
        print(f"\n🏷️  EDITION TAGS: {len(has_edition_tag)}")
        print(f"   Movies with {{edition-...}} tags")
        for title, filename in has_edition_tag[:5]:
            print(f"   • {title[:60]}")
        if len(has_edition_tag) > 5:
            print(f"   ... and {len(has_edition_tag) - 5} more")

    if has_imdb_tag:
        print(f"\n🎬 IMDB TAGS: {len(has_imdb_tag)}")
        print(f"   Movies with {{imdb-...}} tags")

    print(f"\n📁 PATH DEPTH FROM 'Movies' FOLDER:")
    for depth, count in sorted(path_depth.items()):
        pct = (count / len(missing)) * 100
        print(f"   Depth {depth}: {count:4} ({pct:.1f}%)")

    if recently_released:
        print(f"\n📅 RECENT RELEASES (2023+): {len(recently_released)}")
        recent_sorted = sorted(recently_released, key=lambda x: x[1], reverse=True)
        for title, year, filename in recent_sorted[:10]:
            print(f"   {year} - {title[:60]}")
        if len(recently_released) > 10:
            print(f"   ... and {len(recently_released) - 10} more")

    if has_special_chars:
        print(f"\n⚠️  SPECIAL CHARACTERS: {len(has_special_chars)}")
        special_chars_found = set()
        for title, filename in has_special_chars:
            chars = re.findall(r'[^a-zA-Z0-9\s\-_\.\(\)\[\]\{\}]', filename)
            special_chars_found.update(chars)
        print(f"   Characters found: {', '.join(repr(c) for c in sorted(special_chars_found))}")
        print(f"   Sample files:")
        for title, filename in has_special_chars[:5]:
            print(f"   • {filename[:75]}")

    # Check for empty/weird directories
    print(f"\n📂 UNUSUAL DIRECTORY PATTERNS:")
    unusual_dirs = []
    for dirname, count in directory_names.items():
        if '()' in dirname or dirname.strip() == '' or len(dirname) < 3:
            unusual_dirs.append((dirname, count))

    if unusual_dirs:
        for dirname, count in unusual_dirs[:10]:
            print(f"   '{dirname}': {count} movies")
    else:
        print("   None found")

    # Find common directory patterns
    print(f"\n📁 DIRECTORIES WITH MOST MISSING MOVIES:")
    top_dirs = sorted(directory_names.items(), key=lambda x: x[1], reverse=True)[:10]
    for dirname, count in top_dirs:
        if count > 1:
            print(f"   {dirname}: {count} movies")

    # Check if all in Jellyfin
    all_in_jellyfin = all(info['in_jellyfin'] for info in missing.values())
    some_in_jellyfin = sum(1 for info in missing.values() if info['in_jellyfin'])

    print(f"\n🎥 JELLYFIN STATUS:")
    print(f"   Movies also in Jellyfin: {some_in_jellyfin}/{len(missing)} ({(some_in_jellyfin/len(missing)*100):.1f}%)")

    if some_in_jellyfin == len(missing):
        print("   ✓ ALL missing movies are visible in Jellyfin")
        print("   → This suggests a Plex scanning/indexing issue, not file access")
    elif some_in_jellyfin > 0:
        print(f"   ⚠ {len(missing) - some_in_jellyfin} movies not in Jellyfin either")
        print("   → These might have filesystem/permission issues")

    print("\n" + "="*80)
    print("RECOMMENDATIONS:")
    print("="*80)

    recommendations = []

    if naming_patterns['has_curly_braces'] > len(missing) * 0.5:
        recommendations.append(
            "• Many files use curly braces {} in filenames\n"
            "  Plex might have trouble with certain special characters in names.\n"
            "  Consider: Plex 'Plex Dance' or manual metadata matching for these."
        )

    if has_edition_tag:
        recommendations.append(
            f"• {len(has_edition_tag)} movies have {{edition-...}} tags\n"
            "  Plex may not recognize edition tags in curly braces.\n"
            "  Consider: Use Plex's edition support or move tags to square brackets."
        )

    if len(unusual_dirs) > 0:
        recommendations.append(
            "• Some movies are in directories with unusual names (empty, very short)\n"
            "  Plex scanner might skip these.\n"
            "  Consider: Rename directories to proper movie names."
        )

    if recently_released:
        recommendations.append(
            f"• {len(recently_released)} movies are from 2023 or later\n"
            "  These might need manual metadata matching if too new for databases."
        )

    if all_in_jellyfin:
        recommendations.append(
            "• ALL missing movies are visible in Jellyfin\n"
            "  This strongly suggests the issue is with Plex's scanner/matcher, not file access.\n"
            "  Actions:\n"
            "    1. Force a full library refresh in Plex\n"
            "    2. Check Plex's 'Trash' for unmatched items\n"
            "    3. Try 'Scan Library Files' then 'Refresh All Metadata'\n"
            "    4. Check Plex server logs for scanner errors"
        )

    for i, rec in enumerate(recommendations, 1):
        print(f"\n{i}. {rec}")

    if not recommendations:
        print("\n• Files appear normal. Try forcing a Plex library refresh.")

    print("\n" + "="*80)

if __name__ == '__main__':
    analyze_missing_movies()