Files
media-downloader/modules/media_identifier.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

536 lines
17 KiB
Python

"""
Media Identifier Module
Parses media filenames using guessit and matches them against TMDB for metadata enrichment.
Generates organized file paths for TV Shows and Movies.
"""
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import requests
from modules.universal_logger import get_logger
logger = get_logger('MediaIdentifier')
# Try to import guessit, but gracefully handle if not installed
try:
import guessit
GUESSIT_AVAILABLE = True
except ImportError:
GUESSIT_AVAILABLE = False
logger.warning("guessit not installed - filename parsing will be limited")
@dataclass
class ParsedMedia:
"""Represents parsed media information from a filename."""
title: str
media_type: str # 'movie' or 'episode' (TV)
year: Optional[int] = None
season: Optional[int] = None
episode: Optional[int] = None
quality: Optional[str] = None
source: Optional[str] = None
codec: Optional[str] = None
release_group: Optional[str] = None
original_filename: str = ""
def to_dict(self) -> Dict[str, Any]:
return {
'title': self.title,
'media_type': self.media_type,
'year': self.year,
'season': self.season,
'episode': self.episode,
'quality': self.quality,
'source': self.source,
'codec': self.codec,
'release_group': self.release_group,
'original_filename': self.original_filename,
}
@dataclass
class TMDBMatch:
"""Represents a TMDB match for parsed media."""
tmdb_id: int
title: str
original_title: Optional[str]
media_type: str # 'movie' or 'tv'
year: Optional[int] = None
poster_path: Optional[str] = None
overview: Optional[str] = None
# For TV episodes
season_number: Optional[int] = None
episode_number: Optional[int] = None
episode_title: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {
'tmdb_id': self.tmdb_id,
'title': self.title,
'original_title': self.original_title,
'media_type': self.media_type,
'year': self.year,
'poster_path': self.poster_path,
'overview': self.overview,
'season_number': self.season_number,
'episode_number': self.episode_number,
'episode_title': self.episode_title,
}
class MediaIdentifier:
"""
Identifies media from filenames and matches against TMDB.
Uses guessit for filename parsing and TMDB API for metadata enrichment.
"""
TMDB_BASE_URL = "https://api.themoviedb.org/3"
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w500"
# Quality normalization patterns
QUALITY_MAP = {
'2160p': '2160p',
'4k': '2160p',
'uhd': '2160p',
'1080p': '1080p',
'fullhd': '1080p',
'fhd': '1080p',
'720p': '720p',
'hd': '720p',
'480p': '480p',
'sd': '480p',
'360p': '360p',
}
def __init__(self, tmdb_api_key: str):
"""
Initialize the MediaIdentifier.
Args:
tmdb_api_key: TMDB API key for lookups
"""
self.api_key = tmdb_api_key
self.session = requests.Session()
def parse_filename(self, filename: str) -> Optional[ParsedMedia]:
"""
Parse a media filename to extract metadata.
Args:
filename: The filename to parse (without path)
Returns:
ParsedMedia object with extracted information, or None if parsing fails
"""
if not filename:
return None
# Strip path if present
filename = Path(filename).name
if GUESSIT_AVAILABLE:
return self._parse_with_guessit(filename)
else:
return self._parse_fallback(filename)
def _parse_with_guessit(self, filename: str) -> Optional[ParsedMedia]:
"""Parse filename using guessit library."""
try:
result = guessit.guessit(filename)
# Determine media type
media_type = result.get('type', 'movie')
if media_type == 'episode':
media_type = 'episode'
else:
media_type = 'movie'
# Extract title
title = result.get('title', '')
if not title:
return None
# Extract quality
quality = None
screen_size = result.get('screen_size')
if screen_size:
quality = self.QUALITY_MAP.get(str(screen_size).lower(), str(screen_size))
return ParsedMedia(
title=title,
media_type=media_type,
year=result.get('year'),
season=result.get('season'),
episode=result.get('episode'),
quality=quality,
source=result.get('source'),
codec=result.get('video_codec'),
release_group=result.get('release_group'),
original_filename=filename,
)
except Exception as e:
logger.error(f"guessit parsing failed for '{filename}': {e}")
return self._parse_fallback(filename)
def _parse_fallback(self, filename: str) -> Optional[ParsedMedia]:
"""
Fallback parser when guessit is not available.
Uses regex patterns to extract common media info.
"""
try:
# Remove extension
name = Path(filename).stem
# Replace common separators with spaces
name = re.sub(r'[._]', ' ', name)
# Try to extract TV show pattern: Show Name S01E02 or Show.Name.1x02
tv_pattern = r'^(.+?)[\s\.]+[Ss](\d{1,2})[Ee](\d{1,2})'
tv_match = re.match(tv_pattern, name)
if tv_match:
title = tv_match.group(1).strip()
season = int(tv_match.group(2))
episode = int(tv_match.group(3))
# Extract quality
quality = self._extract_quality(name)
return ParsedMedia(
title=title,
media_type='episode',
season=season,
episode=episode,
quality=quality,
original_filename=filename,
)
# Try alternative TV pattern: 1x02 format
alt_tv_pattern = r'^(.+?)[\s\.]+(\d{1,2})x(\d{1,2})'
alt_match = re.match(alt_tv_pattern, name)
if alt_match:
title = alt_match.group(1).strip()
season = int(alt_match.group(2))
episode = int(alt_match.group(3))
quality = self._extract_quality(name)
return ParsedMedia(
title=title,
media_type='episode',
season=season,
episode=episode,
quality=quality,
original_filename=filename,
)
# Assume movie - extract title and year
# Pattern: Movie Title (2023) or Movie.Title.2023
movie_pattern = r'^(.+?)[\s\.]+\(?(\d{4})\)?'
movie_match = re.match(movie_pattern, name)
if movie_match:
title = movie_match.group(1).strip()
year = int(movie_match.group(2))
else:
# Just use the name as title
title = name.split()[0] if name.split() else name
year = None
quality = self._extract_quality(name)
return ParsedMedia(
title=title,
media_type='movie',
year=year,
quality=quality,
original_filename=filename,
)
except Exception as e:
logger.error(f"Fallback parsing failed for '{filename}': {e}")
return None
def _extract_quality(self, text: str) -> Optional[str]:
"""Extract quality from text."""
text_lower = text.lower()
for pattern, quality in self.QUALITY_MAP.items():
if pattern in text_lower:
return quality
return None
def match_tmdb(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
"""
Match parsed media against TMDB.
Args:
parsed: ParsedMedia object from parse_filename
Returns:
TMDBMatch object if found, None otherwise
"""
if not parsed:
return None
try:
if parsed.media_type == 'episode':
return self._match_tv_show(parsed)
else:
return self._match_movie(parsed)
except Exception as e:
logger.error(f"TMDB matching failed for '{parsed.title}': {e}")
return None
def _match_tv_show(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
"""Match a TV show episode against TMDB."""
try:
# Search for the TV show
search_url = f"{self.TMDB_BASE_URL}/search/tv"
params = {
'api_key': self.api_key,
'query': parsed.title,
'page': 1,
}
if parsed.year:
params['first_air_date_year'] = parsed.year
response = self.session.get(search_url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
results = data.get('results', [])
if not results:
logger.debug(f"No TMDB results for TV show: {parsed.title}")
return None
# Use the first (best) result
show = results[0]
show_id = show['id']
# Get episode details if we have season/episode
episode_title = None
if parsed.season and parsed.episode:
episode_url = f"{self.TMDB_BASE_URL}/tv/{show_id}/season/{parsed.season}/episode/{parsed.episode}"
ep_params = {'api_key': self.api_key}
try:
ep_response = self.session.get(episode_url, params=ep_params, timeout=30)
if ep_response.status_code == 200:
ep_data = ep_response.json()
episode_title = ep_data.get('name')
except Exception:
pass
# Parse year from first_air_date
year = None
first_air_date = show.get('first_air_date', '')
if first_air_date and len(first_air_date) >= 4:
try:
year = int(first_air_date[:4])
except ValueError:
pass
return TMDBMatch(
tmdb_id=show_id,
title=show.get('name', parsed.title),
original_title=show.get('original_name'),
media_type='tv',
year=year,
poster_path=show.get('poster_path'),
overview=show.get('overview'),
season_number=parsed.season,
episode_number=parsed.episode,
episode_title=episode_title,
)
except Exception as e:
logger.error(f"TMDB TV show matching failed: {e}")
return None
def _match_movie(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
"""Match a movie against TMDB."""
try:
# Search for the movie
search_url = f"{self.TMDB_BASE_URL}/search/movie"
params = {
'api_key': self.api_key,
'query': parsed.title,
'page': 1,
}
if parsed.year:
params['year'] = parsed.year
response = self.session.get(search_url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
results = data.get('results', [])
if not results:
logger.debug(f"No TMDB results for movie: {parsed.title}")
return None
# Use the first (best) result
movie = results[0]
# Parse year from release_date
year = None
release_date = movie.get('release_date', '')
if release_date and len(release_date) >= 4:
try:
year = int(release_date[:4])
except ValueError:
pass
return TMDBMatch(
tmdb_id=movie['id'],
title=movie.get('title', parsed.title),
original_title=movie.get('original_title'),
media_type='movie',
year=year,
poster_path=movie.get('poster_path'),
overview=movie.get('overview'),
)
except Exception as e:
logger.error(f"TMDB movie matching failed: {e}")
return None
def get_organized_path(
self,
match: TMDBMatch,
base_path: str,
original_filename: str,
) -> str:
"""
Generate an organized file path for the matched media.
Args:
match: TMDBMatch object with TMDB metadata
base_path: Base directory for media storage
original_filename: Original filename (for extension)
Returns:
Full organized path for the file
"""
base = Path(base_path)
# Get extension from original filename
ext = Path(original_filename).suffix
# Sanitize title for filesystem
safe_title = self._sanitize_filename(match.title)
if match.media_type == 'tv':
# TV: {base}/TV Shows/{Show}/Season {XX}/{Show} - S{XX}E{XX} - {Episode Title}.{ext}
show_dir = base / "TV Shows" / safe_title
if match.season_number is not None:
season_dir = show_dir / f"Season {match.season_number:02d}"
else:
season_dir = show_dir / "Season 01"
# Build filename
if match.season_number is not None and match.episode_number is not None:
ep_part = f"S{match.season_number:02d}E{match.episode_number:02d}"
else:
ep_part = "S01E01"
if match.episode_title:
safe_ep_title = self._sanitize_filename(match.episode_title)
filename = f"{safe_title} - {ep_part} - {safe_ep_title}{ext}"
else:
filename = f"{safe_title} - {ep_part}{ext}"
return str(season_dir / filename)
else:
# Movie: {base}/Movies/{Title} ({Year})/{Title} ({Year}).{ext}
if match.year:
movie_folder = f"{safe_title} ({match.year})"
else:
movie_folder = safe_title
movie_dir = base / "Movies" / movie_folder
filename = f"{movie_folder}{ext}"
return str(movie_dir / filename)
def _sanitize_filename(self, name: str) -> str:
"""
Sanitize a string for use as a filename.
Removes/replaces characters that are invalid in filenames.
"""
if not name:
return "Unknown"
# Replace problematic characters
name = re.sub(r'[<>:"/\\|?*]', '', name)
name = re.sub(r'\s+', ' ', name)
name = name.strip()
# Limit length
if len(name) > 100:
name = name[:100].strip()
return name if name else "Unknown"
def identify_and_match(
self,
filename: str,
base_path: str = "/media",
) -> Dict[str, Any]:
"""
Convenience method to parse, match, and get organized path in one call.
Args:
filename: The media filename to process
base_path: Base directory for organized media
Returns:
Dict with parsed info, TMDB match, and organized path
"""
result = {
'success': False,
'filename': filename,
'parsed': None,
'match': None,
'organized_path': None,
'error': None,
}
try:
# Parse filename
parsed = self.parse_filename(filename)
if not parsed:
result['error'] = 'Failed to parse filename'
return result
result['parsed'] = parsed.to_dict()
# Match against TMDB
match = self.match_tmdb(parsed)
if match:
result['match'] = match.to_dict()
# Get organized path
organized_path = self.get_organized_path(match, base_path, filename)
result['organized_path'] = organized_path
result['success'] = True
else:
result['error'] = 'No TMDB match found'
return result
except Exception as e:
result['error'] = str(e)
logger.error(f"identify_and_match failed for '{filename}': {e}")
return result