535
modules/media_identifier.py
Normal file
535
modules/media_identifier.py
Normal file
@@ -0,0 +1,535 @@
|
||||
"""
|
||||
Media Identifier Module
|
||||
|
||||
Parses media filenames using guessit and matches them against TMDB for metadata enrichment.
|
||||
Generates organized file paths for TV Shows and Movies.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('MediaIdentifier')
|
||||
|
||||
# Try to import guessit, but gracefully handle if not installed
|
||||
try:
|
||||
import guessit
|
||||
GUESSIT_AVAILABLE = True
|
||||
except ImportError:
|
||||
GUESSIT_AVAILABLE = False
|
||||
logger.warning("guessit not installed - filename parsing will be limited")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedMedia:
|
||||
"""Represents parsed media information from a filename."""
|
||||
title: str
|
||||
media_type: str # 'movie' or 'episode' (TV)
|
||||
year: Optional[int] = None
|
||||
season: Optional[int] = None
|
||||
episode: Optional[int] = None
|
||||
quality: Optional[str] = None
|
||||
source: Optional[str] = None
|
||||
codec: Optional[str] = None
|
||||
release_group: Optional[str] = None
|
||||
original_filename: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'title': self.title,
|
||||
'media_type': self.media_type,
|
||||
'year': self.year,
|
||||
'season': self.season,
|
||||
'episode': self.episode,
|
||||
'quality': self.quality,
|
||||
'source': self.source,
|
||||
'codec': self.codec,
|
||||
'release_group': self.release_group,
|
||||
'original_filename': self.original_filename,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class TMDBMatch:
|
||||
"""Represents a TMDB match for parsed media."""
|
||||
tmdb_id: int
|
||||
title: str
|
||||
original_title: Optional[str]
|
||||
media_type: str # 'movie' or 'tv'
|
||||
year: Optional[int] = None
|
||||
poster_path: Optional[str] = None
|
||||
overview: Optional[str] = None
|
||||
# For TV episodes
|
||||
season_number: Optional[int] = None
|
||||
episode_number: Optional[int] = None
|
||||
episode_title: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'tmdb_id': self.tmdb_id,
|
||||
'title': self.title,
|
||||
'original_title': self.original_title,
|
||||
'media_type': self.media_type,
|
||||
'year': self.year,
|
||||
'poster_path': self.poster_path,
|
||||
'overview': self.overview,
|
||||
'season_number': self.season_number,
|
||||
'episode_number': self.episode_number,
|
||||
'episode_title': self.episode_title,
|
||||
}
|
||||
|
||||
|
||||
class MediaIdentifier:
|
||||
"""
|
||||
Identifies media from filenames and matches against TMDB.
|
||||
|
||||
Uses guessit for filename parsing and TMDB API for metadata enrichment.
|
||||
"""
|
||||
|
||||
TMDB_BASE_URL = "https://api.themoviedb.org/3"
|
||||
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w500"
|
||||
|
||||
# Quality normalization patterns
|
||||
QUALITY_MAP = {
|
||||
'2160p': '2160p',
|
||||
'4k': '2160p',
|
||||
'uhd': '2160p',
|
||||
'1080p': '1080p',
|
||||
'fullhd': '1080p',
|
||||
'fhd': '1080p',
|
||||
'720p': '720p',
|
||||
'hd': '720p',
|
||||
'480p': '480p',
|
||||
'sd': '480p',
|
||||
'360p': '360p',
|
||||
}
|
||||
|
||||
def __init__(self, tmdb_api_key: str):
|
||||
"""
|
||||
Initialize the MediaIdentifier.
|
||||
|
||||
Args:
|
||||
tmdb_api_key: TMDB API key for lookups
|
||||
"""
|
||||
self.api_key = tmdb_api_key
|
||||
self.session = requests.Session()
|
||||
|
||||
def parse_filename(self, filename: str) -> Optional[ParsedMedia]:
|
||||
"""
|
||||
Parse a media filename to extract metadata.
|
||||
|
||||
Args:
|
||||
filename: The filename to parse (without path)
|
||||
|
||||
Returns:
|
||||
ParsedMedia object with extracted information, or None if parsing fails
|
||||
"""
|
||||
if not filename:
|
||||
return None
|
||||
|
||||
# Strip path if present
|
||||
filename = Path(filename).name
|
||||
|
||||
if GUESSIT_AVAILABLE:
|
||||
return self._parse_with_guessit(filename)
|
||||
else:
|
||||
return self._parse_fallback(filename)
|
||||
|
||||
def _parse_with_guessit(self, filename: str) -> Optional[ParsedMedia]:
|
||||
"""Parse filename using guessit library."""
|
||||
try:
|
||||
result = guessit.guessit(filename)
|
||||
|
||||
# Determine media type
|
||||
media_type = result.get('type', 'movie')
|
||||
if media_type == 'episode':
|
||||
media_type = 'episode'
|
||||
else:
|
||||
media_type = 'movie'
|
||||
|
||||
# Extract title
|
||||
title = result.get('title', '')
|
||||
if not title:
|
||||
return None
|
||||
|
||||
# Extract quality
|
||||
quality = None
|
||||
screen_size = result.get('screen_size')
|
||||
if screen_size:
|
||||
quality = self.QUALITY_MAP.get(str(screen_size).lower(), str(screen_size))
|
||||
|
||||
return ParsedMedia(
|
||||
title=title,
|
||||
media_type=media_type,
|
||||
year=result.get('year'),
|
||||
season=result.get('season'),
|
||||
episode=result.get('episode'),
|
||||
quality=quality,
|
||||
source=result.get('source'),
|
||||
codec=result.get('video_codec'),
|
||||
release_group=result.get('release_group'),
|
||||
original_filename=filename,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"guessit parsing failed for '{filename}': {e}")
|
||||
return self._parse_fallback(filename)
|
||||
|
||||
def _parse_fallback(self, filename: str) -> Optional[ParsedMedia]:
|
||||
"""
|
||||
Fallback parser when guessit is not available.
|
||||
Uses regex patterns to extract common media info.
|
||||
"""
|
||||
try:
|
||||
# Remove extension
|
||||
name = Path(filename).stem
|
||||
|
||||
# Replace common separators with spaces
|
||||
name = re.sub(r'[._]', ' ', name)
|
||||
|
||||
# Try to extract TV show pattern: Show Name S01E02 or Show.Name.1x02
|
||||
tv_pattern = r'^(.+?)[\s\.]+[Ss](\d{1,2})[Ee](\d{1,2})'
|
||||
tv_match = re.match(tv_pattern, name)
|
||||
|
||||
if tv_match:
|
||||
title = tv_match.group(1).strip()
|
||||
season = int(tv_match.group(2))
|
||||
episode = int(tv_match.group(3))
|
||||
|
||||
# Extract quality
|
||||
quality = self._extract_quality(name)
|
||||
|
||||
return ParsedMedia(
|
||||
title=title,
|
||||
media_type='episode',
|
||||
season=season,
|
||||
episode=episode,
|
||||
quality=quality,
|
||||
original_filename=filename,
|
||||
)
|
||||
|
||||
# Try alternative TV pattern: 1x02 format
|
||||
alt_tv_pattern = r'^(.+?)[\s\.]+(\d{1,2})x(\d{1,2})'
|
||||
alt_match = re.match(alt_tv_pattern, name)
|
||||
|
||||
if alt_match:
|
||||
title = alt_match.group(1).strip()
|
||||
season = int(alt_match.group(2))
|
||||
episode = int(alt_match.group(3))
|
||||
|
||||
quality = self._extract_quality(name)
|
||||
|
||||
return ParsedMedia(
|
||||
title=title,
|
||||
media_type='episode',
|
||||
season=season,
|
||||
episode=episode,
|
||||
quality=quality,
|
||||
original_filename=filename,
|
||||
)
|
||||
|
||||
# Assume movie - extract title and year
|
||||
# Pattern: Movie Title (2023) or Movie.Title.2023
|
||||
movie_pattern = r'^(.+?)[\s\.]+\(?(\d{4})\)?'
|
||||
movie_match = re.match(movie_pattern, name)
|
||||
|
||||
if movie_match:
|
||||
title = movie_match.group(1).strip()
|
||||
year = int(movie_match.group(2))
|
||||
else:
|
||||
# Just use the name as title
|
||||
title = name.split()[0] if name.split() else name
|
||||
year = None
|
||||
|
||||
quality = self._extract_quality(name)
|
||||
|
||||
return ParsedMedia(
|
||||
title=title,
|
||||
media_type='movie',
|
||||
year=year,
|
||||
quality=quality,
|
||||
original_filename=filename,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fallback parsing failed for '{filename}': {e}")
|
||||
return None
|
||||
|
||||
def _extract_quality(self, text: str) -> Optional[str]:
|
||||
"""Extract quality from text."""
|
||||
text_lower = text.lower()
|
||||
for pattern, quality in self.QUALITY_MAP.items():
|
||||
if pattern in text_lower:
|
||||
return quality
|
||||
return None
|
||||
|
||||
def match_tmdb(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
|
||||
"""
|
||||
Match parsed media against TMDB.
|
||||
|
||||
Args:
|
||||
parsed: ParsedMedia object from parse_filename
|
||||
|
||||
Returns:
|
||||
TMDBMatch object if found, None otherwise
|
||||
"""
|
||||
if not parsed:
|
||||
return None
|
||||
|
||||
try:
|
||||
if parsed.media_type == 'episode':
|
||||
return self._match_tv_show(parsed)
|
||||
else:
|
||||
return self._match_movie(parsed)
|
||||
except Exception as e:
|
||||
logger.error(f"TMDB matching failed for '{parsed.title}': {e}")
|
||||
return None
|
||||
|
||||
def _match_tv_show(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
|
||||
"""Match a TV show episode against TMDB."""
|
||||
try:
|
||||
# Search for the TV show
|
||||
search_url = f"{self.TMDB_BASE_URL}/search/tv"
|
||||
params = {
|
||||
'api_key': self.api_key,
|
||||
'query': parsed.title,
|
||||
'page': 1,
|
||||
}
|
||||
if parsed.year:
|
||||
params['first_air_date_year'] = parsed.year
|
||||
|
||||
response = self.session.get(search_url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get('results', [])
|
||||
if not results:
|
||||
logger.debug(f"No TMDB results for TV show: {parsed.title}")
|
||||
return None
|
||||
|
||||
# Use the first (best) result
|
||||
show = results[0]
|
||||
show_id = show['id']
|
||||
|
||||
# Get episode details if we have season/episode
|
||||
episode_title = None
|
||||
if parsed.season and parsed.episode:
|
||||
episode_url = f"{self.TMDB_BASE_URL}/tv/{show_id}/season/{parsed.season}/episode/{parsed.episode}"
|
||||
ep_params = {'api_key': self.api_key}
|
||||
try:
|
||||
ep_response = self.session.get(episode_url, params=ep_params, timeout=30)
|
||||
if ep_response.status_code == 200:
|
||||
ep_data = ep_response.json()
|
||||
episode_title = ep_data.get('name')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Parse year from first_air_date
|
||||
year = None
|
||||
first_air_date = show.get('first_air_date', '')
|
||||
if first_air_date and len(first_air_date) >= 4:
|
||||
try:
|
||||
year = int(first_air_date[:4])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return TMDBMatch(
|
||||
tmdb_id=show_id,
|
||||
title=show.get('name', parsed.title),
|
||||
original_title=show.get('original_name'),
|
||||
media_type='tv',
|
||||
year=year,
|
||||
poster_path=show.get('poster_path'),
|
||||
overview=show.get('overview'),
|
||||
season_number=parsed.season,
|
||||
episode_number=parsed.episode,
|
||||
episode_title=episode_title,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TMDB TV show matching failed: {e}")
|
||||
return None
|
||||
|
||||
def _match_movie(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
|
||||
"""Match a movie against TMDB."""
|
||||
try:
|
||||
# Search for the movie
|
||||
search_url = f"{self.TMDB_BASE_URL}/search/movie"
|
||||
params = {
|
||||
'api_key': self.api_key,
|
||||
'query': parsed.title,
|
||||
'page': 1,
|
||||
}
|
||||
if parsed.year:
|
||||
params['year'] = parsed.year
|
||||
|
||||
response = self.session.get(search_url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get('results', [])
|
||||
if not results:
|
||||
logger.debug(f"No TMDB results for movie: {parsed.title}")
|
||||
return None
|
||||
|
||||
# Use the first (best) result
|
||||
movie = results[0]
|
||||
|
||||
# Parse year from release_date
|
||||
year = None
|
||||
release_date = movie.get('release_date', '')
|
||||
if release_date and len(release_date) >= 4:
|
||||
try:
|
||||
year = int(release_date[:4])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return TMDBMatch(
|
||||
tmdb_id=movie['id'],
|
||||
title=movie.get('title', parsed.title),
|
||||
original_title=movie.get('original_title'),
|
||||
media_type='movie',
|
||||
year=year,
|
||||
poster_path=movie.get('poster_path'),
|
||||
overview=movie.get('overview'),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TMDB movie matching failed: {e}")
|
||||
return None
|
||||
|
||||
def get_organized_path(
|
||||
self,
|
||||
match: TMDBMatch,
|
||||
base_path: str,
|
||||
original_filename: str,
|
||||
) -> str:
|
||||
"""
|
||||
Generate an organized file path for the matched media.
|
||||
|
||||
Args:
|
||||
match: TMDBMatch object with TMDB metadata
|
||||
base_path: Base directory for media storage
|
||||
original_filename: Original filename (for extension)
|
||||
|
||||
Returns:
|
||||
Full organized path for the file
|
||||
"""
|
||||
base = Path(base_path)
|
||||
|
||||
# Get extension from original filename
|
||||
ext = Path(original_filename).suffix
|
||||
|
||||
# Sanitize title for filesystem
|
||||
safe_title = self._sanitize_filename(match.title)
|
||||
|
||||
if match.media_type == 'tv':
|
||||
# TV: {base}/TV Shows/{Show}/Season {XX}/{Show} - S{XX}E{XX} - {Episode Title}.{ext}
|
||||
show_dir = base / "TV Shows" / safe_title
|
||||
|
||||
if match.season_number is not None:
|
||||
season_dir = show_dir / f"Season {match.season_number:02d}"
|
||||
else:
|
||||
season_dir = show_dir / "Season 01"
|
||||
|
||||
# Build filename
|
||||
if match.season_number is not None and match.episode_number is not None:
|
||||
ep_part = f"S{match.season_number:02d}E{match.episode_number:02d}"
|
||||
else:
|
||||
ep_part = "S01E01"
|
||||
|
||||
if match.episode_title:
|
||||
safe_ep_title = self._sanitize_filename(match.episode_title)
|
||||
filename = f"{safe_title} - {ep_part} - {safe_ep_title}{ext}"
|
||||
else:
|
||||
filename = f"{safe_title} - {ep_part}{ext}"
|
||||
|
||||
return str(season_dir / filename)
|
||||
|
||||
else:
|
||||
# Movie: {base}/Movies/{Title} ({Year})/{Title} ({Year}).{ext}
|
||||
if match.year:
|
||||
movie_folder = f"{safe_title} ({match.year})"
|
||||
else:
|
||||
movie_folder = safe_title
|
||||
|
||||
movie_dir = base / "Movies" / movie_folder
|
||||
filename = f"{movie_folder}{ext}"
|
||||
|
||||
return str(movie_dir / filename)
|
||||
|
||||
def _sanitize_filename(self, name: str) -> str:
|
||||
"""
|
||||
Sanitize a string for use as a filename.
|
||||
|
||||
Removes/replaces characters that are invalid in filenames.
|
||||
"""
|
||||
if not name:
|
||||
return "Unknown"
|
||||
|
||||
# Replace problematic characters
|
||||
name = re.sub(r'[<>:"/\\|?*]', '', name)
|
||||
name = re.sub(r'\s+', ' ', name)
|
||||
name = name.strip()
|
||||
|
||||
# Limit length
|
||||
if len(name) > 100:
|
||||
name = name[:100].strip()
|
||||
|
||||
return name if name else "Unknown"
|
||||
|
||||
def identify_and_match(
|
||||
self,
|
||||
filename: str,
|
||||
base_path: str = "/media",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience method to parse, match, and get organized path in one call.
|
||||
|
||||
Args:
|
||||
filename: The media filename to process
|
||||
base_path: Base directory for organized media
|
||||
|
||||
Returns:
|
||||
Dict with parsed info, TMDB match, and organized path
|
||||
"""
|
||||
result = {
|
||||
'success': False,
|
||||
'filename': filename,
|
||||
'parsed': None,
|
||||
'match': None,
|
||||
'organized_path': None,
|
||||
'error': None,
|
||||
}
|
||||
|
||||
try:
|
||||
# Parse filename
|
||||
parsed = self.parse_filename(filename)
|
||||
if not parsed:
|
||||
result['error'] = 'Failed to parse filename'
|
||||
return result
|
||||
|
||||
result['parsed'] = parsed.to_dict()
|
||||
|
||||
# Match against TMDB
|
||||
match = self.match_tmdb(parsed)
|
||||
if match:
|
||||
result['match'] = match.to_dict()
|
||||
|
||||
# Get organized path
|
||||
organized_path = self.get_organized_path(match, base_path, filename)
|
||||
result['organized_path'] = organized_path
|
||||
result['success'] = True
|
||||
else:
|
||||
result['error'] = 'No TMDB match found'
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
result['error'] = str(e)
|
||||
logger.error(f"identify_and_match failed for '{filename}': {e}")
|
||||
return result
|
||||
Reference in New Issue
Block a user