#!/usr/bin/env python3 """ Filename Parser Module for Manual Import Parses filenames based on configurable patterns to extract metadata """ import re from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Any class FilenameParser: """ Parse filenames using configurable patterns to extract metadata. Supported pattern tokens: - {username} - Username/source (alphanumeric, underscores, periods) - {YYYYMMDD} - Date as 8 digits (20251127) - {HHMMSS} - Time as 6 digits (172753) - {YYYYMMDD_HHMMSS} - Combined date_time with underscore - {id} - Media ID (any characters until next separator) - {description} - Text content (any characters until next separator) - {num} - Sequence number (digits) - {ext} - File extension (optional, auto-handled) Example patterns: - Instagram Stories: "{username}_{YYYYMMDD}_{HHMMSS}_{id}" - Instagram Posts: "{username}_{YYYYMMDD}_{HHMMSS}_{id}" - TikTok: "{YYYYMMDD}_{description}_{id}_{num}" """ # Token definitions: token_name -> (regex_pattern, is_greedy) TOKEN_PATTERNS = { 'username': (r'[a-zA-Z0-9_.]+', False), 'YYYYMMDD': (r'\d{8}', False), 'HHMMSS': (r'\d{6}', False), 'YYYYMMDD_HHMMSS': (r'\d{8}_\d{6}', False), 'id': (r'.+', True), # Greedy - matches everything until separator 'description': (r'.+', True), # Greedy 'num': (r'\d+', False), 'ext': (r'\.[a-zA-Z0-9]+', False), } def __init__(self, pattern: str): """ Initialize parser with a filename pattern. Args: pattern: Pattern string like "{username}-{YYYYMMDD}_{HHMMSS}-{id}" """ self.pattern = pattern self.regex, self.token_order = self._compile_pattern(pattern) def _compile_pattern(self, pattern: str) -> tuple: """ Convert pattern string to compiled regex. Returns: Tuple of (compiled_regex, list_of_token_names) """ # Find all tokens in the pattern token_regex = r'\{(\w+)\}' tokens = re.findall(token_regex, pattern) # Build regex pattern regex_pattern = pattern # Escape special regex characters in the pattern (except our tokens) # First, temporarily replace tokens for i, token in enumerate(tokens): regex_pattern = regex_pattern.replace(f'{{{token}}}', f'__TOKEN_{i}__', 1) # Escape special chars regex_pattern = re.escape(regex_pattern) # Replace tokens back with their regex patterns for i, token in enumerate(tokens): if token in self.TOKEN_PATTERNS: token_pattern, is_greedy = self.TOKEN_PATTERNS[token] # Use non-greedy for greedy tokens when there's a separator after if is_greedy: # Make it non-greedy so it stops at the next separator token_pattern = r'.+?' regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', f'({token_pattern})', 1) else: # Unknown token - treat as any characters regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', r'(.+?)', 1) # Handle the last greedy token specially - it should be truly greedy # Find the last greedy token and make it greedy for token in reversed(tokens): if token in self.TOKEN_PATTERNS: _, is_greedy = self.TOKEN_PATTERNS[token] if is_greedy: # The last occurrence of .+? for this token should be .+ # We need to be more careful here - just make the whole pattern work break # Add start anchor, but allow extension at end regex_pattern = '^' + regex_pattern + r'(?:\.[a-zA-Z0-9]+)?$' try: compiled = re.compile(regex_pattern) except re.error as e: raise ValueError(f"Invalid pattern '{pattern}': {e}") return compiled, tokens def parse(self, filename: str) -> Dict[str, Any]: """ Parse a filename and extract metadata. Args: filename: Filename to parse (with or without extension) Returns: Dictionary with extracted metadata: - username: str or None - datetime: datetime object or None - media_id: str or None - description: str or None - num: int or None - extension: str or None - valid: bool - error: str or None (if valid is False) """ result = { 'username': None, 'datetime': None, 'media_id': None, 'description': None, 'num': None, 'extension': None, 'valid': False, 'error': None, 'raw_values': {} } # Extract extension path = Path(filename) extension = path.suffix.lower() if path.suffix else None basename = path.stem result['extension'] = extension # Try to match the pattern match = self.regex.match(basename) or self.regex.match(filename) if not match: result['error'] = f"Filename doesn't match pattern: {self.pattern}" return result # Extract values for each token groups = match.groups() for i, token in enumerate(self.token_order): if i < len(groups): value = groups[i] result['raw_values'][token] = value # Map tokens to result fields if token == 'username': result['username'] = value.lower() elif token == 'id': result['media_id'] = value elif token == 'description': result['description'] = value elif token == 'num': try: result['num'] = int(value) except ValueError: result['num'] = value # Parse datetime from date/time tokens result['datetime'] = self._parse_datetime(result['raw_values']) result['valid'] = True return result def _parse_datetime(self, raw_values: Dict[str, str]) -> Optional[datetime]: """ Parse datetime from extracted raw values. Supports: - YYYYMMDD_HHMMSS combined - YYYYMMDD + HHMMSS separate - YYYYMMDD only (time defaults to 00:00:00) """ try: if 'YYYYMMDD_HHMMSS' in raw_values: dt_str = raw_values['YYYYMMDD_HHMMSS'] return datetime.strptime(dt_str, '%Y%m%d_%H%M%S') if 'YYYYMMDD' in raw_values: date_str = raw_values['YYYYMMDD'] if 'HHMMSS' in raw_values: time_str = raw_values['HHMMSS'] return datetime.strptime(f'{date_str}_{time_str}', '%Y%m%d_%H%M%S') else: # Date only, no time return datetime.strptime(date_str, '%Y%m%d') return None except ValueError: return None def validate_pattern(self) -> tuple: """ Validate the pattern is properly formed. Returns: Tuple of (is_valid: bool, error_message: str or None) """ try: # Check for at least one recognized token token_regex = r'\{(\w+)\}' tokens = re.findall(token_regex, self.pattern) if not tokens: return False, "Pattern must contain at least one token" # Check all tokens are recognized unknown_tokens = [t for t in tokens if t not in self.TOKEN_PATTERNS] if unknown_tokens: return False, f"Unknown tokens: {', '.join(unknown_tokens)}" return True, None except Exception as e: return False, str(e) def create_parser(pattern: str) -> FilenameParser: """ Factory function to create a FilenameParser. Args: pattern: Pattern string Returns: FilenameParser instance """ return FilenameParser(pattern) def parse_with_fallbacks(filename: str, patterns: List[str]) -> Dict[str, Any]: """ Try parsing a filename with multiple patterns, return first successful match. Args: filename: Filename to parse patterns: List of pattern strings to try in order Returns: Dictionary with extracted metadata (same as FilenameParser.parse) """ last_error = None for pattern in patterns: try: parser = FilenameParser(pattern) result = parser.parse(filename) if result['valid']: result['matched_pattern'] = pattern return result last_error = result.get('error') except Exception as e: last_error = str(e) # Return failure with last error return { 'username': None, 'datetime': None, 'media_id': None, 'description': None, 'num': None, 'extension': Path(filename).suffix.lower() if Path(filename).suffix else None, 'valid': False, 'error': last_error or f"Filename doesn't match any of {len(patterns)} patterns", 'raw_values': {} } # Instagram has many filename formats from different download sources INSTAGRAM_PATTERNS = [ # Standard gallery-dl formats '{username}_{YYYYMMDD}_{HHMMSS}_{id}', # gallery-dl default (underscores) '{username}-{YYYYMMDD}_{HHMMSS}-{id}', # alternative format (dashes around date) # Formats with _n suffix (common from some scrapers) '{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # with _n suffix '{username}-{YYYYMMDD}_{HHMMSS}-{id}_n', # dashes + _n suffix # Formats with hl=en language parameter (imginn/instaloader variants) '{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # language tag + _n suffix '{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}', # language tag, no _n suffix # Formats with leading underscore (some scrapers prefix underscore) '_{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # leading underscore + _n suffix '_{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # leading underscore + lang + _n # Formats with media shortcode before date (some browser extensions / save tools) '{username}-video-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-video-shortcode-date_hash '{username}-photo-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-photo-shortcode-date_hash '{username}-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-shortcode-date_hash (no type prefix, must be last) ] # Predefined patterns for common platforms PRESET_PATTERNS = { 'instagram_stories': { 'name': 'Instagram Stories', 'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}', 'alt_patterns': INSTAGRAM_PATTERNS, 'example': 'evalongoria_20251127_172753_AQOGOcCUbrMy...', 'platform': 'instagram', 'content_type': 'stories' }, 'instagram_posts': { 'name': 'Instagram Posts', 'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}', 'alt_patterns': INSTAGRAM_PATTERNS, 'example': 'evalongoria_20251127_172753_18538674661006538', 'platform': 'instagram', 'content_type': 'posts' }, 'instagram_reels': { 'name': 'Instagram Reels', 'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}', 'alt_patterns': INSTAGRAM_PATTERNS, 'example': 'evalongoria_20251127_172753_18538674661006538', 'platform': 'instagram', 'content_type': 'reels' }, 'tiktok_videos': { 'name': 'TikTok Videos', 'pattern': '{YYYYMMDD}_{description}_{id}_{num}', 'example': '20251127_beautiful_sunset_1234567890_1', 'platform': 'tiktok', 'content_type': 'videos' }, 'snapchat_stories': { 'name': 'Snapchat Stories', 'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}', 'example': 'username_20251127_172753_story123', 'platform': 'snapchat', 'content_type': 'stories' }, 'youtube_videos': { 'name': 'YouTube Videos', 'pattern': '{id}', 'example': 'dQw4w9WgXcQ', 'platform': 'youtube', 'content_type': 'videos', 'use_ytdlp': True } } def get_preset_patterns() -> Dict[str, Dict]: """Get all predefined filename patterns.""" return PRESET_PATTERNS.copy() # Test/demo function if __name__ == '__main__': # Test with the user's example test_pattern = '{username}-{YYYYMMDD}_{HHMMSS}-{id}' test_filename = 'tiannahcgarcia-20251127_172753-AQOGOcCUbrMyAL0VXcQjnpHr6aY6U25C1SbaREqFJv7_MVXNVUvBd290MwlNFmwOTK5PuLx6DtK9cYoot0c5Y6a4vuDtOaug2heLank.jpg' parser = FilenameParser(test_pattern) result = parser.parse(test_filename) print(f"Pattern: {test_pattern}") print(f"Filename: {test_filename}") print(f"Result: {result}") print() # Test Instagram post format test_pattern2 = '{username}_{YYYYMMDD}_{HHMMSS}_{id}' test_filename2 = 'evalongoria_20251027_155842_18538674661006538.jpg' parser2 = FilenameParser(test_pattern2) result2 = parser2.parse(test_filename2) print(f"Pattern: {test_pattern2}") print(f"Filename: {test_filename2}") print(f"Result: {result2}")