Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

382
modules/filename_parser.py Normal file
View File

@@ -0,0 +1,382 @@
#!/usr/bin/env python3
"""
Filename Parser Module for Manual Import
Parses filenames based on configurable patterns to extract metadata
"""
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
class FilenameParser:
"""
Parse filenames using configurable patterns to extract metadata.
Supported pattern tokens:
- {username} - Username/source (alphanumeric, underscores, periods)
- {YYYYMMDD} - Date as 8 digits (20251127)
- {HHMMSS} - Time as 6 digits (172753)
- {YYYYMMDD_HHMMSS} - Combined date_time with underscore
- {id} - Media ID (any characters until next separator)
- {description} - Text content (any characters until next separator)
- {num} - Sequence number (digits)
- {ext} - File extension (optional, auto-handled)
Example patterns:
- Instagram Stories: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
- Instagram Posts: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
- TikTok: "{YYYYMMDD}_{description}_{id}_{num}"
"""
# Token definitions: token_name -> (regex_pattern, is_greedy)
TOKEN_PATTERNS = {
'username': (r'[a-zA-Z0-9_.]+', False),
'YYYYMMDD': (r'\d{8}', False),
'HHMMSS': (r'\d{6}', False),
'YYYYMMDD_HHMMSS': (r'\d{8}_\d{6}', False),
'id': (r'.+', True), # Greedy - matches everything until separator
'description': (r'.+', True), # Greedy
'num': (r'\d+', False),
'ext': (r'\.[a-zA-Z0-9]+', False),
}
def __init__(self, pattern: str):
"""
Initialize parser with a filename pattern.
Args:
pattern: Pattern string like "{username}-{YYYYMMDD}_{HHMMSS}-{id}"
"""
self.pattern = pattern
self.regex, self.token_order = self._compile_pattern(pattern)
def _compile_pattern(self, pattern: str) -> tuple:
"""
Convert pattern string to compiled regex.
Returns:
Tuple of (compiled_regex, list_of_token_names)
"""
# Find all tokens in the pattern
token_regex = r'\{(\w+)\}'
tokens = re.findall(token_regex, pattern)
# Build regex pattern
regex_pattern = pattern
# Escape special regex characters in the pattern (except our tokens)
# First, temporarily replace tokens
for i, token in enumerate(tokens):
regex_pattern = regex_pattern.replace(f'{{{token}}}', f'__TOKEN_{i}__', 1)
# Escape special chars
regex_pattern = re.escape(regex_pattern)
# Replace tokens back with their regex patterns
for i, token in enumerate(tokens):
if token in self.TOKEN_PATTERNS:
token_pattern, is_greedy = self.TOKEN_PATTERNS[token]
# Use non-greedy for greedy tokens when there's a separator after
if is_greedy:
# Make it non-greedy so it stops at the next separator
token_pattern = r'.+?'
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', f'({token_pattern})', 1)
else:
# Unknown token - treat as any characters
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', r'(.+?)', 1)
# Handle the last greedy token specially - it should be truly greedy
# Find the last greedy token and make it greedy
for token in reversed(tokens):
if token in self.TOKEN_PATTERNS:
_, is_greedy = self.TOKEN_PATTERNS[token]
if is_greedy:
# The last occurrence of .+? for this token should be .+
# We need to be more careful here - just make the whole pattern work
break
# Add start anchor, but allow extension at end
regex_pattern = '^' + regex_pattern + r'(?:\.[a-zA-Z0-9]+)?$'
try:
compiled = re.compile(regex_pattern)
except re.error as e:
raise ValueError(f"Invalid pattern '{pattern}': {e}")
return compiled, tokens
def parse(self, filename: str) -> Dict[str, Any]:
"""
Parse a filename and extract metadata.
Args:
filename: Filename to parse (with or without extension)
Returns:
Dictionary with extracted metadata:
- username: str or None
- datetime: datetime object or None
- media_id: str or None
- description: str or None
- num: int or None
- extension: str or None
- valid: bool
- error: str or None (if valid is False)
"""
result = {
'username': None,
'datetime': None,
'media_id': None,
'description': None,
'num': None,
'extension': None,
'valid': False,
'error': None,
'raw_values': {}
}
# Extract extension
path = Path(filename)
extension = path.suffix.lower() if path.suffix else None
basename = path.stem
result['extension'] = extension
# Try to match the pattern
match = self.regex.match(basename) or self.regex.match(filename)
if not match:
result['error'] = f"Filename doesn't match pattern: {self.pattern}"
return result
# Extract values for each token
groups = match.groups()
for i, token in enumerate(self.token_order):
if i < len(groups):
value = groups[i]
result['raw_values'][token] = value
# Map tokens to result fields
if token == 'username':
result['username'] = value.lower()
elif token == 'id':
result['media_id'] = value
elif token == 'description':
result['description'] = value
elif token == 'num':
try:
result['num'] = int(value)
except ValueError:
result['num'] = value
# Parse datetime from date/time tokens
result['datetime'] = self._parse_datetime(result['raw_values'])
result['valid'] = True
return result
def _parse_datetime(self, raw_values: Dict[str, str]) -> Optional[datetime]:
"""
Parse datetime from extracted raw values.
Supports:
- YYYYMMDD_HHMMSS combined
- YYYYMMDD + HHMMSS separate
- YYYYMMDD only (time defaults to 00:00:00)
"""
try:
if 'YYYYMMDD_HHMMSS' in raw_values:
dt_str = raw_values['YYYYMMDD_HHMMSS']
return datetime.strptime(dt_str, '%Y%m%d_%H%M%S')
if 'YYYYMMDD' in raw_values:
date_str = raw_values['YYYYMMDD']
if 'HHMMSS' in raw_values:
time_str = raw_values['HHMMSS']
return datetime.strptime(f'{date_str}_{time_str}', '%Y%m%d_%H%M%S')
else:
# Date only, no time
return datetime.strptime(date_str, '%Y%m%d')
return None
except ValueError:
return None
def validate_pattern(self) -> tuple:
"""
Validate the pattern is properly formed.
Returns:
Tuple of (is_valid: bool, error_message: str or None)
"""
try:
# Check for at least one recognized token
token_regex = r'\{(\w+)\}'
tokens = re.findall(token_regex, self.pattern)
if not tokens:
return False, "Pattern must contain at least one token"
# Check all tokens are recognized
unknown_tokens = [t for t in tokens if t not in self.TOKEN_PATTERNS]
if unknown_tokens:
return False, f"Unknown tokens: {', '.join(unknown_tokens)}"
return True, None
except Exception as e:
return False, str(e)
def create_parser(pattern: str) -> FilenameParser:
"""
Factory function to create a FilenameParser.
Args:
pattern: Pattern string
Returns:
FilenameParser instance
"""
return FilenameParser(pattern)
def parse_with_fallbacks(filename: str, patterns: List[str]) -> Dict[str, Any]:
"""
Try parsing a filename with multiple patterns, return first successful match.
Args:
filename: Filename to parse
patterns: List of pattern strings to try in order
Returns:
Dictionary with extracted metadata (same as FilenameParser.parse)
"""
last_error = None
for pattern in patterns:
try:
parser = FilenameParser(pattern)
result = parser.parse(filename)
if result['valid']:
result['matched_pattern'] = pattern
return result
last_error = result.get('error')
except Exception as e:
last_error = str(e)
# Return failure with last error
return {
'username': None,
'datetime': None,
'media_id': None,
'description': None,
'num': None,
'extension': Path(filename).suffix.lower() if Path(filename).suffix else None,
'valid': False,
'error': last_error or f"Filename doesn't match any of {len(patterns)} patterns",
'raw_values': {}
}
# Instagram has many filename formats from different download sources
INSTAGRAM_PATTERNS = [
# Standard gallery-dl formats
'{username}_{YYYYMMDD}_{HHMMSS}_{id}', # gallery-dl default (underscores)
'{username}-{YYYYMMDD}_{HHMMSS}-{id}', # alternative format (dashes around date)
# Formats with _n suffix (common from some scrapers)
'{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # with _n suffix
'{username}-{YYYYMMDD}_{HHMMSS}-{id}_n', # dashes + _n suffix
# Formats with hl=en language parameter (imginn/instaloader variants)
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # language tag + _n suffix
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}', # language tag, no _n suffix
# Formats with leading underscore (some scrapers prefix underscore)
'_{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # leading underscore + _n suffix
'_{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # leading underscore + lang + _n
# Formats with media shortcode before date (some browser extensions / save tools)
'{username}-video-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-video-shortcode-date_hash
'{username}-photo-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-photo-shortcode-date_hash
'{username}-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-shortcode-date_hash (no type prefix, must be last)
]
# Predefined patterns for common platforms
PRESET_PATTERNS = {
'instagram_stories': {
'name': 'Instagram Stories',
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
'alt_patterns': INSTAGRAM_PATTERNS,
'example': 'evalongoria_20251127_172753_AQOGOcCUbrMy...',
'platform': 'instagram',
'content_type': 'stories'
},
'instagram_posts': {
'name': 'Instagram Posts',
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
'alt_patterns': INSTAGRAM_PATTERNS,
'example': 'evalongoria_20251127_172753_18538674661006538',
'platform': 'instagram',
'content_type': 'posts'
},
'instagram_reels': {
'name': 'Instagram Reels',
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
'alt_patterns': INSTAGRAM_PATTERNS,
'example': 'evalongoria_20251127_172753_18538674661006538',
'platform': 'instagram',
'content_type': 'reels'
},
'tiktok_videos': {
'name': 'TikTok Videos',
'pattern': '{YYYYMMDD}_{description}_{id}_{num}',
'example': '20251127_beautiful_sunset_1234567890_1',
'platform': 'tiktok',
'content_type': 'videos'
},
'snapchat_stories': {
'name': 'Snapchat Stories',
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
'example': 'username_20251127_172753_story123',
'platform': 'snapchat',
'content_type': 'stories'
},
'youtube_videos': {
'name': 'YouTube Videos',
'pattern': '{id}',
'example': 'dQw4w9WgXcQ',
'platform': 'youtube',
'content_type': 'videos',
'use_ytdlp': True
}
}
def get_preset_patterns() -> Dict[str, Dict]:
"""Get all predefined filename patterns."""
return PRESET_PATTERNS.copy()
# Test/demo function
if __name__ == '__main__':
# Test with the user's example
test_pattern = '{username}-{YYYYMMDD}_{HHMMSS}-{id}'
test_filename = 'tiannahcgarcia-20251127_172753-AQOGOcCUbrMyAL0VXcQjnpHr6aY6U25C1SbaREqFJv7_MVXNVUvBd290MwlNFmwOTK5PuLx6DtK9cYoot0c5Y6a4vuDtOaug2heLank.jpg'
parser = FilenameParser(test_pattern)
result = parser.parse(test_filename)
print(f"Pattern: {test_pattern}")
print(f"Filename: {test_filename}")
print(f"Result: {result}")
print()
# Test Instagram post format
test_pattern2 = '{username}_{YYYYMMDD}_{HHMMSS}_{id}'
test_filename2 = 'evalongoria_20251027_155842_18538674661006538.jpg'
parser2 = FilenameParser(test_pattern2)
result2 = parser2.parse(test_filename2)
print(f"Pattern: {test_pattern2}")
print(f"Filename: {test_filename2}")
print(f"Result: {result2}")