383 lines
13 KiB
Python
383 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Filename Parser Module for Manual Import
|
|
Parses filenames based on configurable patterns to extract metadata
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
|
|
class FilenameParser:
|
|
"""
|
|
Parse filenames using configurable patterns to extract metadata.
|
|
|
|
Supported pattern tokens:
|
|
- {username} - Username/source (alphanumeric, underscores, periods)
|
|
- {YYYYMMDD} - Date as 8 digits (20251127)
|
|
- {HHMMSS} - Time as 6 digits (172753)
|
|
- {YYYYMMDD_HHMMSS} - Combined date_time with underscore
|
|
- {id} - Media ID (any characters until next separator)
|
|
- {description} - Text content (any characters until next separator)
|
|
- {num} - Sequence number (digits)
|
|
- {ext} - File extension (optional, auto-handled)
|
|
|
|
Example patterns:
|
|
- Instagram Stories: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
|
|
- Instagram Posts: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
|
|
- TikTok: "{YYYYMMDD}_{description}_{id}_{num}"
|
|
"""
|
|
|
|
# Token definitions: token_name -> (regex_pattern, is_greedy)
|
|
TOKEN_PATTERNS = {
|
|
'username': (r'[a-zA-Z0-9_.]+', False),
|
|
'YYYYMMDD': (r'\d{8}', False),
|
|
'HHMMSS': (r'\d{6}', False),
|
|
'YYYYMMDD_HHMMSS': (r'\d{8}_\d{6}', False),
|
|
'id': (r'.+', True), # Greedy - matches everything until separator
|
|
'description': (r'.+', True), # Greedy
|
|
'num': (r'\d+', False),
|
|
'ext': (r'\.[a-zA-Z0-9]+', False),
|
|
}
|
|
|
|
def __init__(self, pattern: str):
|
|
"""
|
|
Initialize parser with a filename pattern.
|
|
|
|
Args:
|
|
pattern: Pattern string like "{username}-{YYYYMMDD}_{HHMMSS}-{id}"
|
|
"""
|
|
self.pattern = pattern
|
|
self.regex, self.token_order = self._compile_pattern(pattern)
|
|
|
|
def _compile_pattern(self, pattern: str) -> tuple:
|
|
"""
|
|
Convert pattern string to compiled regex.
|
|
|
|
Returns:
|
|
Tuple of (compiled_regex, list_of_token_names)
|
|
"""
|
|
# Find all tokens in the pattern
|
|
token_regex = r'\{(\w+)\}'
|
|
tokens = re.findall(token_regex, pattern)
|
|
|
|
# Build regex pattern
|
|
regex_pattern = pattern
|
|
|
|
# Escape special regex characters in the pattern (except our tokens)
|
|
# First, temporarily replace tokens
|
|
for i, token in enumerate(tokens):
|
|
regex_pattern = regex_pattern.replace(f'{{{token}}}', f'__TOKEN_{i}__', 1)
|
|
|
|
# Escape special chars
|
|
regex_pattern = re.escape(regex_pattern)
|
|
|
|
# Replace tokens back with their regex patterns
|
|
for i, token in enumerate(tokens):
|
|
if token in self.TOKEN_PATTERNS:
|
|
token_pattern, is_greedy = self.TOKEN_PATTERNS[token]
|
|
# Use non-greedy for greedy tokens when there's a separator after
|
|
if is_greedy:
|
|
# Make it non-greedy so it stops at the next separator
|
|
token_pattern = r'.+?'
|
|
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', f'({token_pattern})', 1)
|
|
else:
|
|
# Unknown token - treat as any characters
|
|
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', r'(.+?)', 1)
|
|
|
|
# Handle the last greedy token specially - it should be truly greedy
|
|
# Find the last greedy token and make it greedy
|
|
for token in reversed(tokens):
|
|
if token in self.TOKEN_PATTERNS:
|
|
_, is_greedy = self.TOKEN_PATTERNS[token]
|
|
if is_greedy:
|
|
# The last occurrence of .+? for this token should be .+
|
|
# We need to be more careful here - just make the whole pattern work
|
|
break
|
|
|
|
# Add start anchor, but allow extension at end
|
|
regex_pattern = '^' + regex_pattern + r'(?:\.[a-zA-Z0-9]+)?$'
|
|
|
|
try:
|
|
compiled = re.compile(regex_pattern)
|
|
except re.error as e:
|
|
raise ValueError(f"Invalid pattern '{pattern}': {e}")
|
|
|
|
return compiled, tokens
|
|
|
|
def parse(self, filename: str) -> Dict[str, Any]:
|
|
"""
|
|
Parse a filename and extract metadata.
|
|
|
|
Args:
|
|
filename: Filename to parse (with or without extension)
|
|
|
|
Returns:
|
|
Dictionary with extracted metadata:
|
|
- username: str or None
|
|
- datetime: datetime object or None
|
|
- media_id: str or None
|
|
- description: str or None
|
|
- num: int or None
|
|
- extension: str or None
|
|
- valid: bool
|
|
- error: str or None (if valid is False)
|
|
"""
|
|
result = {
|
|
'username': None,
|
|
'datetime': None,
|
|
'media_id': None,
|
|
'description': None,
|
|
'num': None,
|
|
'extension': None,
|
|
'valid': False,
|
|
'error': None,
|
|
'raw_values': {}
|
|
}
|
|
|
|
# Extract extension
|
|
path = Path(filename)
|
|
extension = path.suffix.lower() if path.suffix else None
|
|
basename = path.stem
|
|
result['extension'] = extension
|
|
|
|
# Try to match the pattern
|
|
match = self.regex.match(basename) or self.regex.match(filename)
|
|
|
|
if not match:
|
|
result['error'] = f"Filename doesn't match pattern: {self.pattern}"
|
|
return result
|
|
|
|
# Extract values for each token
|
|
groups = match.groups()
|
|
for i, token in enumerate(self.token_order):
|
|
if i < len(groups):
|
|
value = groups[i]
|
|
result['raw_values'][token] = value
|
|
|
|
# Map tokens to result fields
|
|
if token == 'username':
|
|
result['username'] = value.lower()
|
|
elif token == 'id':
|
|
result['media_id'] = value
|
|
elif token == 'description':
|
|
result['description'] = value
|
|
elif token == 'num':
|
|
try:
|
|
result['num'] = int(value)
|
|
except ValueError:
|
|
result['num'] = value
|
|
|
|
# Parse datetime from date/time tokens
|
|
result['datetime'] = self._parse_datetime(result['raw_values'])
|
|
|
|
result['valid'] = True
|
|
return result
|
|
|
|
def _parse_datetime(self, raw_values: Dict[str, str]) -> Optional[datetime]:
|
|
"""
|
|
Parse datetime from extracted raw values.
|
|
|
|
Supports:
|
|
- YYYYMMDD_HHMMSS combined
|
|
- YYYYMMDD + HHMMSS separate
|
|
- YYYYMMDD only (time defaults to 00:00:00)
|
|
"""
|
|
try:
|
|
if 'YYYYMMDD_HHMMSS' in raw_values:
|
|
dt_str = raw_values['YYYYMMDD_HHMMSS']
|
|
return datetime.strptime(dt_str, '%Y%m%d_%H%M%S')
|
|
|
|
if 'YYYYMMDD' in raw_values:
|
|
date_str = raw_values['YYYYMMDD']
|
|
|
|
if 'HHMMSS' in raw_values:
|
|
time_str = raw_values['HHMMSS']
|
|
return datetime.strptime(f'{date_str}_{time_str}', '%Y%m%d_%H%M%S')
|
|
else:
|
|
# Date only, no time
|
|
return datetime.strptime(date_str, '%Y%m%d')
|
|
|
|
return None
|
|
except ValueError:
|
|
return None
|
|
|
|
def validate_pattern(self) -> tuple:
|
|
"""
|
|
Validate the pattern is properly formed.
|
|
|
|
Returns:
|
|
Tuple of (is_valid: bool, error_message: str or None)
|
|
"""
|
|
try:
|
|
# Check for at least one recognized token
|
|
token_regex = r'\{(\w+)\}'
|
|
tokens = re.findall(token_regex, self.pattern)
|
|
|
|
if not tokens:
|
|
return False, "Pattern must contain at least one token"
|
|
|
|
# Check all tokens are recognized
|
|
unknown_tokens = [t for t in tokens if t not in self.TOKEN_PATTERNS]
|
|
if unknown_tokens:
|
|
return False, f"Unknown tokens: {', '.join(unknown_tokens)}"
|
|
|
|
return True, None
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
|
|
def create_parser(pattern: str) -> FilenameParser:
|
|
"""
|
|
Factory function to create a FilenameParser.
|
|
|
|
Args:
|
|
pattern: Pattern string
|
|
|
|
Returns:
|
|
FilenameParser instance
|
|
"""
|
|
return FilenameParser(pattern)
|
|
|
|
|
|
def parse_with_fallbacks(filename: str, patterns: List[str]) -> Dict[str, Any]:
|
|
"""
|
|
Try parsing a filename with multiple patterns, return first successful match.
|
|
|
|
Args:
|
|
filename: Filename to parse
|
|
patterns: List of pattern strings to try in order
|
|
|
|
Returns:
|
|
Dictionary with extracted metadata (same as FilenameParser.parse)
|
|
"""
|
|
last_error = None
|
|
for pattern in patterns:
|
|
try:
|
|
parser = FilenameParser(pattern)
|
|
result = parser.parse(filename)
|
|
if result['valid']:
|
|
result['matched_pattern'] = pattern
|
|
return result
|
|
last_error = result.get('error')
|
|
except Exception as e:
|
|
last_error = str(e)
|
|
|
|
# Return failure with last error
|
|
return {
|
|
'username': None,
|
|
'datetime': None,
|
|
'media_id': None,
|
|
'description': None,
|
|
'num': None,
|
|
'extension': Path(filename).suffix.lower() if Path(filename).suffix else None,
|
|
'valid': False,
|
|
'error': last_error or f"Filename doesn't match any of {len(patterns)} patterns",
|
|
'raw_values': {}
|
|
}
|
|
|
|
|
|
# Instagram has many filename formats from different download sources
|
|
INSTAGRAM_PATTERNS = [
|
|
# Standard gallery-dl formats
|
|
'{username}_{YYYYMMDD}_{HHMMSS}_{id}', # gallery-dl default (underscores)
|
|
'{username}-{YYYYMMDD}_{HHMMSS}-{id}', # alternative format (dashes around date)
|
|
# Formats with _n suffix (common from some scrapers)
|
|
'{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # with _n suffix
|
|
'{username}-{YYYYMMDD}_{HHMMSS}-{id}_n', # dashes + _n suffix
|
|
# Formats with hl=en language parameter (imginn/instaloader variants)
|
|
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # language tag + _n suffix
|
|
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}', # language tag, no _n suffix
|
|
# Formats with leading underscore (some scrapers prefix underscore)
|
|
'_{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # leading underscore + _n suffix
|
|
'_{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # leading underscore + lang + _n
|
|
# Formats with media shortcode before date (some browser extensions / save tools)
|
|
'{username}-video-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-video-shortcode-date_hash
|
|
'{username}-photo-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-photo-shortcode-date_hash
|
|
'{username}-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-shortcode-date_hash (no type prefix, must be last)
|
|
]
|
|
|
|
|
|
# Predefined patterns for common platforms
|
|
PRESET_PATTERNS = {
|
|
'instagram_stories': {
|
|
'name': 'Instagram Stories',
|
|
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
|
'alt_patterns': INSTAGRAM_PATTERNS,
|
|
'example': 'evalongoria_20251127_172753_AQOGOcCUbrMy...',
|
|
'platform': 'instagram',
|
|
'content_type': 'stories'
|
|
},
|
|
'instagram_posts': {
|
|
'name': 'Instagram Posts',
|
|
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
|
'alt_patterns': INSTAGRAM_PATTERNS,
|
|
'example': 'evalongoria_20251127_172753_18538674661006538',
|
|
'platform': 'instagram',
|
|
'content_type': 'posts'
|
|
},
|
|
'instagram_reels': {
|
|
'name': 'Instagram Reels',
|
|
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
|
'alt_patterns': INSTAGRAM_PATTERNS,
|
|
'example': 'evalongoria_20251127_172753_18538674661006538',
|
|
'platform': 'instagram',
|
|
'content_type': 'reels'
|
|
},
|
|
'tiktok_videos': {
|
|
'name': 'TikTok Videos',
|
|
'pattern': '{YYYYMMDD}_{description}_{id}_{num}',
|
|
'example': '20251127_beautiful_sunset_1234567890_1',
|
|
'platform': 'tiktok',
|
|
'content_type': 'videos'
|
|
},
|
|
'snapchat_stories': {
|
|
'name': 'Snapchat Stories',
|
|
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
|
'example': 'username_20251127_172753_story123',
|
|
'platform': 'snapchat',
|
|
'content_type': 'stories'
|
|
},
|
|
'youtube_videos': {
|
|
'name': 'YouTube Videos',
|
|
'pattern': '{id}',
|
|
'example': 'dQw4w9WgXcQ',
|
|
'platform': 'youtube',
|
|
'content_type': 'videos',
|
|
'use_ytdlp': True
|
|
}
|
|
}
|
|
|
|
|
|
def get_preset_patterns() -> Dict[str, Dict]:
|
|
"""Get all predefined filename patterns."""
|
|
return PRESET_PATTERNS.copy()
|
|
|
|
|
|
# Test/demo function
|
|
if __name__ == '__main__':
|
|
# Test with the user's example
|
|
test_pattern = '{username}-{YYYYMMDD}_{HHMMSS}-{id}'
|
|
test_filename = 'tiannahcgarcia-20251127_172753-AQOGOcCUbrMyAL0VXcQjnpHr6aY6U25C1SbaREqFJv7_MVXNVUvBd290MwlNFmwOTK5PuLx6DtK9cYoot0c5Y6a4vuDtOaug2heLank.jpg'
|
|
|
|
parser = FilenameParser(test_pattern)
|
|
result = parser.parse(test_filename)
|
|
|
|
print(f"Pattern: {test_pattern}")
|
|
print(f"Filename: {test_filename}")
|
|
print(f"Result: {result}")
|
|
print()
|
|
|
|
# Test Instagram post format
|
|
test_pattern2 = '{username}_{YYYYMMDD}_{HHMMSS}_{id}'
|
|
test_filename2 = 'evalongoria_20251027_155842_18538674661006538.jpg'
|
|
|
|
parser2 = FilenameParser(test_pattern2)
|
|
result2 = parser2.parse(test_filename2)
|
|
|
|
print(f"Pattern: {test_pattern2}")
|
|
print(f"Filename: {test_filename2}")
|
|
print(f"Result: {result2}")
|