382
modules/filename_parser.py
Normal file
382
modules/filename_parser.py
Normal file
@@ -0,0 +1,382 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Filename Parser Module for Manual Import
|
||||
Parses filenames based on configurable patterns to extract metadata
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
|
||||
class FilenameParser:
|
||||
"""
|
||||
Parse filenames using configurable patterns to extract metadata.
|
||||
|
||||
Supported pattern tokens:
|
||||
- {username} - Username/source (alphanumeric, underscores, periods)
|
||||
- {YYYYMMDD} - Date as 8 digits (20251127)
|
||||
- {HHMMSS} - Time as 6 digits (172753)
|
||||
- {YYYYMMDD_HHMMSS} - Combined date_time with underscore
|
||||
- {id} - Media ID (any characters until next separator)
|
||||
- {description} - Text content (any characters until next separator)
|
||||
- {num} - Sequence number (digits)
|
||||
- {ext} - File extension (optional, auto-handled)
|
||||
|
||||
Example patterns:
|
||||
- Instagram Stories: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
|
||||
- Instagram Posts: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
|
||||
- TikTok: "{YYYYMMDD}_{description}_{id}_{num}"
|
||||
"""
|
||||
|
||||
# Token definitions: token_name -> (regex_pattern, is_greedy)
|
||||
TOKEN_PATTERNS = {
|
||||
'username': (r'[a-zA-Z0-9_.]+', False),
|
||||
'YYYYMMDD': (r'\d{8}', False),
|
||||
'HHMMSS': (r'\d{6}', False),
|
||||
'YYYYMMDD_HHMMSS': (r'\d{8}_\d{6}', False),
|
||||
'id': (r'.+', True), # Greedy - matches everything until separator
|
||||
'description': (r'.+', True), # Greedy
|
||||
'num': (r'\d+', False),
|
||||
'ext': (r'\.[a-zA-Z0-9]+', False),
|
||||
}
|
||||
|
||||
def __init__(self, pattern: str):
|
||||
"""
|
||||
Initialize parser with a filename pattern.
|
||||
|
||||
Args:
|
||||
pattern: Pattern string like "{username}-{YYYYMMDD}_{HHMMSS}-{id}"
|
||||
"""
|
||||
self.pattern = pattern
|
||||
self.regex, self.token_order = self._compile_pattern(pattern)
|
||||
|
||||
def _compile_pattern(self, pattern: str) -> tuple:
|
||||
"""
|
||||
Convert pattern string to compiled regex.
|
||||
|
||||
Returns:
|
||||
Tuple of (compiled_regex, list_of_token_names)
|
||||
"""
|
||||
# Find all tokens in the pattern
|
||||
token_regex = r'\{(\w+)\}'
|
||||
tokens = re.findall(token_regex, pattern)
|
||||
|
||||
# Build regex pattern
|
||||
regex_pattern = pattern
|
||||
|
||||
# Escape special regex characters in the pattern (except our tokens)
|
||||
# First, temporarily replace tokens
|
||||
for i, token in enumerate(tokens):
|
||||
regex_pattern = regex_pattern.replace(f'{{{token}}}', f'__TOKEN_{i}__', 1)
|
||||
|
||||
# Escape special chars
|
||||
regex_pattern = re.escape(regex_pattern)
|
||||
|
||||
# Replace tokens back with their regex patterns
|
||||
for i, token in enumerate(tokens):
|
||||
if token in self.TOKEN_PATTERNS:
|
||||
token_pattern, is_greedy = self.TOKEN_PATTERNS[token]
|
||||
# Use non-greedy for greedy tokens when there's a separator after
|
||||
if is_greedy:
|
||||
# Make it non-greedy so it stops at the next separator
|
||||
token_pattern = r'.+?'
|
||||
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', f'({token_pattern})', 1)
|
||||
else:
|
||||
# Unknown token - treat as any characters
|
||||
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', r'(.+?)', 1)
|
||||
|
||||
# Handle the last greedy token specially - it should be truly greedy
|
||||
# Find the last greedy token and make it greedy
|
||||
for token in reversed(tokens):
|
||||
if token in self.TOKEN_PATTERNS:
|
||||
_, is_greedy = self.TOKEN_PATTERNS[token]
|
||||
if is_greedy:
|
||||
# The last occurrence of .+? for this token should be .+
|
||||
# We need to be more careful here - just make the whole pattern work
|
||||
break
|
||||
|
||||
# Add start anchor, but allow extension at end
|
||||
regex_pattern = '^' + regex_pattern + r'(?:\.[a-zA-Z0-9]+)?$'
|
||||
|
||||
try:
|
||||
compiled = re.compile(regex_pattern)
|
||||
except re.error as e:
|
||||
raise ValueError(f"Invalid pattern '{pattern}': {e}")
|
||||
|
||||
return compiled, tokens
|
||||
|
||||
def parse(self, filename: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse a filename and extract metadata.
|
||||
|
||||
Args:
|
||||
filename: Filename to parse (with or without extension)
|
||||
|
||||
Returns:
|
||||
Dictionary with extracted metadata:
|
||||
- username: str or None
|
||||
- datetime: datetime object or None
|
||||
- media_id: str or None
|
||||
- description: str or None
|
||||
- num: int or None
|
||||
- extension: str or None
|
||||
- valid: bool
|
||||
- error: str or None (if valid is False)
|
||||
"""
|
||||
result = {
|
||||
'username': None,
|
||||
'datetime': None,
|
||||
'media_id': None,
|
||||
'description': None,
|
||||
'num': None,
|
||||
'extension': None,
|
||||
'valid': False,
|
||||
'error': None,
|
||||
'raw_values': {}
|
||||
}
|
||||
|
||||
# Extract extension
|
||||
path = Path(filename)
|
||||
extension = path.suffix.lower() if path.suffix else None
|
||||
basename = path.stem
|
||||
result['extension'] = extension
|
||||
|
||||
# Try to match the pattern
|
||||
match = self.regex.match(basename) or self.regex.match(filename)
|
||||
|
||||
if not match:
|
||||
result['error'] = f"Filename doesn't match pattern: {self.pattern}"
|
||||
return result
|
||||
|
||||
# Extract values for each token
|
||||
groups = match.groups()
|
||||
for i, token in enumerate(self.token_order):
|
||||
if i < len(groups):
|
||||
value = groups[i]
|
||||
result['raw_values'][token] = value
|
||||
|
||||
# Map tokens to result fields
|
||||
if token == 'username':
|
||||
result['username'] = value.lower()
|
||||
elif token == 'id':
|
||||
result['media_id'] = value
|
||||
elif token == 'description':
|
||||
result['description'] = value
|
||||
elif token == 'num':
|
||||
try:
|
||||
result['num'] = int(value)
|
||||
except ValueError:
|
||||
result['num'] = value
|
||||
|
||||
# Parse datetime from date/time tokens
|
||||
result['datetime'] = self._parse_datetime(result['raw_values'])
|
||||
|
||||
result['valid'] = True
|
||||
return result
|
||||
|
||||
def _parse_datetime(self, raw_values: Dict[str, str]) -> Optional[datetime]:
|
||||
"""
|
||||
Parse datetime from extracted raw values.
|
||||
|
||||
Supports:
|
||||
- YYYYMMDD_HHMMSS combined
|
||||
- YYYYMMDD + HHMMSS separate
|
||||
- YYYYMMDD only (time defaults to 00:00:00)
|
||||
"""
|
||||
try:
|
||||
if 'YYYYMMDD_HHMMSS' in raw_values:
|
||||
dt_str = raw_values['YYYYMMDD_HHMMSS']
|
||||
return datetime.strptime(dt_str, '%Y%m%d_%H%M%S')
|
||||
|
||||
if 'YYYYMMDD' in raw_values:
|
||||
date_str = raw_values['YYYYMMDD']
|
||||
|
||||
if 'HHMMSS' in raw_values:
|
||||
time_str = raw_values['HHMMSS']
|
||||
return datetime.strptime(f'{date_str}_{time_str}', '%Y%m%d_%H%M%S')
|
||||
else:
|
||||
# Date only, no time
|
||||
return datetime.strptime(date_str, '%Y%m%d')
|
||||
|
||||
return None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def validate_pattern(self) -> tuple:
|
||||
"""
|
||||
Validate the pattern is properly formed.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid: bool, error_message: str or None)
|
||||
"""
|
||||
try:
|
||||
# Check for at least one recognized token
|
||||
token_regex = r'\{(\w+)\}'
|
||||
tokens = re.findall(token_regex, self.pattern)
|
||||
|
||||
if not tokens:
|
||||
return False, "Pattern must contain at least one token"
|
||||
|
||||
# Check all tokens are recognized
|
||||
unknown_tokens = [t for t in tokens if t not in self.TOKEN_PATTERNS]
|
||||
if unknown_tokens:
|
||||
return False, f"Unknown tokens: {', '.join(unknown_tokens)}"
|
||||
|
||||
return True, None
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def create_parser(pattern: str) -> FilenameParser:
|
||||
"""
|
||||
Factory function to create a FilenameParser.
|
||||
|
||||
Args:
|
||||
pattern: Pattern string
|
||||
|
||||
Returns:
|
||||
FilenameParser instance
|
||||
"""
|
||||
return FilenameParser(pattern)
|
||||
|
||||
|
||||
def parse_with_fallbacks(filename: str, patterns: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Try parsing a filename with multiple patterns, return first successful match.
|
||||
|
||||
Args:
|
||||
filename: Filename to parse
|
||||
patterns: List of pattern strings to try in order
|
||||
|
||||
Returns:
|
||||
Dictionary with extracted metadata (same as FilenameParser.parse)
|
||||
"""
|
||||
last_error = None
|
||||
for pattern in patterns:
|
||||
try:
|
||||
parser = FilenameParser(pattern)
|
||||
result = parser.parse(filename)
|
||||
if result['valid']:
|
||||
result['matched_pattern'] = pattern
|
||||
return result
|
||||
last_error = result.get('error')
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
|
||||
# Return failure with last error
|
||||
return {
|
||||
'username': None,
|
||||
'datetime': None,
|
||||
'media_id': None,
|
||||
'description': None,
|
||||
'num': None,
|
||||
'extension': Path(filename).suffix.lower() if Path(filename).suffix else None,
|
||||
'valid': False,
|
||||
'error': last_error or f"Filename doesn't match any of {len(patterns)} patterns",
|
||||
'raw_values': {}
|
||||
}
|
||||
|
||||
|
||||
# Instagram has many filename formats from different download sources
|
||||
INSTAGRAM_PATTERNS = [
|
||||
# Standard gallery-dl formats
|
||||
'{username}_{YYYYMMDD}_{HHMMSS}_{id}', # gallery-dl default (underscores)
|
||||
'{username}-{YYYYMMDD}_{HHMMSS}-{id}', # alternative format (dashes around date)
|
||||
# Formats with _n suffix (common from some scrapers)
|
||||
'{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # with _n suffix
|
||||
'{username}-{YYYYMMDD}_{HHMMSS}-{id}_n', # dashes + _n suffix
|
||||
# Formats with hl=en language parameter (imginn/instaloader variants)
|
||||
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # language tag + _n suffix
|
||||
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}', # language tag, no _n suffix
|
||||
# Formats with leading underscore (some scrapers prefix underscore)
|
||||
'_{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # leading underscore + _n suffix
|
||||
'_{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # leading underscore + lang + _n
|
||||
# Formats with media shortcode before date (some browser extensions / save tools)
|
||||
'{username}-video-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-video-shortcode-date_hash
|
||||
'{username}-photo-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-photo-shortcode-date_hash
|
||||
'{username}-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-shortcode-date_hash (no type prefix, must be last)
|
||||
]
|
||||
|
||||
|
||||
# Predefined patterns for common platforms
|
||||
PRESET_PATTERNS = {
|
||||
'instagram_stories': {
|
||||
'name': 'Instagram Stories',
|
||||
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
||||
'alt_patterns': INSTAGRAM_PATTERNS,
|
||||
'example': 'evalongoria_20251127_172753_AQOGOcCUbrMy...',
|
||||
'platform': 'instagram',
|
||||
'content_type': 'stories'
|
||||
},
|
||||
'instagram_posts': {
|
||||
'name': 'Instagram Posts',
|
||||
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
||||
'alt_patterns': INSTAGRAM_PATTERNS,
|
||||
'example': 'evalongoria_20251127_172753_18538674661006538',
|
||||
'platform': 'instagram',
|
||||
'content_type': 'posts'
|
||||
},
|
||||
'instagram_reels': {
|
||||
'name': 'Instagram Reels',
|
||||
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
||||
'alt_patterns': INSTAGRAM_PATTERNS,
|
||||
'example': 'evalongoria_20251127_172753_18538674661006538',
|
||||
'platform': 'instagram',
|
||||
'content_type': 'reels'
|
||||
},
|
||||
'tiktok_videos': {
|
||||
'name': 'TikTok Videos',
|
||||
'pattern': '{YYYYMMDD}_{description}_{id}_{num}',
|
||||
'example': '20251127_beautiful_sunset_1234567890_1',
|
||||
'platform': 'tiktok',
|
||||
'content_type': 'videos'
|
||||
},
|
||||
'snapchat_stories': {
|
||||
'name': 'Snapchat Stories',
|
||||
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
||||
'example': 'username_20251127_172753_story123',
|
||||
'platform': 'snapchat',
|
||||
'content_type': 'stories'
|
||||
},
|
||||
'youtube_videos': {
|
||||
'name': 'YouTube Videos',
|
||||
'pattern': '{id}',
|
||||
'example': 'dQw4w9WgXcQ',
|
||||
'platform': 'youtube',
|
||||
'content_type': 'videos',
|
||||
'use_ytdlp': True
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_preset_patterns() -> Dict[str, Dict]:
|
||||
"""Get all predefined filename patterns."""
|
||||
return PRESET_PATTERNS.copy()
|
||||
|
||||
|
||||
# Test/demo function
|
||||
if __name__ == '__main__':
|
||||
# Test with the user's example
|
||||
test_pattern = '{username}-{YYYYMMDD}_{HHMMSS}-{id}'
|
||||
test_filename = 'tiannahcgarcia-20251127_172753-AQOGOcCUbrMyAL0VXcQjnpHr6aY6U25C1SbaREqFJv7_MVXNVUvBd290MwlNFmwOTK5PuLx6DtK9cYoot0c5Y6a4vuDtOaug2heLank.jpg'
|
||||
|
||||
parser = FilenameParser(test_pattern)
|
||||
result = parser.parse(test_filename)
|
||||
|
||||
print(f"Pattern: {test_pattern}")
|
||||
print(f"Filename: {test_filename}")
|
||||
print(f"Result: {result}")
|
||||
print()
|
||||
|
||||
# Test Instagram post format
|
||||
test_pattern2 = '{username}_{YYYYMMDD}_{HHMMSS}_{id}'
|
||||
test_filename2 = 'evalongoria_20251027_155842_18538674661006538.jpg'
|
||||
|
||||
parser2 = FilenameParser(test_pattern2)
|
||||
result2 = parser2.parse(test_filename2)
|
||||
|
||||
print(f"Pattern: {test_pattern2}")
|
||||
print(f"Filename: {test_filename2}")
|
||||
print(f"Result: {result2}")
|
||||
Reference in New Issue
Block a user