Files
media-downloader/modules/paid_content/filename_parser.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

172 lines
5.7 KiB
Python

"""
Filename parser for extracting dates and metadata from Fansly/paid content filenames.
Supports:
1. Fansly snowflake IDs: 871257582885416960.mp4
2. Embedded date format: 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP.mp4
3. Date-prefixed files: 2022-07-08.mp4 or 2022-07-08_video.mp4
"""
import re
from datetime import datetime, timezone
from typing import Optional, Dict, Tuple
from pathlib import Path
# Fansly epoch calibrated from known files
# Based on: 513099759796367360 = 2023-05-11 15:51 UTC
FANSLY_EPOCH_MS = 1561483337101
def decode_fansly_snowflake(snowflake_id: str) -> Optional[datetime]:
"""
Decode a Fansly snowflake ID to a datetime.
Fansly uses Twitter-style snowflake IDs where the timestamp
is encoded in the upper bits (shifted right by 22).
"""
try:
sid = int(snowflake_id)
# Timestamp is in upper bits
timestamp_ms = (sid >> 22) + FANSLY_EPOCH_MS
return datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)
except (ValueError, OverflowError, OSError):
return None
def parse_filename(filename: str) -> Dict:
"""
Parse a filename and extract any date/metadata information.
Returns:
{
'original_filename': str,
'detected_date': datetime or None,
'fansly_id': str or None,
'date_source': str or None, # 'snowflake', 'embedded', 'prefix', None
'confidence': str, # 'high', 'medium', 'low'
}
"""
result = {
'original_filename': filename,
'detected_date': None,
'fansly_id': None,
'date_source': None,
'confidence': 'low',
}
# Get the base name without extension
name = Path(filename).stem
# Pattern 1: Embedded date format
# 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP-YcNs55W9.mp4
# 2026-01-24_at_06-22_id_871257582885416960_hash2_4547ab5367c6d7ea3a28ac4fc79df018.mp4
# Also handles spaces: 2023 05 11_at_15 51_id_513099759796367360
embedded_pattern = r'(\d{4})[-_ ](\d{2})[-_ ](\d{2})[-_ ]?at[-_ ](\d{2})[-_ ](\d{2})[-_ ]?id[-_ ](\d{15,20})'
match = re.search(embedded_pattern, name, re.IGNORECASE)
if match:
year, month, day, hour, minute, fansly_id = match.groups()
try:
result['detected_date'] = datetime(
int(year), int(month), int(day),
int(hour), int(minute), 0,
tzinfo=timezone.utc
)
result['fansly_id'] = fansly_id
result['date_source'] = 'embedded'
result['confidence'] = 'high'
return result
except ValueError:
pass
# Pattern 2: Date prefix (YYYY-MM-DD or YYYY_MM_DD)
# 2022-07-08.mp4 or 2022-07-08_video.mp4
date_prefix_pattern = r'^(\d{4})[-_](\d{2})[-_](\d{2})(?:[_\-\s]|$)'
match = re.match(date_prefix_pattern, name)
if match:
year, month, day = match.groups()
try:
result['detected_date'] = datetime(
int(year), int(month), int(day),
12, 0, 0, # Default to noon
tzinfo=timezone.utc
)
result['date_source'] = 'prefix'
result['confidence'] = 'high'
return result
except ValueError:
pass
# Pattern 3: Pure Fansly snowflake ID
# 871257582885416960.mp4 (15-20 digit number)
snowflake_pattern = r'^(\d{15,20})(?:_\d+)?$'
match = re.match(snowflake_pattern, name)
if match:
fansly_id = match.group(1)
decoded_date = decode_fansly_snowflake(fansly_id)
if decoded_date:
# Sanity check: date should be between 2020 and 2030
if 2020 <= decoded_date.year <= 2030:
result['detected_date'] = decoded_date
result['fansly_id'] = fansly_id
result['date_source'] = 'snowflake'
result['confidence'] = 'high'
return result
# Pattern 4: Fansly ID embedded anywhere in filename
# e.g., video_871257582885416960_hd.mp4
embedded_id_pattern = r'(\d{15,20})'
matches = re.findall(embedded_id_pattern, name)
for potential_id in matches:
decoded_date = decode_fansly_snowflake(potential_id)
if decoded_date and 2020 <= decoded_date.year <= 2030:
result['detected_date'] = decoded_date
result['fansly_id'] = potential_id
result['date_source'] = 'snowflake'
result['confidence'] = 'medium'
return result
return result
def parse_filenames(filenames: list) -> Dict:
"""
Parse multiple filenames and return analysis.
Returns:
{
'files': [parsed result for each file],
'earliest_date': datetime or None,
'latest_date': datetime or None,
'suggested_date': datetime or None, # Most common or earliest
'has_dates': bool,
}
"""
results = [parse_filename(f) for f in filenames]
dates = [r['detected_date'] for r in results if r['detected_date']]
analysis = {
'files': results,
'earliest_date': min(dates) if dates else None,
'latest_date': max(dates) if dates else None,
'suggested_date': min(dates) if dates else None, # Use earliest as default
'has_dates': len(dates) > 0,
}
return analysis
def format_date_for_display(dt: datetime) -> str:
"""Format datetime for display: 'May 11, 2023 at 3:51 PM'"""
if dt is None:
return ''
return dt.strftime('%b %d, %Y at %-I:%M %p')
def format_date_for_input(dt: datetime) -> Tuple[str, str]:
"""Format datetime for HTML inputs: (date_str, time_str)"""
if dt is None:
return ('', '')
return (dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M'))