Files
media-downloader/modules/imginn_module.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

3775 lines
200 KiB
Python

#!/usr/bin/env python3
"""
ImgInn downloader module with FastDL-compatible file naming
Format: {profile}_{YYYYMMDD_HHMMSS}_{media_id}{ext}
"""
import os
import json
import time
import random
import re
import subprocess
import platform
import requests
from pathlib import Path
from datetime import datetime, timedelta
from modules.base_module import LoggingMixin
from modules.cloudflare_handler import (
CloudflareHandler, SiteStatus, get_flaresolverr_user_agent,
get_playwright_context_options, get_playwright_stealth_scripts
)
from modules.instagram_utils import (
extract_instagram_media_id,
scan_existing_files_for_media_ids,
record_instagram_download,
is_instagram_downloaded
)
from typing import Dict, Optional
from playwright.sync_api import sync_playwright
class ImgInnDownloader(LoggingMixin):
"""ImgInn downloader with FastDL-compatible naming"""
def __init__(self,
headless: bool = True,
cookie_file: str = "/opt/media-downloader/cookies/imginn_cookies.json",
show_progress: bool = True,
use_database: bool = True,
log_callback=None,
unified_db=None,
):
"""Initialize downloader compatible with media-downloader system"""
# Initialize logging via mixin
self._init_logger('Instagram', log_callback, default_module='Download')
self.headless = headless
self.downloaded_files = set() # Track downloaded media IDs
self.show_progress = show_progress
self.use_database = use_database
self.download_count = 0
self.unified_db = unified_db # Store for scraper config access
self.scraper_id = 'imginn' # Scraper ID in database
self.pending_downloads = [] # Track downloads for deferred database recording
# Rate limiting - track last scrape time to avoid hitting Cloudflare
self._last_scrape_time = None
self._min_scrape_interval = 15 # Minimum seconds between scrape types
# Track transient page load failures per session
self._page_load_failures = 0
self._page_load_failure_threshold = 5 # Escalate to error after this many
# Browser reuse across profiles
self.playwright = None
self.browser = None
self.context = None
self.page = None
# Use unified database directly (no adapter needed)
if unified_db and use_database:
self.unified_db = unified_db
else:
self.unified_db = None
self.use_database = False
# Initialize activity status manager for real-time updates
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
# Load scraper configuration from database if available
self.proxy_url = None
self.cookie_file = None # Default to None (use database)
if unified_db:
scraper_config = unified_db.get_scraper(self.scraper_id)
if scraper_config:
# Get proxy configuration
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
self.proxy_url = scraper_config['proxy_url']
self.log(f"Using proxy: {self.proxy_url}", "info")
# Fall back to cookie file if no database
if not unified_db:
self.cookie_file = Path(cookie_file)
self.cookie_file.parent.mkdir(parents=True, exist_ok=True)
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
self.user_agent = get_flaresolverr_user_agent()
# Initialize universal Cloudflare handler
# Pass proxy_url if configured, and cookie_file=None for database storage
self.cf_handler = CloudflareHandler(
module_name="ImgInn",
cookie_file=str(self.cookie_file) if self.cookie_file else None,
user_agent=self.user_agent,
logger=self.logger,
aggressive_expiry=True, # Refresh cookies expiring within 7 days
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
)
# Keep for backwards compatibility
self.flaresolverr_url = self.cf_handler.flaresolverr_url
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
# Load cookies from database if available
self._load_cookies_from_db()
def _load_cookies_from_db(self):
"""Load cookies from database if available"""
if not self.unified_db:
return
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
# Load into CloudflareHandler
self.cf_handler._cookies = cookies
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
except Exception as e:
self.log(f"Error loading cookies from database: {e}", "warning")
def _save_cookies_to_db(self, cookies: list, user_agent: str = None):
"""Save cookies to database
Args:
cookies: List of cookie dictionaries
user_agent: User agent to associate with cookies (important for cf_clearance).
If not provided, uses self.user_agent as fallback.
"""
if not self.unified_db:
return
try:
# Use provided user_agent or fall back to self.user_agent
ua = user_agent or self.user_agent
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=ua,
merge=True
)
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
except Exception as e:
self.log(f"Error saving cookies to database: {e}", "warning")
def _cookies_expired(self):
"""Check if cookies are expired - delegates to CloudflareHandler"""
return self.cf_handler.cookies_expired()
def _get_cookies_for_requests(self):
"""Get cookies in format for requests library - delegates to CloudflareHandler"""
return self.cf_handler.get_cookies_dict()
def _get_cookies_via_flaresolverr(self, url="https://imginn.com/", max_retries=2):
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
Args:
url: URL to fetch
max_retries: Maximum number of retry attempts (default: 2)
Returns:
True if cookies obtained successfully, False otherwise
"""
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
# Save cookies to database if successful
if success and self.unified_db:
cookies_list = self.cf_handler.get_cookies_list()
if cookies_list:
# CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent
# cf_clearance cookies are fingerprinted to the browser that solved the challenge
flaresolverr_ua = self.cf_handler.get_user_agent()
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
return success
def _enforce_rate_limit(self, scrape_type: str = "scrape"):
"""Enforce rate limiting between scrape operations to avoid Cloudflare blocks.
Args:
scrape_type: Type of scrape (posts, stories, tagged) for logging
"""
import random
if self._last_scrape_time is not None:
elapsed = time.time() - self._last_scrape_time
if elapsed < self._min_scrape_interval:
# Add random jitter (5-15 seconds) to the delay
jitter = random.uniform(5, 15)
wait_time = self._min_scrape_interval - elapsed + jitter
self.log(f"Rate limiting: waiting {wait_time:.1f}s before {scrape_type} (Cloudflare avoidance)", "info")
time.sleep(wait_time)
self._last_scrape_time = time.time()
def _has_valid_cookies(self):
"""Check if we have valid cookies (either in file or database)"""
if self.unified_db:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
return cookies and len(cookies) > 0
elif self.cookie_file:
return self.cookie_file.exists()
return False
def _start_browser(self):
"""Start browser if not already running (reusable across profiles)"""
# Try to get fresh cookies via FlareSolverr if we don't have them or they're old
# Do this BEFORE the browser reuse check so cookies are always checked
if not self._has_valid_cookies() or self._cookies_expired():
self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info")
if self._get_cookies_via_flaresolverr():
self.log("Successfully got fresh cookies from FlareSolverr", "info")
else:
self.log("FlareSolverr unavailable, will try with Playwright", "warning")
if self.browser is not None:
self.log("Browser already running, reusing...", "debug")
return
import os
# Use environment variable if set, otherwise use standard location
if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ:
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright'
os.environ['DISPLAY'] = ':100' # Use Xvfb virtual display
self.log("Starting browser (Chromium)...", "info")
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(
headless=self.headless,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding'
]
)
# CRITICAL: Browser fingerprint must match FlareSolverr for cookies to work
# Get dynamic fingerprint settings from FlareSolverr
context_options = get_playwright_context_options()
# IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
# Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
try:
stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
if stored_user_agent:
self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug")
context_options['user_agent'] = stored_user_agent
else:
self.log(f"Using fingerprint: UA={context_options['user_agent'][:50]}...", "debug")
except Exception as e:
self.log(f"Error getting stored user_agent, using default: {e}", "debug")
self.context = self.browser.new_context(**context_options)
# Load cookies
self.load_cookies(self.context)
self.page = self.context.new_page()
# Add comprehensive anti-detection scripts (dynamically from cloudflare_handler)
self.page.add_init_script(get_playwright_stealth_scripts())
self.log("Browser started and ready", "info")
def _stop_browser(self):
"""Stop the browser safely with proper error handling"""
# Close context first
if self.context:
try:
self.context.close()
self.log("Browser context closed", "debug")
except Exception as e:
self.log(f"Error closing browser context: {e}", "warning")
finally:
self.context = None
# Close browser
if self.browser:
try:
self.browser.close()
self.log("Browser closed", "debug")
except Exception as e:
self.log(f"Error closing browser: {e}", "warning")
finally:
self.browser = None
# Stop playwright
if self.playwright:
try:
self.playwright.stop()
except Exception as e:
self.log(f"Error stopping playwright: {e}", "warning")
finally:
self.playwright = None
self.page = None
def __del__(self):
"""Cleanup browser when instance is destroyed"""
self._stop_browser()
def __enter__(self):
"""Context manager entry - allows using 'with' statement"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit - ensures browser cleanup"""
self._stop_browser()
return False # Don't suppress exceptions
def get_profile_info(self, username: str) -> Optional[Dict]:
"""Extract profile info (avatar URL, bio, display name) from imginn profile page.
Returns dict with keys: avatar_url, bio, display_name, or None on failure.
"""
import time as _time
import random as _random
self._enforce_rate_limit("posts")
self._start_browser()
page = self.page
if not page:
return None
try:
url = f"https://imginn.com/{username.lower()}/?ref=index"
self.log(f"Fetching profile info for @{username} from imginn", "info")
page.goto(url, wait_until='domcontentloaded')
wait_time = 5 + _random.uniform(0, 2)
_time.sleep(wait_time)
if not self.wait_for_cloudflare(page):
self.log("Page didn't load for profile info extraction", "warning")
return None
self.save_cookies(self.context)
_time.sleep(2)
# Use JavaScript to extract profile info with multiple selector strategies
profile_info = page.evaluate("""() => {
const result = {};
// --- Avatar ---
// Strategy 1: img inside a profile/user info section
const avatarSelectors = [
'.profile-avatar img',
'.user-avatar img',
'.avatar img',
'.profile-info img',
'.info img:first-of-type',
'header img',
'.user img',
];
for (const sel of avatarSelectors) {
const el = document.querySelector(sel);
if (el && el.src && !el.src.includes('lazy') && !el.src.includes('data:')) {
result.avatar_url = el.src;
break;
}
}
// Strategy 2: find small/round img with scontent or profile in src
if (!result.avatar_url) {
const imgs = document.querySelectorAll('img');
for (const img of imgs) {
const src = img.src || '';
if ((src.includes('scontent') || src.includes('profile') || src.includes('avatar')
|| src.includes('imginn.com'))
&& !src.includes('lazy') && !src.includes('data:')) {
const rect = img.getBoundingClientRect();
if (rect.width > 20 && rect.width < 250) {
result.avatar_url = src;
break;
}
}
}
}
// Clean avatar URL: strip query params (imginn CDN works without them
// and the full URL often has malformed double-? from Instagram CDN paths)
if (result.avatar_url && result.avatar_url.includes('?')) {
result.avatar_url = result.avatar_url.split('?')[0];
}
// --- Bio ---
const bioSelectors = [
'.biography',
'.bio',
'.user-bio',
'.profile-bio',
'.profile-info .description',
'.info .bio',
];
for (const sel of bioSelectors) {
const el = document.querySelector(sel);
if (el && el.textContent.trim().length > 2) {
result.bio = el.textContent.trim();
break;
}
}
// --- Display Name ---
const nameSelectors = [
'.fullname',
'.display-name',
'.profile-name',
'.name',
'.user-info h1',
'h1',
];
for (const sel of nameSelectors) {
const el = document.querySelector(sel);
if (el && el.textContent.trim().length > 1 && el.textContent.trim().length < 100) {
result.display_name = el.textContent.trim();
break;
}
}
return result;
}""")
# Save debug screenshot for future selector tuning
try:
screenshot_path = Path(f"/tmp/imginn_profile_{username}.png")
page.screenshot(path=str(screenshot_path))
self.log(f"Profile screenshot saved to {screenshot_path}", "debug")
except Exception:
pass
if profile_info and any(profile_info.values()):
self.log(f"Extracted profile info: avatar={'yes' if profile_info.get('avatar_url') else 'no'}, "
f"bio={'yes' if profile_info.get('bio') else 'no'}, "
f"name={profile_info.get('display_name', 'no')}", "info")
return profile_info
else:
# Save page HTML for debugging
try:
html_path = Path(f"/tmp/imginn_profile_{username}.html")
html_path.write_text(page.content()[:50000])
self.log(f"No profile info found - HTML saved to {html_path}", "warning")
except Exception:
pass
return None
except Exception as e:
self.log(f"Error getting profile info for @{username}: {e}", "error")
return None
def _extract_media_id_from_url(self, url: str) -> str:
"""Extract Instagram media ID from URL"""
# URL format: https://imginn.com/p/MEDIA_ID/
# or just /p/MEDIA_ID/
match = re.search(r'/p/([^/]+)/?', url)
if match:
return match.group(1)
return None
def _update_file_timestamps(self, filepath: Path, post_date: datetime):
"""Update all timestamps for a file to match the post date"""
try:
# Convert datetime to timestamp
timestamp = post_date.timestamp()
# 1. Update file system timestamps (access time and modification time)
os.utime(filepath, (timestamp, timestamp))
self.log(f"Updated file timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
# 2. Try to update creation time (platform-specific)
if platform.system() == 'Darwin': # macOS
# Use SetFile command on macOS
date_str = post_date.strftime('%m/%d/%Y %H:%M:%S')
try:
subprocess.run(
['SetFile', '-d', date_str, str(filepath)],
capture_output=True,
text=True
)
except (subprocess.SubprocessError, FileNotFoundError, OSError):
pass # SetFile not available on this system
elif platform.system() == 'Windows':
# On Windows, use PowerShell with proper escaping to prevent injection
filepath_escaped = str(filepath).replace("'", "''")
date_escaped = post_date.isoformat().replace("'", "''")
ps_command = f"(Get-Item -LiteralPath '{filepath_escaped}').CreationTime = Get-Date '{date_escaped}'"
try:
subprocess.run(
['powershell', '-Command', ps_command],
capture_output=True,
text=True
)
except (subprocess.SubprocessError, FileNotFoundError, OSError):
pass # PowerShell command failed
# Linux doesn't support changing creation time
# 3. Update EXIF data for images
if str(filepath).lower().endswith(('.jpg', '.jpeg', '.png', '.heic')):
self._update_exif_timestamps(filepath, post_date)
except Exception as e:
self.log(f"Error updating timestamps: {e}", "warning")
def _update_exif_timestamps(self, filepath: Path, post_date: datetime):
"""Update EXIF timestamps in image files"""
try:
# Check if exiftool is available
result = subprocess.run(['which', 'exiftool'], capture_output=True, text=True)
if result.returncode == 0:
# Format date for EXIF
exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S')
# Update all date fields in EXIF including MetadataDate for Immich
cmd = [
'exiftool', '-overwrite_original', '-quiet',
f'-AllDates={exif_date}',
f'-MetadataDate={exif_date}',
'-HistoryWhen=',
f'-FileModifyDate={exif_date}',
str(filepath)
]
subprocess.run(cmd, capture_output=True, text=True)
self.log(f"Updated EXIF timestamps", "debug")
except Exception:
# Silently skip if exiftool not available
pass
def _extract_post_date(self, page) -> datetime:
"""Try to extract post date from page"""
try:
# Wait a moment for dynamic content to load
page.wait_for_timeout(500)
# FIRST: Look for data-created attribute (Unix timestamp)
elements_with_data_created = page.locator('[data-created]').all()
self.log(f"Found {len(elements_with_data_created)} elements with data-created attribute", "debug")
for elem in elements_with_data_created:
timestamp_str = elem.get_attribute('data-created')
if timestamp_str:
try:
# Convert Unix timestamp to datetime
timestamp = int(timestamp_str)
post_date = datetime.fromtimestamp(timestamp)
self.log(f"Found data-created timestamp: {timestamp} -> {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
return post_date
except Exception as e:
self.log(f"Failed to parse timestamp {timestamp_str}: {e}", "debug")
pass
# If no data-created found, wait a bit more and try again
if len(elements_with_data_created) == 0:
self.log("No data-created elements found, waiting for dynamic content...", "debug")
# Try to wait for the element to appear
try:
page.wait_for_selector('[data-created]', timeout=2000)
elements_with_data_created = page.locator('[data-created]').all()
self.log(f"After waiting for selector: found {len(elements_with_data_created)} elements with data-created", "debug")
except Exception:
# Still try one more time with a longer wait
page.wait_for_timeout(1500)
elements_with_data_created = page.locator('[data-created]').all()
self.log(f"After timeout wait: found {len(elements_with_data_created)} elements with data-created", "debug")
for elem in elements_with_data_created:
timestamp_str = elem.get_attribute('data-created')
if timestamp_str:
try:
timestamp = int(timestamp_str)
post_date = datetime.fromtimestamp(timestamp)
self.log(f"Found data-created timestamp after wait: {timestamp} -> {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
return post_date
except Exception as e:
self.log(f"Failed to parse timestamp {timestamp_str}: {e}", "debug")
# Fallback: Look for other date elements
date_selectors = [
'time[datetime]',
'time',
'.date',
'[datetime]',
'span.date',
'div.date'
]
for selector in date_selectors:
elem = page.locator(selector).first
if elem.count() > 0:
# Try datetime attribute first
datetime_str = elem.get_attribute('datetime')
if datetime_str:
# Parse ISO format
for fmt in ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d']:
try:
return datetime.strptime(datetime_str.split('.')[0].replace('Z', ''), fmt)
except Exception:
continue
# Try text content
text = elem.text_content()
if text:
# Parse various date formats
# Could be "2 days ago", "September 6, 2025", etc.
if "ago" in text.lower():
# Handle relative dates
if "hour" in text:
hours = int(re.search(r'(\d+)', text).group(1))
return datetime.now() - timedelta(hours=hours)
elif "day" in text:
days = int(re.search(r'(\d+)', text).group(1))
return datetime.now() - timedelta(days=days)
elif "week" in text:
weeks = int(re.search(r'(\d+)', text).group(1))
return datetime.now() - timedelta(weeks=weeks)
else:
# Try parsing absolute date
for fmt in ['%B %d, %Y', '%b %d, %Y', '%Y-%m-%d']:
try:
return datetime.strptime(text, fmt)
except Exception:
continue
except Exception as e:
self.log(f"Error extracting date: {e}", "debug")
return None
def _scan_existing_files(self, output_dir: Path, profile_name: str):
"""Scan directory for existing files and extract media IDs"""
self.downloaded_files = scan_existing_files_for_media_ids(
output_dir, profile_name, min_file_size=20000, recursive=False
)
if self.downloaded_files:
self.log(f"Found {len(self.downloaded_files)} existing media IDs for {profile_name}", "debug")
def _is_already_downloaded(self, media_id: str) -> bool:
"""Check if media_id has already been downloaded (uses centralized function)"""
if not self.use_database or not self.unified_db:
return False
# Use centralized function for consistent cross-module detection
return is_instagram_downloaded(self.unified_db, media_id)
def _record_download(self, media_id: str, username: str, filename: str,
url: str = None, post_date=None, file_path: str = None,
content_type: str = 'post', metadata: dict = None,
deferred: bool = False):
"""Record a successful download in the database (uses centralized function)
Args:
deferred: If True, don't record to database now - add to pending_downloads list
for later recording after file move is complete
"""
# If deferred, store for later recording instead of recording now
if deferred:
self.pending_downloads.append({
'media_id': media_id,
'username': username,
'filename': filename,
'url': url,
'post_date': post_date.isoformat() if post_date else None,
'file_path': file_path,
'content_type': content_type,
'metadata': metadata
})
self.log(f"Deferred recording for {media_id}", "debug")
return True
if not self.use_database or not self.unified_db:
return False
try:
# Use centralized function for consistent cross-module storage
result = record_instagram_download(
db=self.unified_db,
media_id=media_id,
username=username,
content_type=content_type,
filename=filename,
url=url,
post_date=post_date,
file_path=file_path,
method='imginn',
extra_metadata=metadata
)
if result:
self.log(f"Recorded download for {media_id}", "debug")
return result
except Exception as e:
self.log(f"Failed to record download: {e}", "debug")
return False
def get_pending_downloads(self):
"""Get list of downloads that were deferred for later recording
Returns:
List of download metadata dicts ready for database recording
"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list after they've been recorded"""
self.pending_downloads = []
def _get_processed_posts(self, username: str) -> set:
"""Get set of post/story IDs that have been processed from database
NOTE: Checks ALL Instagram posts globally, not just this user's, because
the same post can appear on multiple profiles (shared posts, tags, reposts)
"""
processed = set()
if not self.unified_db:
return processed
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Get all Instagram posts globally (same post can appear on multiple profiles)
cursor.execute('''
SELECT url, filename, metadata FROM downloads
WHERE platform = 'instagram'
''')
for row in cursor.fetchall():
url, filename, metadata_str = row
# Add full URL to processed set
if url:
processed.add(url)
# Also extract and add post ID from URL for backward compatibility
if url and '/p/' in url:
match = re.search(r'/p/([^/]+)/', url)
if match:
processed.add(match.group(1))
# For stories, extract media_id from filename
if filename and '_story' in filename:
# Extract the long media ID before _story
# Format: username_date_MEDIAID_storyN.ext
parts = filename.split('_story')
if len(parts) >= 2:
# Get everything before _story, then get the media ID (last underscore-separated part)
pre_story = parts[0]
# Split by underscore and skip first 3 parts (username_YYYYMMDD_HHMMSS)
id_parts = pre_story.split('_')
if len(id_parts) > 3:
# Join everything after date as the media_id
media_id_full = '_'.join(id_parts[3:])
processed.add(media_id_full)
# Also add the extracted Instagram media ID (18-digit number)
normalized_id = extract_instagram_media_id(media_id_full)
if normalized_id and normalized_id != media_id_full:
processed.add(normalized_id)
# Also check metadata for media_id
if metadata_str:
try:
metadata = json.loads(metadata_str)
if 'post_id' in metadata:
processed.add(metadata['post_id'])
if 'media_id' in metadata:
media_id = metadata['media_id']
processed.add(media_id)
# Also add the extracted Instagram media ID
normalized_id = extract_instagram_media_id(media_id)
if normalized_id and normalized_id != media_id:
processed.add(normalized_id)
if 'media_id_full' in metadata:
processed.add(metadata['media_id_full'])
except Exception:
pass
if processed:
self.log(f"Found {len(processed)} processed posts in database for {username}", "debug")
except Exception as e:
self.log(f"Error loading processed posts from database: {e}", "debug")
return processed
def save_cookies(self, context):
"""Save cookies to database or file"""
cookies = context.cookies()
# Save to database if available
if self.unified_db:
try:
# CRITICAL: Include user_agent for cf_clearance cookies to work
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=self.user_agent,
merge=True
)
self.log(f"Saved {len(cookies)} cookies to database", "debug")
return
except Exception as e:
self.log(f"Error saving cookies to database: {e}", "warning")
# Fallback to file-based storage
if self.cookie_file:
storage_data = {
'cookies': cookies,
'timestamp': datetime.now().isoformat()
}
with open(self.cookie_file, 'w') as f:
json.dump(storage_data, f, indent=2)
self.log(f"Saved {len(cookies)} cookies to file", "debug")
def load_cookies(self, context):
"""Load saved cookies from database or file"""
# Try loading from database first
if self.unified_db:
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
# Clean cookies - remove unsupported properties and convert expiry->expires
cleaned_cookies = []
for cookie in cookies:
cleaned = {k: v for k, v in cookie.items()
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
if 'expiry' in cleaned and 'expires' not in cleaned:
cleaned['expires'] = cleaned.pop('expiry')
cleaned_cookies.append(cleaned)
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
# Otherwise old cookies may override new ones from FlareSolverr
try:
context.clear_cookies()
self.log("Cleared existing browser cookies", "debug")
except Exception as e:
self.log(f"Could not clear cookies: {e}", "debug")
context.add_cookies(cleaned_cookies)
self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info")
return True
except Exception as e:
self.log(f"Error loading cookies from database: {e}", "warning")
# Fallback to file-based cookies
if not self.cookie_file or not self.cookie_file.exists():
return False
try:
with open(self.cookie_file, 'r') as f:
data = json.load(f)
# Check age (24 hours)
saved_time = datetime.fromisoformat(data['timestamp'])
if datetime.now() - saved_time > timedelta(hours=24):
self.log("Cookies expired", "debug")
return False
# Clean cookies - remove unsupported properties and convert expiry->expires
cleaned_cookies = []
for cookie in data['cookies']:
# Remove Chrome-specific properties that Playwright doesn't support
cleaned = {k: v for k, v in cookie.items()
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
if 'expiry' in cleaned and 'expires' not in cleaned:
cleaned['expires'] = cleaned.pop('expiry')
cleaned_cookies.append(cleaned)
context.add_cookies(cleaned_cookies)
self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info")
return True
except Exception as e:
self.log(f"Failed to load cookies: {e}", "warning")
return False
def wait_for_cloudflare(self, page):
"""Wait for Cloudflare to auto-solve or page to load - uses CloudflareHandler with ImgInn-specific checks"""
self.log("Waiting for page to load...", "debug")
max_wait = 120 # Extended wait - Cloudflare challenges can take up to 120 seconds
flaresolverr_attempts = 0
max_flaresolverr_attempts = 3
for i in range(max_wait):
time.sleep(1)
# Check current URL and content
try:
current_url = page.url
content = page.content().lower()
except Exception as e:
# Page is still navigating, wait and try again
if "navigating" in str(e).lower():
self.log("Page still navigating, waiting...", "debug")
continue
else:
# Some other error, re-raise it
raise
# First check if the actual content is visible (not Cloudflare)
# ImgInn pages will have profile content when loaded
if 'imginn' in current_url.lower() and ('posts' in content or 'followers' in content or 'following' in content):
# We have actual content, not a challenge
self.log(f"Page loaded successfully after {i+1} seconds", "info")
return True
# Check for actual Cloudflare challenge or server error
# NOTE: 'challenge-platform' is NOT a reliable indicator - it's embedded JS that stays on the page
# even after successful bypass. Only check for visible interstitial text.
challenge_indicators = ['checking your browser', 'just a moment', 'verify you are human', 'enable javascript']
error_indicators = ['internal server error', 'error code 500', 'error code 502', 'error code 503']
has_challenge = any(indicator in content for indicator in challenge_indicators)
has_error = any(indicator in content for indicator in error_indicators)
if has_error:
self.log("Server error detected (500/502/503) - site is likely down", "error")
# Save screenshot for debugging
try:
debug_dir = Path("debug")
debug_dir.mkdir(exist_ok=True)
screenshot_path = debug_dir / f"server_error_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
page.screenshot(path=str(screenshot_path))
self.log(f"Screenshot saved to {screenshot_path}", "debug")
except Exception:
pass
return False
if has_challenge:
# Try FlareSolverr at specific intervals (0s, 15s, 30s)
# Note: Turnstile checkbox clicking doesn't work - it's designed to block automation
if i == 0 or (i in [15, 30] and flaresolverr_attempts < max_flaresolverr_attempts):
flaresolverr_attempts += 1
self.log(f"Cloudflare challenge detected, attempting FlareSolverr bypass (attempt {flaresolverr_attempts})...", "info")
# Get current browser user_agent for comparison
current_browser_ua = None
try:
current_browser_ua = page.evaluate('() => navigator.userAgent')
except Exception:
pass
# Try to get fresh cookies via FlareSolverr
if self._get_cookies_via_flaresolverr(page.url):
self.log("Got fresh cookies, reloading page...", "info")
# Check if user_agent changed - if so, restart browser
new_ua = None
try:
new_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
self.log(f"Stored cookie UA: {new_ua[:60] if new_ua else 'None'}...", "debug")
self.log(f"Browser UA: {current_browser_ua[:60] if current_browser_ua else 'None'}...", "debug")
except Exception as e:
self.log(f"Error getting stored UA: {e}", "debug")
if new_ua and current_browser_ua and new_ua != current_browser_ua:
self.log("User-agent changed, restarting browser with new fingerprint...", "info")
self._stop_browser()
self._start_browser()
page = self.page
try:
page.goto(current_url, wait_until='domcontentloaded', timeout=30000)
except Exception as e:
self.log(f"Error navigating after browser restart: {e}", "debug")
else:
# Reload cookies in browser context
try:
self.load_cookies(self.context)
# Reload the page with new cookies
page.reload(wait_until='domcontentloaded', timeout=10000)
# CRITICAL: Wait for Cloudflare background JS validation (5-7 seconds)
wait_time = 5 + random.uniform(0, 2)
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
time.sleep(wait_time)
except Exception as e:
self.log(f"Error reloading page with new cookies: {e}", "debug")
else:
self.log("FlareSolverr failed, waiting for challenge to resolve...", "warning")
# Continue waiting for challenge to resolve
continue
# Check if we're on the correct page with content
if '/p/' in current_url: # Post page
# Look for download button or image
if 'download' in content or 'data-created' in content:
self.log(f"Post page loaded after {i+1} seconds", "info")
return True
elif '/stories/' in current_url: # Stories page
# Stories pages have swiper, reels, or story content
if 'swiper' in content or 'data-uid' in content or 'reel' in content:
self.log(f"Stories page loaded after {i+1} seconds", "info")
return True
# Also check for counter/profile info which is on stories pages too
if 'counter-item' in content or ('posts' in content and 'followers' in content):
self.log(f"Stories page loaded after {i+1} seconds", "info")
return True
elif '/tagged/' in current_url: # Tagged page
# Tagged pages have items grid
if 'class="item"' in content or 'data-uid' in content:
self.log(f"Tagged page loaded after {i+1} seconds", "info")
return True
if 'posts' in content and 'followers' in content:
self.log(f"Tagged page loaded after {i+1} seconds", "info")
return True
else: # Profile page
# Check if profile content is visible - ImgInn specific
if 'imginn' in current_url.lower():
if ('posts' in content and 'followers' in content) or 'following' in content:
self.log(f"Profile page loaded after {i+1} seconds", "info")
return True
# Also check for actual post links
if 'href="/p/' in content or 'class="item"' in content:
self.log(f"Profile page loaded after {i+1} seconds", "info")
return True
# Debug: Log what we're seeing if we've been waiting a while
if i == 15:
self.log(f"Debug: URL={current_url[:50]}, has posts={('posts' in content)}, has swiper={('swiper' in content)}", "debug")
# Status updates (only if we haven't detected content yet)
if i == 10:
self.log("Still waiting (10s)... page loading", "debug")
elif i == 20:
self.log("Still waiting (20s)... page not ready yet", "info")
elif i == 30:
self.log("Still waiting (30s)... slow response from server", "info")
elif i == 45:
self.log("Still waiting (45s)... checking if blocked", "info")
elif i == 60:
self.log("Still waiting (60s)... page load is slow", "warning")
elif i == 90:
self.log("Still waiting (90s)... this is taking too long", "warning")
# Timeout reached - page didn't load
self._page_load_failures += 1
level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning"
self.log(f"Page load timeout ({self._page_load_failures}x this session). URL: {page.url}", level)
# Save screenshot for debugging
try:
debug_dir = Path("debug")
debug_dir.mkdir(exist_ok=True)
screenshot_path = debug_dir / f"cloudflare_block_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
page.screenshot(path=str(screenshot_path))
self.log(f"Screenshot saved to {screenshot_path}", "debug")
except Exception:
pass
return False
def _dismiss_consent_dialog(self, page):
"""Dismiss cookie consent / GDPR overlay if present (Google FundingChoices)."""
try:
# Google FundingChoices consent dialog
consent_btn = page.locator(
'button.fc-cta-consent, ' # "Consent" button
'button.fc-cta-do-not-consent, ' # "Do not consent" button
'button[aria-label="Consent"], '
'button.fc-dismiss-button, ' # Dismiss/close button
'.fc-dialog button.fc-primary-button'
).first
if consent_btn.count() > 0 and consent_btn.is_visible():
consent_btn.click(force=True)
self.log("Dismissed consent dialog", "debug")
time.sleep(0.5)
return
# Fallback: remove the overlay via JS if buttons aren't found
overlay = page.locator('.fc-consent-root, .fc-dialog-overlay').first
if overlay.count() > 0:
page.evaluate("document.querySelectorAll('.fc-consent-root, .fc-dialog-overlay, .fc-dialog-container').forEach(el => el.remove())")
self.log("Removed consent overlay via JS", "debug")
except Exception:
pass
def _safe_go_back(self, page, username: str, tagged: bool = False):
"""Navigate back to profile page safely with timeout handling.
Tries go_back() first with a short timeout, falls back to direct navigation.
"""
try:
page.go_back(timeout=10000)
except Exception:
self.log("go_back timed out, navigating directly to profile", "debug")
try:
suffix = f"/tagged/?ref=index" if tagged else "/?ref=index"
page.goto(f"https://imginn.com/{username}{suffix}", timeout=15000)
except Exception as nav_err:
self.log(f"Direct navigation back also failed: {nav_err}", "warning")
def _is_cloudflare_challenge(self, page) -> bool:
"""Check if current page is a Cloudflare challenge page.
Returns:
True if Cloudflare challenge detected, False otherwise
"""
try:
title = page.title().lower()
content = page.content().lower()[:2000] # Check first 2000 chars
challenge_indicators = ['just a moment', 'checking your browser', 'verify you are human',
'enable javascript', 'cloudflare']
# Check title first (most reliable)
if any(indicator in title for indicator in challenge_indicators):
return True
# Check content
if any(indicator in content for indicator in challenge_indicators):
return True
return False
except Exception:
return False
def _handle_cloudflare_on_post(self, page, post_url: str, max_retries: int = 2) -> bool:
"""Handle Cloudflare challenge on a post page by getting fresh cookies and retrying.
Args:
page: Playwright page object
post_url: URL of the post to retry
max_retries: Maximum number of retry attempts
Returns:
True if page loaded successfully (no Cloudflare), False if still blocked
"""
if not self._is_cloudflare_challenge(page):
return True # No challenge, page is good
self.log(f"Cloudflare challenge detected on post page, attempting bypass...", "warning")
for attempt in range(max_retries):
# Wait before FlareSolverr attempt - give Cloudflare time to cool down
if attempt == 0:
wait_time = random.uniform(15, 25)
else:
wait_time = random.uniform(30, 60)
self.log(f"Waiting {wait_time:.1f}s before FlareSolverr attempt {attempt + 1}...", "info")
time.sleep(wait_time)
# Get fresh cookies via FlareSolverr using the post URL
if self._get_cookies_via_flaresolverr(post_url):
self.log(f"Got fresh cookies (attempt {attempt + 1}), reloading post...", "info")
# Check if user_agent changed - if so, restart browser
try:
current_browser_ua = page.evaluate('() => navigator.userAgent')
new_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
if new_ua and current_browser_ua and new_ua != current_browser_ua:
self.log("User-agent changed, restarting browser...", "info")
self._stop_browser()
self._start_browser()
page = self.page
except Exception as e:
self.log(f"Error checking user_agent: {e}", "debug")
# Reload cookies into browser context
try:
self.load_cookies(self.context)
except Exception as e:
self.log(f"Error loading cookies: {e}", "debug")
# Navigate directly to the post URL
try:
page.goto(post_url, wait_until='domcontentloaded', timeout=30000)
# Wait for Cloudflare background JS validation (5-7 seconds)
wait_time = 5 + random.uniform(0, 2)
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
time.sleep(wait_time)
# Check if still blocked
if not self._is_cloudflare_challenge(page):
self.log("Cloudflare bypass successful on post page", "info")
# IMPORTANT: Save browser cookies after successful bypass
# This captures any cookies set by Cloudflare's JS validation
try:
self.save_cookies(self.context)
self.log("Saved browser cookies after successful bypass", "debug")
except Exception as e:
self.log(f"Error saving cookies after bypass: {e}", "debug")
return True
else:
self.log(f"Still blocked after retry {attempt + 1}", "warning")
except Exception as e:
self.log(f"Navigation failed after cookie refresh: {e}", "warning")
else:
self.log(f"FlareSolverr failed (attempt {attempt + 1})", "warning")
self.log("Failed to bypass Cloudflare on post page after all retries", "error")
return False
def _check_post_phrases(self, page, phrase_config: dict) -> bool:
"""Check if post contains required phrases
Args:
page: Playwright page object
phrase_config: Phrase search configuration
{
'phrases': list of phrases to search for,
'case_sensitive': bool,
'match_all': bool (True = all phrases must match, False = any phrase)
}
Returns:
True if post matches phrase criteria, False otherwise
"""
try:
# Get post caption/text
caption_selectors = [
'.caption',
'.post-caption',
'meta[property="og:description"]',
'meta[name="description"]',
'.content',
'div[class*="caption"]',
'span[class*="caption"]'
]
post_text = ""
for selector in caption_selectors:
try:
element = page.locator(selector).first
if element.count() > 0:
text = element.text_content() or element.get_attribute('content') or ""
if text:
post_text += " " + text
except Exception:
continue
# Also check visible text in the main content area
try:
main_content = page.locator('main, article, .post-content, div[role="main"]').first
if main_content.count() > 0:
post_text += " " + (main_content.text_content() or "")
except Exception:
pass
if not post_text:
self.log("Could not extract post text for phrase matching", "warning")
return False
# Clean up text
post_text = ' '.join(post_text.split()) # Normalize whitespace
phrases = phrase_config.get('phrases', [])
if not phrases:
return True # No phrases to match = match all
case_sensitive = phrase_config.get('case_sensitive', False)
match_all = phrase_config.get('match_all', False)
if not case_sensitive:
post_text = post_text.lower()
phrases = [p.lower() for p in phrases]
self.log(f"Checking post text ({len(post_text)} chars) for phrases: {phrases}", "debug")
# Check phrase matching
matches = []
for phrase in phrases:
if phrase in post_text:
matches.append(phrase)
self.log(f"Found phrase: '{phrase}'", "debug")
if match_all:
# All phrases must be found
result = len(matches) == len(phrases)
if not result:
missing = [p for p in phrases if p not in matches]
self.log(f"Missing required phrases: {missing}", "debug")
else:
# At least one phrase must be found
result = len(matches) > 0
if not result:
self.log(f"No matching phrases found", "debug")
return result
except Exception as e:
self.log(f"Error checking post phrases: {e}", "error")
return False
def download(self, username: str, content_type: str = "posts", days_back: int = 14, max_downloads: int = 50, output_dir: str = None, phrase_config: dict = None, defer_database: bool = False):
"""Download content from a user - compatible with media-downloader interface
Args:
username: Instagram username
content_type: Type of content ("posts", "stories", or "tagged")
days_back: How many days back to search
max_downloads: Maximum posts to download
output_dir: Output directory
phrase_config: Optional phrase search configuration
{
'enabled': bool,
'phrases': list of phrases to search for,
'case_sensitive': bool,
'match_all': bool (True = all phrases must match, False = any phrase)
}
defer_database: If True, defer database recording to pending_downloads list
for later recording after file move is complete
"""
# Clear downloaded_files cache between accounts to prevent memory growth
self.downloaded_files.clear()
# Clear pending downloads for fresh batch
self.pending_downloads = []
# Set output directory
if output_dir:
output_path = Path(output_dir) / username
else:
output_path = Path(f"/opt/media-downloader/downloads/{username}")
# Route to appropriate download method
if content_type == "posts":
files = self.download_posts(
username=username,
days_back=days_back,
max_posts=max_downloads,
output_dir=output_path,
phrase_config=phrase_config,
defer_database=defer_database
)
elif content_type == "stories":
files = self.download_stories(
username=username,
days_back=days_back,
max_stories=max_downloads,
output_dir=output_path,
defer_database=defer_database
)
elif content_type == "tagged":
files = self.download_tagged(
username=username,
days_back=days_back,
max_posts=max_downloads,
output_dir=output_path,
phrase_config=phrase_config,
defer_database=defer_database
)
else:
self.log(f"ImgInn does not support content type: {content_type}", "warning")
return 0
return len(files)
def download_posts(self, username: str, days_back: int = 14, max_posts: int = 50, specific_post_url: str = None, output_dir: Path = None, phrase_config: dict = None, skip_database: bool = False, max_age_hours: int = None, defer_database: bool = False):
"""Download posts from a user with FastDL naming
Args:
username: Instagram username
days_back: How many days back to search
max_posts: Maximum posts to check
specific_post_url: Download a specific post
output_dir: Output directory
phrase_config: Optional phrase search configuration
skip_database: If True, don't record downloads in database (for temporary processing)
max_age_hours: If specified, only download posts newer than N hours (overrides days_back)
defer_database: If True, defer database recording to pending_downloads list
for later recording after file move is complete
"""
# Rate limiting to avoid Cloudflare blocks
self._enforce_rate_limit("posts")
profile_name = username.lower()
if output_dir is None:
output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
output_dir.mkdir(parents=True, exist_ok=True)
# Check site status before doing anything else
self.log("Checking ImgInn site status...", "debug")
site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10)
if self.cf_handler.should_skip_download(site_status):
self.log(f"Skipping download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning")
self.activity_manager.update_status(f"Skipped - ImgInn unavailable ({error_msg})")
return []
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
# Scan existing files
self._scan_existing_files(output_dir, profile_name)
# Get processed posts from database
processed_posts = self._get_processed_posts(profile_name)
self.log(f"Loaded {len(processed_posts)} processed posts for {profile_name} from database", "info")
if len(processed_posts) > 0 and len(processed_posts) < 20:
self.log(f"Processed post IDs: {processed_posts}", "debug")
downloaded_files = []
# Use max_age_hours if specified, otherwise use days_back
if max_age_hours is not None:
cutoff_date = datetime.now() - timedelta(hours=max_age_hours)
else:
cutoff_date = datetime.now() - timedelta(days=days_back)
# Update activity status
if specific_post_url and profile_name == 'unknown':
self.activity_manager.update_status(f"Fetching post...")
else:
self.activity_manager.update_status("Checking posts")
# Start or reuse browser
self._start_browser()
page = self.page
try:
# If specific post URL provided, go directly to it
if specific_post_url:
self.log(f"Navigating to specific post", "info")
page.goto(specific_post_url, wait_until='domcontentloaded')
else:
# Navigate to profile
self.log(f"Navigating to @{username} profile", "info")
page.goto(f"https://imginn.com/{username}/?ref=index", wait_until='domcontentloaded')
# CRITICAL: Wait 5-7 seconds for Cloudflare background JS challenges to complete
# Per browserless.io: "Allow 5+ seconds post-page load for background JavaScript challenges"
import random
wait_time = 5 + random.uniform(0, 2) # 5-7 seconds
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
time.sleep(wait_time)
# Wait for page to load
if not self.wait_for_cloudflare(page):
self._page_load_failures += 1
level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning"
self.log(f"Page didn't load properly ({self._page_load_failures}x this session)", level)
return []
# Save cookies
self.save_cookies(self.context)
# Wait for JavaScript to load posts (ImgInn loads posts dynamically)
self.log("Waiting for posts to load via JavaScript...", "info")
try:
# Wait for post links to appear (up to 10 seconds)
page.wait_for_selector('a[href*="/p/"]', timeout=10000)
self.log("Posts loaded successfully", "info")
except Exception:
# Timeout - posts might not exist, or page structure changed
self.log("Timeout waiting for posts to appear", "warning")
time.sleep(2) # Give it a bit more time anyway
# If specific post, process it directly
if specific_post_url:
self.log("Processing specific post", "info")
# Extract media ID from URL
media_id = self._extract_media_id_from_url(specific_post_url)
if not media_id:
self.log("Could not extract media ID", "warning")
return []
self.log(f"URL Media ID: {media_id}", "debug")
# Process this single post (bypass date filter for specific posts)
post_links = [None] # Dummy list for iteration
bypass_date_filter = True
else:
# Find posts on profile page
self.log("Finding posts...", "info")
# Debug: Check what's actually on the page
page_content = page.content()
if 'no posts' in page_content.lower() or 'page not found' in page_content.lower():
self.log("Page shows 'no posts' or 'not found'", "warning")
post_links = page.locator('a[href*="/p/"]').all()
self.log(f"Found {len(post_links)} posts", "info")
if not post_links:
# Debug: Save screenshot to see what's wrong
try:
screenshot_path = Path(f"/tmp/imginn_no_posts_{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
page.screenshot(path=str(screenshot_path))
self.log(f"No posts found - screenshot saved to {screenshot_path}", "warning")
except Exception:
pass
self.log("No posts found", "warning")
return []
bypass_date_filter = False
self.log(f"Processing posts (max {max_posts})", "info")
# Collect all post URLs upfront to avoid stale element issues
post_urls_to_process = []
if not specific_post_url:
for idx, pl in enumerate(post_links[:max_posts]):
try:
href = pl.get_attribute('href', timeout=5000)
if href:
if not href.startswith('http'):
href = f"https://imginn.com{href}"
post_urls_to_process.append(href)
except Exception as e:
self.log(f"Post {idx+1}: Failed to get URL: {str(e)[:50]}", "debug")
continue
self.log(f"Collected {len(post_urls_to_process)} post URLs", "debug")
# Track consecutive old posts to handle pinned posts
consecutive_old_posts = 0
max_consecutive_old_posts = 5 # Allow up to 5 old posts (pinned) before stopping
# Set initial progress so dashboard shows 0/N immediately
total_posts = len(post_urls_to_process) if not specific_post_url else 1
self.activity_manager.update_status(
"Downloading posts",
progress_current=0,
progress_total=total_posts
)
for i, post_url in enumerate(post_urls_to_process if not specific_post_url else [specific_post_url]):
# Update progress at start of each iteration (fires even on skips)
self.activity_manager.update_status(
"Downloading posts",
progress_current=i + 1,
progress_total=total_posts
)
try:
# Handle specific post vs regular posts
if specific_post_url:
# Already on the specific post page
post_url = specific_post_url
media_id = self._extract_media_id_from_url(post_url)
else:
# URL already collected and formatted
media_id = self._extract_media_id_from_url(post_url)
if not media_id:
self.log(f"Post {i+1}: Could not extract media ID", "warning")
continue
# Check if post was already processed (from database)
if media_id in processed_posts:
# Skip if in database - trust the database tracking
self.log(f"Post {i+1}: {media_id} already processed (database), skipping", "debug")
continue
# Rate limiting between post downloads to avoid Cloudflare blocks
if i > 0:
post_delay = random.uniform(3, 8)
self.log(f"Rate limit: waiting {post_delay:.1f}s before post {i+1}", "debug")
time.sleep(post_delay)
self.log(f"Post {i+1}: Processing {media_id}", "info")
# Navigate directly to post URL (more reliable than clicking which can timeout)
try:
page.goto(post_url, wait_until='domcontentloaded', timeout=30000)
except Exception as nav_err:
self.log(f"Post {i+1}: Navigation failed: {nav_err}", "warning")
continue
# Wait for page to load
time.sleep(2)
# Wait for navigation to complete
try:
page.wait_for_load_state('networkidle', timeout=5000)
except Exception:
# Continue even if network isn't idle - page might still be usable
self.log("Network didn't idle, but continuing", "debug")
# Check if on post page
if "/p/" not in page.url:
self.log(f"Not a downloadable post (URL: {page.url})", "warning")
self._safe_go_back(page, username)
continue
# IMPORTANT: Wait for post page content to fully render
# This ensures download buttons are from the POST PAGE, not profile page preview
try:
# Wait for the post container to be visible (imginn uses main-content now)
page.wait_for_selector('div.main-content, div.post, div.content, div.single-post', timeout=3000)
time.sleep(1) # Additional wait for download buttons to render
except Exception:
self.log("Post container not found, checking for Cloudflare...", "debug")
# Check for Cloudflare challenge and handle it
cloudflare_bypassed = False
if self._is_cloudflare_challenge(page):
self.log(f"Cloudflare challenge detected on post {media_id}", "warning")
if not self._handle_cloudflare_on_post(page, post_url):
# Cloudflare bypass failed - skip this post WITHOUT marking as processed
# so it can be retried on next run
self.log(f"Skipping post {media_id} due to Cloudflare block (will retry later)", "warning")
try:
page.goto(f"https://imginn.com/{username}/?ref=index")
time.sleep(3)
except Exception:
pass
continue
cloudflare_bypassed = True
self.log(f"Navigated to post page: {page.url}", "debug")
self._dismiss_consent_dialog(page)
# Extract actual username from post page if we don't have it (e.g., specific_post_url with unknown user)
if profile_name == 'unknown' or specific_post_url:
try:
username_elem = page.locator('div.username a').first
if username_elem.count() > 0:
username_href = username_elem.get_attribute('href')
if username_href:
# Extract username from href like "/evalongoria/" -> "evalongoria"
extracted_username = username_href.strip('/').lower()
if extracted_username and extracted_username != 'unknown':
profile_name = extracted_username
self.log(f"Extracted username from post page: @{profile_name}", "info")
# Update activity status with real username
self.activity_manager.update_status("Downloading posts")
except Exception as e:
self.log(f"Could not extract username from post page: {e}", "debug")
# Extract post date - ALWAYS extract for proper file naming
post_date = self._extract_post_date(page)
# Use post date for filename, or current date
if post_date:
date_str = post_date.strftime('%Y%m%d_%H%M%S')
self.log(f"Original post date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
else:
date_str = datetime.now().strftime('%Y%m%d_%H%M%S')
self.log(f"No original date found, using current time", "debug")
# Check date filter AFTER extracting date (bypass for specific posts)
if not bypass_date_filter and post_date and post_date < cutoff_date:
consecutive_old_posts += 1
self.log(f"Post too old ({post_date.strftime('%Y-%m-%d')}), skipping (consecutive old: {consecutive_old_posts}/{max_consecutive_old_posts})", "info")
# Mark this old post as checked in database to avoid re-checking
# Only mark if doing phrase search (has phrase_config)
if phrase_config and media_id:
self._record_download(
media_id=media_id,
username=profile_name,
filename=f"_old_post_{media_id}",
url=post_url,
post_date=post_date,
content_type='post',
metadata={'marker': True, 'reason': 'old_post'}
)
self._safe_go_back(page, username)
# Stop only after 5 consecutive old posts (handles pinned posts at top)
if consecutive_old_posts >= max_consecutive_old_posts:
self.log(f"Found {consecutive_old_posts} consecutive old posts - stopping", "info")
break
else:
continue # Skip this old post but keep checking (might be pinned)
# Reset consecutive old posts counter - we found a post within date range
consecutive_old_posts = 0
# Check for phrase matching if configured
if phrase_config and phrase_config.get('enabled'):
if not self._check_post_phrases(page, phrase_config):
self.log(f"Post does not match phrase criteria, skipping download", "info")
# Mark this post as checked (but not downloaded) in database
# This prevents re-checking the same post every run
if media_id:
self._record_download(
media_id=media_id,
username=profile_name,
filename=f"_phrase_checked_{media_id}",
url=post_url,
post_date=post_date,
content_type='post',
metadata={'marker': True, 'reason': 'phrase_checked'}
)
self._safe_go_back(page, username)
continue
else:
self.log(f"Post matches phrase criteria, using high-res download", "info")
# Check for carousel
carousel_next = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
has_carousel = carousel_next.count() > 0
if has_carousel:
self.log(f"Carousel detected - will download all carousel images", "info")
self._dismiss_consent_dialog(page)
# CRITICAL: Wait for POST PAGE carousel download buttons to be ready
# This prevents downloading from the profile page preview
try:
# Wait for download buttons with POST PAGE URLs (have "scontent" or "post" in them)
page.wait_for_selector('a.btn[href*="scontent"], a[download], a.download', timeout=3000)
time.sleep(1.5) # Additional wait for all carousel images to load
self.log("Carousel download buttons ready on post page", "debug")
except Exception:
self.log("Download buttons not found, but continuing", "debug")
else:
self.log("Single image post", "debug")
# Handle downloads - always use download buttons from post page
image_count = 0
max_images = 10
# Download images (carousel or single)
if has_carousel:
# First, let's find all carousel slides
all_slides = page.locator('.swiper-slide').all()
self.log(f"Found {len(all_slides)} carousel slides", "debug")
# Download each slide's image
for slide_index in range(min(len(all_slides), max_images)):
self.log(f"Processing carousel slide {slide_index + 1}/{len(all_slides)}", "debug")
# Get the current slide element to scope our searches
current_slide = all_slides[slide_index]
# Click next to navigate to this slide (except for first one)
if slide_index > 0:
next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
if next_btn.count() > 0 and next_btn.is_visible():
try:
next_btn.click(force=True)
except Exception:
self.log(f"Carousel next button click timed out at slide {slide_index + 1}, stopping carousel", "warning")
break
time.sleep(2) # Wait for slide transition and image to load
# First, try to find a download button for this carousel item
# IMPORTANT: Search within CURRENT SLIDE only, not entire page
download_btn = None
download_url = None
webp_fallback_url = None
# Look for download button on the current slide - prefer high-res, fallback to .webp
download_selectors = [
'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg
'a.btn[href*="scontent"][href*=".mp4"]', # Video
'a.btn[href*="scontent"]', # Any scontent
'a[download][href*=".jpg"]',
'a[download][href*=".mp4"]',
'a.download',
'a[download]',
'a[href*="/post"]'
]
# Search for download buttons - first try within slide, then try page-level
# Imginn often has download buttons outside the .swiper-slide elements
search_contexts = [current_slide, page]
for search_context in search_contexts:
if download_url: # Already found, skip other contexts
break
for selector in download_selectors:
btn = search_context.locator(selector).first
if btn.count() > 0:
temp_url = btn.get_attribute('href')
if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)':
if not temp_url.startswith('http'):
temp_url = f"https://imginn.com{temp_url}"
# Store .webp as fallback, but keep looking for better
if '.webp' in temp_url.lower():
if not webp_fallback_url:
webp_fallback_url = temp_url
self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug")
continue
# Found non-.webp link, use it
download_btn = btn
download_url = temp_url
self.log(f"Found high-res download for carousel slide {slide_index + 1}: {download_url[:80]}...", "debug")
break
# Use .webp fallback if no high-res found
used_webp_fallback = False
if not download_url and webp_fallback_url:
download_url = webp_fallback_url
used_webp_fallback = True
self.log(f"Using .webp fallback for carousel slide {slide_index + 1}", "info")
# If we found a download button, use it for high-res
if download_url:
try:
import requests
from urllib.parse import urlparse, unquote
response = requests.get(download_url, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Extract filename and media ID from the actual file
url_path = urlparse(download_url).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
# The media ID is the filename without extension
actual_media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
# Build filename for carousel image using actual media ID
filename = f"{profile_name}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}"
filepath = output_dir / filename
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
# Check for duplicate hash (hash blacklist persists even if original deleted)
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
filepath.unlink()
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
if post_date:
self._update_file_timestamps(filepath, post_date)
# Log with appropriate quality label
quality_label = "fallback" if used_webp_fallback else "high-res"
self.log(f"Downloaded ({quality_label}): {filename} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
image_count += 1
# Add to tracking
self.downloaded_files.add(actual_media_id)
# Mark in database (or defer for later)
if not skip_database or defer_database:
unique_url = f"{post_url}#{filename}"
self._record_download(
media_id=actual_media_id,
username=profile_name,
filename=filename,
url=unique_url,
post_date=post_date,
file_path=str(filepath),
content_type='post',
deferred=defer_database
)
continue # Skip to next slide
except Exception as e:
self.log(f"Failed to download high-res carousel image {slide_index + 1}: {e}, falling back to standard res", "warning")
# Fallback: Find the current slide's media (img or video) if no download button
# current_slide already defined at top of loop
# Try img first, then video
media_src = None
slide_img = current_slide.locator('img').first
if slide_img.count() > 0:
media_src = slide_img.get_attribute('src')
# If it's a lazy placeholder, wait for it to load
if media_src and 'lazy.jpg' in media_src:
self.log(f"Slide {slide_index + 1} is lazy, waiting for load...", "debug")
# Trigger load by making it visible
current_slide.scroll_into_view_if_needed()
time.sleep(1)
# Get src again
media_src = slide_img.get_attribute('src')
else:
# Check for video tag
slide_video = current_slide.locator('video source, video').first
if slide_video.count() > 0:
media_src = slide_video.get_attribute('src')
self.log(f"Found video for slide {slide_index + 1}", "debug")
if media_src and 'lazy.jpg' not in media_src and '483011604' not in media_src:
self.log(f"Downloading carousel media {slide_index + 1} (standard res): {media_src[:80]}...", "debug")
# Download this media
try:
import requests
from urllib.parse import urlparse, unquote
if not media_src.startswith('http'):
media_src = f"https:{media_src}" if media_src.startswith('//') else f"https://imginn.com{media_src}"
response = requests.get(media_src, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Extract filename and media ID from the actual file
url_path = urlparse(media_src).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
# The media ID is the filename without extension
actual_media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
# Build filename for carousel image using actual media ID
filename = f"{profile_name}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}"
filepath = output_dir / filename
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
existing_path = PathLib(existing['file_path'])
if existing_path.exists():
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
try:
filepath.unlink()
self.log(f"Deleted duplicate: {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
if post_date:
self._update_file_timestamps(filepath, post_date)
self.log(f"Downloaded: {filename} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
image_count += 1
# Add to tracking
self.downloaded_files.add(actual_media_id)
# Mark in database (or defer for later)
if not skip_database or defer_database:
unique_url = f"{post_url}#{filename}"
self._record_download(
media_id=actual_media_id,
username=profile_name,
filename=filename,
url=unique_url,
post_date=post_date,
file_path=str(filepath),
content_type='post',
deferred=defer_database
)
except Exception as e:
self.log(f"Failed to download carousel media {slide_index + 1}: {e}", "error")
else:
self.log(f"Slide {slide_index + 1} has no valid media (img/video)", "warning")
# Skip the old carousel download logic
pass
# This duplicate block is not needed - single image logic is already handled above
# OLD carousel logic removed - handled above
if False:
# Wait for carousel content to load
time.sleep(1)
# Find download button AND image elements
# ImgInn sometimes has the full image in an img tag, not just download button
download_selectors = [
'a[download]',
'a.download-btn',
'a[href*="scontent"]',
'a[href*="s3.imginn.com"]',
'a.download',
'a[href*="/post"][href*=".jpg"]',
'a[href*="/post"][href*=".mp4"]',
'button.download',
'a.btn-download'
]
# For carousel images, we need to find the actual post image, not the profile thumbnail
# Look for images that are NOT the profile pic and NOT lazy placeholders
img_src = None
# Try to find the carousel image (exclude profile pic and lazy images)
possible_images = page.locator('img[src*="post"], img[src*="scontent"]:not([src*="profile"])').all()
for img_elem in possible_images:
src = img_elem.get_attribute('src')
if src and 'lazy.jpg' not in src and '483011604' not in src: # Exclude profile pic
img_src = src
self.log(f"Found carousel image src: {img_src[:100]}...", "debug")
break
# If no good image found, wait and try again
if not img_src or 'lazy.jpg' in img_src:
time.sleep(1)
# Try once more after waiting
main_image = page.locator('img[src*="post"]:not([src*="lazy"])').first
if main_image.count() > 0:
img_src = main_image.get_attribute('src')
if img_src:
self.log(f"Found carousel image after wait: {img_src[:100]}...", "debug")
download_btn = None
for selector in download_selectors:
btn = page.locator(selector).first
if btn.count() > 0:
download_btn = btn
break
if download_btn and download_btn.count() > 0:
try:
# For ImgInn, we should click the download button to get the full-size image
# The href often points to a thumbnail, not the full image
download_url = download_btn.get_attribute('href')
self.log(f"Download button href: {download_url[:100] if download_url else 'None'}...", "debug")
# Try clicking the button for browser download first
try:
self.log(f"Attempting browser download (clicking button)", "debug")
with page.expect_download(timeout=5000) as download_info:
download_btn.click()
download = download_info.value
original_name = download.suggested_filename
media_id_from_file = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
download_method = 'browser'
response = None
self.log(f"Browser download completed: {original_name}", "debug")
except Exception:
# Fallback to direct download if clicking doesn't work
self.log(f"Browser download failed, trying direct download", "debug")
# For carousels, if no download URL or it's invalid, use image src
if has_carousel and (not download_url or download_url == "None" or download_url == "null"):
if img_src:
self.log(f"No download button for carousel, using image src", "debug")
download_url = img_src
# Be more lenient with download URLs - accept any https URL that looks like it could be an image/video
if download_url and download_url.startswith('http'):
# Make sure it's not just the post page URL
if '/p/' not in download_url or download_url.endswith(('.jpg', '.jpeg', '.png', '.heic', '.mp4', '.webm')):
import requests
response = requests.get(download_url, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
self.log(f"Downloaded {len(response.content)} bytes", "debug")
download_method = 'direct'
# Extract filename from URL
from urllib.parse import urlparse, unquote
url_path = urlparse(download_url).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
# Remove 'post' prefix if present
if original_name.startswith('post'):
original_name = original_name[4:]
media_id_from_file = Path(original_name).stem # This is the actual media ID
ext = Path(original_name).suffix or '.jpg'
else:
# Try to use image src instead
if img_src:
self.log(f"Download URL is post page, using image src instead", "debug")
download_url = img_src
if not download_url.startswith('http'):
download_url = f"https://imginn.com{download_url}"
import requests
response = requests.get(download_url, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
download_method = 'direct'
from urllib.parse import urlparse, unquote
url_path = urlparse(download_url).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
media_id_from_file = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
else:
raise Exception("No valid download URL found")
else:
raise Exception("No valid download URL found")
# Update our tracked media ID with the correct one from the file
if media_id_from_file:
media_id = media_id_from_file
self.log(f"Media ID from file: {media_id}", "debug")
# For carousels, if we don't get a unique media ID, generate one
normalized_media_id = extract_instagram_media_id(media_id) if media_id else None
if has_carousel and (not media_id or media_id in self.downloaded_files or (normalized_media_id and normalized_media_id in self.downloaded_files)):
# Generate unique ID for this carousel image
media_id = f"{media_id_base}_carousel_{carousel_index}"
normalized_media_id = extract_instagram_media_id(media_id)
self.log(f"Generated carousel media ID: {media_id}", "debug")
# Check if this media ID is already downloaded (both original and normalized)
if media_id in self.downloaded_files or (normalized_media_id and normalized_media_id in self.downloaded_files):
self.log(f"Already have {media_id}, skipping download but continuing carousel", "debug")
# Still count this as an image even if skipped
image_count += 1
if has_carousel:
carousel_index += 1
else:
self.log(f"Downloading new file for {media_id}", "debug")
# Build filename with FastDL format
if has_carousel:
# For carousel items, append index (simpler format)
filename = f"{profile_name}_{date_str}_{media_id_base}_{carousel_index}{ext}"
else:
filename = f"{profile_name}_{date_str}_{media_id}{ext}"
filepath = output_dir / filename
# Save the downloaded content
if download_method == 'direct':
with open(filepath, 'wb') as f:
f.write(response.content)
else:
download.save_as(filepath)
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
# Check for duplicate hash (hash blacklist persists even if original deleted)
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
filepath.unlink()
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update file timestamps to match post date
if post_date:
self._update_file_timestamps(filepath, post_date)
self.log(f"Downloaded: {filename}", "info")
downloaded_files.append(str(filepath))
image_count += 1
# Add to tracking
self.downloaded_files.add(media_id)
# Increment carousel index for next image
if has_carousel:
carousel_index += 1
# Mark as downloaded in database (or defer for later)
# Use per-slide URL for carousels so each slide gets a unique url_hash
record_url = f"{post_url}?img_index={carousel_index + 1}" if has_carousel else post_url
if not skip_database or defer_database:
self._record_download(
media_id=media_id,
username=profile_name,
filename=filename,
url=record_url,
post_date=post_date,
file_path=str(filepath),
content_type='post',
deferred=defer_database
)
except Exception as e:
self.log(f"Download failed: {e}", "error")
import traceback
self.log(f"Traceback: {traceback.format_exc()}", "debug")
break
else:
# No download button found, try using the image src as fallback
page_url = page.url
self.log(f"No download button found on {page_url}, trying image src", "warning")
# Use the image src we found earlier
if img_src:
try:
self.log(f"Using image src as fallback: {img_src[:100]}...", "debug")
import requests
from urllib.parse import urlparse, unquote
# Ensure full URL
if not img_src.startswith('http'):
img_src = f"https://imginn.com{img_src}"
response = requests.get(img_src, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Extract filename from URL
url_path = urlparse(img_src).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
# Build filename with carousel index if needed
if has_carousel and carousel_index > 1:
filename = f"{profile_name}_{date_str}_{media_id}_{carousel_index}{ext}"
else:
filename = f"{profile_name}_{date_str}_{media_id}{ext}"
filepath = output_dir / filename
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
self.log(f"Downloaded via image src: {filename} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
# Check for duplicate hash (hash blacklist persists even if original deleted)
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
filepath.unlink()
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
if post_date:
self._update_file_timestamps(filepath, post_date)
image_count += 1
self.downloaded_files.add(media_id)
# Mark in database (or defer for later)
# Use per-slide URL for carousels so each slide gets a unique url_hash
record_url = f"{post_url}?img_index={carousel_index + 1}" if has_carousel else post_url
if not skip_database or defer_database:
self._record_download(
media_id=media_id,
username=profile_name,
filename=filename,
url=record_url,
post_date=post_date,
file_path=str(filepath),
content_type='post',
deferred=defer_database
)
except Exception as e:
self.log(f"Failed to download via image src: {e}", "error")
# Don't break here - might be a temporary issue with one image
if not has_carousel:
break
else:
self.log(f"No image src available as fallback", "debug")
# For carousels, we might still have more images after clicking next
if not has_carousel:
break
# Check for next image in carousel
if has_carousel and image_count < max_images:
next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
if next_btn.count() > 0 and next_btn.is_visible():
# Store current image src to detect when it changes
current_img_src = img_src if img_src else ""
self.log(f"Clicking next for carousel image {carousel_index}", "debug")
try:
next_btn.click(force=True)
except Exception:
self.log(f"Carousel next button click timed out at image {carousel_index}, stopping carousel", "warning")
break
# Wait for the image to change
time.sleep(2) # Give more time for slide transition and new image to load
else:
self.log("No more carousel images", "debug")
break
else:
break
else:
# Single image - download from post page using download button
download_url = None
webp_fallback_url = None
download_selectors = [
'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg
'a.btn[href*="scontent"][href*=".mp4"]', # Video
'a.btn[href*="scontent"]', # Any scontent
'a[download][href*=".jpg"]',
'a[download][href*=".mp4"]',
'a.download',
'a[href*="/post"]'
]
for selector in download_selectors:
btn = page.locator(selector).first
if btn.count() > 0:
temp_url = btn.get_attribute('href')
if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)':
if not temp_url.startswith('http'):
temp_url = f"https://imginn.com{temp_url}"
# Store .webp as fallback, but keep looking for better
if '.webp' in temp_url.lower():
if not webp_fallback_url:
webp_fallback_url = temp_url
self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug")
continue
# Found non-.webp link, use it
download_url = temp_url
self.log(f"Found high-res download for single image: {download_url[:80]}...", "debug")
break
# Use .webp fallback if no high-res found
if not download_url and webp_fallback_url:
download_url = webp_fallback_url
self.log(f"Using .webp fallback for single image", "info")
if download_url:
try:
import requests
from urllib.parse import urlparse, unquote
response = requests.get(download_url, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Extract filename and media ID from the actual file
url_path = urlparse(download_url).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
# The media ID is the filename without extension
actual_media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
# Build filename
filename = f"{profile_name}_{date_str}_{actual_media_id}{ext}"
filepath = output_dir / filename
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
self.log(f"Downloaded (high-res): {filename} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
existing_path = PathLib(existing['file_path'])
if existing_path.exists():
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
try:
filepath.unlink()
self.log(f"Deleted duplicate: {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
if post_date:
self._update_file_timestamps(filepath, post_date)
image_count = 1
# Add to tracking
self.downloaded_files.add(actual_media_id)
# Mark in database (or defer for later)
if not skip_database or defer_database:
self._record_download(
media_id=actual_media_id,
username=profile_name,
filename=filename,
url=post_url,
post_date=post_date,
file_path=str(filepath),
content_type='post',
deferred=defer_database
)
except Exception as e:
self.log(f"Failed to download single image: {e}", "warning")
else:
# No download button found - try video/image src as fallback
self.log("No download button found, trying video/image src fallback", "debug")
media_src = None
# Try video first - multiple selectors for different page structures
video_selectors = [
'video source[src]',
'video[src]',
'video source[type*="mp4"]',
'.video-container video',
'.post-video video',
'div[class*="video"] video',
'video'
]
for v_selector in video_selectors:
video_elem = page.locator(v_selector).first
if video_elem.count() > 0:
# Try src attribute first, then check source child
media_src = video_elem.get_attribute('src')
if not media_src:
source_elem = video_elem.locator('source').first
if source_elem.count() > 0:
media_src = source_elem.get_attribute('src')
if media_src and media_src != '#':
self.log(f"Found video src via '{v_selector}': {media_src[:80]}...", "debug")
break
# If no video found, wait a bit and try again (videos may lazy-load)
if not media_src:
time.sleep(2)
for v_selector in video_selectors:
video_elem = page.locator(v_selector).first
if video_elem.count() > 0:
media_src = video_elem.get_attribute('src')
if not media_src:
source_elem = video_elem.locator('source').first
if source_elem.count() > 0:
media_src = source_elem.get_attribute('src')
if media_src and media_src != '#':
self.log(f"Found video src after wait via '{v_selector}': {media_src[:80]}...", "debug")
break
# Try image if no video
if not media_src:
img_elem = page.locator('img[src*="scontent"]:not([src*="profile"]), img[src*="post"]').first
if img_elem.count() > 0:
media_src = img_elem.get_attribute('src')
if media_src and 'lazy.jpg' not in media_src:
self.log(f"Found image src: {media_src[:80]}...", "debug")
else:
media_src = None
if media_src:
try:
import requests
from urllib.parse import urlparse, unquote
if not media_src.startswith('http'):
media_src = f"https://imginn.com{media_src}"
response = requests.get(media_src, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Extract filename from URL
url_path = urlparse(media_src).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
actual_media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.mp4'
filename = f"{profile_name}_{date_str}_{actual_media_id}{ext}"
filepath = output_dir / filename
with open(filepath, 'wb') as f:
f.write(response.content)
self.log(f"Downloaded (fallback): {filename} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
if post_date:
self._update_file_timestamps(filepath, post_date)
image_count = 1
self.downloaded_files.add(actual_media_id)
if not skip_database or defer_database:
self._record_download(
media_id=actual_media_id,
username=profile_name,
filename=filename,
url=post_url,
post_date=post_date,
file_path=str(filepath),
content_type='post',
deferred=defer_database
)
except Exception as e:
self.log(f"Failed to download via fallback: {e}", "error")
else:
self.log("No download button or media src found for single post", "warning")
# Debug: capture screenshot and page content when download fails
try:
debug_dir = Path("debug")
debug_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
screenshot_path = debug_dir / f"no_media_{media_id}_{timestamp}.png"
page.screenshot(path=str(screenshot_path))
self.log(f"Debug screenshot saved: {screenshot_path}", "debug")
# Also log page title and some content
title = page.title()
self.log(f"Page title: {title}", "debug")
# Check if this is a Cloudflare block - don't mark as processed if so
if self._is_cloudflare_challenge(page):
self.log(f"Cloudflare block detected - NOT marking {media_id} as processed (will retry later)", "warning")
# Skip to next post without marking as processed
try:
page.goto(f"https://imginn.com/{username}/?ref=index")
time.sleep(3)
except Exception:
pass
continue
except Exception as e:
self.log(f"Failed to capture debug screenshot: {e}", "debug")
# Mark post as processed in database even if no downloads
# (might be already downloaded or failed - but NOT if Cloudflare blocked)
if image_count == 0:
# Still mark the post URL as processed to avoid re-checking
self._record_download(
media_id=media_id,
username=profile_name,
filename=f"{media_id}_skipped",
url=post_url,
post_date=post_date,
content_type='post',
metadata={'marker': True, 'reason': 'skipped'}
)
# Go back to profile
self._safe_go_back(page, username)
# If we just bypassed Cloudflare, wait longer to let session stabilize
if cloudflare_bypassed:
cooldown = random.uniform(15, 25)
self.log(f"Post-bypass cooldown: waiting {cooldown:.1f}s to stabilize session", "info")
time.sleep(cooldown)
else:
time.sleep(random.uniform(1, 3))
# Check if back on profile
if username not in page.url:
page.goto(f"https://imginn.com/{username}/?ref=index")
time.sleep(3)
except Exception as e:
self.log(f"Error processing post: {e}", "error")
try:
page.goto(f"https://imginn.com/{username}/?ref=index")
time.sleep(3)
except Exception:
pass
self.log(f"Downloaded {len(downloaded_files)} files", "info")
except Exception as e:
self.log(f"Error: {e}", "error")
# Don't close browser here - reuse it for next profile
# Call _stop_browser() explicitly when done with all profiles
return downloaded_files
def download_tagged(self, username: str, days_back: int = 14, max_posts: int = 50, output_dir: Path = None, phrase_config: dict = None, defer_database: bool = False):
"""Download tagged posts from a user
Args:
username: Instagram username
days_back: How many days back to search
max_posts: Maximum posts to check
output_dir: Output directory
phrase_config: Optional phrase search configuration
defer_database: If True, defer database recording to pending_downloads list
for later recording after file move is complete
"""
# Rate limiting to avoid Cloudflare blocks
self._enforce_rate_limit("tagged")
profile_name = username.lower()
if output_dir is None:
output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
output_dir.mkdir(parents=True, exist_ok=True)
# Check site status before doing anything else
self.log("Checking ImgInn site status...", "debug")
site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10)
if self.cf_handler.should_skip_download(site_status):
self.log(f"Skipping tagged download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning")
return []
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
# Scan existing files
self._scan_existing_files(output_dir, profile_name)
# Get processed posts from database
processed_posts = self._get_processed_posts(profile_name)
self.log(f"Loaded {len(processed_posts)} processed tagged posts for {profile_name} from database", "info")
downloaded_files = []
cutoff_date = datetime.now() - timedelta(days=days_back)
# Start or reuse browser
self._start_browser()
page = self.page
try:
# Navigate to tagged page directly
self.log(f"Navigating to @{username} tagged posts page", "info")
page.goto(f"https://imginn.com/tagged/{username}/?ref=index", wait_until='domcontentloaded')
# CRITICAL: Wait for Cloudflare background JS challenges
import random
wait_time = 5 + random.uniform(0, 2)
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
time.sleep(wait_time)
# Wait for page to load
if not self.wait_for_cloudflare(page):
self._page_load_failures += 1
level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning"
self.log(f"Page didn't load properly ({self._page_load_failures}x this session)", level)
return []
# Save cookies
self.save_cookies(self.context)
# Wait for JavaScript to load posts (ImgInn loads posts dynamically on tagged page)
self.log("Waiting for tagged posts to load via JavaScript...", "info")
try:
# Wait for post links to appear (up to 10 seconds)
page.wait_for_selector('a[href*="/p/"]', timeout=10000)
self.log("Tagged posts loaded successfully", "info")
except Exception:
# Timeout - posts might not exist, or page structure changed
self.log("Timeout waiting for tagged posts to appear", "warning")
time.sleep(2) # Give it a bit more time anyway
# Scroll to load more posts (ImgInn uses infinite scroll on tagged page)
self.log("Scrolling to load more tagged posts...", "info")
previous_count = 0
scroll_attempts = 0
max_scroll_attempts = 10 # Scroll up to 10 times to load posts
while scroll_attempts < max_scroll_attempts:
# Get current count of post links
current_count = page.locator('a[href*="/p/"]').count()
if current_count == previous_count and scroll_attempts > 0:
# No new posts loaded after scroll, we've reached the end
self.log(f"No more tagged posts to load (total: {current_count})", "debug")
break
if current_count >= max_posts:
# We have enough posts
self.log(f"Loaded {current_count} tagged posts (reached max_posts limit)", "debug")
break
previous_count = current_count
# Scroll to bottom of page
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(1.5) # Wait for new posts to load
scroll_attempts += 1
self.log(f"Scroll {scroll_attempts}: Found {current_count} tagged posts", "debug")
# Find posts on tagged page
self.log("Finding tagged posts...", "info")
# Debug: Check what's actually on the page
page_content = page.content()
if 'no posts' in page_content.lower() or 'page not found' in page_content.lower():
self.log("Page shows 'no posts' or 'not found'", "warning")
post_links = page.locator('a[href*="/p/"]').all()
self.log(f"Found {len(post_links)} tagged posts", "info")
if not post_links:
# Debug: Save screenshot to see what's wrong
try:
screenshot_path = Path(f"/tmp/imginn_no_tagged_{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
page.screenshot(path=str(screenshot_path))
self.log(f"No tagged posts found - screenshot saved to {screenshot_path}", "warning")
except Exception:
pass
self.log("No tagged posts found", "warning")
return []
# Extract all post URLs upfront to avoid stale element issues
# (elements become stale after page.go_back())
post_urls = []
for idx, post_link in enumerate(post_links[:max_posts]):
try:
href = post_link.get_attribute('href', timeout=5000)
if href:
# Ensure full URL
if not href.startswith('http'):
href = f"https://imginn.com{href}"
post_urls.append(href)
except Exception as e:
self.log(f"Tagged {idx+1}: Failed to get URL: {str(e)[:50]}", "debug")
continue
self.log(f"Processing {len(post_urls)} tagged posts (max {max_posts})", "info")
# Track consecutive old posts to handle pinned posts
consecutive_old_posts = 0
max_consecutive_old_posts = 5 # Allow up to 5 old posts (pinned) before stopping
# Set initial progress so dashboard shows 0/N immediately
self.activity_manager.update_status(
"Downloading tagged",
progress_current=0,
progress_total=len(post_urls)
)
for i, post_url in enumerate(post_urls):
# Update progress at start of each iteration (fires even on skips)
self.activity_manager.update_status(
"Downloading tagged",
progress_current=i + 1,
progress_total=len(post_urls)
)
try:
# Extract media ID from URL
media_id = self._extract_media_id_from_url(post_url)
if not media_id:
self.log(f"Could not extract media ID from {post_url}", "warning")
continue
self.log(f"[{i+1}/{len(post_urls)}] Checking tagged post {media_id}", "debug")
# Check if already processed (either downloaded or checked for phrases/age)
if media_id in processed_posts or post_url in processed_posts:
self.log(f"Post {media_id} already processed, skipping", "debug")
continue
# Rate limiting between post downloads to avoid Cloudflare blocks
if i > 0:
post_delay = random.uniform(3, 8)
self.log(f"Rate limit: waiting {post_delay:.1f}s before tagged post {i+1}", "debug")
time.sleep(post_delay)
# For tagged posts, ALWAYS navigate to post page for high-res download
# (Never use profile download which gives low-res .webp)
page.goto(post_url, wait_until='domcontentloaded')
# Wait for page to load
time.sleep(2)
# Wait for navigation to complete
try:
page.wait_for_load_state('networkidle', timeout=5000)
except Exception:
# Continue even if network isn't idle - page might still be usable
self.log("Network didn't idle, but continuing", "debug")
# Check if on post page
if "/p/" not in page.url:
self.log(f"Not a downloadable post (URL: {page.url})", "warning")
self._safe_go_back(page, username, tagged=True)
continue
# IMPORTANT: Wait for post page content to fully render
# This ensures download buttons are from the POST PAGE, not tagged page preview
try:
# Wait for the post container to be visible
page.wait_for_selector('div.main-content, div.post, div.content, div.single-post', timeout=3000)
time.sleep(1) # Additional wait for download buttons to render
except Exception:
self.log("Post container not found, checking for Cloudflare...", "debug")
# Check for Cloudflare challenge and handle it
cloudflare_bypassed = False
if self._is_cloudflare_challenge(page):
self.log(f"Cloudflare challenge detected on tagged post {media_id}", "warning")
if not self._handle_cloudflare_on_post(page, post_url):
# Cloudflare bypass failed - skip this post WITHOUT marking as processed
# so it can be retried on next run
self.log(f"Skipping tagged post {media_id} due to Cloudflare block (will retry later)", "warning")
try:
page.goto(f"https://imginn.com/tagged/{username}/?ref=index")
time.sleep(3)
except Exception:
pass
continue
cloudflare_bypassed = True
self.log(f"Navigated to tagged post page: {page.url}", "debug")
self._dismiss_consent_dialog(page)
# Extract the actual poster's username (not the tagged user)
# On tagged pages, posts are FROM other users who tagged this user
poster_username = profile_name # Default to tagged user
try:
username_elem = page.locator('div.username a').first
if username_elem.count() > 0:
username_href = username_elem.get_attribute('href')
if username_href:
# Extract username from href like "/evalongoria.of/" -> "evalongoria.of"
poster_username = username_href.strip('/').lower()
self.log(f"Poster username: @{poster_username}", "debug")
except Exception as e:
self.log(f"Could not extract poster username, using default: {e}", "debug")
# Extract post date - ALWAYS extract for proper file naming
post_date = self._extract_post_date(page)
# Use post date for filename, or current date
if post_date:
date_str = post_date.strftime('%Y%m%d_%H%M%S')
self.log(f"Original post date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
else:
date_str = datetime.now().strftime('%Y%m%d_%H%M%S')
self.log(f"No original date found, using current time", "debug")
# Check date filter
if post_date and post_date < cutoff_date:
consecutive_old_posts += 1
self.log(f"Tagged post too old ({post_date.strftime('%Y-%m-%d')}), skipping (consecutive old: {consecutive_old_posts}/{max_consecutive_old_posts})", "info")
# Clean up temp file if exists
if 'temp_download_path' in locals() and temp_download_path and temp_download_path.exists():
temp_download_path.unlink()
self.log(f"Deleted temp file for old post", "debug")
# Mark this old post as checked in database - use poster_username for tagged content
if phrase_config and media_id:
self._record_download(
media_id=media_id,
username=poster_username,
filename=f"_old_post_{media_id}",
url=post_url,
post_date=post_date,
content_type='tagged',
metadata={'marker': True, 'reason': 'old_post'}
)
self._safe_go_back(page, username, tagged=True)
# Stop only after 5 consecutive old posts (handles pinned posts at top)
if consecutive_old_posts >= max_consecutive_old_posts:
self.log(f"Found {consecutive_old_posts} consecutive old tagged posts - stopping", "info")
break
else:
continue # Skip this old post but keep checking (might be pinned)
# Reset consecutive old posts counter - we found a post within date range
consecutive_old_posts = 0
# Check for phrase matching if configured
if phrase_config and phrase_config.get('enabled'):
if not self._check_post_phrases(page, phrase_config):
self.log(f"Tagged post does not match phrase criteria, skipping download", "info")
# Clean up temp file if exists
if 'temp_download_path' in locals() and temp_download_path and temp_download_path.exists():
temp_download_path.unlink()
self.log(f"Deleted temp file for non-matching post", "debug")
# Mark this post as checked (but not downloaded) in database - use poster_username
if media_id:
self._record_download(
media_id=media_id,
username=poster_username,
filename=f"_phrase_checked_{media_id}",
url=post_url,
post_date=post_date,
content_type='tagged',
metadata={'marker': True, 'reason': 'phrase_checked'}
)
self._safe_go_back(page, username, tagged=True)
continue
else:
self.log(f"Tagged post matches phrase criteria, using high-res download", "info")
# Check for carousel
carousel_next = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
has_carousel = carousel_next.count() > 0
if has_carousel:
self.log(f"Carousel detected in tagged post - will download all carousel images", "info")
self._dismiss_consent_dialog(page)
# CRITICAL: Wait for POST PAGE carousel download buttons to be ready
# This prevents downloading from the tagged page preview
try:
# Wait for download buttons with POST PAGE URLs (have "scontent" or "post" in them)
page.wait_for_selector('a.btn[href*="scontent"], a[download], a.download', timeout=3000)
time.sleep(1.5) # Additional wait for all carousel images to load
self.log("Carousel download buttons ready on post page", "debug")
except Exception:
self.log("Download buttons not found, but continuing", "debug")
else:
self.log("Single image tagged post", "debug")
# Handle downloads - always use download buttons from post page
image_count = 0
max_images = 10
# Download images (carousel or single)
if has_carousel:
all_slides = page.locator('.swiper-slide').all()
self.log(f"Found {len(all_slides)} carousel slides in tagged post", "debug")
# Download each slide's image
for slide_index in range(min(len(all_slides), max_images)):
self.log(f"Processing carousel slide {slide_index + 1}/{len(all_slides)}", "debug")
# Get the current slide element to scope our searches
current_slide = all_slides[slide_index]
# Click next to navigate to this slide (except for first one)
if slide_index > 0:
next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
if next_btn.count() > 0 and next_btn.is_visible():
try:
next_btn.click(force=True)
except Exception:
self.log(f"Carousel next button click timed out at slide {slide_index + 1}, stopping carousel", "warning")
break
time.sleep(2) # Wait for slide transition and image to load
# Look for download button - prefer high-res, fallback to .webp
# IMPORTANT: Search within CURRENT SLIDE only, not entire page
download_url = None
webp_fallback_url = None
slide_downloaded = False # Track if this specific slide was downloaded
download_selectors = [
'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg
'a.btn[href*="scontent"][href*=".mp4"]', # Video
'a.btn[href*="scontent"]', # Any scontent
'a[download][href*=".jpg"]',
'a[download][href*=".mp4"]',
'a.download',
'a[href*="/post"]'
]
# Search for download buttons - first try within slide, then try page-level
# Imginn often has download buttons outside the .swiper-slide elements
search_contexts = [current_slide, page]
for search_context in search_contexts:
if download_url: # Already found, skip other contexts
break
for selector in download_selectors:
btn = search_context.locator(selector).first
if btn.count() > 0:
temp_url = btn.get_attribute('href')
if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)':
if not temp_url.startswith('http'):
temp_url = f"https://imginn.com{temp_url}"
# Store .webp as fallback, but keep looking for better
if '.webp' in temp_url.lower():
if not webp_fallback_url:
webp_fallback_url = temp_url
self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug")
continue
# Found non-.webp link, use it
download_url = temp_url
self.log(f"Found high-res download for carousel slide {slide_index + 1}: {download_url[:80]}...", "debug")
break
# Use .webp fallback if no high-res found
used_webp_fallback = False
if not download_url and webp_fallback_url:
download_url = webp_fallback_url
used_webp_fallback = True
self.log(f"Using .webp fallback for carousel slide {slide_index + 1}", "info")
# If we found a download button, use it for high-res
if download_url:
try:
import requests
from urllib.parse import urlparse, unquote
response = requests.get(download_url, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Extract filename and media ID from the actual file
url_path = urlparse(download_url).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
# The media ID is the filename without extension
actual_media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
# Build filename for carousel image using actual media ID (use poster's username)
filename = f"{poster_username}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}"
filepath = output_dir / filename
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
# Log with appropriate quality label
quality_label = "fallback" if used_webp_fallback else "high-res"
self.log(f"Downloaded tagged ({quality_label}): {filename} from @{poster_username} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
# Check for duplicate hash (hash blacklist persists even if original deleted)
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
filepath.unlink()
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
if post_date:
self._update_file_timestamps(filepath, post_date)
image_count += 1
slide_downloaded = True # Mark this slide as successfully downloaded
# Add to tracking
self.downloaded_files.add(actual_media_id)
# Mark in database (or defer for later) - use poster_username for tagged content
unique_url = f"{post_url}#{filename}"
self._record_download(
media_id=actual_media_id,
username=poster_username,
filename=filename,
url=unique_url,
post_date=post_date,
file_path=str(filepath),
content_type='tagged',
deferred=defer_database
)
except Exception as e:
self.log(f"Failed to download carousel image {slide_index + 1}: {e}", "error")
# Don't continue - try fallback method below
# Fallback: Download from current slide's img/video src if no download button worked
if not slide_downloaded:
self.log(f"Trying fallback: downloading from slide {slide_index + 1} media src", "debug")
# current_slide already defined at top of loop
# Try img first, then video
media_src = None
slide_img = current_slide.locator('img').first
if slide_img.count() > 0:
media_src = slide_img.get_attribute('src')
else:
# Check for video tag
slide_video = current_slide.locator('video source, video').first
if slide_video.count() > 0:
media_src = slide_video.get_attribute('src')
self.log(f"Found video for slide {slide_index + 1}", "debug")
if media_src:
# Skip lazy placeholders
if 'lazy.jpg' not in media_src and '483011604' not in media_src:
try:
import requests
from urllib.parse import urlparse, unquote
if not media_src.startswith('http'):
media_src = f"https:{media_src}" if media_src.startswith('//') else f"https://imginn.com{media_src}"
response = requests.get(media_src, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Extract filename
url_path = urlparse(media_src).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
actual_media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
# Build filename
filename = f"{poster_username}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}"
filepath = output_dir / filename
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
self.log(f"Downloaded tagged (fallback): {filename} from @{poster_username} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
existing_path = PathLib(existing['file_path'])
if existing_path.exists():
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
try:
filepath.unlink()
self.log(f"Deleted duplicate: {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
if post_date:
self._update_file_timestamps(filepath, post_date)
image_count += 1
# Add to tracking
self.downloaded_files.add(actual_media_id)
# Mark in database (or defer for later) - use poster_username for tagged content
unique_url = f"{post_url}#{filename}"
self._record_download(
media_id=actual_media_id,
username=poster_username,
filename=filename,
url=unique_url,
post_date=post_date,
file_path=str(filepath),
content_type='tagged',
deferred=defer_database
)
except Exception as e:
self.log(f"Failed to download from media src for slide {slide_index + 1}: {e}", "error")
else:
self.log(f"No media (img/video) found for carousel slide {slide_index + 1}", "warning")
else:
# Single image - download from post page using download button
download_url = None
webp_fallback_url = None
download_selectors = [
'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg
'a.btn[href*="scontent"][href*=".mp4"]', # Video
'a.btn[href*="scontent"]', # Any scontent
'a[download][href*=".jpg"]',
'a[download][href*=".mp4"]',
'a.download',
'a[href*="/post"]'
]
for selector in download_selectors:
btn = page.locator(selector).first
if btn.count() > 0:
temp_url = btn.get_attribute('href')
if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)':
if not temp_url.startswith('http'):
temp_url = f"https://imginn.com{temp_url}"
# Store .webp as fallback, but keep looking for better
if '.webp' in temp_url.lower():
if not webp_fallback_url:
webp_fallback_url = temp_url
self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug")
continue
# Found non-.webp link, use it
download_url = temp_url
self.log(f"Found high-res download for single image: {download_url[:80]}...", "debug")
break
# Use .webp fallback if no high-res found
if not download_url and webp_fallback_url:
download_url = webp_fallback_url
self.log(f"Using .webp fallback for single image", "info")
if download_url:
try:
import requests
from urllib.parse import urlparse, unquote
response = requests.get(download_url, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Extract filename and media ID from the actual file
url_path = urlparse(download_url).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
# The media ID is the filename without extension
actual_media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.jpg'
# Build filename using poster's username
filename = f"{poster_username}_{date_str}_{actual_media_id}{ext}"
filepath = output_dir / filename
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
self.log(f"Downloaded tagged (high-res): {filename} from @{poster_username} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
existing_path = PathLib(existing['file_path'])
if existing_path.exists():
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
try:
filepath.unlink()
self.log(f"Deleted duplicate: {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
if post_date:
self._update_file_timestamps(filepath, post_date)
image_count = 1
# Add to tracking
self.downloaded_files.add(actual_media_id)
# Mark in database (or defer for later) - use poster_username for tagged content
self._record_download(
media_id=actual_media_id,
username=poster_username,
filename=filename,
url=post_url,
post_date=post_date,
file_path=str(filepath),
content_type='tagged',
deferred=defer_database
)
except Exception as e:
self.log(f"Failed to download single image: {e}", "warning")
else:
# No download button found - try video/image src as fallback
self.log("No download button found, trying video/image src fallback", "debug")
media_src = None
# Try video first - multiple selectors for different page structures
video_selectors = [
'video source[src]',
'video[src]',
'video source[type*="mp4"]',
'.video-container video',
'.post-video video',
'div[class*="video"] video',
'video'
]
for v_selector in video_selectors:
video_elem = page.locator(v_selector).first
if video_elem.count() > 0:
# Try src attribute first, then check source child
media_src = video_elem.get_attribute('src')
if not media_src:
source_elem = video_elem.locator('source').first
if source_elem.count() > 0:
media_src = source_elem.get_attribute('src')
if media_src and media_src != '#':
self.log(f"Found video src via '{v_selector}': {media_src[:80]}...", "debug")
break
# If no video found, wait a bit and try again (videos may lazy-load)
if not media_src:
time.sleep(2)
for v_selector in video_selectors:
video_elem = page.locator(v_selector).first
if video_elem.count() > 0:
media_src = video_elem.get_attribute('src')
if not media_src:
source_elem = video_elem.locator('source').first
if source_elem.count() > 0:
media_src = source_elem.get_attribute('src')
if media_src and media_src != '#':
self.log(f"Found video src after wait via '{v_selector}': {media_src[:80]}...", "debug")
break
# Try image if no video
if not media_src:
img_elem = page.locator('img[src*="scontent"]:not([src*="profile"]), img[src*="post"]').first
if img_elem.count() > 0:
media_src = img_elem.get_attribute('src')
if media_src and 'lazy.jpg' not in media_src:
self.log(f"Found image src: {media_src[:80]}...", "debug")
else:
media_src = None
if media_src:
try:
import requests
from urllib.parse import urlparse, unquote
if not media_src.startswith('http'):
media_src = f"https://imginn.com{media_src}"
response = requests.get(media_src, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
url_path = urlparse(media_src).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
if original_name.startswith('post'):
original_name = original_name[4:]
actual_media_id = Path(original_name).stem
ext = Path(original_name).suffix or '.mp4'
filename = f"{poster_username}_{date_str}_{actual_media_id}{ext}"
filepath = output_dir / filename
with open(filepath, 'wb') as f:
f.write(response.content)
self.log(f"Downloaded (fallback): {filename} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
if post_date:
self._update_file_timestamps(filepath, post_date)
image_count = 1
self.downloaded_files.add(actual_media_id)
self._record_download(
media_id=actual_media_id,
username=poster_username,
filename=filename,
url=post_url,
post_date=post_date,
file_path=str(filepath),
content_type='tagged',
deferred=defer_database
)
except Exception as e:
self.log(f"Failed to download via fallback: {e}", "error")
else:
self.log("No download button or media src found for single post", "warning")
# Debug: capture screenshot and page content when download fails
try:
debug_dir = Path("debug")
debug_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
screenshot_path = debug_dir / f"no_media_tagged_{media_id}_{timestamp}.png"
page.screenshot(path=str(screenshot_path))
self.log(f"Debug screenshot saved: {screenshot_path}", "debug")
# Also log page title
title = page.title()
self.log(f"Page title: {title}", "debug")
# Check if this is a Cloudflare block - don't mark as processed if so
if self._is_cloudflare_challenge(page):
self.log(f"Cloudflare block detected - NOT marking tagged post {media_id} as processed (will retry later)", "warning")
# Skip to next post without marking as processed
try:
page.goto(f"https://imginn.com/tagged/{username}/?ref=index")
time.sleep(3)
except Exception:
pass
continue
except Exception as e:
self.log(f"Failed to capture debug screenshot: {e}", "debug")
# Navigate back to tagged page
if image_count > 0:
self.log(f"Successfully downloaded {image_count} image(s) from tagged post {media_id}", "info")
self._safe_go_back(page, username, tagged=True)
# If we just bypassed Cloudflare, wait longer to let session stabilize
if cloudflare_bypassed:
cooldown = random.uniform(15, 25)
self.log(f"Post-bypass cooldown: waiting {cooldown:.1f}s to stabilize session", "info")
time.sleep(cooldown)
else:
time.sleep(1)
except KeyboardInterrupt:
self.log("Download interrupted by user", "warning")
break
except Exception as e:
self.log(f"Error processing tagged post: {e}", "error")
self._safe_go_back(page, username, tagged=True)
self.log(f"Downloaded {len(downloaded_files)} tagged files", "info")
except Exception as e:
self.log(f"Error: {e}", "error")
# Don't close browser here - reuse it for next profile
return downloaded_files
def download_stories(self, username: str, days_back: int = 1, max_stories: int = 50, output_dir: Path = None, skip_database: bool = False, defer_database: bool = False):
"""Download stories from a user with FastDL naming
Args:
username: Instagram username
days_back: How many days back to search (stories expire after 24h)
max_stories: Maximum stories to download
output_dir: Output directory
skip_database: If True, don't record downloads in database (for temporary processing)
defer_database: If True, defer database recording to pending_downloads list
for later recording after file move is complete
"""
profile_name = username.lower()
if output_dir is None:
output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
output_dir.mkdir(parents=True, exist_ok=True)
# Check site status before doing anything else
self.log("Checking ImgInn site status...", "debug")
site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10)
if self.cf_handler.should_skip_download(site_status):
self.log(f"Skipping stories download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning")
return []
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
# Scan existing files
self._scan_existing_files(output_dir, profile_name)
# Get processed stories from database
processed_stories = self._get_processed_posts(profile_name)
self.log(f"Loaded {len(processed_stories)} processed stories for {profile_name} from database", "info")
downloaded_files = []
cutoff_date = datetime.now() - timedelta(days=days_back)
# Start or reuse browser
self._start_browser()
page = self.page
try:
# Navigate to stories page
self.log(f"Navigating to @{username} stories page", "info")
page.goto(f"https://imginn.com/stories/{username}/?ref=index", wait_until='domcontentloaded')
# CRITICAL: Wait for Cloudflare background JS challenges
import random
wait_time = 5 + random.uniform(0, 2)
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
time.sleep(wait_time)
# Wait for page to load
if not self.wait_for_cloudflare(page):
self.log("Stories page didn't load properly", "error")
return []
# Save cookies
self.save_cookies(self.context)
# Wait for stories container to load
self.log("Waiting for stories to load...", "info")
try:
page.wait_for_selector('.swiper-container.reels', timeout=10000)
self.log("Stories container loaded", "info")
except Exception:
self.log("No stories found - may have expired or page structure changed", "warning")
return []
# Find the Stories reel (first li.reel with data-uid and title "Stories")
self.log("Looking for Stories reel...", "info")
stories_reel = None
reels = page.locator('li.reel[data-uid]').all()
for reel in reels:
try:
# Check if this is the "Stories" reel
title = reel.locator('.title').first.text_content()
if title and title.strip().lower() == "stories":
stories_reel = reel
self.log(f"Found Stories reel", "info")
break
except Exception:
continue
if not stories_reel:
self.log("No active Stories found for this user", "warning")
return []
# Click the Stories reel to open viewer
self.log("Opening Stories viewer...", "info")
stories_reel.click()
time.sleep(2) # Wait for viewer to open
# Find all download buttons in the story viewer
self.log("Finding story download links...", "info")
download_links = page.locator('div.action a.download').all()
if not download_links:
self.log("No story download links found", "warning")
return []
self.log(f"Found {len(download_links)} stories", "info")
# Set initial progress so dashboard shows 0/N immediately
stories_to_download = min(len(download_links), max_stories)
self.activity_manager.update_status(
"Downloading stories",
progress_current=0,
progress_total=stories_to_download
)
# Download each story
story_index = 1
for i, download_link in enumerate(download_links[:max_stories]):
# Update progress at start of each iteration (fires even on skips)
self.activity_manager.update_status(
"Downloading stories",
progress_current=i + 1,
progress_total=stories_to_download
)
try:
# Get download URL
download_url = download_link.get_attribute('href')
if not download_url or download_url == '#':
self.log(f"Story {story_index}: Invalid download URL", "warning")
continue
self.log(f"Story {story_index}: {download_url[:80]}...", "debug")
# Extract media ID from URL or generate unique ID
from urllib.parse import urlparse, unquote
url_path = urlparse(download_url).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
media_id_full = Path(original_name).stem # Full filename stem for unique naming
ext = Path(original_name).suffix or '.jpg'
# Extract real Instagram media ID (18-digit number) for duplicate checking
media_id_for_tracking = extract_instagram_media_id(media_id_full)
self.log(f"Story {story_index}: Full ID: {media_id_full[:40]}..., Tracking ID: {media_id_for_tracking}", "debug")
# Check if already downloaded using the normalized media ID
if media_id_for_tracking in self.downloaded_files or media_id_for_tracking in processed_stories:
self.log(f"Story {story_index}: Already downloaded (tracking ID: {media_id_for_tracking}), skipping", "debug")
story_index += 1
continue
# Also check with full ID for backwards compatibility
if media_id_full in self.downloaded_files or media_id_full in processed_stories:
self.log(f"Story {story_index}: Already downloaded (full ID: {media_id_full[:30]}...), skipping", "debug")
story_index += 1
continue
# Use current date for stories (they expire after 24h)
story_date = datetime.now()
date_str = story_date.strftime('%Y%m%d_%H%M%S')
# Build filename: {profile}_{date}_{media_id}_story{index}{ext}
# Use full media ID in filename for uniqueness
filename = f"{profile_name}_{date_str}_{media_id_full}_story{story_index}{ext}"
filepath = output_dir / filename
# Download the story
try:
import requests
response = requests.get(download_url, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': 'https://imginn.com/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
self.log(f"Downloaded story: {filename} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
# Check for duplicate hash before recording
if self.unified_db:
from pathlib import Path as PathLib
file_hash = self.unified_db.get_file_hash(str(filepath))
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
existing_path = PathLib(existing['file_path'])
if existing_path.exists():
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
try:
filepath.unlink()
self.log(f"Deleted duplicate: {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
self._update_file_timestamps(filepath, story_date)
# Add both tracking ID and full ID to tracking set for comprehensive duplicate prevention
self.downloaded_files.add(media_id_for_tracking)
self.downloaded_files.add(media_id_full)
# Mark in database with media_id in metadata (or defer for later)
# Use the normalized media ID for database tracking to prevent future duplicates
if not skip_database or defer_database:
self._record_download(
media_id=media_id_for_tracking,
username=profile_name,
filename=filename,
url=download_url,
post_date=story_date,
file_path=str(filepath),
content_type='stories',
metadata={'media_id_full': media_id_full},
deferred=defer_database
)
story_index += 1
except Exception as e:
self.log(f"Failed to download story {story_index}: {e}", "error")
story_index += 1
continue
except Exception as e:
self.log(f"Error processing story {story_index}: {e}", "error")
story_index += 1
continue
self.log(f"Downloaded {len(downloaded_files)} story files", "info")
except Exception as e:
self.log(f"Error downloading stories: {e}", "error")
# Don't close browser here - reuse it for next profile
return downloaded_files
def main():
"""Test the downloader with FastDL naming"""
import sys
print("=" * 60)
print("ImgInn Downloader - FastDL Compatible Naming")
print("=" * 60)
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)
downloader = ImgInnDownloader(
api_key=os.environ.get("IMGINN_API_KEY", ""),
headless=False # Use with xvfb
)
# Check for specific post URL in arguments
if len(sys.argv) > 1 and 'imginn.com/p/' in sys.argv[-1]:
# Download specific post without date filter
files = downloader.download_posts(
username="evalongoria",
days_back=365, # Use large value to bypass date filter
max_posts=5,
specific_post_url=sys.argv[-1]
)
else:
# Download evalongoria posts from last 2 weeks
files = downloader.download_posts(
username="evalongoria",
days_back=14,
max_posts=50
)
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
if files:
print(f"Successfully downloaded {len(files)} files!")
print("\n📁 Downloaded files (FastDL naming format):")
for f in files:
name = Path(f).name
size = Path(f).stat().st_size / 1024
# Show the naming format
parts = name.split('_', 3)
if len(parts) >= 4:
print(f" - {name}")
print(f" Profile: {parts[0]}")
print(f" Date: {parts[1]}_{parts[2]}")
print(f" Media ID: {parts[3].split('.')[0]}")
print(f" Size: {size:.1f} KB")
else:
print("No files downloaded")
# Check total in folder
download_dir = Path("/opt/media-downloader/downloads/evalongoria")
if download_dir.exists():
all_files = list(download_dir.glob("*"))
total_size = sum(f.stat().st_size for f in all_files) / 1024
print(f"\n📊 Total in folder: {len(all_files)} files ({total_size:.1f} KB)")
if __name__ == "__main__":
main()