3775 lines
200 KiB
Python
3775 lines
200 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ImgInn downloader module with FastDL-compatible file naming
|
|
Format: {profile}_{YYYYMMDD_HHMMSS}_{media_id}{ext}
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import time
|
|
import random
|
|
import re
|
|
import subprocess
|
|
import platform
|
|
import requests
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from modules.base_module import LoggingMixin
|
|
from modules.cloudflare_handler import (
|
|
CloudflareHandler, SiteStatus, get_flaresolverr_user_agent,
|
|
get_playwright_context_options, get_playwright_stealth_scripts
|
|
)
|
|
from modules.instagram_utils import (
|
|
extract_instagram_media_id,
|
|
scan_existing_files_for_media_ids,
|
|
record_instagram_download,
|
|
is_instagram_downloaded
|
|
)
|
|
|
|
from typing import Dict, Optional
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
|
|
class ImgInnDownloader(LoggingMixin):
|
|
"""ImgInn downloader with FastDL-compatible naming"""
|
|
|
|
def __init__(self,
|
|
headless: bool = True,
|
|
cookie_file: str = "/opt/media-downloader/cookies/imginn_cookies.json",
|
|
show_progress: bool = True,
|
|
use_database: bool = True,
|
|
log_callback=None,
|
|
unified_db=None,
|
|
):
|
|
"""Initialize downloader compatible with media-downloader system"""
|
|
# Initialize logging via mixin
|
|
self._init_logger('Instagram', log_callback, default_module='Download')
|
|
|
|
self.headless = headless
|
|
self.downloaded_files = set() # Track downloaded media IDs
|
|
self.show_progress = show_progress
|
|
self.use_database = use_database
|
|
self.download_count = 0
|
|
self.unified_db = unified_db # Store for scraper config access
|
|
self.scraper_id = 'imginn' # Scraper ID in database
|
|
self.pending_downloads = [] # Track downloads for deferred database recording
|
|
|
|
# Rate limiting - track last scrape time to avoid hitting Cloudflare
|
|
self._last_scrape_time = None
|
|
self._min_scrape_interval = 15 # Minimum seconds between scrape types
|
|
|
|
# Track transient page load failures per session
|
|
self._page_load_failures = 0
|
|
self._page_load_failure_threshold = 5 # Escalate to error after this many
|
|
|
|
# Browser reuse across profiles
|
|
self.playwright = None
|
|
self.browser = None
|
|
self.context = None
|
|
self.page = None
|
|
|
|
# Use unified database directly (no adapter needed)
|
|
if unified_db and use_database:
|
|
self.unified_db = unified_db
|
|
else:
|
|
self.unified_db = None
|
|
self.use_database = False
|
|
|
|
# Initialize activity status manager for real-time updates
|
|
from modules.activity_status import get_activity_manager
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
|
|
# Load scraper configuration from database if available
|
|
self.proxy_url = None
|
|
self.cookie_file = None # Default to None (use database)
|
|
|
|
if unified_db:
|
|
scraper_config = unified_db.get_scraper(self.scraper_id)
|
|
if scraper_config:
|
|
# Get proxy configuration
|
|
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
|
self.proxy_url = scraper_config['proxy_url']
|
|
self.log(f"Using proxy: {self.proxy_url}", "info")
|
|
|
|
# Fall back to cookie file if no database
|
|
if not unified_db:
|
|
self.cookie_file = Path(cookie_file)
|
|
self.cookie_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
|
|
self.user_agent = get_flaresolverr_user_agent()
|
|
|
|
# Initialize universal Cloudflare handler
|
|
# Pass proxy_url if configured, and cookie_file=None for database storage
|
|
self.cf_handler = CloudflareHandler(
|
|
module_name="ImgInn",
|
|
cookie_file=str(self.cookie_file) if self.cookie_file else None,
|
|
user_agent=self.user_agent,
|
|
logger=self.logger,
|
|
aggressive_expiry=True, # Refresh cookies expiring within 7 days
|
|
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
|
|
)
|
|
|
|
# Keep for backwards compatibility
|
|
self.flaresolverr_url = self.cf_handler.flaresolverr_url
|
|
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
|
|
|
|
# Load cookies from database if available
|
|
self._load_cookies_from_db()
|
|
|
|
def _load_cookies_from_db(self):
|
|
"""Load cookies from database if available"""
|
|
if not self.unified_db:
|
|
return
|
|
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
# Load into CloudflareHandler
|
|
self.cf_handler._cookies = cookies
|
|
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from database: {e}", "warning")
|
|
|
|
def _save_cookies_to_db(self, cookies: list, user_agent: str = None):
|
|
"""Save cookies to database
|
|
|
|
Args:
|
|
cookies: List of cookie dictionaries
|
|
user_agent: User agent to associate with cookies (important for cf_clearance).
|
|
If not provided, uses self.user_agent as fallback.
|
|
"""
|
|
if not self.unified_db:
|
|
return
|
|
|
|
try:
|
|
# Use provided user_agent or fall back to self.user_agent
|
|
ua = user_agent or self.user_agent
|
|
self.unified_db.save_scraper_cookies(
|
|
self.scraper_id,
|
|
cookies,
|
|
user_agent=ua,
|
|
merge=True
|
|
)
|
|
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies to database: {e}", "warning")
|
|
|
|
def _cookies_expired(self):
|
|
"""Check if cookies are expired - delegates to CloudflareHandler"""
|
|
return self.cf_handler.cookies_expired()
|
|
|
|
def _get_cookies_for_requests(self):
|
|
"""Get cookies in format for requests library - delegates to CloudflareHandler"""
|
|
return self.cf_handler.get_cookies_dict()
|
|
|
|
def _get_cookies_via_flaresolverr(self, url="https://imginn.com/", max_retries=2):
|
|
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
max_retries: Maximum number of retry attempts (default: 2)
|
|
|
|
Returns:
|
|
True if cookies obtained successfully, False otherwise
|
|
"""
|
|
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
|
|
|
|
# Save cookies to database if successful
|
|
if success and self.unified_db:
|
|
cookies_list = self.cf_handler.get_cookies_list()
|
|
if cookies_list:
|
|
# CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent
|
|
# cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
|
flaresolverr_ua = self.cf_handler.get_user_agent()
|
|
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
|
|
|
|
return success
|
|
|
|
def _enforce_rate_limit(self, scrape_type: str = "scrape"):
|
|
"""Enforce rate limiting between scrape operations to avoid Cloudflare blocks.
|
|
|
|
Args:
|
|
scrape_type: Type of scrape (posts, stories, tagged) for logging
|
|
"""
|
|
import random
|
|
|
|
if self._last_scrape_time is not None:
|
|
elapsed = time.time() - self._last_scrape_time
|
|
if elapsed < self._min_scrape_interval:
|
|
# Add random jitter (5-15 seconds) to the delay
|
|
jitter = random.uniform(5, 15)
|
|
wait_time = self._min_scrape_interval - elapsed + jitter
|
|
self.log(f"Rate limiting: waiting {wait_time:.1f}s before {scrape_type} (Cloudflare avoidance)", "info")
|
|
time.sleep(wait_time)
|
|
|
|
self._last_scrape_time = time.time()
|
|
|
|
def _has_valid_cookies(self):
|
|
"""Check if we have valid cookies (either in file or database)"""
|
|
if self.unified_db:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
return cookies and len(cookies) > 0
|
|
elif self.cookie_file:
|
|
return self.cookie_file.exists()
|
|
return False
|
|
|
|
def _start_browser(self):
|
|
"""Start browser if not already running (reusable across profiles)"""
|
|
# Try to get fresh cookies via FlareSolverr if we don't have them or they're old
|
|
# Do this BEFORE the browser reuse check so cookies are always checked
|
|
if not self._has_valid_cookies() or self._cookies_expired():
|
|
self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info")
|
|
if self._get_cookies_via_flaresolverr():
|
|
self.log("Successfully got fresh cookies from FlareSolverr", "info")
|
|
else:
|
|
self.log("FlareSolverr unavailable, will try with Playwright", "warning")
|
|
|
|
if self.browser is not None:
|
|
self.log("Browser already running, reusing...", "debug")
|
|
return
|
|
|
|
import os
|
|
# Use environment variable if set, otherwise use standard location
|
|
if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ:
|
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright'
|
|
os.environ['DISPLAY'] = ':100' # Use Xvfb virtual display
|
|
|
|
self.log("Starting browser (Chromium)...", "info")
|
|
self.playwright = sync_playwright().start()
|
|
|
|
self.browser = self.playwright.chromium.launch(
|
|
headless=self.headless,
|
|
args=[
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--disable-dev-shm-usage',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-infobars',
|
|
'--disable-background-timer-throttling',
|
|
'--disable-backgrounding-occluded-windows',
|
|
'--disable-renderer-backgrounding'
|
|
]
|
|
)
|
|
|
|
# CRITICAL: Browser fingerprint must match FlareSolverr for cookies to work
|
|
# Get dynamic fingerprint settings from FlareSolverr
|
|
context_options = get_playwright_context_options()
|
|
|
|
# IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
|
|
# Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
|
try:
|
|
stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
|
|
if stored_user_agent:
|
|
self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug")
|
|
context_options['user_agent'] = stored_user_agent
|
|
else:
|
|
self.log(f"Using fingerprint: UA={context_options['user_agent'][:50]}...", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error getting stored user_agent, using default: {e}", "debug")
|
|
|
|
self.context = self.browser.new_context(**context_options)
|
|
|
|
# Load cookies
|
|
self.load_cookies(self.context)
|
|
|
|
self.page = self.context.new_page()
|
|
|
|
# Add comprehensive anti-detection scripts (dynamically from cloudflare_handler)
|
|
self.page.add_init_script(get_playwright_stealth_scripts())
|
|
|
|
self.log("Browser started and ready", "info")
|
|
|
|
def _stop_browser(self):
|
|
"""Stop the browser safely with proper error handling"""
|
|
# Close context first
|
|
if self.context:
|
|
try:
|
|
self.context.close()
|
|
self.log("Browser context closed", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error closing browser context: {e}", "warning")
|
|
finally:
|
|
self.context = None
|
|
|
|
# Close browser
|
|
if self.browser:
|
|
try:
|
|
self.browser.close()
|
|
self.log("Browser closed", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error closing browser: {e}", "warning")
|
|
finally:
|
|
self.browser = None
|
|
|
|
# Stop playwright
|
|
if self.playwright:
|
|
try:
|
|
self.playwright.stop()
|
|
except Exception as e:
|
|
self.log(f"Error stopping playwright: {e}", "warning")
|
|
finally:
|
|
self.playwright = None
|
|
|
|
self.page = None
|
|
|
|
def __del__(self):
|
|
"""Cleanup browser when instance is destroyed"""
|
|
self._stop_browser()
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry - allows using 'with' statement"""
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit - ensures browser cleanup"""
|
|
self._stop_browser()
|
|
return False # Don't suppress exceptions
|
|
|
|
def get_profile_info(self, username: str) -> Optional[Dict]:
|
|
"""Extract profile info (avatar URL, bio, display name) from imginn profile page.
|
|
|
|
Returns dict with keys: avatar_url, bio, display_name, or None on failure.
|
|
"""
|
|
import time as _time
|
|
import random as _random
|
|
|
|
self._enforce_rate_limit("posts")
|
|
self._start_browser()
|
|
page = self.page
|
|
if not page:
|
|
return None
|
|
|
|
try:
|
|
url = f"https://imginn.com/{username.lower()}/?ref=index"
|
|
self.log(f"Fetching profile info for @{username} from imginn", "info")
|
|
page.goto(url, wait_until='domcontentloaded')
|
|
|
|
wait_time = 5 + _random.uniform(0, 2)
|
|
_time.sleep(wait_time)
|
|
|
|
if not self.wait_for_cloudflare(page):
|
|
self.log("Page didn't load for profile info extraction", "warning")
|
|
return None
|
|
|
|
self.save_cookies(self.context)
|
|
_time.sleep(2)
|
|
|
|
# Use JavaScript to extract profile info with multiple selector strategies
|
|
profile_info = page.evaluate("""() => {
|
|
const result = {};
|
|
|
|
// --- Avatar ---
|
|
// Strategy 1: img inside a profile/user info section
|
|
const avatarSelectors = [
|
|
'.profile-avatar img',
|
|
'.user-avatar img',
|
|
'.avatar img',
|
|
'.profile-info img',
|
|
'.info img:first-of-type',
|
|
'header img',
|
|
'.user img',
|
|
];
|
|
for (const sel of avatarSelectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el && el.src && !el.src.includes('lazy') && !el.src.includes('data:')) {
|
|
result.avatar_url = el.src;
|
|
break;
|
|
}
|
|
}
|
|
// Strategy 2: find small/round img with scontent or profile in src
|
|
if (!result.avatar_url) {
|
|
const imgs = document.querySelectorAll('img');
|
|
for (const img of imgs) {
|
|
const src = img.src || '';
|
|
if ((src.includes('scontent') || src.includes('profile') || src.includes('avatar')
|
|
|| src.includes('imginn.com'))
|
|
&& !src.includes('lazy') && !src.includes('data:')) {
|
|
const rect = img.getBoundingClientRect();
|
|
if (rect.width > 20 && rect.width < 250) {
|
|
result.avatar_url = src;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean avatar URL: strip query params (imginn CDN works without them
|
|
// and the full URL often has malformed double-? from Instagram CDN paths)
|
|
if (result.avatar_url && result.avatar_url.includes('?')) {
|
|
result.avatar_url = result.avatar_url.split('?')[0];
|
|
}
|
|
|
|
// --- Bio ---
|
|
const bioSelectors = [
|
|
'.biography',
|
|
'.bio',
|
|
'.user-bio',
|
|
'.profile-bio',
|
|
'.profile-info .description',
|
|
'.info .bio',
|
|
];
|
|
for (const sel of bioSelectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el && el.textContent.trim().length > 2) {
|
|
result.bio = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
// --- Display Name ---
|
|
const nameSelectors = [
|
|
'.fullname',
|
|
'.display-name',
|
|
'.profile-name',
|
|
'.name',
|
|
'.user-info h1',
|
|
'h1',
|
|
];
|
|
for (const sel of nameSelectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el && el.textContent.trim().length > 1 && el.textContent.trim().length < 100) {
|
|
result.display_name = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}""")
|
|
|
|
# Save debug screenshot for future selector tuning
|
|
try:
|
|
screenshot_path = Path(f"/tmp/imginn_profile_{username}.png")
|
|
page.screenshot(path=str(screenshot_path))
|
|
self.log(f"Profile screenshot saved to {screenshot_path}", "debug")
|
|
except Exception:
|
|
pass
|
|
|
|
if profile_info and any(profile_info.values()):
|
|
self.log(f"Extracted profile info: avatar={'yes' if profile_info.get('avatar_url') else 'no'}, "
|
|
f"bio={'yes' if profile_info.get('bio') else 'no'}, "
|
|
f"name={profile_info.get('display_name', 'no')}", "info")
|
|
return profile_info
|
|
else:
|
|
# Save page HTML for debugging
|
|
try:
|
|
html_path = Path(f"/tmp/imginn_profile_{username}.html")
|
|
html_path.write_text(page.content()[:50000])
|
|
self.log(f"No profile info found - HTML saved to {html_path}", "warning")
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
except Exception as e:
|
|
self.log(f"Error getting profile info for @{username}: {e}", "error")
|
|
return None
|
|
|
|
def _extract_media_id_from_url(self, url: str) -> str:
|
|
"""Extract Instagram media ID from URL"""
|
|
# URL format: https://imginn.com/p/MEDIA_ID/
|
|
# or just /p/MEDIA_ID/
|
|
match = re.search(r'/p/([^/]+)/?', url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def _update_file_timestamps(self, filepath: Path, post_date: datetime):
|
|
"""Update all timestamps for a file to match the post date"""
|
|
try:
|
|
# Convert datetime to timestamp
|
|
timestamp = post_date.timestamp()
|
|
|
|
# 1. Update file system timestamps (access time and modification time)
|
|
os.utime(filepath, (timestamp, timestamp))
|
|
self.log(f"Updated file timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
|
|
# 2. Try to update creation time (platform-specific)
|
|
if platform.system() == 'Darwin': # macOS
|
|
# Use SetFile command on macOS
|
|
date_str = post_date.strftime('%m/%d/%Y %H:%M:%S')
|
|
try:
|
|
subprocess.run(
|
|
['SetFile', '-d', date_str, str(filepath)],
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
except (subprocess.SubprocessError, FileNotFoundError, OSError):
|
|
pass # SetFile not available on this system
|
|
elif platform.system() == 'Windows':
|
|
# On Windows, use PowerShell with proper escaping to prevent injection
|
|
filepath_escaped = str(filepath).replace("'", "''")
|
|
date_escaped = post_date.isoformat().replace("'", "''")
|
|
ps_command = f"(Get-Item -LiteralPath '{filepath_escaped}').CreationTime = Get-Date '{date_escaped}'"
|
|
try:
|
|
subprocess.run(
|
|
['powershell', '-Command', ps_command],
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
except (subprocess.SubprocessError, FileNotFoundError, OSError):
|
|
pass # PowerShell command failed
|
|
# Linux doesn't support changing creation time
|
|
|
|
# 3. Update EXIF data for images
|
|
if str(filepath).lower().endswith(('.jpg', '.jpeg', '.png', '.heic')):
|
|
self._update_exif_timestamps(filepath, post_date)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error updating timestamps: {e}", "warning")
|
|
|
|
def _update_exif_timestamps(self, filepath: Path, post_date: datetime):
|
|
"""Update EXIF timestamps in image files"""
|
|
try:
|
|
# Check if exiftool is available
|
|
result = subprocess.run(['which', 'exiftool'], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
# Format date for EXIF
|
|
exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S')
|
|
|
|
# Update all date fields in EXIF including MetadataDate for Immich
|
|
cmd = [
|
|
'exiftool', '-overwrite_original', '-quiet',
|
|
f'-AllDates={exif_date}',
|
|
f'-MetadataDate={exif_date}',
|
|
'-HistoryWhen=',
|
|
f'-FileModifyDate={exif_date}',
|
|
str(filepath)
|
|
]
|
|
|
|
subprocess.run(cmd, capture_output=True, text=True)
|
|
self.log(f"Updated EXIF timestamps", "debug")
|
|
except Exception:
|
|
# Silently skip if exiftool not available
|
|
pass
|
|
|
|
def _extract_post_date(self, page) -> datetime:
|
|
"""Try to extract post date from page"""
|
|
try:
|
|
# Wait a moment for dynamic content to load
|
|
page.wait_for_timeout(500)
|
|
|
|
# FIRST: Look for data-created attribute (Unix timestamp)
|
|
elements_with_data_created = page.locator('[data-created]').all()
|
|
self.log(f"Found {len(elements_with_data_created)} elements with data-created attribute", "debug")
|
|
|
|
for elem in elements_with_data_created:
|
|
timestamp_str = elem.get_attribute('data-created')
|
|
if timestamp_str:
|
|
try:
|
|
# Convert Unix timestamp to datetime
|
|
timestamp = int(timestamp_str)
|
|
post_date = datetime.fromtimestamp(timestamp)
|
|
self.log(f"Found data-created timestamp: {timestamp} -> {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
return post_date
|
|
except Exception as e:
|
|
self.log(f"Failed to parse timestamp {timestamp_str}: {e}", "debug")
|
|
pass
|
|
|
|
# If no data-created found, wait a bit more and try again
|
|
if len(elements_with_data_created) == 0:
|
|
self.log("No data-created elements found, waiting for dynamic content...", "debug")
|
|
|
|
# Try to wait for the element to appear
|
|
try:
|
|
page.wait_for_selector('[data-created]', timeout=2000)
|
|
elements_with_data_created = page.locator('[data-created]').all()
|
|
self.log(f"After waiting for selector: found {len(elements_with_data_created)} elements with data-created", "debug")
|
|
except Exception:
|
|
# Still try one more time with a longer wait
|
|
page.wait_for_timeout(1500)
|
|
elements_with_data_created = page.locator('[data-created]').all()
|
|
self.log(f"After timeout wait: found {len(elements_with_data_created)} elements with data-created", "debug")
|
|
|
|
for elem in elements_with_data_created:
|
|
timestamp_str = elem.get_attribute('data-created')
|
|
if timestamp_str:
|
|
try:
|
|
timestamp = int(timestamp_str)
|
|
post_date = datetime.fromtimestamp(timestamp)
|
|
self.log(f"Found data-created timestamp after wait: {timestamp} -> {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
return post_date
|
|
except Exception as e:
|
|
self.log(f"Failed to parse timestamp {timestamp_str}: {e}", "debug")
|
|
|
|
# Fallback: Look for other date elements
|
|
date_selectors = [
|
|
'time[datetime]',
|
|
'time',
|
|
'.date',
|
|
'[datetime]',
|
|
'span.date',
|
|
'div.date'
|
|
]
|
|
|
|
for selector in date_selectors:
|
|
elem = page.locator(selector).first
|
|
if elem.count() > 0:
|
|
# Try datetime attribute first
|
|
datetime_str = elem.get_attribute('datetime')
|
|
if datetime_str:
|
|
# Parse ISO format
|
|
for fmt in ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d']:
|
|
try:
|
|
return datetime.strptime(datetime_str.split('.')[0].replace('Z', ''), fmt)
|
|
except Exception:
|
|
continue
|
|
|
|
# Try text content
|
|
text = elem.text_content()
|
|
if text:
|
|
# Parse various date formats
|
|
# Could be "2 days ago", "September 6, 2025", etc.
|
|
if "ago" in text.lower():
|
|
# Handle relative dates
|
|
if "hour" in text:
|
|
hours = int(re.search(r'(\d+)', text).group(1))
|
|
return datetime.now() - timedelta(hours=hours)
|
|
elif "day" in text:
|
|
days = int(re.search(r'(\d+)', text).group(1))
|
|
return datetime.now() - timedelta(days=days)
|
|
elif "week" in text:
|
|
weeks = int(re.search(r'(\d+)', text).group(1))
|
|
return datetime.now() - timedelta(weeks=weeks)
|
|
else:
|
|
# Try parsing absolute date
|
|
for fmt in ['%B %d, %Y', '%b %d, %Y', '%Y-%m-%d']:
|
|
try:
|
|
return datetime.strptime(text, fmt)
|
|
except Exception:
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Error extracting date: {e}", "debug")
|
|
|
|
return None
|
|
|
|
def _scan_existing_files(self, output_dir: Path, profile_name: str):
|
|
"""Scan directory for existing files and extract media IDs"""
|
|
self.downloaded_files = scan_existing_files_for_media_ids(
|
|
output_dir, profile_name, min_file_size=20000, recursive=False
|
|
)
|
|
if self.downloaded_files:
|
|
self.log(f"Found {len(self.downloaded_files)} existing media IDs for {profile_name}", "debug")
|
|
|
|
def _is_already_downloaded(self, media_id: str) -> bool:
|
|
"""Check if media_id has already been downloaded (uses centralized function)"""
|
|
if not self.use_database or not self.unified_db:
|
|
return False
|
|
|
|
# Use centralized function for consistent cross-module detection
|
|
return is_instagram_downloaded(self.unified_db, media_id)
|
|
|
|
def _record_download(self, media_id: str, username: str, filename: str,
|
|
url: str = None, post_date=None, file_path: str = None,
|
|
content_type: str = 'post', metadata: dict = None,
|
|
deferred: bool = False):
|
|
"""Record a successful download in the database (uses centralized function)
|
|
|
|
Args:
|
|
deferred: If True, don't record to database now - add to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
# If deferred, store for later recording instead of recording now
|
|
if deferred:
|
|
self.pending_downloads.append({
|
|
'media_id': media_id,
|
|
'username': username,
|
|
'filename': filename,
|
|
'url': url,
|
|
'post_date': post_date.isoformat() if post_date else None,
|
|
'file_path': file_path,
|
|
'content_type': content_type,
|
|
'metadata': metadata
|
|
})
|
|
self.log(f"Deferred recording for {media_id}", "debug")
|
|
return True
|
|
|
|
if not self.use_database or not self.unified_db:
|
|
return False
|
|
|
|
try:
|
|
# Use centralized function for consistent cross-module storage
|
|
result = record_instagram_download(
|
|
db=self.unified_db,
|
|
media_id=media_id,
|
|
username=username,
|
|
content_type=content_type,
|
|
filename=filename,
|
|
url=url,
|
|
post_date=post_date,
|
|
file_path=file_path,
|
|
method='imginn',
|
|
extra_metadata=metadata
|
|
)
|
|
if result:
|
|
self.log(f"Recorded download for {media_id}", "debug")
|
|
return result
|
|
except Exception as e:
|
|
self.log(f"Failed to record download: {e}", "debug")
|
|
return False
|
|
|
|
def get_pending_downloads(self):
|
|
"""Get list of downloads that were deferred for later recording
|
|
|
|
Returns:
|
|
List of download metadata dicts ready for database recording
|
|
"""
|
|
return self.pending_downloads.copy()
|
|
|
|
def clear_pending_downloads(self):
|
|
"""Clear the pending downloads list after they've been recorded"""
|
|
self.pending_downloads = []
|
|
|
|
def _get_processed_posts(self, username: str) -> set:
|
|
"""Get set of post/story IDs that have been processed from database
|
|
|
|
NOTE: Checks ALL Instagram posts globally, not just this user's, because
|
|
the same post can appear on multiple profiles (shared posts, tags, reposts)
|
|
"""
|
|
processed = set()
|
|
if not self.unified_db:
|
|
return processed
|
|
|
|
try:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
# Get all Instagram posts globally (same post can appear on multiple profiles)
|
|
cursor.execute('''
|
|
SELECT url, filename, metadata FROM downloads
|
|
WHERE platform = 'instagram'
|
|
''')
|
|
|
|
for row in cursor.fetchall():
|
|
url, filename, metadata_str = row
|
|
# Add full URL to processed set
|
|
if url:
|
|
processed.add(url)
|
|
# Also extract and add post ID from URL for backward compatibility
|
|
if url and '/p/' in url:
|
|
match = re.search(r'/p/([^/]+)/', url)
|
|
if match:
|
|
processed.add(match.group(1))
|
|
|
|
# For stories, extract media_id from filename
|
|
if filename and '_story' in filename:
|
|
# Extract the long media ID before _story
|
|
# Format: username_date_MEDIAID_storyN.ext
|
|
parts = filename.split('_story')
|
|
if len(parts) >= 2:
|
|
# Get everything before _story, then get the media ID (last underscore-separated part)
|
|
pre_story = parts[0]
|
|
# Split by underscore and skip first 3 parts (username_YYYYMMDD_HHMMSS)
|
|
id_parts = pre_story.split('_')
|
|
if len(id_parts) > 3:
|
|
# Join everything after date as the media_id
|
|
media_id_full = '_'.join(id_parts[3:])
|
|
processed.add(media_id_full)
|
|
# Also add the extracted Instagram media ID (18-digit number)
|
|
normalized_id = extract_instagram_media_id(media_id_full)
|
|
if normalized_id and normalized_id != media_id_full:
|
|
processed.add(normalized_id)
|
|
|
|
# Also check metadata for media_id
|
|
if metadata_str:
|
|
try:
|
|
metadata = json.loads(metadata_str)
|
|
if 'post_id' in metadata:
|
|
processed.add(metadata['post_id'])
|
|
if 'media_id' in metadata:
|
|
media_id = metadata['media_id']
|
|
processed.add(media_id)
|
|
# Also add the extracted Instagram media ID
|
|
normalized_id = extract_instagram_media_id(media_id)
|
|
if normalized_id and normalized_id != media_id:
|
|
processed.add(normalized_id)
|
|
if 'media_id_full' in metadata:
|
|
processed.add(metadata['media_id_full'])
|
|
except Exception:
|
|
pass
|
|
|
|
if processed:
|
|
self.log(f"Found {len(processed)} processed posts in database for {username}", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error loading processed posts from database: {e}", "debug")
|
|
|
|
return processed
|
|
|
|
def save_cookies(self, context):
|
|
"""Save cookies to database or file"""
|
|
cookies = context.cookies()
|
|
|
|
# Save to database if available
|
|
if self.unified_db:
|
|
try:
|
|
# CRITICAL: Include user_agent for cf_clearance cookies to work
|
|
self.unified_db.save_scraper_cookies(
|
|
self.scraper_id,
|
|
cookies,
|
|
user_agent=self.user_agent,
|
|
merge=True
|
|
)
|
|
self.log(f"Saved {len(cookies)} cookies to database", "debug")
|
|
return
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies to database: {e}", "warning")
|
|
|
|
# Fallback to file-based storage
|
|
if self.cookie_file:
|
|
storage_data = {
|
|
'cookies': cookies,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
with open(self.cookie_file, 'w') as f:
|
|
json.dump(storage_data, f, indent=2)
|
|
self.log(f"Saved {len(cookies)} cookies to file", "debug")
|
|
|
|
def load_cookies(self, context):
|
|
"""Load saved cookies from database or file"""
|
|
# Try loading from database first
|
|
if self.unified_db:
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
# Clean cookies - remove unsupported properties and convert expiry->expires
|
|
cleaned_cookies = []
|
|
for cookie in cookies:
|
|
cleaned = {k: v for k, v in cookie.items()
|
|
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
|
|
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
|
|
if 'expiry' in cleaned and 'expires' not in cleaned:
|
|
cleaned['expires'] = cleaned.pop('expiry')
|
|
cleaned_cookies.append(cleaned)
|
|
|
|
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
|
|
# Otherwise old cookies may override new ones from FlareSolverr
|
|
try:
|
|
context.clear_cookies()
|
|
self.log("Cleared existing browser cookies", "debug")
|
|
except Exception as e:
|
|
self.log(f"Could not clear cookies: {e}", "debug")
|
|
|
|
context.add_cookies(cleaned_cookies)
|
|
self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info")
|
|
return True
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from database: {e}", "warning")
|
|
|
|
# Fallback to file-based cookies
|
|
if not self.cookie_file or not self.cookie_file.exists():
|
|
return False
|
|
|
|
try:
|
|
with open(self.cookie_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Check age (24 hours)
|
|
saved_time = datetime.fromisoformat(data['timestamp'])
|
|
if datetime.now() - saved_time > timedelta(hours=24):
|
|
self.log("Cookies expired", "debug")
|
|
return False
|
|
|
|
# Clean cookies - remove unsupported properties and convert expiry->expires
|
|
cleaned_cookies = []
|
|
for cookie in data['cookies']:
|
|
# Remove Chrome-specific properties that Playwright doesn't support
|
|
cleaned = {k: v for k, v in cookie.items()
|
|
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
|
|
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
|
|
if 'expiry' in cleaned and 'expires' not in cleaned:
|
|
cleaned['expires'] = cleaned.pop('expiry')
|
|
cleaned_cookies.append(cleaned)
|
|
|
|
context.add_cookies(cleaned_cookies)
|
|
self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info")
|
|
return True
|
|
except Exception as e:
|
|
self.log(f"Failed to load cookies: {e}", "warning")
|
|
return False
|
|
|
|
def wait_for_cloudflare(self, page):
|
|
"""Wait for Cloudflare to auto-solve or page to load - uses CloudflareHandler with ImgInn-specific checks"""
|
|
self.log("Waiting for page to load...", "debug")
|
|
|
|
max_wait = 120 # Extended wait - Cloudflare challenges can take up to 120 seconds
|
|
flaresolverr_attempts = 0
|
|
max_flaresolverr_attempts = 3
|
|
|
|
for i in range(max_wait):
|
|
time.sleep(1)
|
|
|
|
# Check current URL and content
|
|
try:
|
|
current_url = page.url
|
|
content = page.content().lower()
|
|
except Exception as e:
|
|
# Page is still navigating, wait and try again
|
|
if "navigating" in str(e).lower():
|
|
self.log("Page still navigating, waiting...", "debug")
|
|
continue
|
|
else:
|
|
# Some other error, re-raise it
|
|
raise
|
|
|
|
# First check if the actual content is visible (not Cloudflare)
|
|
# ImgInn pages will have profile content when loaded
|
|
if 'imginn' in current_url.lower() and ('posts' in content or 'followers' in content or 'following' in content):
|
|
# We have actual content, not a challenge
|
|
self.log(f"Page loaded successfully after {i+1} seconds", "info")
|
|
return True
|
|
|
|
# Check for actual Cloudflare challenge or server error
|
|
# NOTE: 'challenge-platform' is NOT a reliable indicator - it's embedded JS that stays on the page
|
|
# even after successful bypass. Only check for visible interstitial text.
|
|
challenge_indicators = ['checking your browser', 'just a moment', 'verify you are human', 'enable javascript']
|
|
error_indicators = ['internal server error', 'error code 500', 'error code 502', 'error code 503']
|
|
|
|
has_challenge = any(indicator in content for indicator in challenge_indicators)
|
|
has_error = any(indicator in content for indicator in error_indicators)
|
|
|
|
if has_error:
|
|
self.log("Server error detected (500/502/503) - site is likely down", "error")
|
|
# Save screenshot for debugging
|
|
try:
|
|
debug_dir = Path("debug")
|
|
debug_dir.mkdir(exist_ok=True)
|
|
screenshot_path = debug_dir / f"server_error_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
|
page.screenshot(path=str(screenshot_path))
|
|
self.log(f"Screenshot saved to {screenshot_path}", "debug")
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
if has_challenge:
|
|
# Try FlareSolverr at specific intervals (0s, 15s, 30s)
|
|
# Note: Turnstile checkbox clicking doesn't work - it's designed to block automation
|
|
if i == 0 or (i in [15, 30] and flaresolverr_attempts < max_flaresolverr_attempts):
|
|
flaresolverr_attempts += 1
|
|
self.log(f"Cloudflare challenge detected, attempting FlareSolverr bypass (attempt {flaresolverr_attempts})...", "info")
|
|
|
|
# Get current browser user_agent for comparison
|
|
current_browser_ua = None
|
|
try:
|
|
current_browser_ua = page.evaluate('() => navigator.userAgent')
|
|
except Exception:
|
|
pass
|
|
|
|
# Try to get fresh cookies via FlareSolverr
|
|
if self._get_cookies_via_flaresolverr(page.url):
|
|
self.log("Got fresh cookies, reloading page...", "info")
|
|
|
|
# Check if user_agent changed - if so, restart browser
|
|
new_ua = None
|
|
try:
|
|
new_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
|
|
self.log(f"Stored cookie UA: {new_ua[:60] if new_ua else 'None'}...", "debug")
|
|
self.log(f"Browser UA: {current_browser_ua[:60] if current_browser_ua else 'None'}...", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error getting stored UA: {e}", "debug")
|
|
|
|
if new_ua and current_browser_ua and new_ua != current_browser_ua:
|
|
self.log("User-agent changed, restarting browser with new fingerprint...", "info")
|
|
self._stop_browser()
|
|
self._start_browser()
|
|
page = self.page
|
|
try:
|
|
page.goto(current_url, wait_until='domcontentloaded', timeout=30000)
|
|
except Exception as e:
|
|
self.log(f"Error navigating after browser restart: {e}", "debug")
|
|
else:
|
|
# Reload cookies in browser context
|
|
try:
|
|
self.load_cookies(self.context)
|
|
# Reload the page with new cookies
|
|
page.reload(wait_until='domcontentloaded', timeout=10000)
|
|
# CRITICAL: Wait for Cloudflare background JS validation (5-7 seconds)
|
|
wait_time = 5 + random.uniform(0, 2)
|
|
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
|
|
time.sleep(wait_time)
|
|
except Exception as e:
|
|
self.log(f"Error reloading page with new cookies: {e}", "debug")
|
|
else:
|
|
self.log("FlareSolverr failed, waiting for challenge to resolve...", "warning")
|
|
|
|
# Continue waiting for challenge to resolve
|
|
continue
|
|
|
|
# Check if we're on the correct page with content
|
|
if '/p/' in current_url: # Post page
|
|
# Look for download button or image
|
|
if 'download' in content or 'data-created' in content:
|
|
self.log(f"Post page loaded after {i+1} seconds", "info")
|
|
return True
|
|
elif '/stories/' in current_url: # Stories page
|
|
# Stories pages have swiper, reels, or story content
|
|
if 'swiper' in content or 'data-uid' in content or 'reel' in content:
|
|
self.log(f"Stories page loaded after {i+1} seconds", "info")
|
|
return True
|
|
# Also check for counter/profile info which is on stories pages too
|
|
if 'counter-item' in content or ('posts' in content and 'followers' in content):
|
|
self.log(f"Stories page loaded after {i+1} seconds", "info")
|
|
return True
|
|
elif '/tagged/' in current_url: # Tagged page
|
|
# Tagged pages have items grid
|
|
if 'class="item"' in content or 'data-uid' in content:
|
|
self.log(f"Tagged page loaded after {i+1} seconds", "info")
|
|
return True
|
|
if 'posts' in content and 'followers' in content:
|
|
self.log(f"Tagged page loaded after {i+1} seconds", "info")
|
|
return True
|
|
else: # Profile page
|
|
# Check if profile content is visible - ImgInn specific
|
|
if 'imginn' in current_url.lower():
|
|
if ('posts' in content and 'followers' in content) or 'following' in content:
|
|
self.log(f"Profile page loaded after {i+1} seconds", "info")
|
|
return True
|
|
# Also check for actual post links
|
|
if 'href="/p/' in content or 'class="item"' in content:
|
|
self.log(f"Profile page loaded after {i+1} seconds", "info")
|
|
return True
|
|
|
|
# Debug: Log what we're seeing if we've been waiting a while
|
|
if i == 15:
|
|
self.log(f"Debug: URL={current_url[:50]}, has posts={('posts' in content)}, has swiper={('swiper' in content)}", "debug")
|
|
|
|
# Status updates (only if we haven't detected content yet)
|
|
if i == 10:
|
|
self.log("Still waiting (10s)... page loading", "debug")
|
|
elif i == 20:
|
|
self.log("Still waiting (20s)... page not ready yet", "info")
|
|
elif i == 30:
|
|
self.log("Still waiting (30s)... slow response from server", "info")
|
|
elif i == 45:
|
|
self.log("Still waiting (45s)... checking if blocked", "info")
|
|
elif i == 60:
|
|
self.log("Still waiting (60s)... page load is slow", "warning")
|
|
elif i == 90:
|
|
self.log("Still waiting (90s)... this is taking too long", "warning")
|
|
|
|
# Timeout reached - page didn't load
|
|
self._page_load_failures += 1
|
|
level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning"
|
|
self.log(f"Page load timeout ({self._page_load_failures}x this session). URL: {page.url}", level)
|
|
|
|
# Save screenshot for debugging
|
|
try:
|
|
debug_dir = Path("debug")
|
|
debug_dir.mkdir(exist_ok=True)
|
|
screenshot_path = debug_dir / f"cloudflare_block_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
|
page.screenshot(path=str(screenshot_path))
|
|
self.log(f"Screenshot saved to {screenshot_path}", "debug")
|
|
except Exception:
|
|
pass
|
|
|
|
return False
|
|
|
|
def _dismiss_consent_dialog(self, page):
|
|
"""Dismiss cookie consent / GDPR overlay if present (Google FundingChoices)."""
|
|
try:
|
|
# Google FundingChoices consent dialog
|
|
consent_btn = page.locator(
|
|
'button.fc-cta-consent, ' # "Consent" button
|
|
'button.fc-cta-do-not-consent, ' # "Do not consent" button
|
|
'button[aria-label="Consent"], '
|
|
'button.fc-dismiss-button, ' # Dismiss/close button
|
|
'.fc-dialog button.fc-primary-button'
|
|
).first
|
|
if consent_btn.count() > 0 and consent_btn.is_visible():
|
|
consent_btn.click(force=True)
|
|
self.log("Dismissed consent dialog", "debug")
|
|
time.sleep(0.5)
|
|
return
|
|
# Fallback: remove the overlay via JS if buttons aren't found
|
|
overlay = page.locator('.fc-consent-root, .fc-dialog-overlay').first
|
|
if overlay.count() > 0:
|
|
page.evaluate("document.querySelectorAll('.fc-consent-root, .fc-dialog-overlay, .fc-dialog-container').forEach(el => el.remove())")
|
|
self.log("Removed consent overlay via JS", "debug")
|
|
except Exception:
|
|
pass
|
|
|
|
def _safe_go_back(self, page, username: str, tagged: bool = False):
|
|
"""Navigate back to profile page safely with timeout handling.
|
|
|
|
Tries go_back() first with a short timeout, falls back to direct navigation.
|
|
"""
|
|
try:
|
|
page.go_back(timeout=10000)
|
|
except Exception:
|
|
self.log("go_back timed out, navigating directly to profile", "debug")
|
|
try:
|
|
suffix = f"/tagged/?ref=index" if tagged else "/?ref=index"
|
|
page.goto(f"https://imginn.com/{username}{suffix}", timeout=15000)
|
|
except Exception as nav_err:
|
|
self.log(f"Direct navigation back also failed: {nav_err}", "warning")
|
|
|
|
def _is_cloudflare_challenge(self, page) -> bool:
|
|
"""Check if current page is a Cloudflare challenge page.
|
|
|
|
Returns:
|
|
True if Cloudflare challenge detected, False otherwise
|
|
"""
|
|
try:
|
|
title = page.title().lower()
|
|
content = page.content().lower()[:2000] # Check first 2000 chars
|
|
|
|
challenge_indicators = ['just a moment', 'checking your browser', 'verify you are human',
|
|
'enable javascript', 'cloudflare']
|
|
|
|
# Check title first (most reliable)
|
|
if any(indicator in title for indicator in challenge_indicators):
|
|
return True
|
|
|
|
# Check content
|
|
if any(indicator in content for indicator in challenge_indicators):
|
|
return True
|
|
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
def _handle_cloudflare_on_post(self, page, post_url: str, max_retries: int = 2) -> bool:
|
|
"""Handle Cloudflare challenge on a post page by getting fresh cookies and retrying.
|
|
|
|
Args:
|
|
page: Playwright page object
|
|
post_url: URL of the post to retry
|
|
max_retries: Maximum number of retry attempts
|
|
|
|
Returns:
|
|
True if page loaded successfully (no Cloudflare), False if still blocked
|
|
"""
|
|
if not self._is_cloudflare_challenge(page):
|
|
return True # No challenge, page is good
|
|
|
|
self.log(f"Cloudflare challenge detected on post page, attempting bypass...", "warning")
|
|
|
|
for attempt in range(max_retries):
|
|
# Wait before FlareSolverr attempt - give Cloudflare time to cool down
|
|
if attempt == 0:
|
|
wait_time = random.uniform(15, 25)
|
|
else:
|
|
wait_time = random.uniform(30, 60)
|
|
self.log(f"Waiting {wait_time:.1f}s before FlareSolverr attempt {attempt + 1}...", "info")
|
|
time.sleep(wait_time)
|
|
|
|
# Get fresh cookies via FlareSolverr using the post URL
|
|
if self._get_cookies_via_flaresolverr(post_url):
|
|
self.log(f"Got fresh cookies (attempt {attempt + 1}), reloading post...", "info")
|
|
|
|
# Check if user_agent changed - if so, restart browser
|
|
try:
|
|
current_browser_ua = page.evaluate('() => navigator.userAgent')
|
|
new_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
|
|
|
|
if new_ua and current_browser_ua and new_ua != current_browser_ua:
|
|
self.log("User-agent changed, restarting browser...", "info")
|
|
self._stop_browser()
|
|
self._start_browser()
|
|
page = self.page
|
|
except Exception as e:
|
|
self.log(f"Error checking user_agent: {e}", "debug")
|
|
|
|
# Reload cookies into browser context
|
|
try:
|
|
self.load_cookies(self.context)
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies: {e}", "debug")
|
|
|
|
# Navigate directly to the post URL
|
|
try:
|
|
page.goto(post_url, wait_until='domcontentloaded', timeout=30000)
|
|
|
|
# Wait for Cloudflare background JS validation (5-7 seconds)
|
|
wait_time = 5 + random.uniform(0, 2)
|
|
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
|
|
time.sleep(wait_time)
|
|
|
|
# Check if still blocked
|
|
if not self._is_cloudflare_challenge(page):
|
|
self.log("Cloudflare bypass successful on post page", "info")
|
|
# IMPORTANT: Save browser cookies after successful bypass
|
|
# This captures any cookies set by Cloudflare's JS validation
|
|
try:
|
|
self.save_cookies(self.context)
|
|
self.log("Saved browser cookies after successful bypass", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies after bypass: {e}", "debug")
|
|
return True
|
|
else:
|
|
self.log(f"Still blocked after retry {attempt + 1}", "warning")
|
|
except Exception as e:
|
|
self.log(f"Navigation failed after cookie refresh: {e}", "warning")
|
|
else:
|
|
self.log(f"FlareSolverr failed (attempt {attempt + 1})", "warning")
|
|
|
|
self.log("Failed to bypass Cloudflare on post page after all retries", "error")
|
|
return False
|
|
|
|
def _check_post_phrases(self, page, phrase_config: dict) -> bool:
|
|
"""Check if post contains required phrases
|
|
|
|
Args:
|
|
page: Playwright page object
|
|
phrase_config: Phrase search configuration
|
|
{
|
|
'phrases': list of phrases to search for,
|
|
'case_sensitive': bool,
|
|
'match_all': bool (True = all phrases must match, False = any phrase)
|
|
}
|
|
|
|
Returns:
|
|
True if post matches phrase criteria, False otherwise
|
|
"""
|
|
try:
|
|
# Get post caption/text
|
|
caption_selectors = [
|
|
'.caption',
|
|
'.post-caption',
|
|
'meta[property="og:description"]',
|
|
'meta[name="description"]',
|
|
'.content',
|
|
'div[class*="caption"]',
|
|
'span[class*="caption"]'
|
|
]
|
|
|
|
post_text = ""
|
|
for selector in caption_selectors:
|
|
try:
|
|
element = page.locator(selector).first
|
|
if element.count() > 0:
|
|
text = element.text_content() or element.get_attribute('content') or ""
|
|
if text:
|
|
post_text += " " + text
|
|
except Exception:
|
|
continue
|
|
|
|
# Also check visible text in the main content area
|
|
try:
|
|
main_content = page.locator('main, article, .post-content, div[role="main"]').first
|
|
if main_content.count() > 0:
|
|
post_text += " " + (main_content.text_content() or "")
|
|
except Exception:
|
|
pass
|
|
|
|
if not post_text:
|
|
self.log("Could not extract post text for phrase matching", "warning")
|
|
return False
|
|
|
|
# Clean up text
|
|
post_text = ' '.join(post_text.split()) # Normalize whitespace
|
|
|
|
phrases = phrase_config.get('phrases', [])
|
|
if not phrases:
|
|
return True # No phrases to match = match all
|
|
|
|
case_sensitive = phrase_config.get('case_sensitive', False)
|
|
match_all = phrase_config.get('match_all', False)
|
|
|
|
if not case_sensitive:
|
|
post_text = post_text.lower()
|
|
phrases = [p.lower() for p in phrases]
|
|
|
|
self.log(f"Checking post text ({len(post_text)} chars) for phrases: {phrases}", "debug")
|
|
|
|
# Check phrase matching
|
|
matches = []
|
|
for phrase in phrases:
|
|
if phrase in post_text:
|
|
matches.append(phrase)
|
|
self.log(f"Found phrase: '{phrase}'", "debug")
|
|
|
|
if match_all:
|
|
# All phrases must be found
|
|
result = len(matches) == len(phrases)
|
|
if not result:
|
|
missing = [p for p in phrases if p not in matches]
|
|
self.log(f"Missing required phrases: {missing}", "debug")
|
|
else:
|
|
# At least one phrase must be found
|
|
result = len(matches) > 0
|
|
if not result:
|
|
self.log(f"No matching phrases found", "debug")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
self.log(f"Error checking post phrases: {e}", "error")
|
|
return False
|
|
|
|
def download(self, username: str, content_type: str = "posts", days_back: int = 14, max_downloads: int = 50, output_dir: str = None, phrase_config: dict = None, defer_database: bool = False):
|
|
"""Download content from a user - compatible with media-downloader interface
|
|
|
|
Args:
|
|
username: Instagram username
|
|
content_type: Type of content ("posts", "stories", or "tagged")
|
|
days_back: How many days back to search
|
|
max_downloads: Maximum posts to download
|
|
output_dir: Output directory
|
|
phrase_config: Optional phrase search configuration
|
|
{
|
|
'enabled': bool,
|
|
'phrases': list of phrases to search for,
|
|
'case_sensitive': bool,
|
|
'match_all': bool (True = all phrases must match, False = any phrase)
|
|
}
|
|
defer_database: If True, defer database recording to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
# Clear downloaded_files cache between accounts to prevent memory growth
|
|
self.downloaded_files.clear()
|
|
# Clear pending downloads for fresh batch
|
|
self.pending_downloads = []
|
|
|
|
# Set output directory
|
|
if output_dir:
|
|
output_path = Path(output_dir) / username
|
|
else:
|
|
output_path = Path(f"/opt/media-downloader/downloads/{username}")
|
|
|
|
# Route to appropriate download method
|
|
if content_type == "posts":
|
|
files = self.download_posts(
|
|
username=username,
|
|
days_back=days_back,
|
|
max_posts=max_downloads,
|
|
output_dir=output_path,
|
|
phrase_config=phrase_config,
|
|
defer_database=defer_database
|
|
)
|
|
elif content_type == "stories":
|
|
files = self.download_stories(
|
|
username=username,
|
|
days_back=days_back,
|
|
max_stories=max_downloads,
|
|
output_dir=output_path,
|
|
defer_database=defer_database
|
|
)
|
|
elif content_type == "tagged":
|
|
files = self.download_tagged(
|
|
username=username,
|
|
days_back=days_back,
|
|
max_posts=max_downloads,
|
|
output_dir=output_path,
|
|
phrase_config=phrase_config,
|
|
defer_database=defer_database
|
|
)
|
|
else:
|
|
self.log(f"ImgInn does not support content type: {content_type}", "warning")
|
|
return 0
|
|
|
|
return len(files)
|
|
|
|
def download_posts(self, username: str, days_back: int = 14, max_posts: int = 50, specific_post_url: str = None, output_dir: Path = None, phrase_config: dict = None, skip_database: bool = False, max_age_hours: int = None, defer_database: bool = False):
|
|
"""Download posts from a user with FastDL naming
|
|
|
|
Args:
|
|
username: Instagram username
|
|
days_back: How many days back to search
|
|
max_posts: Maximum posts to check
|
|
specific_post_url: Download a specific post
|
|
output_dir: Output directory
|
|
phrase_config: Optional phrase search configuration
|
|
skip_database: If True, don't record downloads in database (for temporary processing)
|
|
max_age_hours: If specified, only download posts newer than N hours (overrides days_back)
|
|
defer_database: If True, defer database recording to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
# Rate limiting to avoid Cloudflare blocks
|
|
self._enforce_rate_limit("posts")
|
|
|
|
profile_name = username.lower()
|
|
if output_dir is None:
|
|
output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Check site status before doing anything else
|
|
self.log("Checking ImgInn site status...", "debug")
|
|
site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10)
|
|
|
|
if self.cf_handler.should_skip_download(site_status):
|
|
self.log(f"Skipping download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning")
|
|
self.activity_manager.update_status(f"Skipped - ImgInn unavailable ({error_msg})")
|
|
return []
|
|
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
|
|
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
|
|
|
|
# Scan existing files
|
|
self._scan_existing_files(output_dir, profile_name)
|
|
|
|
# Get processed posts from database
|
|
processed_posts = self._get_processed_posts(profile_name)
|
|
self.log(f"Loaded {len(processed_posts)} processed posts for {profile_name} from database", "info")
|
|
if len(processed_posts) > 0 and len(processed_posts) < 20:
|
|
self.log(f"Processed post IDs: {processed_posts}", "debug")
|
|
|
|
downloaded_files = []
|
|
# Use max_age_hours if specified, otherwise use days_back
|
|
if max_age_hours is not None:
|
|
cutoff_date = datetime.now() - timedelta(hours=max_age_hours)
|
|
else:
|
|
cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
|
|
# Update activity status
|
|
if specific_post_url and profile_name == 'unknown':
|
|
self.activity_manager.update_status(f"Fetching post...")
|
|
else:
|
|
self.activity_manager.update_status("Checking posts")
|
|
|
|
# Start or reuse browser
|
|
self._start_browser()
|
|
page = self.page
|
|
|
|
try:
|
|
# If specific post URL provided, go directly to it
|
|
if specific_post_url:
|
|
self.log(f"Navigating to specific post", "info")
|
|
page.goto(specific_post_url, wait_until='domcontentloaded')
|
|
else:
|
|
# Navigate to profile
|
|
self.log(f"Navigating to @{username} profile", "info")
|
|
page.goto(f"https://imginn.com/{username}/?ref=index", wait_until='domcontentloaded')
|
|
|
|
# CRITICAL: Wait 5-7 seconds for Cloudflare background JS challenges to complete
|
|
# Per browserless.io: "Allow 5+ seconds post-page load for background JavaScript challenges"
|
|
import random
|
|
wait_time = 5 + random.uniform(0, 2) # 5-7 seconds
|
|
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
|
|
time.sleep(wait_time)
|
|
|
|
# Wait for page to load
|
|
if not self.wait_for_cloudflare(page):
|
|
self._page_load_failures += 1
|
|
level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning"
|
|
self.log(f"Page didn't load properly ({self._page_load_failures}x this session)", level)
|
|
return []
|
|
|
|
# Save cookies
|
|
self.save_cookies(self.context)
|
|
|
|
# Wait for JavaScript to load posts (ImgInn loads posts dynamically)
|
|
self.log("Waiting for posts to load via JavaScript...", "info")
|
|
try:
|
|
# Wait for post links to appear (up to 10 seconds)
|
|
page.wait_for_selector('a[href*="/p/"]', timeout=10000)
|
|
self.log("Posts loaded successfully", "info")
|
|
except Exception:
|
|
# Timeout - posts might not exist, or page structure changed
|
|
self.log("Timeout waiting for posts to appear", "warning")
|
|
time.sleep(2) # Give it a bit more time anyway
|
|
|
|
# If specific post, process it directly
|
|
if specific_post_url:
|
|
self.log("Processing specific post", "info")
|
|
|
|
# Extract media ID from URL
|
|
media_id = self._extract_media_id_from_url(specific_post_url)
|
|
if not media_id:
|
|
self.log("Could not extract media ID", "warning")
|
|
return []
|
|
|
|
self.log(f"URL Media ID: {media_id}", "debug")
|
|
|
|
# Process this single post (bypass date filter for specific posts)
|
|
post_links = [None] # Dummy list for iteration
|
|
bypass_date_filter = True
|
|
else:
|
|
# Find posts on profile page
|
|
self.log("Finding posts...", "info")
|
|
|
|
# Debug: Check what's actually on the page
|
|
page_content = page.content()
|
|
if 'no posts' in page_content.lower() or 'page not found' in page_content.lower():
|
|
self.log("Page shows 'no posts' or 'not found'", "warning")
|
|
|
|
post_links = page.locator('a[href*="/p/"]').all()
|
|
|
|
self.log(f"Found {len(post_links)} posts", "info")
|
|
|
|
if not post_links:
|
|
# Debug: Save screenshot to see what's wrong
|
|
try:
|
|
screenshot_path = Path(f"/tmp/imginn_no_posts_{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
|
|
page.screenshot(path=str(screenshot_path))
|
|
self.log(f"No posts found - screenshot saved to {screenshot_path}", "warning")
|
|
except Exception:
|
|
pass
|
|
self.log("No posts found", "warning")
|
|
return []
|
|
|
|
bypass_date_filter = False
|
|
|
|
self.log(f"Processing posts (max {max_posts})", "info")
|
|
|
|
# Collect all post URLs upfront to avoid stale element issues
|
|
post_urls_to_process = []
|
|
if not specific_post_url:
|
|
for idx, pl in enumerate(post_links[:max_posts]):
|
|
try:
|
|
href = pl.get_attribute('href', timeout=5000)
|
|
if href:
|
|
if not href.startswith('http'):
|
|
href = f"https://imginn.com{href}"
|
|
post_urls_to_process.append(href)
|
|
except Exception as e:
|
|
self.log(f"Post {idx+1}: Failed to get URL: {str(e)[:50]}", "debug")
|
|
continue
|
|
self.log(f"Collected {len(post_urls_to_process)} post URLs", "debug")
|
|
|
|
# Track consecutive old posts to handle pinned posts
|
|
consecutive_old_posts = 0
|
|
max_consecutive_old_posts = 5 # Allow up to 5 old posts (pinned) before stopping
|
|
|
|
# Set initial progress so dashboard shows 0/N immediately
|
|
total_posts = len(post_urls_to_process) if not specific_post_url else 1
|
|
self.activity_manager.update_status(
|
|
"Downloading posts",
|
|
progress_current=0,
|
|
progress_total=total_posts
|
|
)
|
|
|
|
for i, post_url in enumerate(post_urls_to_process if not specific_post_url else [specific_post_url]):
|
|
# Update progress at start of each iteration (fires even on skips)
|
|
self.activity_manager.update_status(
|
|
"Downloading posts",
|
|
progress_current=i + 1,
|
|
progress_total=total_posts
|
|
)
|
|
|
|
try:
|
|
# Handle specific post vs regular posts
|
|
if specific_post_url:
|
|
# Already on the specific post page
|
|
post_url = specific_post_url
|
|
media_id = self._extract_media_id_from_url(post_url)
|
|
else:
|
|
# URL already collected and formatted
|
|
media_id = self._extract_media_id_from_url(post_url)
|
|
|
|
if not media_id:
|
|
self.log(f"Post {i+1}: Could not extract media ID", "warning")
|
|
continue
|
|
|
|
# Check if post was already processed (from database)
|
|
if media_id in processed_posts:
|
|
# Skip if in database - trust the database tracking
|
|
self.log(f"Post {i+1}: {media_id} already processed (database), skipping", "debug")
|
|
continue
|
|
|
|
# Rate limiting between post downloads to avoid Cloudflare blocks
|
|
if i > 0:
|
|
post_delay = random.uniform(3, 8)
|
|
self.log(f"Rate limit: waiting {post_delay:.1f}s before post {i+1}", "debug")
|
|
time.sleep(post_delay)
|
|
|
|
self.log(f"Post {i+1}: Processing {media_id}", "info")
|
|
|
|
# Navigate directly to post URL (more reliable than clicking which can timeout)
|
|
try:
|
|
page.goto(post_url, wait_until='domcontentloaded', timeout=30000)
|
|
except Exception as nav_err:
|
|
self.log(f"Post {i+1}: Navigation failed: {nav_err}", "warning")
|
|
continue
|
|
|
|
# Wait for page to load
|
|
time.sleep(2)
|
|
|
|
# Wait for navigation to complete
|
|
try:
|
|
page.wait_for_load_state('networkidle', timeout=5000)
|
|
except Exception:
|
|
# Continue even if network isn't idle - page might still be usable
|
|
self.log("Network didn't idle, but continuing", "debug")
|
|
|
|
# Check if on post page
|
|
if "/p/" not in page.url:
|
|
self.log(f"Not a downloadable post (URL: {page.url})", "warning")
|
|
self._safe_go_back(page, username)
|
|
continue
|
|
|
|
# IMPORTANT: Wait for post page content to fully render
|
|
# This ensures download buttons are from the POST PAGE, not profile page preview
|
|
try:
|
|
# Wait for the post container to be visible (imginn uses main-content now)
|
|
page.wait_for_selector('div.main-content, div.post, div.content, div.single-post', timeout=3000)
|
|
time.sleep(1) # Additional wait for download buttons to render
|
|
except Exception:
|
|
self.log("Post container not found, checking for Cloudflare...", "debug")
|
|
|
|
# Check for Cloudflare challenge and handle it
|
|
cloudflare_bypassed = False
|
|
if self._is_cloudflare_challenge(page):
|
|
self.log(f"Cloudflare challenge detected on post {media_id}", "warning")
|
|
if not self._handle_cloudflare_on_post(page, post_url):
|
|
# Cloudflare bypass failed - skip this post WITHOUT marking as processed
|
|
# so it can be retried on next run
|
|
self.log(f"Skipping post {media_id} due to Cloudflare block (will retry later)", "warning")
|
|
try:
|
|
page.goto(f"https://imginn.com/{username}/?ref=index")
|
|
time.sleep(3)
|
|
except Exception:
|
|
pass
|
|
continue
|
|
cloudflare_bypassed = True
|
|
|
|
self.log(f"Navigated to post page: {page.url}", "debug")
|
|
self._dismiss_consent_dialog(page)
|
|
|
|
# Extract actual username from post page if we don't have it (e.g., specific_post_url with unknown user)
|
|
if profile_name == 'unknown' or specific_post_url:
|
|
try:
|
|
username_elem = page.locator('div.username a').first
|
|
if username_elem.count() > 0:
|
|
username_href = username_elem.get_attribute('href')
|
|
if username_href:
|
|
# Extract username from href like "/evalongoria/" -> "evalongoria"
|
|
extracted_username = username_href.strip('/').lower()
|
|
if extracted_username and extracted_username != 'unknown':
|
|
profile_name = extracted_username
|
|
self.log(f"Extracted username from post page: @{profile_name}", "info")
|
|
# Update activity status with real username
|
|
self.activity_manager.update_status("Downloading posts")
|
|
except Exception as e:
|
|
self.log(f"Could not extract username from post page: {e}", "debug")
|
|
|
|
# Extract post date - ALWAYS extract for proper file naming
|
|
post_date = self._extract_post_date(page)
|
|
|
|
# Use post date for filename, or current date
|
|
if post_date:
|
|
date_str = post_date.strftime('%Y%m%d_%H%M%S')
|
|
self.log(f"Original post date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
else:
|
|
date_str = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
self.log(f"No original date found, using current time", "debug")
|
|
|
|
# Check date filter AFTER extracting date (bypass for specific posts)
|
|
if not bypass_date_filter and post_date and post_date < cutoff_date:
|
|
consecutive_old_posts += 1
|
|
self.log(f"Post too old ({post_date.strftime('%Y-%m-%d')}), skipping (consecutive old: {consecutive_old_posts}/{max_consecutive_old_posts})", "info")
|
|
|
|
# Mark this old post as checked in database to avoid re-checking
|
|
# Only mark if doing phrase search (has phrase_config)
|
|
if phrase_config and media_id:
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=profile_name,
|
|
filename=f"_old_post_{media_id}",
|
|
url=post_url,
|
|
post_date=post_date,
|
|
content_type='post',
|
|
metadata={'marker': True, 'reason': 'old_post'}
|
|
)
|
|
|
|
self._safe_go_back(page, username)
|
|
|
|
# Stop only after 5 consecutive old posts (handles pinned posts at top)
|
|
if consecutive_old_posts >= max_consecutive_old_posts:
|
|
self.log(f"Found {consecutive_old_posts} consecutive old posts - stopping", "info")
|
|
break
|
|
else:
|
|
continue # Skip this old post but keep checking (might be pinned)
|
|
|
|
# Reset consecutive old posts counter - we found a post within date range
|
|
consecutive_old_posts = 0
|
|
|
|
# Check for phrase matching if configured
|
|
if phrase_config and phrase_config.get('enabled'):
|
|
if not self._check_post_phrases(page, phrase_config):
|
|
self.log(f"Post does not match phrase criteria, skipping download", "info")
|
|
|
|
# Mark this post as checked (but not downloaded) in database
|
|
# This prevents re-checking the same post every run
|
|
if media_id:
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=profile_name,
|
|
filename=f"_phrase_checked_{media_id}",
|
|
url=post_url,
|
|
post_date=post_date,
|
|
content_type='post',
|
|
metadata={'marker': True, 'reason': 'phrase_checked'}
|
|
)
|
|
|
|
self._safe_go_back(page, username)
|
|
continue
|
|
else:
|
|
self.log(f"Post matches phrase criteria, using high-res download", "info")
|
|
|
|
# Check for carousel
|
|
carousel_next = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
|
|
has_carousel = carousel_next.count() > 0
|
|
|
|
if has_carousel:
|
|
self.log(f"Carousel detected - will download all carousel images", "info")
|
|
self._dismiss_consent_dialog(page)
|
|
|
|
# CRITICAL: Wait for POST PAGE carousel download buttons to be ready
|
|
# This prevents downloading from the profile page preview
|
|
try:
|
|
# Wait for download buttons with POST PAGE URLs (have "scontent" or "post" in them)
|
|
page.wait_for_selector('a.btn[href*="scontent"], a[download], a.download', timeout=3000)
|
|
time.sleep(1.5) # Additional wait for all carousel images to load
|
|
self.log("Carousel download buttons ready on post page", "debug")
|
|
except Exception:
|
|
self.log("Download buttons not found, but continuing", "debug")
|
|
else:
|
|
self.log("Single image post", "debug")
|
|
|
|
# Handle downloads - always use download buttons from post page
|
|
image_count = 0
|
|
max_images = 10
|
|
|
|
# Download images (carousel or single)
|
|
if has_carousel:
|
|
# First, let's find all carousel slides
|
|
all_slides = page.locator('.swiper-slide').all()
|
|
self.log(f"Found {len(all_slides)} carousel slides", "debug")
|
|
|
|
# Download each slide's image
|
|
for slide_index in range(min(len(all_slides), max_images)):
|
|
self.log(f"Processing carousel slide {slide_index + 1}/{len(all_slides)}", "debug")
|
|
|
|
# Get the current slide element to scope our searches
|
|
current_slide = all_slides[slide_index]
|
|
|
|
# Click next to navigate to this slide (except for first one)
|
|
if slide_index > 0:
|
|
next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
|
|
if next_btn.count() > 0 and next_btn.is_visible():
|
|
try:
|
|
next_btn.click(force=True)
|
|
except Exception:
|
|
self.log(f"Carousel next button click timed out at slide {slide_index + 1}, stopping carousel", "warning")
|
|
break
|
|
time.sleep(2) # Wait for slide transition and image to load
|
|
|
|
# First, try to find a download button for this carousel item
|
|
# IMPORTANT: Search within CURRENT SLIDE only, not entire page
|
|
download_btn = None
|
|
download_url = None
|
|
webp_fallback_url = None
|
|
|
|
# Look for download button on the current slide - prefer high-res, fallback to .webp
|
|
download_selectors = [
|
|
'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg
|
|
'a.btn[href*="scontent"][href*=".mp4"]', # Video
|
|
'a.btn[href*="scontent"]', # Any scontent
|
|
'a[download][href*=".jpg"]',
|
|
'a[download][href*=".mp4"]',
|
|
'a.download',
|
|
'a[download]',
|
|
'a[href*="/post"]'
|
|
]
|
|
|
|
# Search for download buttons - first try within slide, then try page-level
|
|
# Imginn often has download buttons outside the .swiper-slide elements
|
|
search_contexts = [current_slide, page]
|
|
|
|
for search_context in search_contexts:
|
|
if download_url: # Already found, skip other contexts
|
|
break
|
|
|
|
for selector in download_selectors:
|
|
btn = search_context.locator(selector).first
|
|
if btn.count() > 0:
|
|
temp_url = btn.get_attribute('href')
|
|
if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)':
|
|
if not temp_url.startswith('http'):
|
|
temp_url = f"https://imginn.com{temp_url}"
|
|
|
|
# Store .webp as fallback, but keep looking for better
|
|
if '.webp' in temp_url.lower():
|
|
if not webp_fallback_url:
|
|
webp_fallback_url = temp_url
|
|
self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug")
|
|
continue
|
|
|
|
# Found non-.webp link, use it
|
|
download_btn = btn
|
|
download_url = temp_url
|
|
self.log(f"Found high-res download for carousel slide {slide_index + 1}: {download_url[:80]}...", "debug")
|
|
break
|
|
|
|
# Use .webp fallback if no high-res found
|
|
used_webp_fallback = False
|
|
if not download_url and webp_fallback_url:
|
|
download_url = webp_fallback_url
|
|
used_webp_fallback = True
|
|
self.log(f"Using .webp fallback for carousel slide {slide_index + 1}", "info")
|
|
|
|
# If we found a download button, use it for high-res
|
|
if download_url:
|
|
try:
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
response = requests.get(download_url, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Extract filename and media ID from the actual file
|
|
url_path = urlparse(download_url).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
# The media ID is the filename without extension
|
|
actual_media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
|
|
# Build filename for carousel image using actual media ID
|
|
filename = f"{profile_name}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
# Check for duplicate hash (hash blacklist persists even if original deleted)
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
|
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
# Delete the duplicate regardless of whether original file still exists
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
# Log with appropriate quality label
|
|
quality_label = "fallback" if used_webp_fallback else "high-res"
|
|
self.log(f"Downloaded ({quality_label}): {filename} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
image_count += 1
|
|
|
|
# Add to tracking
|
|
self.downloaded_files.add(actual_media_id)
|
|
|
|
# Mark in database (or defer for later)
|
|
if not skip_database or defer_database:
|
|
unique_url = f"{post_url}#{filename}"
|
|
self._record_download(
|
|
media_id=actual_media_id,
|
|
username=profile_name,
|
|
filename=filename,
|
|
url=unique_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='post',
|
|
deferred=defer_database
|
|
)
|
|
|
|
continue # Skip to next slide
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to download high-res carousel image {slide_index + 1}: {e}, falling back to standard res", "warning")
|
|
|
|
# Fallback: Find the current slide's media (img or video) if no download button
|
|
# current_slide already defined at top of loop
|
|
|
|
# Try img first, then video
|
|
media_src = None
|
|
slide_img = current_slide.locator('img').first
|
|
if slide_img.count() > 0:
|
|
media_src = slide_img.get_attribute('src')
|
|
|
|
# If it's a lazy placeholder, wait for it to load
|
|
if media_src and 'lazy.jpg' in media_src:
|
|
self.log(f"Slide {slide_index + 1} is lazy, waiting for load...", "debug")
|
|
# Trigger load by making it visible
|
|
current_slide.scroll_into_view_if_needed()
|
|
time.sleep(1)
|
|
# Get src again
|
|
media_src = slide_img.get_attribute('src')
|
|
else:
|
|
# Check for video tag
|
|
slide_video = current_slide.locator('video source, video').first
|
|
if slide_video.count() > 0:
|
|
media_src = slide_video.get_attribute('src')
|
|
self.log(f"Found video for slide {slide_index + 1}", "debug")
|
|
|
|
if media_src and 'lazy.jpg' not in media_src and '483011604' not in media_src:
|
|
self.log(f"Downloading carousel media {slide_index + 1} (standard res): {media_src[:80]}...", "debug")
|
|
|
|
# Download this media
|
|
try:
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
if not media_src.startswith('http'):
|
|
media_src = f"https:{media_src}" if media_src.startswith('//') else f"https://imginn.com{media_src}"
|
|
|
|
response = requests.get(media_src, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Extract filename and media ID from the actual file
|
|
url_path = urlparse(media_src).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
# The media ID is the filename without extension
|
|
actual_media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
|
|
# Build filename for carousel image using actual media ID
|
|
filename = f"{profile_name}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
existing_path = PathLib(existing['file_path'])
|
|
if existing_path.exists():
|
|
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate: {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
self.log(f"Downloaded: {filename} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
image_count += 1
|
|
|
|
# Add to tracking
|
|
self.downloaded_files.add(actual_media_id)
|
|
|
|
# Mark in database (or defer for later)
|
|
if not skip_database or defer_database:
|
|
unique_url = f"{post_url}#{filename}"
|
|
self._record_download(
|
|
media_id=actual_media_id,
|
|
username=profile_name,
|
|
filename=filename,
|
|
url=unique_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='post',
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to download carousel media {slide_index + 1}: {e}", "error")
|
|
else:
|
|
self.log(f"Slide {slide_index + 1} has no valid media (img/video)", "warning")
|
|
|
|
# Skip the old carousel download logic
|
|
pass
|
|
|
|
# This duplicate block is not needed - single image logic is already handled above
|
|
|
|
# OLD carousel logic removed - handled above
|
|
if False:
|
|
# Wait for carousel content to load
|
|
time.sleep(1)
|
|
|
|
# Find download button AND image elements
|
|
# ImgInn sometimes has the full image in an img tag, not just download button
|
|
download_selectors = [
|
|
'a[download]',
|
|
'a.download-btn',
|
|
'a[href*="scontent"]',
|
|
'a[href*="s3.imginn.com"]',
|
|
'a.download',
|
|
'a[href*="/post"][href*=".jpg"]',
|
|
'a[href*="/post"][href*=".mp4"]',
|
|
'button.download',
|
|
'a.btn-download'
|
|
]
|
|
|
|
# For carousel images, we need to find the actual post image, not the profile thumbnail
|
|
# Look for images that are NOT the profile pic and NOT lazy placeholders
|
|
img_src = None
|
|
|
|
# Try to find the carousel image (exclude profile pic and lazy images)
|
|
possible_images = page.locator('img[src*="post"], img[src*="scontent"]:not([src*="profile"])').all()
|
|
for img_elem in possible_images:
|
|
src = img_elem.get_attribute('src')
|
|
if src and 'lazy.jpg' not in src and '483011604' not in src: # Exclude profile pic
|
|
img_src = src
|
|
self.log(f"Found carousel image src: {img_src[:100]}...", "debug")
|
|
break
|
|
|
|
# If no good image found, wait and try again
|
|
if not img_src or 'lazy.jpg' in img_src:
|
|
time.sleep(1)
|
|
# Try once more after waiting
|
|
main_image = page.locator('img[src*="post"]:not([src*="lazy"])').first
|
|
if main_image.count() > 0:
|
|
img_src = main_image.get_attribute('src')
|
|
if img_src:
|
|
self.log(f"Found carousel image after wait: {img_src[:100]}...", "debug")
|
|
|
|
download_btn = None
|
|
for selector in download_selectors:
|
|
btn = page.locator(selector).first
|
|
if btn.count() > 0:
|
|
download_btn = btn
|
|
break
|
|
|
|
if download_btn and download_btn.count() > 0:
|
|
try:
|
|
# For ImgInn, we should click the download button to get the full-size image
|
|
# The href often points to a thumbnail, not the full image
|
|
download_url = download_btn.get_attribute('href')
|
|
self.log(f"Download button href: {download_url[:100] if download_url else 'None'}...", "debug")
|
|
|
|
# Try clicking the button for browser download first
|
|
try:
|
|
self.log(f"Attempting browser download (clicking button)", "debug")
|
|
with page.expect_download(timeout=5000) as download_info:
|
|
download_btn.click()
|
|
download = download_info.value
|
|
original_name = download.suggested_filename
|
|
media_id_from_file = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
download_method = 'browser'
|
|
response = None
|
|
self.log(f"Browser download completed: {original_name}", "debug")
|
|
except Exception:
|
|
# Fallback to direct download if clicking doesn't work
|
|
self.log(f"Browser download failed, trying direct download", "debug")
|
|
|
|
# For carousels, if no download URL or it's invalid, use image src
|
|
if has_carousel and (not download_url or download_url == "None" or download_url == "null"):
|
|
if img_src:
|
|
self.log(f"No download button for carousel, using image src", "debug")
|
|
download_url = img_src
|
|
|
|
# Be more lenient with download URLs - accept any https URL that looks like it could be an image/video
|
|
if download_url and download_url.startswith('http'):
|
|
# Make sure it's not just the post page URL
|
|
if '/p/' not in download_url or download_url.endswith(('.jpg', '.jpeg', '.png', '.heic', '.mp4', '.webm')):
|
|
import requests
|
|
response = requests.get(download_url, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
self.log(f"Downloaded {len(response.content)} bytes", "debug")
|
|
download_method = 'direct'
|
|
|
|
# Extract filename from URL
|
|
from urllib.parse import urlparse, unquote
|
|
url_path = urlparse(download_url).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
|
|
# Remove 'post' prefix if present
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
media_id_from_file = Path(original_name).stem # This is the actual media ID
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
else:
|
|
# Try to use image src instead
|
|
if img_src:
|
|
self.log(f"Download URL is post page, using image src instead", "debug")
|
|
download_url = img_src
|
|
if not download_url.startswith('http'):
|
|
download_url = f"https://imginn.com{download_url}"
|
|
|
|
import requests
|
|
response = requests.get(download_url, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
download_method = 'direct'
|
|
|
|
from urllib.parse import urlparse, unquote
|
|
url_path = urlparse(download_url).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
media_id_from_file = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
else:
|
|
raise Exception("No valid download URL found")
|
|
else:
|
|
raise Exception("No valid download URL found")
|
|
|
|
# Update our tracked media ID with the correct one from the file
|
|
if media_id_from_file:
|
|
media_id = media_id_from_file
|
|
self.log(f"Media ID from file: {media_id}", "debug")
|
|
|
|
# For carousels, if we don't get a unique media ID, generate one
|
|
normalized_media_id = extract_instagram_media_id(media_id) if media_id else None
|
|
if has_carousel and (not media_id or media_id in self.downloaded_files or (normalized_media_id and normalized_media_id in self.downloaded_files)):
|
|
# Generate unique ID for this carousel image
|
|
media_id = f"{media_id_base}_carousel_{carousel_index}"
|
|
normalized_media_id = extract_instagram_media_id(media_id)
|
|
self.log(f"Generated carousel media ID: {media_id}", "debug")
|
|
|
|
# Check if this media ID is already downloaded (both original and normalized)
|
|
if media_id in self.downloaded_files or (normalized_media_id and normalized_media_id in self.downloaded_files):
|
|
self.log(f"Already have {media_id}, skipping download but continuing carousel", "debug")
|
|
# Still count this as an image even if skipped
|
|
image_count += 1
|
|
if has_carousel:
|
|
carousel_index += 1
|
|
else:
|
|
self.log(f"Downloading new file for {media_id}", "debug")
|
|
# Build filename with FastDL format
|
|
if has_carousel:
|
|
# For carousel items, append index (simpler format)
|
|
filename = f"{profile_name}_{date_str}_{media_id_base}_{carousel_index}{ext}"
|
|
else:
|
|
filename = f"{profile_name}_{date_str}_{media_id}{ext}"
|
|
|
|
filepath = output_dir / filename
|
|
|
|
# Save the downloaded content
|
|
if download_method == 'direct':
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
else:
|
|
download.save_as(filepath)
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
# Check for duplicate hash (hash blacklist persists even if original deleted)
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
|
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
# Delete the duplicate regardless of whether original file still exists
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update file timestamps to match post date
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
self.log(f"Downloaded: {filename}", "info")
|
|
downloaded_files.append(str(filepath))
|
|
image_count += 1
|
|
|
|
# Add to tracking
|
|
self.downloaded_files.add(media_id)
|
|
|
|
# Increment carousel index for next image
|
|
if has_carousel:
|
|
carousel_index += 1
|
|
|
|
# Mark as downloaded in database (or defer for later)
|
|
# Use per-slide URL for carousels so each slide gets a unique url_hash
|
|
record_url = f"{post_url}?img_index={carousel_index + 1}" if has_carousel else post_url
|
|
if not skip_database or defer_database:
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=profile_name,
|
|
filename=filename,
|
|
url=record_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='post',
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Download failed: {e}", "error")
|
|
import traceback
|
|
self.log(f"Traceback: {traceback.format_exc()}", "debug")
|
|
break
|
|
else:
|
|
# No download button found, try using the image src as fallback
|
|
page_url = page.url
|
|
self.log(f"No download button found on {page_url}, trying image src", "warning")
|
|
|
|
# Use the image src we found earlier
|
|
if img_src:
|
|
try:
|
|
self.log(f"Using image src as fallback: {img_src[:100]}...", "debug")
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
# Ensure full URL
|
|
if not img_src.startswith('http'):
|
|
img_src = f"https://imginn.com{img_src}"
|
|
|
|
response = requests.get(img_src, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Extract filename from URL
|
|
url_path = urlparse(img_src).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
|
|
# Build filename with carousel index if needed
|
|
if has_carousel and carousel_index > 1:
|
|
filename = f"{profile_name}_{date_str}_{media_id}_{carousel_index}{ext}"
|
|
else:
|
|
filename = f"{profile_name}_{date_str}_{media_id}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.log(f"Downloaded via image src: {filename} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
# Check for duplicate hash (hash blacklist persists even if original deleted)
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
|
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
# Delete the duplicate regardless of whether original file still exists
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
image_count += 1
|
|
self.downloaded_files.add(media_id)
|
|
|
|
# Mark in database (or defer for later)
|
|
# Use per-slide URL for carousels so each slide gets a unique url_hash
|
|
record_url = f"{post_url}?img_index={carousel_index + 1}" if has_carousel else post_url
|
|
if not skip_database or defer_database:
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=profile_name,
|
|
filename=filename,
|
|
url=record_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='post',
|
|
deferred=defer_database
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Failed to download via image src: {e}", "error")
|
|
# Don't break here - might be a temporary issue with one image
|
|
if not has_carousel:
|
|
break
|
|
else:
|
|
self.log(f"No image src available as fallback", "debug")
|
|
# For carousels, we might still have more images after clicking next
|
|
if not has_carousel:
|
|
break
|
|
|
|
# Check for next image in carousel
|
|
if has_carousel and image_count < max_images:
|
|
next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
|
|
if next_btn.count() > 0 and next_btn.is_visible():
|
|
# Store current image src to detect when it changes
|
|
current_img_src = img_src if img_src else ""
|
|
|
|
self.log(f"Clicking next for carousel image {carousel_index}", "debug")
|
|
try:
|
|
next_btn.click(force=True)
|
|
except Exception:
|
|
self.log(f"Carousel next button click timed out at image {carousel_index}, stopping carousel", "warning")
|
|
break
|
|
|
|
# Wait for the image to change
|
|
time.sleep(2) # Give more time for slide transition and new image to load
|
|
else:
|
|
self.log("No more carousel images", "debug")
|
|
break
|
|
else:
|
|
break
|
|
else:
|
|
# Single image - download from post page using download button
|
|
download_url = None
|
|
webp_fallback_url = None
|
|
download_selectors = [
|
|
'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg
|
|
'a.btn[href*="scontent"][href*=".mp4"]', # Video
|
|
'a.btn[href*="scontent"]', # Any scontent
|
|
'a[download][href*=".jpg"]',
|
|
'a[download][href*=".mp4"]',
|
|
'a.download',
|
|
'a[href*="/post"]'
|
|
]
|
|
|
|
for selector in download_selectors:
|
|
btn = page.locator(selector).first
|
|
if btn.count() > 0:
|
|
temp_url = btn.get_attribute('href')
|
|
if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)':
|
|
if not temp_url.startswith('http'):
|
|
temp_url = f"https://imginn.com{temp_url}"
|
|
|
|
# Store .webp as fallback, but keep looking for better
|
|
if '.webp' in temp_url.lower():
|
|
if not webp_fallback_url:
|
|
webp_fallback_url = temp_url
|
|
self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug")
|
|
continue
|
|
|
|
# Found non-.webp link, use it
|
|
download_url = temp_url
|
|
self.log(f"Found high-res download for single image: {download_url[:80]}...", "debug")
|
|
break
|
|
|
|
# Use .webp fallback if no high-res found
|
|
if not download_url and webp_fallback_url:
|
|
download_url = webp_fallback_url
|
|
self.log(f"Using .webp fallback for single image", "info")
|
|
|
|
if download_url:
|
|
try:
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
response = requests.get(download_url, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Extract filename and media ID from the actual file
|
|
url_path = urlparse(download_url).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
# The media ID is the filename without extension
|
|
actual_media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
|
|
# Build filename
|
|
filename = f"{profile_name}_{date_str}_{actual_media_id}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.log(f"Downloaded (high-res): {filename} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
existing_path = PathLib(existing['file_path'])
|
|
if existing_path.exists():
|
|
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate: {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
image_count = 1
|
|
|
|
# Add to tracking
|
|
self.downloaded_files.add(actual_media_id)
|
|
|
|
# Mark in database (or defer for later)
|
|
if not skip_database or defer_database:
|
|
self._record_download(
|
|
media_id=actual_media_id,
|
|
username=profile_name,
|
|
filename=filename,
|
|
url=post_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='post',
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to download single image: {e}", "warning")
|
|
else:
|
|
# No download button found - try video/image src as fallback
|
|
self.log("No download button found, trying video/image src fallback", "debug")
|
|
media_src = None
|
|
|
|
# Try video first - multiple selectors for different page structures
|
|
video_selectors = [
|
|
'video source[src]',
|
|
'video[src]',
|
|
'video source[type*="mp4"]',
|
|
'.video-container video',
|
|
'.post-video video',
|
|
'div[class*="video"] video',
|
|
'video'
|
|
]
|
|
for v_selector in video_selectors:
|
|
video_elem = page.locator(v_selector).first
|
|
if video_elem.count() > 0:
|
|
# Try src attribute first, then check source child
|
|
media_src = video_elem.get_attribute('src')
|
|
if not media_src:
|
|
source_elem = video_elem.locator('source').first
|
|
if source_elem.count() > 0:
|
|
media_src = source_elem.get_attribute('src')
|
|
if media_src and media_src != '#':
|
|
self.log(f"Found video src via '{v_selector}': {media_src[:80]}...", "debug")
|
|
break
|
|
|
|
# If no video found, wait a bit and try again (videos may lazy-load)
|
|
if not media_src:
|
|
time.sleep(2)
|
|
for v_selector in video_selectors:
|
|
video_elem = page.locator(v_selector).first
|
|
if video_elem.count() > 0:
|
|
media_src = video_elem.get_attribute('src')
|
|
if not media_src:
|
|
source_elem = video_elem.locator('source').first
|
|
if source_elem.count() > 0:
|
|
media_src = source_elem.get_attribute('src')
|
|
if media_src and media_src != '#':
|
|
self.log(f"Found video src after wait via '{v_selector}': {media_src[:80]}...", "debug")
|
|
break
|
|
|
|
# Try image if no video
|
|
if not media_src:
|
|
img_elem = page.locator('img[src*="scontent"]:not([src*="profile"]), img[src*="post"]').first
|
|
if img_elem.count() > 0:
|
|
media_src = img_elem.get_attribute('src')
|
|
if media_src and 'lazy.jpg' not in media_src:
|
|
self.log(f"Found image src: {media_src[:80]}...", "debug")
|
|
else:
|
|
media_src = None
|
|
|
|
if media_src:
|
|
try:
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
if not media_src.startswith('http'):
|
|
media_src = f"https://imginn.com{media_src}"
|
|
|
|
response = requests.get(media_src, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Extract filename from URL
|
|
url_path = urlparse(media_src).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
actual_media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.mp4'
|
|
|
|
filename = f"{profile_name}_{date_str}_{actual_media_id}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.log(f"Downloaded (fallback): {filename} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
image_count = 1
|
|
self.downloaded_files.add(actual_media_id)
|
|
|
|
if not skip_database or defer_database:
|
|
self._record_download(
|
|
media_id=actual_media_id,
|
|
username=profile_name,
|
|
filename=filename,
|
|
url=post_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='post',
|
|
deferred=defer_database
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Failed to download via fallback: {e}", "error")
|
|
else:
|
|
self.log("No download button or media src found for single post", "warning")
|
|
# Debug: capture screenshot and page content when download fails
|
|
try:
|
|
debug_dir = Path("debug")
|
|
debug_dir.mkdir(exist_ok=True)
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
screenshot_path = debug_dir / f"no_media_{media_id}_{timestamp}.png"
|
|
page.screenshot(path=str(screenshot_path))
|
|
self.log(f"Debug screenshot saved: {screenshot_path}", "debug")
|
|
# Also log page title and some content
|
|
title = page.title()
|
|
self.log(f"Page title: {title}", "debug")
|
|
|
|
# Check if this is a Cloudflare block - don't mark as processed if so
|
|
if self._is_cloudflare_challenge(page):
|
|
self.log(f"Cloudflare block detected - NOT marking {media_id} as processed (will retry later)", "warning")
|
|
# Skip to next post without marking as processed
|
|
try:
|
|
page.goto(f"https://imginn.com/{username}/?ref=index")
|
|
time.sleep(3)
|
|
except Exception:
|
|
pass
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to capture debug screenshot: {e}", "debug")
|
|
|
|
# Mark post as processed in database even if no downloads
|
|
# (might be already downloaded or failed - but NOT if Cloudflare blocked)
|
|
if image_count == 0:
|
|
# Still mark the post URL as processed to avoid re-checking
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=profile_name,
|
|
filename=f"{media_id}_skipped",
|
|
url=post_url,
|
|
post_date=post_date,
|
|
content_type='post',
|
|
metadata={'marker': True, 'reason': 'skipped'}
|
|
)
|
|
|
|
# Go back to profile
|
|
self._safe_go_back(page, username)
|
|
|
|
# If we just bypassed Cloudflare, wait longer to let session stabilize
|
|
if cloudflare_bypassed:
|
|
cooldown = random.uniform(15, 25)
|
|
self.log(f"Post-bypass cooldown: waiting {cooldown:.1f}s to stabilize session", "info")
|
|
time.sleep(cooldown)
|
|
else:
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
# Check if back on profile
|
|
if username not in page.url:
|
|
page.goto(f"https://imginn.com/{username}/?ref=index")
|
|
time.sleep(3)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing post: {e}", "error")
|
|
try:
|
|
page.goto(f"https://imginn.com/{username}/?ref=index")
|
|
time.sleep(3)
|
|
except Exception:
|
|
pass
|
|
|
|
self.log(f"Downloaded {len(downloaded_files)} files", "info")
|
|
except Exception as e:
|
|
self.log(f"Error: {e}", "error")
|
|
|
|
# Don't close browser here - reuse it for next profile
|
|
# Call _stop_browser() explicitly when done with all profiles
|
|
return downloaded_files
|
|
|
|
def download_tagged(self, username: str, days_back: int = 14, max_posts: int = 50, output_dir: Path = None, phrase_config: dict = None, defer_database: bool = False):
|
|
"""Download tagged posts from a user
|
|
|
|
Args:
|
|
username: Instagram username
|
|
days_back: How many days back to search
|
|
max_posts: Maximum posts to check
|
|
output_dir: Output directory
|
|
phrase_config: Optional phrase search configuration
|
|
defer_database: If True, defer database recording to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
# Rate limiting to avoid Cloudflare blocks
|
|
self._enforce_rate_limit("tagged")
|
|
|
|
profile_name = username.lower()
|
|
if output_dir is None:
|
|
output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Check site status before doing anything else
|
|
self.log("Checking ImgInn site status...", "debug")
|
|
site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10)
|
|
|
|
if self.cf_handler.should_skip_download(site_status):
|
|
self.log(f"Skipping tagged download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning")
|
|
return []
|
|
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
|
|
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
|
|
|
|
# Scan existing files
|
|
self._scan_existing_files(output_dir, profile_name)
|
|
|
|
# Get processed posts from database
|
|
processed_posts = self._get_processed_posts(profile_name)
|
|
self.log(f"Loaded {len(processed_posts)} processed tagged posts for {profile_name} from database", "info")
|
|
|
|
downloaded_files = []
|
|
cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
|
|
# Start or reuse browser
|
|
self._start_browser()
|
|
page = self.page
|
|
|
|
try:
|
|
# Navigate to tagged page directly
|
|
self.log(f"Navigating to @{username} tagged posts page", "info")
|
|
page.goto(f"https://imginn.com/tagged/{username}/?ref=index", wait_until='domcontentloaded')
|
|
|
|
# CRITICAL: Wait for Cloudflare background JS challenges
|
|
import random
|
|
wait_time = 5 + random.uniform(0, 2)
|
|
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
|
|
time.sleep(wait_time)
|
|
|
|
# Wait for page to load
|
|
if not self.wait_for_cloudflare(page):
|
|
self._page_load_failures += 1
|
|
level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning"
|
|
self.log(f"Page didn't load properly ({self._page_load_failures}x this session)", level)
|
|
return []
|
|
|
|
# Save cookies
|
|
self.save_cookies(self.context)
|
|
|
|
# Wait for JavaScript to load posts (ImgInn loads posts dynamically on tagged page)
|
|
self.log("Waiting for tagged posts to load via JavaScript...", "info")
|
|
try:
|
|
# Wait for post links to appear (up to 10 seconds)
|
|
page.wait_for_selector('a[href*="/p/"]', timeout=10000)
|
|
self.log("Tagged posts loaded successfully", "info")
|
|
except Exception:
|
|
# Timeout - posts might not exist, or page structure changed
|
|
self.log("Timeout waiting for tagged posts to appear", "warning")
|
|
time.sleep(2) # Give it a bit more time anyway
|
|
|
|
# Scroll to load more posts (ImgInn uses infinite scroll on tagged page)
|
|
self.log("Scrolling to load more tagged posts...", "info")
|
|
previous_count = 0
|
|
scroll_attempts = 0
|
|
max_scroll_attempts = 10 # Scroll up to 10 times to load posts
|
|
|
|
while scroll_attempts < max_scroll_attempts:
|
|
# Get current count of post links
|
|
current_count = page.locator('a[href*="/p/"]').count()
|
|
|
|
if current_count == previous_count and scroll_attempts > 0:
|
|
# No new posts loaded after scroll, we've reached the end
|
|
self.log(f"No more tagged posts to load (total: {current_count})", "debug")
|
|
break
|
|
|
|
if current_count >= max_posts:
|
|
# We have enough posts
|
|
self.log(f"Loaded {current_count} tagged posts (reached max_posts limit)", "debug")
|
|
break
|
|
|
|
previous_count = current_count
|
|
|
|
# Scroll to bottom of page
|
|
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
time.sleep(1.5) # Wait for new posts to load
|
|
|
|
scroll_attempts += 1
|
|
self.log(f"Scroll {scroll_attempts}: Found {current_count} tagged posts", "debug")
|
|
|
|
# Find posts on tagged page
|
|
self.log("Finding tagged posts...", "info")
|
|
|
|
# Debug: Check what's actually on the page
|
|
page_content = page.content()
|
|
if 'no posts' in page_content.lower() or 'page not found' in page_content.lower():
|
|
self.log("Page shows 'no posts' or 'not found'", "warning")
|
|
|
|
post_links = page.locator('a[href*="/p/"]').all()
|
|
|
|
self.log(f"Found {len(post_links)} tagged posts", "info")
|
|
|
|
if not post_links:
|
|
# Debug: Save screenshot to see what's wrong
|
|
try:
|
|
screenshot_path = Path(f"/tmp/imginn_no_tagged_{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
|
|
page.screenshot(path=str(screenshot_path))
|
|
self.log(f"No tagged posts found - screenshot saved to {screenshot_path}", "warning")
|
|
except Exception:
|
|
pass
|
|
self.log("No tagged posts found", "warning")
|
|
return []
|
|
|
|
# Extract all post URLs upfront to avoid stale element issues
|
|
# (elements become stale after page.go_back())
|
|
post_urls = []
|
|
for idx, post_link in enumerate(post_links[:max_posts]):
|
|
try:
|
|
href = post_link.get_attribute('href', timeout=5000)
|
|
if href:
|
|
# Ensure full URL
|
|
if not href.startswith('http'):
|
|
href = f"https://imginn.com{href}"
|
|
post_urls.append(href)
|
|
except Exception as e:
|
|
self.log(f"Tagged {idx+1}: Failed to get URL: {str(e)[:50]}", "debug")
|
|
continue
|
|
|
|
self.log(f"Processing {len(post_urls)} tagged posts (max {max_posts})", "info")
|
|
|
|
# Track consecutive old posts to handle pinned posts
|
|
consecutive_old_posts = 0
|
|
max_consecutive_old_posts = 5 # Allow up to 5 old posts (pinned) before stopping
|
|
|
|
# Set initial progress so dashboard shows 0/N immediately
|
|
self.activity_manager.update_status(
|
|
"Downloading tagged",
|
|
progress_current=0,
|
|
progress_total=len(post_urls)
|
|
)
|
|
|
|
for i, post_url in enumerate(post_urls):
|
|
# Update progress at start of each iteration (fires even on skips)
|
|
self.activity_manager.update_status(
|
|
"Downloading tagged",
|
|
progress_current=i + 1,
|
|
progress_total=len(post_urls)
|
|
)
|
|
|
|
try:
|
|
# Extract media ID from URL
|
|
media_id = self._extract_media_id_from_url(post_url)
|
|
|
|
if not media_id:
|
|
self.log(f"Could not extract media ID from {post_url}", "warning")
|
|
continue
|
|
|
|
self.log(f"[{i+1}/{len(post_urls)}] Checking tagged post {media_id}", "debug")
|
|
|
|
# Check if already processed (either downloaded or checked for phrases/age)
|
|
if media_id in processed_posts or post_url in processed_posts:
|
|
self.log(f"Post {media_id} already processed, skipping", "debug")
|
|
continue
|
|
|
|
# Rate limiting between post downloads to avoid Cloudflare blocks
|
|
if i > 0:
|
|
post_delay = random.uniform(3, 8)
|
|
self.log(f"Rate limit: waiting {post_delay:.1f}s before tagged post {i+1}", "debug")
|
|
time.sleep(post_delay)
|
|
|
|
# For tagged posts, ALWAYS navigate to post page for high-res download
|
|
# (Never use profile download which gives low-res .webp)
|
|
page.goto(post_url, wait_until='domcontentloaded')
|
|
|
|
# Wait for page to load
|
|
time.sleep(2)
|
|
|
|
# Wait for navigation to complete
|
|
try:
|
|
page.wait_for_load_state('networkidle', timeout=5000)
|
|
except Exception:
|
|
# Continue even if network isn't idle - page might still be usable
|
|
self.log("Network didn't idle, but continuing", "debug")
|
|
|
|
# Check if on post page
|
|
if "/p/" not in page.url:
|
|
self.log(f"Not a downloadable post (URL: {page.url})", "warning")
|
|
self._safe_go_back(page, username, tagged=True)
|
|
continue
|
|
|
|
# IMPORTANT: Wait for post page content to fully render
|
|
# This ensures download buttons are from the POST PAGE, not tagged page preview
|
|
try:
|
|
# Wait for the post container to be visible
|
|
page.wait_for_selector('div.main-content, div.post, div.content, div.single-post', timeout=3000)
|
|
time.sleep(1) # Additional wait for download buttons to render
|
|
except Exception:
|
|
self.log("Post container not found, checking for Cloudflare...", "debug")
|
|
|
|
# Check for Cloudflare challenge and handle it
|
|
cloudflare_bypassed = False
|
|
if self._is_cloudflare_challenge(page):
|
|
self.log(f"Cloudflare challenge detected on tagged post {media_id}", "warning")
|
|
if not self._handle_cloudflare_on_post(page, post_url):
|
|
# Cloudflare bypass failed - skip this post WITHOUT marking as processed
|
|
# so it can be retried on next run
|
|
self.log(f"Skipping tagged post {media_id} due to Cloudflare block (will retry later)", "warning")
|
|
try:
|
|
page.goto(f"https://imginn.com/tagged/{username}/?ref=index")
|
|
time.sleep(3)
|
|
except Exception:
|
|
pass
|
|
continue
|
|
cloudflare_bypassed = True
|
|
|
|
self.log(f"Navigated to tagged post page: {page.url}", "debug")
|
|
self._dismiss_consent_dialog(page)
|
|
|
|
# Extract the actual poster's username (not the tagged user)
|
|
# On tagged pages, posts are FROM other users who tagged this user
|
|
poster_username = profile_name # Default to tagged user
|
|
try:
|
|
username_elem = page.locator('div.username a').first
|
|
if username_elem.count() > 0:
|
|
username_href = username_elem.get_attribute('href')
|
|
if username_href:
|
|
# Extract username from href like "/evalongoria.of/" -> "evalongoria.of"
|
|
poster_username = username_href.strip('/').lower()
|
|
self.log(f"Poster username: @{poster_username}", "debug")
|
|
except Exception as e:
|
|
self.log(f"Could not extract poster username, using default: {e}", "debug")
|
|
|
|
# Extract post date - ALWAYS extract for proper file naming
|
|
post_date = self._extract_post_date(page)
|
|
|
|
# Use post date for filename, or current date
|
|
if post_date:
|
|
date_str = post_date.strftime('%Y%m%d_%H%M%S')
|
|
self.log(f"Original post date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
else:
|
|
date_str = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
self.log(f"No original date found, using current time", "debug")
|
|
|
|
# Check date filter
|
|
if post_date and post_date < cutoff_date:
|
|
consecutive_old_posts += 1
|
|
self.log(f"Tagged post too old ({post_date.strftime('%Y-%m-%d')}), skipping (consecutive old: {consecutive_old_posts}/{max_consecutive_old_posts})", "info")
|
|
|
|
# Clean up temp file if exists
|
|
if 'temp_download_path' in locals() and temp_download_path and temp_download_path.exists():
|
|
temp_download_path.unlink()
|
|
self.log(f"Deleted temp file for old post", "debug")
|
|
|
|
# Mark this old post as checked in database - use poster_username for tagged content
|
|
if phrase_config and media_id:
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=poster_username,
|
|
filename=f"_old_post_{media_id}",
|
|
url=post_url,
|
|
post_date=post_date,
|
|
content_type='tagged',
|
|
metadata={'marker': True, 'reason': 'old_post'}
|
|
)
|
|
|
|
self._safe_go_back(page, username, tagged=True)
|
|
|
|
# Stop only after 5 consecutive old posts (handles pinned posts at top)
|
|
if consecutive_old_posts >= max_consecutive_old_posts:
|
|
self.log(f"Found {consecutive_old_posts} consecutive old tagged posts - stopping", "info")
|
|
break
|
|
else:
|
|
continue # Skip this old post but keep checking (might be pinned)
|
|
|
|
# Reset consecutive old posts counter - we found a post within date range
|
|
consecutive_old_posts = 0
|
|
|
|
# Check for phrase matching if configured
|
|
if phrase_config and phrase_config.get('enabled'):
|
|
if not self._check_post_phrases(page, phrase_config):
|
|
self.log(f"Tagged post does not match phrase criteria, skipping download", "info")
|
|
# Clean up temp file if exists
|
|
if 'temp_download_path' in locals() and temp_download_path and temp_download_path.exists():
|
|
temp_download_path.unlink()
|
|
self.log(f"Deleted temp file for non-matching post", "debug")
|
|
|
|
# Mark this post as checked (but not downloaded) in database - use poster_username
|
|
if media_id:
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=poster_username,
|
|
filename=f"_phrase_checked_{media_id}",
|
|
url=post_url,
|
|
post_date=post_date,
|
|
content_type='tagged',
|
|
metadata={'marker': True, 'reason': 'phrase_checked'}
|
|
)
|
|
|
|
self._safe_go_back(page, username, tagged=True)
|
|
continue
|
|
else:
|
|
self.log(f"Tagged post matches phrase criteria, using high-res download", "info")
|
|
|
|
# Check for carousel
|
|
carousel_next = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
|
|
has_carousel = carousel_next.count() > 0
|
|
|
|
if has_carousel:
|
|
self.log(f"Carousel detected in tagged post - will download all carousel images", "info")
|
|
self._dismiss_consent_dialog(page)
|
|
|
|
# CRITICAL: Wait for POST PAGE carousel download buttons to be ready
|
|
# This prevents downloading from the tagged page preview
|
|
try:
|
|
# Wait for download buttons with POST PAGE URLs (have "scontent" or "post" in them)
|
|
page.wait_for_selector('a.btn[href*="scontent"], a[download], a.download', timeout=3000)
|
|
time.sleep(1.5) # Additional wait for all carousel images to load
|
|
self.log("Carousel download buttons ready on post page", "debug")
|
|
except Exception:
|
|
self.log("Download buttons not found, but continuing", "debug")
|
|
else:
|
|
self.log("Single image tagged post", "debug")
|
|
|
|
# Handle downloads - always use download buttons from post page
|
|
image_count = 0
|
|
max_images = 10
|
|
|
|
# Download images (carousel or single)
|
|
if has_carousel:
|
|
all_slides = page.locator('.swiper-slide').all()
|
|
self.log(f"Found {len(all_slides)} carousel slides in tagged post", "debug")
|
|
|
|
# Download each slide's image
|
|
for slide_index in range(min(len(all_slides), max_images)):
|
|
self.log(f"Processing carousel slide {slide_index + 1}/{len(all_slides)}", "debug")
|
|
|
|
# Get the current slide element to scope our searches
|
|
current_slide = all_slides[slide_index]
|
|
|
|
# Click next to navigate to this slide (except for first one)
|
|
if slide_index > 0:
|
|
next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first
|
|
if next_btn.count() > 0 and next_btn.is_visible():
|
|
try:
|
|
next_btn.click(force=True)
|
|
except Exception:
|
|
self.log(f"Carousel next button click timed out at slide {slide_index + 1}, stopping carousel", "warning")
|
|
break
|
|
time.sleep(2) # Wait for slide transition and image to load
|
|
|
|
# Look for download button - prefer high-res, fallback to .webp
|
|
# IMPORTANT: Search within CURRENT SLIDE only, not entire page
|
|
download_url = None
|
|
webp_fallback_url = None
|
|
slide_downloaded = False # Track if this specific slide was downloaded
|
|
download_selectors = [
|
|
'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg
|
|
'a.btn[href*="scontent"][href*=".mp4"]', # Video
|
|
'a.btn[href*="scontent"]', # Any scontent
|
|
'a[download][href*=".jpg"]',
|
|
'a[download][href*=".mp4"]',
|
|
'a.download',
|
|
'a[href*="/post"]'
|
|
]
|
|
|
|
# Search for download buttons - first try within slide, then try page-level
|
|
# Imginn often has download buttons outside the .swiper-slide elements
|
|
search_contexts = [current_slide, page]
|
|
|
|
for search_context in search_contexts:
|
|
if download_url: # Already found, skip other contexts
|
|
break
|
|
|
|
for selector in download_selectors:
|
|
btn = search_context.locator(selector).first
|
|
if btn.count() > 0:
|
|
temp_url = btn.get_attribute('href')
|
|
if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)':
|
|
if not temp_url.startswith('http'):
|
|
temp_url = f"https://imginn.com{temp_url}"
|
|
|
|
# Store .webp as fallback, but keep looking for better
|
|
if '.webp' in temp_url.lower():
|
|
if not webp_fallback_url:
|
|
webp_fallback_url = temp_url
|
|
self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug")
|
|
continue
|
|
|
|
# Found non-.webp link, use it
|
|
download_url = temp_url
|
|
self.log(f"Found high-res download for carousel slide {slide_index + 1}: {download_url[:80]}...", "debug")
|
|
break
|
|
|
|
# Use .webp fallback if no high-res found
|
|
used_webp_fallback = False
|
|
if not download_url and webp_fallback_url:
|
|
download_url = webp_fallback_url
|
|
used_webp_fallback = True
|
|
self.log(f"Using .webp fallback for carousel slide {slide_index + 1}", "info")
|
|
|
|
# If we found a download button, use it for high-res
|
|
if download_url:
|
|
try:
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
response = requests.get(download_url, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Extract filename and media ID from the actual file
|
|
url_path = urlparse(download_url).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
# The media ID is the filename without extension
|
|
actual_media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
|
|
# Build filename for carousel image using actual media ID (use poster's username)
|
|
filename = f"{poster_username}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
# Log with appropriate quality label
|
|
quality_label = "fallback" if used_webp_fallback else "high-res"
|
|
self.log(f"Downloaded tagged ({quality_label}): {filename} from @{poster_username} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
# Check for duplicate hash (hash blacklist persists even if original deleted)
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
|
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
# Delete the duplicate regardless of whether original file still exists
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
image_count += 1
|
|
slide_downloaded = True # Mark this slide as successfully downloaded
|
|
|
|
# Add to tracking
|
|
self.downloaded_files.add(actual_media_id)
|
|
|
|
# Mark in database (or defer for later) - use poster_username for tagged content
|
|
unique_url = f"{post_url}#{filename}"
|
|
self._record_download(
|
|
media_id=actual_media_id,
|
|
username=poster_username,
|
|
filename=filename,
|
|
url=unique_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='tagged',
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to download carousel image {slide_index + 1}: {e}", "error")
|
|
# Don't continue - try fallback method below
|
|
|
|
# Fallback: Download from current slide's img/video src if no download button worked
|
|
if not slide_downloaded:
|
|
self.log(f"Trying fallback: downloading from slide {slide_index + 1} media src", "debug")
|
|
# current_slide already defined at top of loop
|
|
|
|
# Try img first, then video
|
|
media_src = None
|
|
slide_img = current_slide.locator('img').first
|
|
if slide_img.count() > 0:
|
|
media_src = slide_img.get_attribute('src')
|
|
else:
|
|
# Check for video tag
|
|
slide_video = current_slide.locator('video source, video').first
|
|
if slide_video.count() > 0:
|
|
media_src = slide_video.get_attribute('src')
|
|
self.log(f"Found video for slide {slide_index + 1}", "debug")
|
|
|
|
if media_src:
|
|
# Skip lazy placeholders
|
|
if 'lazy.jpg' not in media_src and '483011604' not in media_src:
|
|
try:
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
if not media_src.startswith('http'):
|
|
media_src = f"https:{media_src}" if media_src.startswith('//') else f"https://imginn.com{media_src}"
|
|
|
|
response = requests.get(media_src, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Extract filename
|
|
url_path = urlparse(media_src).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
actual_media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
|
|
# Build filename
|
|
filename = f"{poster_username}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.log(f"Downloaded tagged (fallback): {filename} from @{poster_username} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
existing_path = PathLib(existing['file_path'])
|
|
if existing_path.exists():
|
|
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate: {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
image_count += 1
|
|
|
|
# Add to tracking
|
|
self.downloaded_files.add(actual_media_id)
|
|
|
|
# Mark in database (or defer for later) - use poster_username for tagged content
|
|
unique_url = f"{post_url}#{filename}"
|
|
self._record_download(
|
|
media_id=actual_media_id,
|
|
username=poster_username,
|
|
filename=filename,
|
|
url=unique_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='tagged',
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to download from media src for slide {slide_index + 1}: {e}", "error")
|
|
else:
|
|
self.log(f"No media (img/video) found for carousel slide {slide_index + 1}", "warning")
|
|
|
|
else:
|
|
# Single image - download from post page using download button
|
|
download_url = None
|
|
webp_fallback_url = None
|
|
download_selectors = [
|
|
'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg
|
|
'a.btn[href*="scontent"][href*=".mp4"]', # Video
|
|
'a.btn[href*="scontent"]', # Any scontent
|
|
'a[download][href*=".jpg"]',
|
|
'a[download][href*=".mp4"]',
|
|
'a.download',
|
|
'a[href*="/post"]'
|
|
]
|
|
|
|
for selector in download_selectors:
|
|
btn = page.locator(selector).first
|
|
if btn.count() > 0:
|
|
temp_url = btn.get_attribute('href')
|
|
if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)':
|
|
if not temp_url.startswith('http'):
|
|
temp_url = f"https://imginn.com{temp_url}"
|
|
|
|
# Store .webp as fallback, but keep looking for better
|
|
if '.webp' in temp_url.lower():
|
|
if not webp_fallback_url:
|
|
webp_fallback_url = temp_url
|
|
self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug")
|
|
continue
|
|
|
|
# Found non-.webp link, use it
|
|
download_url = temp_url
|
|
self.log(f"Found high-res download for single image: {download_url[:80]}...", "debug")
|
|
break
|
|
|
|
# Use .webp fallback if no high-res found
|
|
if not download_url and webp_fallback_url:
|
|
download_url = webp_fallback_url
|
|
self.log(f"Using .webp fallback for single image", "info")
|
|
|
|
if download_url:
|
|
try:
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
response = requests.get(download_url, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Extract filename and media ID from the actual file
|
|
url_path = urlparse(download_url).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
# The media ID is the filename without extension
|
|
actual_media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
|
|
# Build filename using poster's username
|
|
filename = f"{poster_username}_{date_str}_{actual_media_id}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.log(f"Downloaded tagged (high-res): {filename} from @{poster_username} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
existing_path = PathLib(existing['file_path'])
|
|
if existing_path.exists():
|
|
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate: {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
image_count = 1
|
|
|
|
# Add to tracking
|
|
self.downloaded_files.add(actual_media_id)
|
|
|
|
# Mark in database (or defer for later) - use poster_username for tagged content
|
|
self._record_download(
|
|
media_id=actual_media_id,
|
|
username=poster_username,
|
|
filename=filename,
|
|
url=post_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='tagged',
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to download single image: {e}", "warning")
|
|
else:
|
|
# No download button found - try video/image src as fallback
|
|
self.log("No download button found, trying video/image src fallback", "debug")
|
|
media_src = None
|
|
|
|
# Try video first - multiple selectors for different page structures
|
|
video_selectors = [
|
|
'video source[src]',
|
|
'video[src]',
|
|
'video source[type*="mp4"]',
|
|
'.video-container video',
|
|
'.post-video video',
|
|
'div[class*="video"] video',
|
|
'video'
|
|
]
|
|
for v_selector in video_selectors:
|
|
video_elem = page.locator(v_selector).first
|
|
if video_elem.count() > 0:
|
|
# Try src attribute first, then check source child
|
|
media_src = video_elem.get_attribute('src')
|
|
if not media_src:
|
|
source_elem = video_elem.locator('source').first
|
|
if source_elem.count() > 0:
|
|
media_src = source_elem.get_attribute('src')
|
|
if media_src and media_src != '#':
|
|
self.log(f"Found video src via '{v_selector}': {media_src[:80]}...", "debug")
|
|
break
|
|
|
|
# If no video found, wait a bit and try again (videos may lazy-load)
|
|
if not media_src:
|
|
time.sleep(2)
|
|
for v_selector in video_selectors:
|
|
video_elem = page.locator(v_selector).first
|
|
if video_elem.count() > 0:
|
|
media_src = video_elem.get_attribute('src')
|
|
if not media_src:
|
|
source_elem = video_elem.locator('source').first
|
|
if source_elem.count() > 0:
|
|
media_src = source_elem.get_attribute('src')
|
|
if media_src and media_src != '#':
|
|
self.log(f"Found video src after wait via '{v_selector}': {media_src[:80]}...", "debug")
|
|
break
|
|
|
|
# Try image if no video
|
|
if not media_src:
|
|
img_elem = page.locator('img[src*="scontent"]:not([src*="profile"]), img[src*="post"]').first
|
|
if img_elem.count() > 0:
|
|
media_src = img_elem.get_attribute('src')
|
|
if media_src and 'lazy.jpg' not in media_src:
|
|
self.log(f"Found image src: {media_src[:80]}...", "debug")
|
|
else:
|
|
media_src = None
|
|
|
|
if media_src:
|
|
try:
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
if not media_src.startswith('http'):
|
|
media_src = f"https://imginn.com{media_src}"
|
|
|
|
response = requests.get(media_src, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
url_path = urlparse(media_src).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
if original_name.startswith('post'):
|
|
original_name = original_name[4:]
|
|
|
|
actual_media_id = Path(original_name).stem
|
|
ext = Path(original_name).suffix or '.mp4'
|
|
|
|
filename = f"{poster_username}_{date_str}_{actual_media_id}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.log(f"Downloaded (fallback): {filename} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
if post_date:
|
|
self._update_file_timestamps(filepath, post_date)
|
|
|
|
image_count = 1
|
|
self.downloaded_files.add(actual_media_id)
|
|
|
|
self._record_download(
|
|
media_id=actual_media_id,
|
|
username=poster_username,
|
|
filename=filename,
|
|
url=post_url,
|
|
post_date=post_date,
|
|
file_path=str(filepath),
|
|
content_type='tagged',
|
|
deferred=defer_database
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Failed to download via fallback: {e}", "error")
|
|
else:
|
|
self.log("No download button or media src found for single post", "warning")
|
|
# Debug: capture screenshot and page content when download fails
|
|
try:
|
|
debug_dir = Path("debug")
|
|
debug_dir.mkdir(exist_ok=True)
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
screenshot_path = debug_dir / f"no_media_tagged_{media_id}_{timestamp}.png"
|
|
page.screenshot(path=str(screenshot_path))
|
|
self.log(f"Debug screenshot saved: {screenshot_path}", "debug")
|
|
# Also log page title
|
|
title = page.title()
|
|
self.log(f"Page title: {title}", "debug")
|
|
|
|
# Check if this is a Cloudflare block - don't mark as processed if so
|
|
if self._is_cloudflare_challenge(page):
|
|
self.log(f"Cloudflare block detected - NOT marking tagged post {media_id} as processed (will retry later)", "warning")
|
|
# Skip to next post without marking as processed
|
|
try:
|
|
page.goto(f"https://imginn.com/tagged/{username}/?ref=index")
|
|
time.sleep(3)
|
|
except Exception:
|
|
pass
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to capture debug screenshot: {e}", "debug")
|
|
|
|
# Navigate back to tagged page
|
|
if image_count > 0:
|
|
self.log(f"Successfully downloaded {image_count} image(s) from tagged post {media_id}", "info")
|
|
|
|
self._safe_go_back(page, username, tagged=True)
|
|
|
|
# If we just bypassed Cloudflare, wait longer to let session stabilize
|
|
if cloudflare_bypassed:
|
|
cooldown = random.uniform(15, 25)
|
|
self.log(f"Post-bypass cooldown: waiting {cooldown:.1f}s to stabilize session", "info")
|
|
time.sleep(cooldown)
|
|
else:
|
|
time.sleep(1)
|
|
|
|
except KeyboardInterrupt:
|
|
self.log("Download interrupted by user", "warning")
|
|
break
|
|
except Exception as e:
|
|
self.log(f"Error processing tagged post: {e}", "error")
|
|
self._safe_go_back(page, username, tagged=True)
|
|
|
|
self.log(f"Downloaded {len(downloaded_files)} tagged files", "info")
|
|
except Exception as e:
|
|
self.log(f"Error: {e}", "error")
|
|
|
|
# Don't close browser here - reuse it for next profile
|
|
return downloaded_files
|
|
|
|
def download_stories(self, username: str, days_back: int = 1, max_stories: int = 50, output_dir: Path = None, skip_database: bool = False, defer_database: bool = False):
|
|
"""Download stories from a user with FastDL naming
|
|
|
|
Args:
|
|
username: Instagram username
|
|
days_back: How many days back to search (stories expire after 24h)
|
|
max_stories: Maximum stories to download
|
|
output_dir: Output directory
|
|
skip_database: If True, don't record downloads in database (for temporary processing)
|
|
defer_database: If True, defer database recording to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
|
|
profile_name = username.lower()
|
|
if output_dir is None:
|
|
output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Check site status before doing anything else
|
|
self.log("Checking ImgInn site status...", "debug")
|
|
site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10)
|
|
|
|
if self.cf_handler.should_skip_download(site_status):
|
|
self.log(f"Skipping stories download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning")
|
|
return []
|
|
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
|
|
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
|
|
|
|
# Scan existing files
|
|
self._scan_existing_files(output_dir, profile_name)
|
|
|
|
# Get processed stories from database
|
|
processed_stories = self._get_processed_posts(profile_name)
|
|
self.log(f"Loaded {len(processed_stories)} processed stories for {profile_name} from database", "info")
|
|
|
|
downloaded_files = []
|
|
cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
|
|
# Start or reuse browser
|
|
self._start_browser()
|
|
page = self.page
|
|
|
|
try:
|
|
# Navigate to stories page
|
|
self.log(f"Navigating to @{username} stories page", "info")
|
|
page.goto(f"https://imginn.com/stories/{username}/?ref=index", wait_until='domcontentloaded')
|
|
|
|
# CRITICAL: Wait for Cloudflare background JS challenges
|
|
import random
|
|
wait_time = 5 + random.uniform(0, 2)
|
|
self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug")
|
|
time.sleep(wait_time)
|
|
|
|
# Wait for page to load
|
|
if not self.wait_for_cloudflare(page):
|
|
self.log("Stories page didn't load properly", "error")
|
|
return []
|
|
|
|
# Save cookies
|
|
self.save_cookies(self.context)
|
|
|
|
# Wait for stories container to load
|
|
self.log("Waiting for stories to load...", "info")
|
|
try:
|
|
page.wait_for_selector('.swiper-container.reels', timeout=10000)
|
|
self.log("Stories container loaded", "info")
|
|
except Exception:
|
|
self.log("No stories found - may have expired or page structure changed", "warning")
|
|
return []
|
|
|
|
# Find the Stories reel (first li.reel with data-uid and title "Stories")
|
|
self.log("Looking for Stories reel...", "info")
|
|
stories_reel = None
|
|
reels = page.locator('li.reel[data-uid]').all()
|
|
|
|
for reel in reels:
|
|
try:
|
|
# Check if this is the "Stories" reel
|
|
title = reel.locator('.title').first.text_content()
|
|
if title and title.strip().lower() == "stories":
|
|
stories_reel = reel
|
|
self.log(f"Found Stories reel", "info")
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
if not stories_reel:
|
|
self.log("No active Stories found for this user", "warning")
|
|
return []
|
|
|
|
# Click the Stories reel to open viewer
|
|
self.log("Opening Stories viewer...", "info")
|
|
stories_reel.click()
|
|
time.sleep(2) # Wait for viewer to open
|
|
|
|
# Find all download buttons in the story viewer
|
|
self.log("Finding story download links...", "info")
|
|
download_links = page.locator('div.action a.download').all()
|
|
|
|
if not download_links:
|
|
self.log("No story download links found", "warning")
|
|
return []
|
|
|
|
self.log(f"Found {len(download_links)} stories", "info")
|
|
|
|
# Set initial progress so dashboard shows 0/N immediately
|
|
stories_to_download = min(len(download_links), max_stories)
|
|
self.activity_manager.update_status(
|
|
"Downloading stories",
|
|
progress_current=0,
|
|
progress_total=stories_to_download
|
|
)
|
|
|
|
# Download each story
|
|
story_index = 1
|
|
for i, download_link in enumerate(download_links[:max_stories]):
|
|
# Update progress at start of each iteration (fires even on skips)
|
|
self.activity_manager.update_status(
|
|
"Downloading stories",
|
|
progress_current=i + 1,
|
|
progress_total=stories_to_download
|
|
)
|
|
|
|
try:
|
|
# Get download URL
|
|
download_url = download_link.get_attribute('href')
|
|
if not download_url or download_url == '#':
|
|
self.log(f"Story {story_index}: Invalid download URL", "warning")
|
|
continue
|
|
|
|
self.log(f"Story {story_index}: {download_url[:80]}...", "debug")
|
|
|
|
# Extract media ID from URL or generate unique ID
|
|
from urllib.parse import urlparse, unquote
|
|
url_path = urlparse(download_url).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
media_id_full = Path(original_name).stem # Full filename stem for unique naming
|
|
ext = Path(original_name).suffix or '.jpg'
|
|
|
|
# Extract real Instagram media ID (18-digit number) for duplicate checking
|
|
media_id_for_tracking = extract_instagram_media_id(media_id_full)
|
|
self.log(f"Story {story_index}: Full ID: {media_id_full[:40]}..., Tracking ID: {media_id_for_tracking}", "debug")
|
|
|
|
# Check if already downloaded using the normalized media ID
|
|
if media_id_for_tracking in self.downloaded_files or media_id_for_tracking in processed_stories:
|
|
self.log(f"Story {story_index}: Already downloaded (tracking ID: {media_id_for_tracking}), skipping", "debug")
|
|
story_index += 1
|
|
continue
|
|
|
|
# Also check with full ID for backwards compatibility
|
|
if media_id_full in self.downloaded_files or media_id_full in processed_stories:
|
|
self.log(f"Story {story_index}: Already downloaded (full ID: {media_id_full[:30]}...), skipping", "debug")
|
|
story_index += 1
|
|
continue
|
|
|
|
# Use current date for stories (they expire after 24h)
|
|
story_date = datetime.now()
|
|
date_str = story_date.strftime('%Y%m%d_%H%M%S')
|
|
|
|
# Build filename: {profile}_{date}_{media_id}_story{index}{ext}
|
|
# Use full media ID in filename for uniqueness
|
|
filename = f"{profile_name}_{date_str}_{media_id_full}_story{story_index}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Download the story
|
|
try:
|
|
import requests
|
|
|
|
response = requests.get(download_url, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': 'https://imginn.com/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.log(f"Downloaded story: {filename} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.unified_db:
|
|
from pathlib import Path as PathLib
|
|
file_hash = self.unified_db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
existing_path = PathLib(existing['file_path'])
|
|
if existing_path.exists():
|
|
self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate: {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
self._update_file_timestamps(filepath, story_date)
|
|
|
|
# Add both tracking ID and full ID to tracking set for comprehensive duplicate prevention
|
|
self.downloaded_files.add(media_id_for_tracking)
|
|
self.downloaded_files.add(media_id_full)
|
|
|
|
# Mark in database with media_id in metadata (or defer for later)
|
|
# Use the normalized media ID for database tracking to prevent future duplicates
|
|
if not skip_database or defer_database:
|
|
self._record_download(
|
|
media_id=media_id_for_tracking,
|
|
username=profile_name,
|
|
filename=filename,
|
|
url=download_url,
|
|
post_date=story_date,
|
|
file_path=str(filepath),
|
|
content_type='stories',
|
|
metadata={'media_id_full': media_id_full},
|
|
deferred=defer_database
|
|
)
|
|
|
|
story_index += 1
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to download story {story_index}: {e}", "error")
|
|
story_index += 1
|
|
continue
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing story {story_index}: {e}", "error")
|
|
story_index += 1
|
|
continue
|
|
|
|
self.log(f"Downloaded {len(downloaded_files)} story files", "info")
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading stories: {e}", "error")
|
|
|
|
# Don't close browser here - reuse it for next profile
|
|
return downloaded_files
|
|
|
|
|
|
def main():
|
|
"""Test the downloader with FastDL naming"""
|
|
import sys
|
|
|
|
print("=" * 60)
|
|
print("ImgInn Downloader - FastDL Compatible Naming")
|
|
print("=" * 60)
|
|
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print("=" * 60)
|
|
|
|
downloader = ImgInnDownloader(
|
|
api_key=os.environ.get("IMGINN_API_KEY", ""),
|
|
headless=False # Use with xvfb
|
|
)
|
|
|
|
# Check for specific post URL in arguments
|
|
if len(sys.argv) > 1 and 'imginn.com/p/' in sys.argv[-1]:
|
|
# Download specific post without date filter
|
|
files = downloader.download_posts(
|
|
username="evalongoria",
|
|
days_back=365, # Use large value to bypass date filter
|
|
max_posts=5,
|
|
specific_post_url=sys.argv[-1]
|
|
)
|
|
else:
|
|
# Download evalongoria posts from last 2 weeks
|
|
files = downloader.download_posts(
|
|
username="evalongoria",
|
|
days_back=14,
|
|
max_posts=50
|
|
)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("RESULTS")
|
|
print("=" * 60)
|
|
|
|
if files:
|
|
print(f"Successfully downloaded {len(files)} files!")
|
|
print("\n📁 Downloaded files (FastDL naming format):")
|
|
for f in files:
|
|
name = Path(f).name
|
|
size = Path(f).stat().st_size / 1024
|
|
# Show the naming format
|
|
parts = name.split('_', 3)
|
|
if len(parts) >= 4:
|
|
print(f" - {name}")
|
|
print(f" Profile: {parts[0]}")
|
|
print(f" Date: {parts[1]}_{parts[2]}")
|
|
print(f" Media ID: {parts[3].split('.')[0]}")
|
|
print(f" Size: {size:.1f} KB")
|
|
else:
|
|
print("No files downloaded")
|
|
|
|
# Check total in folder
|
|
download_dir = Path("/opt/media-downloader/downloads/evalongoria")
|
|
if download_dir.exists():
|
|
all_files = list(download_dir.glob("*"))
|
|
total_size = sum(f.stat().st_size for f in all_files) / 1024
|
|
print(f"\n📊 Total in folder: {len(all_files)} files ({total_size:.1f} KB)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |