#!/usr/bin/env python3 """ Forum Downloader Module Advanced forum scraping with database tracking, search monitoring, image host support, and comprehensive authentication for major forum platforms. Supported Forum Platforms: - XenForo (1.x and 2.x) - vBulletin (3.x, 4.x, 5.x) - phpBB (all versions) - Discourse - Invision Power Board (IPB 4.x) - MyBB - Simple Machines Forum (SMF) Key Features: - Automatic forum type detection - User authentication with cookie persistence - Database tracking to avoid re-downloads - Search monitoring with auto-tracking - Bulk downloading from forum sections - Support for multiple image hosting services - Thread update monitoring - Rate limiting and retry logic Authentication: The module supports automatic login for all major forum platforms. Login credentials are used to access private/members-only content. Cookies are saved for session persistence across runs. Usage: # Initialize downloader (use with ForumDatabaseAdapter for unified database) from modules.forum_db_adapter import ForumDatabaseAdapter forum_db_adapter = ForumDatabaseAdapter(unified_db) downloader = ForumDownloader( headless=True, show_progress=True, use_database=True, db_path=forum_db_adapter # Pass adapter for unified database ) # Login to forum (auto-detects forum type) downloader.login( forum_name="MyForum", username="your_username", password="your_password", forum_url="https://forum.example.com" ) # Download private thread downloader.download_thread( thread_url="https://forum.example.com/private/thread/123", forum_name="MyForum", username="your_username", # Optional if already logged in password="your_password" # Optional if already logged in ) # Download entire forum section downloader.download_forum_section( section_url="https://forum.example.com/forums/general-discussion", forum_name="MyForum", max_threads=50, username="your_username", password="your_password" ) Based on FastDL architecture """ # Suppress pkg_resources deprecation warning from face_recognition_models import warnings warnings.filterwarnings('ignore', category=UserWarning, message='.*pkg_resources is deprecated.*') from pathlib import Path from datetime import datetime, timedelta from urllib.parse import urlparse, urljoin import os import re import sqlite3 import json import hashlib import time import random import platform import subprocess from typing import Dict, List, Optional, Tuple import requests from bs4 import BeautifulSoup from enum import Enum from modules.base_module import LoggingMixin from modules.universal_logger import get_logger # Module-level logger for classes without instance logger (ForumAuthenticator, etc.) forum_logger = get_logger('Forum') # Set Playwright browser path - use environment variable if set, otherwise use standard location if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ: os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright' from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout import nest_asyncio # Apply nest_asyncio to allow Playwright in asyncio contexts try: nest_asyncio.apply() except Exception as e: pass # Import shared date utilities try: from modules.date_utils import DateHandler, extract_date, update_timestamps from modules.download_manager import DownloadManager, DownloadItem from modules.move_module import MoveManager DATE_UTILS_AVAILABLE = True # OMDB API key is now set dynamically from settings in ForumDownloader.__init__ except ImportError: DATE_UTILS_AVAILABLE = False from datetime import datetime as dt forum_logger.warning("date_utils module not found, using built-in date handling", module="Import") # Optional imports try: from tqdm import tqdm TQDM_AVAILABLE = True except ImportError: TQDM_AVAILABLE = False # Cloudflare handler for protected sites try: from modules.cloudflare_handler import ( CloudflareHandler, SiteStatus, get_flaresolverr_user_agent, get_playwright_context_options, get_playwright_stealth_scripts ) CLOUDFLARE_HANDLER_AVAILABLE = True except ImportError: CLOUDFLARE_HANDLER_AVAILABLE = False # Fallback functions if import fails def get_flaresolverr_user_agent(): return 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36' def get_playwright_context_options(): return { 'viewport': {'width': 1920, 'height': 1080}, 'user_agent': get_flaresolverr_user_agent(), 'locale': 'en-US', 'timezone_id': 'America/New_York', 'color_scheme': 'light' } def get_playwright_stealth_scripts(): return "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });" class ForumType(Enum): """Supported forum types""" XENOFORO = "xenoforo" VBULLETIN = "vbulletin" PHPBB = "phpbb" DISCOURSE = "discourse" INVISION = "invision" MYBB = "mybb" SMF = "smf" UNKNOWN = "unknown" class ForumAuthenticator: """Handle authentication for various forum platforms""" def __init__(self, log_func=None): self.credentials = {} self.cookies = {} self.session_data = {} self.log = log_func if log_func else lambda *args: None def detect_forum_type(self, page) -> ForumType: """Detect the forum software type from page content""" try: # Check meta tags and common identifiers html = page.content() url = page.url.lower() # Known XenForo forums - check URL first to avoid false positives if 'phun.org' in url or 'forum.phun.org' in url: return ForumType.XENOFORO # XenForo detection if 'data-app="public"' in html or 'XenForo' in html or 'xf-init' in html: return ForumType.XENOFORO # vBulletin detection if 'vBulletin' in html or 'vbulletin_' in html or 'vbmenu_' in html: return ForumType.VBULLETIN # phpBB detection if 'phpBB' in html or 'phpbb' in html or 'viewtopic.php' in html: return ForumType.PHPBB # Discourse detection if 'discourse' in html.lower() or 'data-discourse-' in html: return ForumType.DISCOURSE # Invision Power Board detection if 'ips4' in html or 'ipb' in html.lower() or 'invisioncommunity' in html: return ForumType.INVISION # MyBB detection if 'mybb' in html.lower() or 'MyBB' in html: return ForumType.MYBB # SMF (Simple Machines Forum) detection if 'SMF' in html or 'smf_' in html: return ForumType.SMF except Exception as e: forum_logger.error(f"Error detecting forum type: {e}") return ForumType.UNKNOWN def login_xenoforo(self, page, username: str, password: str, login_url: str) -> bool: """Login to XenForo forums""" try: page.goto(login_url, wait_until='domcontentloaded', timeout=60000) # Try to wait for networkidle but don't fail if it times out # (Cloudflare-protected sites may have ongoing background requests) try: page.wait_for_load_state('networkidle', timeout=15000) except PlaywrightTimeout: page.wait_for_timeout(3000) # Give page a moment to stabilize # Look for login form if page.locator('input[name="login"]').count() > 0: # XenForo 2.x page.fill('input[name="login"]', username) page.fill('input[name="password"]', password) # Check for remember me checkbox if page.locator('input[name="remember"]').count() > 0: page.check('input[name="remember"]') # Try different submit button selectors # First try the visible login button login_buttons = [ 'button:has-text("Log in")', 'button:has-text("Login")', 'button.button--primary:not(.button--icon)', 'button[type="submit"]:visible', 'input[type="submit"]:visible', 'button.button--primary[type="submit"]' ] clicked = False for selector in login_buttons: try: if page.locator(selector).count() > 0: # Make sure it's the login button, not search button = page.locator(selector).first button_text = button.inner_text() if 'search' not in button_text.lower(): button.click() clicked = True break except Exception: continue if not clicked: # Try pressing Enter in password field page.locator('input[name="password"]').press('Enter') elif page.locator('input[id="ctrl_pageLogin_login"]').count() > 0: # XenForo 1.x page.fill('input[id="ctrl_pageLogin_login"]', username) page.fill('input[id="ctrl_pageLogin_password"]', password) if page.locator('input[id="ctrl_pageLogin_remember"]').count() > 0: page.check('input[id="ctrl_pageLogin_remember"]') page.click('input[type="submit"]') # Wait for login to process - use domcontentloaded with fallback try: page.wait_for_load_state('networkidle', timeout=15000) except PlaywrightTimeout: pass page.wait_for_timeout(2000) # Wait a bit for login to process # Check if login was successful return self._verify_login(page, username) except Exception as e: forum_logger.error(f"XenForo login error: {e}") return False def login_vbulletin(self, page, username: str, password: str, login_url: str) -> bool: """Login to vBulletin forums""" try: page.goto(login_url, wait_until='domcontentloaded', timeout=60000) try: page.wait_for_load_state('networkidle', timeout=15000) except PlaywrightTimeout: page.wait_for_timeout(3000) # vBulletin 5.x if page.locator('input[name="username"]').count() > 0: page.fill('input[name="username"]', username) page.fill('input[name="password"]', password) # Remember me if page.locator('input[name="cookieuser"]').count() > 0: page.check('input[name="cookieuser"]') page.click('input[type="submit"], button[type="submit"]') # vBulletin 3.x/4.x elif page.locator('input[name="vb_login_username"]').count() > 0: page.fill('input[name="vb_login_username"]', username) page.fill('input[name="vb_login_password"]', password) if page.locator('input[name="cookieuser"]').count() > 0: page.check('input[name="cookieuser"]') page.click('input[type="submit"]') page.wait_for_load_state('networkidle') return self._verify_login(page, username) except Exception as e: forum_logger.error(f"vBulletin login error: {e}") return False def login_phpbb(self, page, username: str, password: str, login_url: str) -> bool: """Login to phpBB forums""" try: page.goto(login_url) page.wait_for_load_state('networkidle') # Standard phpBB login if page.locator('input[name="username"]').count() > 0: page.fill('input[name="username"]', username) page.fill('input[name="password"]', password) # Auto login if page.locator('input[name="autologin"]').count() > 0: page.check('input[name="autologin"]') page.click('input[name="login"], input[type="submit"]') page.wait_for_load_state('networkidle') return self._verify_login(page, username) except Exception as e: forum_logger.error(f"phpBB login error: {e}") return False def login_discourse(self, page, username: str, password: str, login_url: str) -> bool: """Login to Discourse forums""" try: page.goto(login_url) page.wait_for_load_state('networkidle') # Click login button if needed if page.locator('button.login-button').count() > 0: page.click('button.login-button') page.wait_for_timeout(1000) # Fill login form if page.locator('input[id="login-account-name"]').count() > 0: page.fill('input[id="login-account-name"]', username) page.fill('input[id="login-account-password"]', password) page.click('button[id="login-button"]') page.wait_for_load_state('networkidle') return self._verify_login(page, username) except Exception as e: forum_logger.error(f"Discourse login error: {e}") return False def login_invision(self, page, username: str, password: str, login_url: str) -> bool: """Login to Invision Power Board forums""" try: page.goto(login_url) page.wait_for_load_state('networkidle') # IPB 4.x if page.locator('input[name="auth"]').count() > 0: page.fill('input[name="auth"]', username) page.fill('input[name="password"]', password) if page.locator('input[name="remember_me"]').count() > 0: page.check('input[name="remember_me"]') page.click('button[type="submit"]') # Older versions elif page.locator('input[name="UserName"]').count() > 0: page.fill('input[name="UserName"]', username) page.fill('input[name="PassWord"]', password) page.click('input[type="submit"]') page.wait_for_load_state('networkidle') return self._verify_login(page, username) except Exception as e: forum_logger.error(f"Invision login error: {e}") return False def login_mybb(self, page, username: str, password: str, login_url: str) -> bool: """Login to MyBB forums""" try: page.goto(login_url) page.wait_for_load_state('networkidle') if page.locator('input[name="username"]').count() > 0: page.fill('input[name="username"]', username) page.fill('input[name="password"]', password) if page.locator('input[name="remember"]').count() > 0: page.check('input[name="remember"]') page.click('input[type="submit"]') page.wait_for_load_state('networkidle') return self._verify_login(page, username) except Exception as e: forum_logger.error(f"MyBB login error: {e}") return False def login_smf(self, page, username: str, password: str, login_url: str) -> bool: """Login to Simple Machines Forum""" try: page.goto(login_url) page.wait_for_load_state('networkidle') if page.locator('input[name="user"]').count() > 0: page.fill('input[name="user"]', username) page.fill('input[name="passwrd"]', password) if page.locator('input[name="cookielength"]').count() > 0: page.select_option('select[name="cookielength"]', 'always') page.click('input[type="submit"]') page.wait_for_load_state('networkidle') return self._verify_login(page, username) except Exception as e: forum_logger.error(f"SMF login error: {e}") return False def _verify_login(self, page, username: str) -> bool: """Verify if login was successful""" try: html = page.content().lower() username_lower = username.lower() # Common indicators of successful login success_indicators = [ f'welcome, {username_lower}', f'hello {username_lower}', f'logged in as {username_lower}', username_lower, 'logout', 'log out', 'sign out', 'private messages', 'notifications', 'user cp', 'control panel' ] for indicator in success_indicators: if indicator in html: return True # Check for login error messages error_indicators = [ 'invalid', 'incorrect', 'error', 'failed', 'wrong password', 'not found' ] for error in error_indicators: if error in html and 'login' in html: return False except Exception as e: forum_logger.error(f"Login verification error: {e}") return False def login_with_type(self, page, username: str, password: str, forum_url: str, forum_type_str: str) -> bool: """Login with explicitly specified forum type""" # Convert string to ForumType enum forum_type_map = { 'xenoforo': ForumType.XENOFORO, 'vbulletin': ForumType.VBULLETIN, 'phpbb': ForumType.PHPBB, 'discourse': ForumType.DISCOURSE, 'ipb': ForumType.INVISION, 'invision': ForumType.INVISION, 'mybb': ForumType.MYBB, 'smf': ForumType.SMF, 'unknown': ForumType.UNKNOWN } forum_type = forum_type_map.get(forum_type_str.lower(), ForumType.UNKNOWN) forum_logger.info(f"Using specified forum type: {forum_type.value}") # Determine login URL based on forum type login_urls = { ForumType.XENOFORO: f"{forum_url}/login", ForumType.VBULLETIN: f"{forum_url}/login.php", ForumType.PHPBB: f"{forum_url}/ucp.php?mode=login", ForumType.DISCOURSE: f"{forum_url}/login", ForumType.INVISION: f"{forum_url}/login", ForumType.MYBB: f"{forum_url}/member.php?action=login", ForumType.SMF: f"{forum_url}/index.php?action=login" } login_url = login_urls.get(forum_type, f"{forum_url}/login") login_methods = { ForumType.XENOFORO: lambda p, u, pw: self.login_xenoforo(p, u, pw, login_url), ForumType.VBULLETIN: lambda p, u, pw: self.login_vbulletin(p, u, pw, login_url), ForumType.PHPBB: lambda p, u, pw: self.login_phpbb(p, u, pw, login_url), ForumType.DISCOURSE: lambda p, u, pw: self.login_discourse(p, u, pw, login_url), ForumType.INVISION: lambda p, u, pw: self.login_invision(p, u, pw, login_url), ForumType.MYBB: lambda p, u, pw: self.login_mybb(p, u, pw, login_url), ForumType.SMF: lambda p, u, pw: self.login_smf(p, u, pw, login_url) } login_method = login_methods.get(forum_type) if login_method: return login_method(page, username, password) forum_logger.warning(f"Unknown forum type: {forum_type_str}, attempting auto-detection") return self.auto_login(page, username, password, forum_url) def auto_login(self, page, username: str, password: str, forum_url: str) -> bool: """Automatically detect forum type and login""" # Navigate to the forum URL first to detect forum type page.goto(forum_url, wait_until='domcontentloaded') page.wait_for_timeout(1000) forum_type = self.detect_forum_type(page) forum_logger.info(f"Detected forum type: {forum_type.value}") login_methods = { ForumType.XENOFORO: self.login_xenoforo, ForumType.VBULLETIN: self.login_vbulletin, ForumType.PHPBB: self.login_phpbb, ForumType.DISCOURSE: self.login_discourse, ForumType.INVISION: self.login_invision, ForumType.MYBB: self.login_mybb, ForumType.SMF: self.login_smf } if forum_type in login_methods: # Try to find login page login_url = self._find_login_url(page, forum_url) if login_url: return login_methods[forum_type](page, username, password, login_url) forum_logger.warning(f"Unsupported or unknown forum type: {forum_type.value}") return False def _find_login_url(self, page, base_url: str) -> Optional[str]: """Find the login URL for a forum""" common_paths = [ '/login', '/login/', '/index.php?login/', '/login.php', '/member.php?action=login', '/ucp.php?mode=login', '/index.php?action=login', '/account/login', '/signin', '/user/login' ] # Try common login paths for path in common_paths: login_url = urljoin(base_url, path) try: page.goto(login_url, wait_until='domcontentloaded', timeout=5000) if 'login' in page.content().lower() or 'sign in' in page.content().lower(): return login_url except Exception as e: self.log(f"Failed to check login path {path}: {e}", level="debug") continue # Try to find login link on current page try: page.goto(base_url) login_link = page.locator('a:has-text("Login"), a:has-text("Sign In"), a:has-text("Log In")').first if login_link: return login_link.get_attribute('href') except Exception as e: self.log(f"Failed to find login link on base page: {e}", level="debug") return None def save_cookies(self, page, forum_name: str): """Save cookies for session persistence""" cookies = page.context.cookies() self.cookies[forum_name] = cookies # Save to file for persistence in cookies directory cookies_dir = Path("cookies") cookies_dir.mkdir(exist_ok=True) cookies_file = cookies_dir / f"forum_cookies_{forum_name}.json" with open(cookies_file, 'w') as f: json.dump(cookies, f) def load_cookies(self, context, forum_name: str) -> bool: """Load saved cookies""" # Prioritize cookies directory, then check root for backwards compatibility possible_paths = [ Path("cookies") / f"forum_cookies_{forum_name}.json", Path(f"forum_cookies_{forum_name}.json") # backwards compatibility ] cookies_file = None for path in possible_paths: if path.exists(): cookies_file = path self.log(f"Found cookie file at: {path}", "debug") break if cookies_file and cookies_file.exists(): try: with open(cookies_file, 'r') as f: data = json.load(f) # Handle both formats: raw list or CloudflareHandler dict format if isinstance(data, list): cookies = data elif isinstance(data, dict) and 'cookies' in data: cookies = data['cookies'] else: self.log(f"Unknown cookie format for {forum_name}", "debug") return False # Format cookies for Playwright formatted_cookies = [] for c in cookies: cookie = { 'name': c['name'], 'value': c['value'], 'domain': c['domain'], 'path': c.get('path', '/'), 'secure': c.get('secure', True), 'httpOnly': c.get('httpOnly', False) } if c.get('expiry'): cookie['expires'] = c['expiry'] if c.get('sameSite'): cookie['sameSite'] = c['sameSite'] formatted_cookies.append(cookie) context.add_cookies(formatted_cookies) self.cookies[forum_name] = cookies self.log(f"Successfully loaded {len(cookies)} cookies for {forum_name}", "debug") return True except Exception as e: self.log(f"Error loading cookies: {e}", "debug") else: self.log(f"No cookie file found for {forum_name}", "debug") return False class ImageHostHandler: """Handle downloads from various image hosting services""" # Supported image hosts and their patterns IMAGE_HOSTS = { 'imgur': { 'domains': ['imgur.com', 'i.imgur.com'], 'patterns': [ r'https?://(?:i\.)?imgur\.com/([a-zA-Z0-9]+)(?:\.([a-z]+))?', r'https?://imgur\.com/a/([a-zA-Z0-9]+)', # Albums r'https?://imgur\.com/gallery/([a-zA-Z0-9]+)' # Galleries ] }, 'imgbb': { 'domains': ['imgbb.com', 'i.ibb.co', 'ibb.co'], 'patterns': [ r'https?://(?:i\.)?ibb\.co/([a-zA-Z0-9]+)', r'https?://imgbb\.com/image/([a-zA-Z0-9]+)' ] }, 'postimage': { 'domains': ['postimg.cc', 'postimages.org', 'i.postimg.cc'], 'patterns': [ r'https?://(?:i\.)?postimg\.cc/([a-zA-Z0-9]+)/([a-zA-Z0-9\-]+)', r'https?://postimages\.org/image/([a-zA-Z0-9]+)' ] }, 'imagebam': { 'domains': ['imagebam.com', 'www.imagebam.com'], 'patterns': [ r'https?://(?:www\.)?imagebam\.com/(?:image|view)/([a-zA-Z0-9]+)' ] }, 'imagevenue': { 'domains': ['imagevenue.com', 'img[0-9]+.imagevenue.com'], 'patterns': [ r'https?://img[0-9]+\.imagevenue\.com/.*?/([a-zA-Z0-9_]+\.(?:jpg|jpeg|png|gif))' ] }, 'pixhost': { 'domains': ['pixhost.to', 't.pixhost.to'], 'patterns': [ r'https?://(?:t\.)?pixhost\.to/(?:show|thumbs)/([0-9]+)/([a-zA-Z0-9_\-]+)' ] }, 'catbox': { 'domains': ['catbox.moe', 'files.catbox.moe'], 'patterns': [ r'https?://files\.catbox\.moe/([a-zA-Z0-9]+\.[a-z]+)' ] }, 'imagetwist': { 'domains': ['imagetwist.com', 'phun.imagetwist.com', 'i.imagetwist.com'], 'patterns': [ r'https?://(?:phun\.)?imagetwist\.com/([a-zA-Z0-9]+)', r'https?://i\.imagetwist\.com/[^/]+/([a-zA-Z0-9]+\.[a-z]+)' ] } } @classmethod def identify_host(cls, url: str) -> Optional[str]: """Identify which image host a URL belongs to""" domain = urlparse(url).netloc.lower() for host_name, host_info in cls.IMAGE_HOSTS.items(): for host_domain in host_info['domains']: if host_domain in domain or re.match(host_domain, domain): return host_name return None @classmethod def extract_direct_url(cls, url: str, page_content: str = None) -> Optional[str]: """Extract direct image URL from image host page""" host = cls.identify_host(url) if not host: return None # Direct extraction methods for known hosts if host == 'imgur': # Convert gallery/album URLs to direct image URLs if '/a/' in url or '/gallery/' in url: # Would need to fetch album data via Imgur API or scraping return None # Convert to direct image URL if 'i.imgur.com' not in url: match = re.search(r'imgur\.com/([a-zA-Z0-9]+)', url) if match: return f"https://i.imgur.com/{match.group(1)}.jpg" return url elif host == 'imgbb': if 'i.ibb.co' in url: return url # Already direct # Parse page for direct URL if page_content: soup = BeautifulSoup(page_content, 'html.parser') img = soup.find('img', {'class': 'main-image'}) or soup.find('img', {'id': 'image-viewer-container'}) if img and img.get('src'): return img['src'] elif host == 'catbox': if 'files.catbox.moe' in url: return url # Already direct # Add more host-specific extraction logic as needed return None class ForumDownloader(LoggingMixin): """ Forum downloader with database tracking and monitoring Features: - Download threads, posts, and search results - Monitor searches for new content - Track threads for updates - Support multiple image hosts - Database tracking to avoid re-downloads - Automatic retry and rate limiting """ def __init__(self, headless: bool = True, show_progress: bool = True, use_database: bool = True, db_path = None, download_dir: str = "forum_downloads", max_retries: int = 3, rate_limit: Tuple[int, int] = (1, 3), user_agent: str = None, forum_type: str = None, log_callback=None): """ Initialize forum downloader Args: headless: Run browser in headless mode show_progress: Show progress messages use_database: Enable database tracking db_path: Path to SQLite database download_dir: Base directory for downloads max_retries: Maximum retry attempts rate_limit: (min, max) seconds between requests user_agent: Custom user agent string """ self.headless = headless self.show_progress = show_progress self.use_database = use_database # Check if db_path is actually a database adapter object if hasattr(db_path, 'unified_db'): # It's an adapter - use it directly self.db_adapter = db_path self.db_path = None # Not needed when using adapter self.use_database = True else: # It's a regular path - use traditional database self.db_adapter = None self.db_path = db_path self.download_dir = Path(download_dir) # Don't create directory here - only create when actually downloading self.max_retries = max_retries self.rate_limit = rate_limit self.user_agent = user_agent or self._get_random_user_agent() # Initialize logging via mixin self._init_logger('Forum', log_callback, default_module='Download') # Statistics self.stats = { 'threads_processed': 0, 'posts_downloaded': 0, 'images_downloaded': 0, 'searches_monitored': 0, 'new_threads_found': 0, 'errors': 0 } self.pending_downloads = [] # Track downloads for deferred database recording # Authentication self.authenticator = ForumAuthenticator(log_func=self.log) self.logged_in_forums = {} # Browser context for session persistence self.browser = None self.context = None self.playwright = None # Forum type (can pre-set to skip detection) if forum_type: forum_type_map = { 'xenoforo': ForumType.XENOFORO, 'xenforo': ForumType.XENOFORO, 'vbulletin': ForumType.VBULLETIN, 'phpbb': ForumType.PHPBB, 'discourse': ForumType.DISCOURSE, 'invision': ForumType.INVISION, 'mybb': ForumType.MYBB, 'smf': ForumType.SMF } self.forum_type = forum_type_map.get(forum_type.lower(), None) else: self.forum_type = None # FlareSolverr configuration self.flaresolverr_url = "http://localhost:8191/v1" self.flaresolverr_enabled = True # Set to False to disable # Update User-Agent to match FlareSolverr if not custom (dynamically fetched) if not user_agent: self.user_agent = get_flaresolverr_user_agent() # Initialize database (skip if using adapter) if self.use_database and not self.db_adapter: self._init_database() # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager unified_db_instance = self.db_adapter.unified_db if self.db_adapter else None self.unified_db = unified_db_instance # Store for scraper config access self.activity_manager = get_activity_manager(unified_db_instance) # Set OMDB API key from settings for TV show date lookups if DATE_UTILS_AVAILABLE and unified_db_instance: try: from modules.settings_manager import SettingsManager settings = SettingsManager(unified_db_instance) omdb_config = settings.get('omdb', {}) omdb_api_key = omdb_config.get('api_key', '') if omdb_api_key: DateHandler.set_omdb_api_key(omdb_api_key) self.log("OMDB API key configured for date lookups", "debug") except Exception as e: self.log(f"Could not load OMDB API key from settings: {e}", "debug") def _create_browser_context(self, browser, **extra_options): """Create a browser context with dynamic fingerprinting from FlareSolverr. Args: browser: Playwright browser instance **extra_options: Additional options to merge (e.g., proxy) Returns: Browser context with proper fingerprinting """ context_options = get_playwright_context_options() context_options.update(extra_options) self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug") context = browser.new_context(**context_options) # Add anti-detection scripts context.add_init_script(get_playwright_stealth_scripts()) return context def _get_forum_scraper_id(self, forum_name: str) -> str: """Convert forum name to scraper ID format""" # Normalize forum name to match database IDs normalized = forum_name.lower().replace(' ', '_').replace('.', '_').replace('-', '_') return f"forum_{normalized}" def _get_forum_scraper_config(self, forum_name: str) -> Optional[Dict]: """Get scraper configuration for a forum from database""" if not self.unified_db: return None scraper_id = self._get_forum_scraper_id(forum_name) try: return self.unified_db.get_scraper(scraper_id) except Exception as e: self.log(f"Error getting scraper config for {forum_name}: {e}", "warning") return None def _get_forum_proxy_url(self, forum_name: str) -> Optional[str]: """Get proxy URL for a forum from database config""" config = self._get_forum_scraper_config(forum_name) if config and config.get('proxy_enabled') and config.get('proxy_url'): return config['proxy_url'] return None def _get_cookies_for_requests(self, forum_name: str = None): """Get cookies in format for requests library from database, FlareSolverr, or forum cookies""" cookies = {} # Try database first if available if forum_name and self.unified_db: scraper_id = self._get_forum_scraper_id(forum_name) try: cookie_list = self.unified_db.get_scraper_cookies(scraper_id) if cookie_list: for cookie in cookie_list: cookies[cookie['name']] = cookie['value'] return cookies except Exception as e: self.log(f"Error loading cookies from database for {forum_name}: {e}", "debug") # Fall back to cookie file if forum_name: cookies_file = Path(f"cookies/forum_cookies_{forum_name}.json") if cookies_file.exists(): try: with open(cookies_file, 'r') as f: data = json.load(f) # If it's FlareSolverr format with timestamp if isinstance(data, dict) and 'cookies' in data: for cookie in data['cookies']: cookies[cookie['name']] = cookie['value'] # If it's raw cookie list elif isinstance(data, list): for cookie in data: cookies[cookie['name']] = cookie['value'] except (json.JSONDecodeError, KeyError, TypeError) as e: self.log(f"Failed to parse cookies from {cookie_file}: {e}", level="debug") return cookies def _navigate_with_cloudflare(self, page, url: str, forum_name: str = None, cloudflare_enabled: bool = False, wait_until: str = 'networkidle', timeout: int = 60000) -> bool: """Navigate to a URL with Cloudflare bypass support Args: page: Playwright page object url: URL to navigate to forum_name: Forum name for cookie management cloudflare_enabled: Whether this forum uses Cloudflare protection wait_until: Playwright wait condition timeout: Navigation timeout in ms Returns: True if navigation succeeded, False otherwise """ if not cloudflare_enabled: # Standard navigation without Cloudflare handling try: page.goto(url, wait_until=wait_until, timeout=timeout) return True except PlaywrightTimeout: self.log(f"Navigation timeout for {url}", "error") return False # Cloudflare-protected navigation if not CLOUDFLARE_HANDLER_AVAILABLE: self.log("CloudflareHandler not available, falling back to standard navigation", "warning") try: page.goto(url, wait_until=wait_until, timeout=timeout) return True except PlaywrightTimeout: return False # Parse domain for CloudflareHandler parsed = urlparse(url) base_url = f"{parsed.scheme}://{parsed.netloc}" # Get proxy URL from database config if available proxy_url = self._get_forum_proxy_url(forum_name) if forum_name else None # Use database for cookies if unified_db available, otherwise use file cookie_file = None if not self.unified_db: cookie_file = f"cookies/forum_cookies_{forum_name}.json" if forum_name else "cookies/forum_cloudflare.json" # Initialize CloudflareHandler for this forum cf_handler = CloudflareHandler( module_name=f"Forum.{forum_name}" if forum_name else "Forum", cookie_file=cookie_file, flaresolverr_url=self.flaresolverr_url, flaresolverr_enabled=self.flaresolverr_enabled, user_agent=self.user_agent, logger=self.logger, aggressive_expiry=False, # Use conservative expiry for forum cookies proxy_url=proxy_url # Pass proxy to FlareSolverr ) # Load cookies from database if available if self.unified_db and forum_name: scraper_id = self._get_forum_scraper_id(forum_name) try: cookies = self.unified_db.get_scraper_cookies(scraper_id) if cookies: cf_handler._cookies = cookies except Exception as e: self.log(f"Error loading cookies from database: {e}", "debug") # Always load existing cookies into the page context first # This is critical for new pages that don't have cookies loaded existing_cookies = cf_handler.get_cookies_list() if existing_cookies: self.log(f"Loading {len(existing_cookies)} existing Cloudflare cookies for {forum_name}", "debug") try: page.context.add_cookies(existing_cookies) except Exception as e: self.log(f"Error loading cookies: {e}", "debug") # Check if we need fresh cookies if cf_handler.cookies_expired(): self.log(f"Cloudflare cookies expired for {forum_name}, refreshing via FlareSolverr...", "info") if cf_handler.get_cookies_via_flaresolverr(base_url): self.log(f"Successfully refreshed Cloudflare cookies for {forum_name}", "success") # Reload cookies into browser context cookies = cf_handler.get_cookies_list() if cookies: page.context.add_cookies(cookies) # Save cookies to database if self.unified_db and forum_name: scraper_id = self._get_forum_scraper_id(forum_name) try: self.unified_db.save_scraper_cookies(scraper_id, cookies, self.user_agent) self.log(f"Saved {len(cookies)} Cloudflare cookies to database for {forum_name}", "debug") except Exception as e: self.log(f"Error saving cookies to database: {e}", "debug") else: self.log(f"Failed to refresh Cloudflare cookies for {forum_name}", "warning") # Navigate to the URL with longer timeout for Cloudflare try: # Use domcontentloaded instead of networkidle for Cloudflare pages # networkidle can timeout during challenge page.goto(url, wait_until='domcontentloaded', timeout=timeout) # Wait a moment for any Cloudflare JavaScript to execute page.wait_for_timeout(3000) # Check for Cloudflare challenge try: content = page.content().lower() except Exception as e: # Page might still be navigating self.log(f"Page still loading, waiting...", "debug") page.wait_for_timeout(5000) content = page.content().lower() challenge_indicators = [ 'challenge-platform', 'checking your browser', 'just a moment', 'verify you are human', 'cf-challenge' ] # Only consider it a challenge if we find indicators AND the page is short # (Real forum pages are much longer than Cloudflare challenge pages) is_challenge = any(indicator in content for indicator in challenge_indicators) and len(content) < 10000 if is_challenge: self.log(f"Cloudflare challenge detected for {forum_name}, waiting for resolution...", "info") # Wait for challenge to resolve (up to 120 seconds) start_time = time.time() while time.time() - start_time < 120: try: page.wait_for_timeout(3000) content = page.content().lower() # Check if challenge is still present still_challenge = any(ind in content for ind in challenge_indicators) and len(content) < 10000 if not still_challenge: self.log(f"Cloudflare challenge resolved for {forum_name}", "success") # Save the new cookies cf_handler.save_cookies_from_playwright(page.context) # Also save to database if available if self.unified_db and forum_name: scraper_id = self._get_forum_scraper_id(forum_name) cookies = cf_handler.get_cookies_list() if cookies: try: self.unified_db.save_scraper_cookies(scraper_id, cookies, self.user_agent) self.log(f"Saved {len(cookies)} Cloudflare cookies to database for {forum_name}", "debug") except Exception as e: self.log(f"Error saving cookies to database: {e}", "debug") return True # Log progress elapsed = int(time.time() - start_time) if elapsed % 15 == 0 and elapsed > 0: self.log(f"Still waiting for Cloudflare ({elapsed}s)...", "debug") except Exception as e: self.log(f"Error during Cloudflare wait: {e}", "debug") self.log(f"Cloudflare challenge did not resolve for {forum_name} after 120s", "error") return False # No challenge detected - check if we're on the right page # Try to wait for networkidle, but don't fail if it times out try: page.wait_for_load_state('networkidle', timeout=15000) except PlaywrightTimeout: # Page may be loaded enough even if networkidle times out self.log(f"networkidle timeout for {url}, checking if page is usable...", "debug") # Verify we're on the expected page (not blocked/redirected) if 'celebboard' in url.lower() and 'celebboard' in page.url.lower(): self.log(f"Successfully navigated to {page.url}", "success") return True return True except PlaywrightTimeout: self.log(f"Navigation timeout for Cloudflare-protected URL: {url}", "error") return False except Exception as e: self.log(f"Navigation error for {url}: {e}", "error") return False def _get_random_user_agent(self) -> str: """Get random user agent for requests""" agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0" ] return random.choice(agents) def get_pending_downloads(self): """Get list of downloads that were deferred for later recording""" return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list after they've been recorded""" self.pending_downloads = [] def _get_db_connection(self): """Get database connection - either from adapter or direct""" if self.db_adapter: # Use temporary database for compatibility if not hasattr(self, '_temp_db_path'): import tempfile self._temp_db_path = tempfile.mktemp(suffix='.db') # Initialize temporary database temp_conn = sqlite3.connect(self._temp_db_path) self._init_database_conn(temp_conn) temp_conn.close() return sqlite3.connect(self._temp_db_path) elif self.db_path: return sqlite3.connect(self.db_path) else: # No database configured - create in-memory database return sqlite3.connect(':memory:') def _init_database_conn(self, conn): """Initialize database schema using provided connection""" cursor = conn.cursor() self._create_database_tables(cursor) conn.commit() def _init_database(self): """Initialize SQLite database for tracking""" if self.db_adapter: # Skip initialization when using adapter - it has its own database return conn = self._get_db_connection() cursor = conn.cursor() self._create_database_tables(cursor) conn.commit() conn.close() def _create_database_tables(self, cursor): """Create database tables""" # Threads table cursor.execute(''' CREATE TABLE IF NOT EXISTS threads ( thread_id TEXT PRIMARY KEY, forum_name TEXT, thread_url TEXT UNIQUE, thread_title TEXT, author TEXT, created_date DATETIME, last_checked DATETIME, last_post_date DATETIME, post_count INTEGER DEFAULT 0, status TEXT DEFAULT 'active', monitor_until DATETIME, metadata TEXT ) ''') # Posts table cursor.execute(''' CREATE TABLE IF NOT EXISTS posts ( post_id TEXT PRIMARY KEY, thread_id TEXT, post_url TEXT UNIQUE, author TEXT, post_date DATETIME, content_hash TEXT, has_images BOOLEAN DEFAULT 0, downloaded BOOLEAN DEFAULT 0, download_date DATETIME, metadata TEXT, FOREIGN KEY (thread_id) REFERENCES threads (thread_id) ) ''') # Images table cursor.execute(''' CREATE TABLE IF NOT EXISTS images ( image_id TEXT PRIMARY KEY, post_id TEXT, image_url TEXT, direct_url TEXT, filename TEXT, file_hash TEXT, downloaded BOOLEAN DEFAULT 0, download_date DATETIME, file_size INTEGER, metadata TEXT, FOREIGN KEY (post_id) REFERENCES posts (post_id) ) ''') # Searches table cursor.execute(''' CREATE TABLE IF NOT EXISTS searches ( search_id TEXT PRIMARY KEY, forum_name TEXT, search_query TEXT, search_url TEXT, last_checked DATETIME, check_frequency_hours INTEGER DEFAULT 24, active BOOLEAN DEFAULT 1, results_found INTEGER DEFAULT 0, metadata TEXT ) ''') # Search results table (links searches to threads) cursor.execute(''' CREATE TABLE IF NOT EXISTS search_results ( search_id TEXT, thread_id TEXT, found_date DATETIME, PRIMARY KEY (search_id, thread_id), FOREIGN KEY (search_id) REFERENCES searches (search_id), FOREIGN KEY (thread_id) REFERENCES threads (thread_id) ) ''') # Download queue table (similar to fastdl_module) cursor.execute(''' CREATE TABLE IF NOT EXISTS download_queue ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT UNIQUE NOT NULL, referer TEXT, save_path TEXT NOT NULL, thread_id TEXT, post_id TEXT, forum_name TEXT, status TEXT DEFAULT 'pending', attempts INTEGER DEFAULT 0, created_date DATETIME DEFAULT CURRENT_TIMESTAMP, downloaded_date DATETIME, error_message TEXT, file_hash TEXT, metadata TEXT ) ''') # Create indexes - both single and composite for optimization # Single column indexes cursor.execute('CREATE INDEX IF NOT EXISTS idx_threads_status ON threads(status)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_threads_monitor ON threads(monitor_until)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_posts_thread ON posts(thread_id)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_posts_downloaded ON posts(downloaded)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_images_post ON images(post_id)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_searches_active ON searches(active)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_queue_status ON download_queue(status)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_queue_url ON download_queue(url)') # Composite indexes for common query patterns cursor.execute('CREATE INDEX IF NOT EXISTS idx_threads_forum_status ON threads(forum_name, status)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_posts_thread_downloaded ON posts(thread_id, downloaded)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_images_post_downloaded ON images(post_id, downloaded)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_queue_status_attempts ON download_queue(status, attempts)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_searches_forum_active ON searches(forum_name, active)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_threads_monitor_status ON threads(monitor_until, status)') # Enable WAL mode for better concurrency cursor.execute('PRAGMA journal_mode=WAL') cursor.execute('PRAGMA synchronous=NORMAL') # Create triggers for automatic cleanup # Clean up old completed downloads after 90 days cursor.execute(''' CREATE TRIGGER IF NOT EXISTS cleanup_old_downloads AFTER INSERT ON download_queue WHEN (SELECT COUNT(*) FROM download_queue WHERE status = 'completed') > 10000 BEGIN DELETE FROM download_queue WHERE status = 'completed' AND downloaded_date < datetime('now', '-90 days'); END ''') # Clean up expired monitoring threads cursor.execute(''' CREATE TRIGGER IF NOT EXISTS cleanup_expired_monitors AFTER INSERT ON threads BEGIN UPDATE threads SET status = 'expired' WHERE monitor_until IS NOT NULL AND monitor_until < datetime('now') AND status = 'active'; END ''') # Clean up old search results after 180 days cursor.execute(''' CREATE TRIGGER IF NOT EXISTS cleanup_old_search_results AFTER INSERT ON search_results WHEN (SELECT COUNT(*) FROM search_results) > 50000 BEGIN DELETE FROM search_results WHERE found_date < datetime('now', '-180 days'); END ''') def reset_download_queue(self, forum_name=None, status=None): """Reset download queue by removing records Args: forum_name: If specified, only reset records for this forum status: If specified, only reset records with this status Returns: Number of records deleted """ if not self.use_database: return 0 conn = self._get_db_connection() cursor = conn.cursor() try: if forum_name and status: cursor.execute( "DELETE FROM download_queue WHERE forum_name = ? AND status = ?", (forum_name, status) ) elif forum_name: cursor.execute( "DELETE FROM download_queue WHERE forum_name = ?", (forum_name,) ) elif status: cursor.execute( "DELETE FROM download_queue WHERE status = ?", (status,) ) else: cursor.execute("DELETE FROM download_queue") deleted = cursor.rowcount conn.commit() self.log(f"Deleted {deleted} records from download queue", "info") return deleted finally: conn.close() def add_to_download_queue(self, url, save_path, referer=None, thread_id=None, post_id=None, forum_name=None, metadata=None): """Add an item to the download queue Args: url: URL to download save_path: Where to save the file referer: Referer URL thread_id: Associated thread ID post_id: Associated post ID forum_name: Forum name metadata: Additional metadata as dict Returns: True if added, False if already exists """ if not self.use_database: return False # Use adapter if available if self.db_adapter: return self.db_adapter.add_to_download_queue( url=url, referer=referer, save_path=save_path, thread_id=thread_id, post_id=post_id, forum_name=forum_name, metadata=metadata ) conn = self._get_db_connection() cursor = conn.cursor() try: # Check if already in queue or downloaded cursor.execute( "SELECT status FROM download_queue WHERE url = ?", (url,) ) existing = cursor.fetchone() if existing: if existing[0] == 'completed': self.log(f"Skipping already downloaded: {Path(save_path).name}", "info") return False # Already downloaded elif existing[0] == 'pending': # Already in queue, don't duplicate return False # Only insert if not existing metadata_str = json.dumps(metadata) if metadata else None cursor.execute(''' INSERT INTO download_queue (url, referer, save_path, thread_id, post_id, forum_name, status, metadata) VALUES (?, ?, ?, ?, ?, ?, 'pending', ?) ''', (url, referer, str(save_path), thread_id, post_id, forum_name, metadata_str)) conn.commit() return True finally: conn.close() def is_in_download_queue(self, url): """Check if a URL is in the download queue with pending status Args: url: URL to check Returns: bool: True if in queue with pending status """ if not self.use_database: return False # Use adapter if available if self.db_adapter: return self.db_adapter.is_in_download_queue(url) conn = self._get_db_connection() try: cursor = conn.cursor() cursor.execute( "SELECT status FROM download_queue WHERE url = ? AND status = 'pending'", (url,) ) result = cursor.fetchone() return result is not None finally: conn.close() def process_download_queue(self, context=None, max_items=None): """Process all pending items in the download queue using gallery-dl Args: context: Playwright context to use for downloads (optional, will use gallery-dl) max_items: Maximum number of items to process Returns: Dict with download statistics """ if not self.use_database: return {'processed': 0, 'successful': 0, 'failed': 0} conn = self._get_db_connection() cursor = conn.cursor() # Get pending items with metadata query = """ SELECT id, url, referer, save_path, thread_id, post_id, forum_name, metadata FROM download_queue WHERE status = 'pending' ORDER BY created_date """ if max_items: query += f" LIMIT {max_items}" cursor.execute(query) items = cursor.fetchall() stats = {'processed': 0, 'successful': 0, 'failed': 0} for item in items: item_id, url, referer, save_path, thread_id, post_id, forum_name, metadata_str = item save_path = Path(save_path) # Parse metadata metadata = json.loads(metadata_str) if metadata_str else {} post_date = None post_title = metadata.get('post_title', '') # Extract date from title first (takes precedence) if post_title: post_date = DateHandler.extract_date_from_text(post_title) # Fall back to post date if no date in title if not post_date and metadata.get('post_date'): try: post_date = datetime.fromisoformat(metadata['post_date']) except (ValueError, TypeError): pass # Invalid date format in metadata, use None try: # Download using Playwright if context available if context: page = context.new_page() try: # Set referer if provided if referer: page.set_extra_http_headers({'Referer': referer}) # For pixhost direct URLs (img*.pixhost.to), download directly # For pixhost show URLs, they should have been converted during scraping # but handle them here as fallback if 'pixhost.to/show/' in url: # This shouldn't happen if extraction worked during scraping # Navigate to the pixhost page page.goto(url, wait_until='domcontentloaded', timeout=30000) page.wait_for_timeout(1000) # Wait for JS # Find the actual image img_element = page.query_selector('img#image') if img_element: actual_url = img_element.get_attribute('src') if actual_url: # Download the actual image response = page.goto(actual_url, timeout=30000) if response and response.ok: content = response.body() else: raise Exception(f"Failed to download image from {actual_url}") else: raise Exception("No src attribute on image") else: raise Exception("No image found on pixhost page") else: # Regular download (including direct pixhost URLs) response = page.goto(url, wait_until='domcontentloaded', timeout=60000) if response and response.ok: # Get the content content = response.body() else: raise Exception(f"HTTP {response.status if response else 'No response'}") # Check if it's HTML (error page) if content[:1000].lower().find(b' str: """Generate hash of content for duplicate detection""" return hashlib.sha256(content.encode('utf-8')).hexdigest() def _download_image(self, url: str, save_path: Path, referer: str = None, external_only: bool = False, context=None) -> bool: """Download an image from URL, optionally filtering for external hosts only""" try: # If external_only is True, skip forum's internal attachments if external_only: # Skip forum's internal attachments if '/attachments/' in url: self.log(f"Skipping forum attachment: {url}", "debug") return False # Only download if it's an external image host or external URL host = ImageHostHandler.identify_host(url) if not host and not any(domain in url for domain in ['fastdl.app', 'picturepub.net']): # Not a recognized external host, check if it's still external if not url.startswith('http'): self.log(f"Skipping non-external URL: {url}", "debug") return False # Check for image hosts that need special handling host = ImageHostHandler.identify_host(url) if host: self.log(f"Detected {host} image host, extracting full image...", "debug") # For image hosts, we need to visit the page and extract the full image if host == 'imagebam': # ImageBam requires visiting the page to get the full image return self._download_from_imagebam(url, save_path, referer) elif host == 'imagetwist': # ImageTwist requires parsing the page to get direct image URL return self._download_from_imagetwist(url, save_path, referer) elif host == 'imgur': # Imgur - convert to direct link direct_url = ImageHostHandler.extract_direct_url(url) if direct_url: url = direct_url else: # Try generic extraction direct_url = ImageHostHandler.extract_direct_url(url) if direct_url: url = direct_url # Download using Playwright if context available, otherwise use requests save_path.parent.mkdir(parents=True, exist_ok=True) if context: # Use Playwright for authenticated download page = context.new_page() try: # Navigate to the image URL and get the response response = page.goto(url, wait_until='networkidle') if response: # Get the response body (image bytes) image_bytes = response.body() # Check if we got HTML instead of an image if image_bytes[:100].lower().find(b'') != -1 or \ first_chunk[:100].lower().find(b' bool: """Try to download using gallery-dl as fallback for unsupported hosts""" try: import subprocess # Check if gallery-dl is installed result = subprocess.run(["which", "gallery-dl"], capture_output=True) if result.returncode != 0: self.log("gallery-dl not installed, skipping fallback", "debug") return False self.log(f"Attempting download with gallery-dl: {url}", "debug") # Build gallery-dl command cmd = [ "gallery-dl", "--dest", str(save_path.parent), "--filename", f"{save_path.name}", "--no-skip", "--no-part", "--quiet" ] # Add referer if provided if referer: cmd.extend(["--header", f"Referer: {referer}"]) # Add the URL cmd.append(url) # Run gallery-dl with timeout result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 ) if result.returncode == 0 and save_path.exists(): return True # Check if file was saved with different extension base_name = save_path.stem for file in save_path.parent.glob(f"{base_name}.*"): if file != save_path: # Rename to expected path file.rename(save_path) return True return False except subprocess.TimeoutExpired: self.log("gallery-dl timeout", "debug") return False except Exception as e: self.log(f"gallery-dl fallback failed: {e}", "debug") return False def _download_with_retry(self, download_func, *args, max_retries=3, **kwargs): """Download with exponential backoff retry logic""" import time for attempt in range(max_retries): try: result = download_func(*args, **kwargs) if result: return True # If download returned False (not an exception), might be 404 if attempt == max_retries - 1: return False except requests.exceptions.HTTPError as e: if e.response.status_code in [404, 410]: # Don't retry on not found self.log(f"Resource not found (HTTP {e.response.status_code})", "warning") return False elif e.response.status_code == 429: # Rate limited - wait longer wait_time = min(60, (2 ** attempt) * 5) self.log(f"Rate limited, waiting {wait_time}s", "warning") time.sleep(wait_time) elif e.response.status_code >= 500: # Server error - retry with backoff wait_time = min(30, (2 ** attempt) * 2) self.log(f"Server error {e.response.status_code}, retrying in {wait_time}s", "warning") time.sleep(wait_time) else: raise except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: if attempt < max_retries - 1: # Network error - retry with exponential backoff wait_time = min(30, (2 ** attempt) * 2) self.log(f"Network error, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})", "info") time.sleep(wait_time) else: self.log(f"Failed after {max_retries} attempts: {e}", "error") return False except Exception as e: self.log(f"Unexpected error in download: {e}", "error") return False return False def _download_from_imagebam(self, url: str, save_path: Path, referer: str = None) -> bool: """Download image from ImageBam (requires clicking continue)""" try: # ImageBam requires clicking "Continue to image" button # Run Playwright in a separate thread to avoid event loop conflicts def run_playwright(): with sync_playwright() as p: browser = p.chromium.launch( headless=True, executable_path='/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome' if os.path.exists('/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome') else None ) page = browser.new_page(user_agent=self.user_agent) # Set referer if referer: page.set_extra_http_headers({'Referer': referer}) # Go to ImageBam page page.goto(url, wait_until='domcontentloaded') page.wait_for_timeout(2000) # Click "Continue to image" or similar button continue_buttons = [ 'button:has-text("Continue")', 'a:has-text("Continue")', 'input[value*="Continue"]', '.continue-button', 'button:has-text("Continue to image")', 'a:has-text("Continue to image")', 'a:has-text("Continue to your image")' ] for selector in continue_buttons: try: if page.locator(selector).count() > 0: page.locator(selector).first.click() page.wait_for_timeout(2000) break except Exception: continue # Now look for the actual image img_url = None # Try different methods to find the image # Method 1: Look for ImageBam hosted images (images*.imagebam.com) img_elems = page.locator('img').all() for img in img_elems: src = img.get_attribute('src') if src: # ImageBam full images are on images*.imagebam.com domains if 'images' in src and 'imagebam.com' in src and src.endswith(('.jpg', '.jpeg', '.png', '.gif')): # Check it's not a logo or small image if 'logo' not in src.lower() and 'thumb' not in src.lower(): img_url = src break # Found the full image # Method 2: Look for image in a specific container if not img_url: main_img = page.locator('#imageTarget, .main-image, .the-image, #thepic').first if main_img: img_url = main_img.get_attribute('src') # Method 3: Get from page content if not img_url: content = page.content() import re # Look for image URL in page match = re.search(r'(https?://[^"]+images[^"]+\.(?:jpg|jpeg|png|gif))', content) if match: img_url = match.group(1) browser.close() if img_url: # Make sure it's a full URL if not img_url.startswith('http'): img_url = urljoin(url, img_url) # Download the image headers = { 'User-Agent': self.user_agent, 'Referer': url } response = requests.get(img_url, headers=headers, timeout=30, stream=True, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Read first chunk to validate content type first_chunk = None chunks = [] for chunk in response.iter_content(chunk_size=8192): if first_chunk is None: first_chunk = chunk # Check if we got HTML instead of an image if first_chunk[:100].lower().find(b'') != -1 or \ first_chunk[:100].lower().find(b' bool: """Download image from ImageTwist (requires parsing page for direct image URL)""" import time # Rate limiting for ImageTwist (they return error images if too fast) if not hasattr(self, '_imagetwist_last_request'): self._imagetwist_last_request = 0 elapsed = time.time() - self._imagetwist_last_request if elapsed < 2.0: # Minimum 2 seconds between ImageTwist requests time.sleep(2.0 - elapsed) try: self.log(f"Fetching ImageTwist page: {url}", "debug") # First, fetch the page to find the direct image URL headers = { 'User-Agent': self.user_agent, 'Referer': referer or 'https://forum.phun.org/' } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() self._imagetwist_last_request = time.time() page_content = response.text # Look for the direct image URL in the page # ImageTwist pattern: \s]+)', page_content) if match: img_url = match.group(1) self.log(f"Found ImageTwist direct URL via regex: {img_url}", "debug") # Method 3: Look for download link if not img_url: download_link = soup.find('a', class_='ddownloader') if download_link and download_link.get('href'): img_url = download_link['href'] self.log(f"Found ImageTwist direct URL via download link: {img_url}", "debug") if not img_url: self.log(f"Could not find direct image URL on ImageTwist page: {url}", "warning") return False # Rate limit before image download too elapsed = time.time() - self._imagetwist_last_request if elapsed < 2.0: time.sleep(2.0 - elapsed) # Now download the actual image - use imagetwist page as Referer self.log(f"Downloading ImageTwist image: {img_url}", "debug") img_headers = { 'User-Agent': self.user_agent, 'Referer': url # Use the imagetwist page URL as Referer } img_response = requests.get(img_url, headers=img_headers, timeout=30, stream=True) img_response.raise_for_status() self._imagetwist_last_request = time.time() # Check for ImageTwist error placeholder (8346 bytes PNG - rate limited or deleted) content_length = img_response.headers.get('Content-Length', 'unknown') if content_length == '8346': self.log(f"ImageTwist returned error image (rate limited or unavailable): {url}", "warning") return False # Validate it's an image, not HTML first_chunk = None chunks = [] for chunk in img_response.iter_content(chunk_size=8192): if first_chunk is None: first_chunk = chunk # Check if we got HTML instead of an image if first_chunk[:100].lower().find(b'') != -1 or \ first_chunk[:100].lower().find(b' bool: """ Login to a forum and keep browser context alive for subsequent operations Args: forum_name: Name identifier for the forum username: Login username password: Login password forum_url: Base URL of the forum (optional if thread_url provided) forum_type: Forum software type (xenoforo, vbulletin, phpbb, discourse, ipb, mybb, smf) If not specified, will auto-detect cloudflare_enabled: Whether this forum uses Cloudflare protection Returns: bool: True if login successful """ # Only create new browser if we don't have one if not self.playwright: self.playwright = sync_playwright().start() if not self.browser: self.browser = self.playwright.chromium.launch( headless=self.headless, executable_path='/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome' if os.path.exists('/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome') else None ) if not self.context: self.context = self._create_browser_context(self.browser) import threading self._context_thread_id = threading.current_thread().ident # Try to load existing cookies first if self.authenticator.load_cookies(self.context, forum_name): page = self.context.new_page() # Use Cloudflare-aware navigation if needed if cloudflare_enabled: if not self._navigate_with_cloudflare(page, forum_url, forum_name, cloudflare_enabled): self.log(f"Failed to navigate to {forum_name} (Cloudflare)", "error") page.close() return False else: page.goto(forum_url) # Verify if still logged in if self.authenticator._verify_login(page, username): self.logged_in_forums[forum_name] = True self.log(f"Restored session for {forum_name}", "debug") # Keep browser open for subsequent operations return True page.close() page = self.context.new_page() # Navigate to forum (with Cloudflare support if needed) if forum_url: if cloudflare_enabled: if not self._navigate_with_cloudflare(page, forum_url, forum_name, cloudflare_enabled): self.log(f"Failed to navigate to {forum_name} for login", "error") return False else: page.goto(forum_url) # Use provided forum_type or auto-detect if forum_type: success = self.authenticator.login_with_type(page, username, password, forum_url, forum_type) else: success = self.authenticator.auto_login(page, username, password, forum_url) if success: self.authenticator.save_cookies(page, forum_name) self.logged_in_forums[forum_name] = True self.log(f"Successfully logged in to {forum_name}", "success") else: self.log(f"Failed to login to {forum_name}", "error") # Close browser on failure self.browser.close() self.browser = None self.context = None # Keep browser open for subsequent operations if successful return success def monitor_search(self, forum_name: str, search_query: str, search_url: str = None, forum_url: str = None, check_frequency_hours: int = 24, auto_track_days: int = 30, number_of_days: int = None, base_download_path: str = None, destination_path: str = None, username: str = None, password: str = None, newer_than_days: int = None, older_than_days: int = None, external_only: bool = True, cloudflare_enabled: bool = False) -> Dict: """ Monitor a search for new threads/posts Args: forum_name: Name of the forum search_query: Search query string search_url: URL of the search results (optional if using date filters) forum_url: Base URL of the forum (e.g., https://example.com) check_frequency_hours: How often to check (hours) auto_track_days: Days to track new threads found number_of_days: Only download posts from last N days (None = all) base_download_path: Temporary download path (default: downloads/{forum_name}/temp) destination_path: Final destination path (default: downloads/{forum_name}) username: Optional username for login password: Optional password for login newer_than_days: Search for threads newer than N days older_than_days: Search for threads older than N days cloudflare_enabled: Whether this forum uses Cloudflare protection Returns: Dictionary with search results """ if not self.use_database: self.log("Database required for search monitoring", "error") return {} conn = self._get_db_connection() cursor = conn.cursor() # Generate search ID search_id = hashlib.sha256(f"{forum_name}:{search_query}".encode()).hexdigest() # Check if search exists cursor.execute( "SELECT last_checked FROM searches WHERE search_id = ?", (search_id,) ) existing = cursor.fetchone() # Check if we should run the search if existing: last_checked = existing[0] if isinstance(existing[0], datetime) else datetime.fromisoformat(existing[0]) if datetime.now() - last_checked < timedelta(hours=check_frequency_hours): self.log(f"Search '{search_query}' checked recently, skipping", "info") conn.close() return {'status': 'skipped', 'reason': 'checked_recently'} # Perform the search self.log(f"Monitoring search: {search_query}", "info") # If date filters are provided and no search_url, perform advanced search if (newer_than_days or older_than_days) and not search_url: search_url = self._perform_advanced_search( forum_name=forum_name, search_query=search_query, forum_url=forum_url, newer_than_days=newer_than_days, older_than_days=older_than_days, username=username, password=password, cloudflare_enabled=cloudflare_enabled ) if not search_url: self.log("Advanced search failed", "error") conn.close() return {'status': 'error', 'message': 'Advanced search failed'} elif not search_url: self.log("Search URL required when not using date filters", "error") conn.close() return {'status': 'error', 'message': 'Search URL required'} # Check for special phun.org marker (results already scraped to avoid Cloudflare) if search_url == "PHUN_RESULTS_READY": results = getattr(self, '_phun_search_results', []) self._phun_search_results = [] # Clear after use else: results = self._scrape_search_results(search_url) # Filter results to only include threads that contain ALL search terms in the title if search_query and results: filtered_results = [] search_terms = search_query.lower().split() # Split search query into words for result in results: title = result.get('title', '').lower() # Check if ALL search terms appear in the title if title and all(term in title for term in search_terms): filtered_results.append(result) else: self.log(f"Skipping thread (search term not in title): {result.get('title', 'Unknown')[:60]}...", "debug") if len(filtered_results) < len(results): self.log(f"Filtered {len(results) - len(filtered_results)} threads that don't match search query", "info") results = filtered_results # Update or insert search record if existing: cursor.execute(''' UPDATE searches SET last_checked = ?, results_found = ? WHERE search_id = ? ''', (datetime.now().isoformat(), len(results), search_id)) else: cursor.execute(''' INSERT INTO searches (search_id, forum_name, search_query, search_url, last_checked, check_frequency_hours, active, results_found) VALUES (?, ?, ?, ?, ?, ?, TRUE, ?) ''', (search_id, forum_name, search_query, search_url, datetime.now().isoformat(), check_frequency_hours, len(results))) # Process results new_threads = 0 new_thread_results = [] # Track only new threads to download monitor_until = datetime.now() + timedelta(days=auto_track_days) for result in results: thread_id = result.get('thread_id') thread_url = result.get('url') # Check if thread exists and if it's still being monitored thread_exists = False should_monitor = True if self.db_adapter: # Check if URL is already downloaded in unified database thread_exists = self.db_adapter.is_already_downloaded(thread_url, forum_name=forum_name) if thread_exists: # Check if monitor_until has expired thread_data = self.db_adapter.db_get_thread(thread_id) if thread_data and thread_data.get('monitor_until'): monitor_until_str = thread_data.get('monitor_until') try: monitor_until_date = datetime.fromisoformat(monitor_until_str) if datetime.now() > monitor_until_date: should_monitor = False self.log(f"Thread monitoring expired, skipping: {result.get('title', 'Unknown')[:60]}...", "debug") else: self.log(f"Thread exists but still monitoring for updates: {result.get('title', 'Unknown')[:60]}...", "debug") except Exception: pass # If parsing fails, continue monitoring else: # Fallback to local database check cursor.execute( "SELECT thread_id, monitor_until FROM threads WHERE thread_id = ? OR thread_url = ?", (thread_id, thread_url) ) row = cursor.fetchone() if row: thread_exists = True if row[1]: # monitor_until exists try: monitor_until_date = datetime.fromisoformat(row[1]) if datetime.now() > monitor_until_date: should_monitor = False except Exception: pass if not thread_exists or (thread_exists and should_monitor): # New thread OR existing thread still being monitored if not thread_exists: # Add new thread to tracking if self.db_adapter: thread_added = self.db_adapter.db_add_thread( thread_id=thread_id or hashlib.sha256(thread_url.encode()).hexdigest(), forum_name=forum_name, thread_url=thread_url, thread_title=result.get('title', 'Unknown'), monitor_until=monitor_until ) if thread_added: self.log(f"Added thread to monitoring for 30 days: {result.get('title', 'Unknown')[:60]}...", "info") else: # Fallback to local database cursor.execute(''' INSERT OR IGNORE INTO threads (thread_id, forum_name, thread_url, thread_title, author, created_date, last_checked, status, monitor_until) VALUES (?, ?, ?, ?, ?, ?, ?, 'active', ?) ''', ( thread_id or hashlib.sha256(thread_url.encode()).hexdigest(), forum_name, thread_url, result.get('title', 'Unknown'), result.get('author', 'Unknown'), result.get('date', datetime.now().isoformat()) if isinstance(result.get('date'), str) else datetime.now().isoformat(), datetime.now().isoformat(), monitor_until.isoformat() )) new_threads += 1 self.log(f"New thread found: {result.get('title', 'Unknown')[:60]}...", "info") else: # Existing thread still being monitored - just update last_checked if self.db_adapter: self.db_adapter.db_update_thread( thread_id=thread_id, last_post_date=None, post_count=None ) self.log(f"Checking monitored thread for updates: {result.get('title', 'Unknown')[:60]}...", "info") # Add to results list for downloading/checking new_thread_results.append(result) else: # Thread already downloaded - skip it self.log(f"Thread already downloaded, skipping: {result.get('title', 'Unknown')[:60]}...", "info") # Update monitoring in unified database if using adapter if self.db_adapter: self.db_adapter.db_update_thread( thread_id=thread_id, last_post_date=None, post_count=None ) else: # Thread exists - update monitor_until if it's NULL cursor.execute(''' UPDATE threads SET monitor_until = ?, last_checked = ? WHERE thread_id = ? AND monitor_until IS NULL ''', (monitor_until.isoformat(), datetime.now().isoformat(), thread_id)) # Link to search cursor.execute(''' INSERT OR IGNORE INTO search_results (search_id, thread_id, found_date) VALUES (?, ?, ?) ''', (search_id, thread_id, datetime.now().isoformat())) conn.commit() conn.close() self.stats['searches_monitored'] += 1 self.stats['new_threads_found'] += new_threads skipped_threads = len(results) - new_threads if skipped_threads > 0: self.log(f"Search complete: {len(results)} results found, {new_threads} new threads, {skipped_threads} already downloaded", "success") else: self.log(f"Search complete: {len(results)} results, {new_threads} new threads", "success") # Don't close browser here - it might be needed for downloads # Let download_thread handle its own browser lifecycle return { 'status': 'success', 'total_results': len(results), 'new_threads': new_threads, 'skipped_threads': len(results) - new_threads, 'search_id': search_id, 'results': new_thread_results # Return ONLY new threads to download } def download_thread(self, thread_url: str, forum_name: str = None, download_images: bool = True, update_existing: bool = True, number_of_days: int = None, base_download_path: str = None, destination_path: str = None, username: str = None, password: str = None, external_only: bool = True, recycle_context: bool = True, skip_file_move: bool = False, cloudflare_enabled: bool = False, defer_database: bool = False, auto_track_days: int = 30) -> Dict: """ Download a forum thread with all posts and images Args: thread_url: URL of the thread forum_name: Name of the forum (auto-detected if not provided) download_images: Whether to download images update_existing: Update existing posts number_of_days: Only download posts from last N days (None = all) base_download_path: Temporary download path (default: downloads/{forum_name}/temp) destination_path: Final destination path (default: downloads/{forum_name}) defer_database: If True, don't record to unified database immediately - store in pending_downloads for later recording after file move is complete auto_track_days: Number of days to monitor the thread for updates (default: 30) Returns: Dictionary with download results """ # Store defer_database and cloudflare_enabled for use in method self._current_defer_database = defer_database self._current_cloudflare_enabled = cloudflare_enabled self.log(f"Downloading thread: {thread_url}", "info") self.activity_manager.update_status(f"Checking forum thread: {forum_name or 'unknown'}") # Extract thread ID from URL (forum-specific) thread_id = self._extract_thread_id(thread_url) if self.use_database: conn = self._get_db_connection() cursor = conn.cursor() # Check if thread exists cursor.execute( "SELECT last_post_date, post_count FROM threads WHERE thread_id = ?", (thread_id,) ) existing = cursor.fetchone() if existing and not update_existing: self.log(f"Thread {thread_id} already downloaded, skipping", "info") conn.close() return {'status': 'skipped', 'thread_id': thread_id} # Setup authentication if needed context = None browser = None thread_data = None local_playwright = None # Check if we're running in a different thread than where self.context was created # Playwright contexts cannot be shared across threads import threading current_thread_id = threading.current_thread().ident context_thread_id = getattr(self, '_context_thread_id', None) can_reuse_context = (self.context and self.browser and context_thread_id == current_thread_id) # Check if we already have a browser context from login() in the SAME thread if can_reuse_context: # Use existing authenticated browser context context = self.context browser = self.browser self.log(f"Using existing browser context for {forum_name}", "debug") thread_data = self._scrape_thread(thread_url, context) elif username and password: # Create new browser context if not already logged in local_playwright = sync_playwright().start() browser = local_playwright.chromium.launch( headless=self.headless, executable_path='/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome' if os.path.exists('/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome') else None ) context = self._create_browser_context(browser) # Try to load existing cookies first cookies_loaded = forum_name and self.authenticator.load_cookies(context, forum_name) if cookies_loaded: self.log(f"Loaded saved cookies for {forum_name}", "debug") # Visit forum base URL to renew session (xf_user remember-me cookie) try: from urllib.parse import urlparse base_url = f"{urlparse(thread_url).scheme}://{urlparse(thread_url).netloc}/" temp_page = context.new_page() temp_page.goto(base_url, wait_until='load', timeout=15000) temp_page.wait_for_timeout(2000) if self.authenticator._verify_login(temp_page, username): self.logged_in_forums[forum_name] = True self.authenticator.save_cookies(temp_page, forum_name) self.log(f"Session renewed for {forum_name}", "debug") else: self.log(f"Session expired for {forum_name}, will re-login", "debug") cookies_loaded = False temp_page.close() except Exception as e: self.log(f"Error renewing session: {e}", "debug") cookies_loaded = False # Login if no cookies or session expired if not cookies_loaded and forum_name and forum_name not in self.logged_in_forums: temp_page = context.new_page() if self.authenticator.auto_login(temp_page, username, password, thread_url): self.authenticator.save_cookies(temp_page, forum_name) self.logged_in_forums[forum_name] = True self.log(f"Logged in to {forum_name}", "success") temp_page.close() # Scrape thread within the context thread_data = self._scrape_thread(thread_url, context) else: # Scrape without authentication thread_data = self._scrape_thread(thread_url, None) if not thread_data: self.log(f"Failed to scrape thread: {thread_url}", "error") return {'status': 'error', 'thread_id': thread_id} # Create thread directory with custom paths safe_title = re.sub(r'[<>:"/\\|?*]', '_', thread_data['title'][:100]) # Use custom paths if provided if base_download_path: base_path = Path(base_download_path) else: base_path = self.download_dir / (forum_name or 'unknown') / 'temp' if destination_path: dest_path = Path(destination_path) else: dest_path = self.download_dir / (forum_name or 'unknown') # Initially download to base path thread_dir = base_path / safe_title thread_dir.mkdir(parents=True, exist_ok=True) # Final destination directory final_dir = dest_path / safe_title final_dir.mkdir(parents=True, exist_ok=True) # Save thread info if self.use_database: # Add to unified database if using adapter if self.db_adapter: # Calculate monitor_until using configured auto_track_days monitor_until = datetime.now() + timedelta(days=auto_track_days) thread_added = self.db_adapter.db_add_thread( thread_id=thread_id, forum_name=forum_name or self._detect_forum(thread_url), thread_url=thread_url, thread_title=thread_data['title'], monitor_until=monitor_until ) if thread_added: self.log(f"Added thread to monitoring database for {auto_track_days} days", "debug") # Update with post count and mark as just checked self.db_adapter.db_update_thread( thread_id=thread_id, last_post_date=thread_data.get('last_post_date'), post_count=len(thread_data.get('posts', [])) ) else: # Fallback to local database cursor.execute(''' INSERT INTO threads (thread_id, forum_name, thread_url, thread_title, author, created_date, last_checked, last_post_date, post_count, status) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT (thread_id) DO UPDATE SET forum_name = EXCLUDED.forum_name, thread_url = EXCLUDED.thread_url, thread_title = EXCLUDED.thread_title, author = EXCLUDED.author, last_checked = EXCLUDED.last_checked, last_post_date = EXCLUDED.last_post_date, post_count = EXCLUDED.post_count, status = EXCLUDED.status ''', ( thread_id, forum_name or self._detect_forum(thread_url), thread_url, thread_data['title'], thread_data.get('author', 'Unknown'), thread_data.get('created_date') if isinstance(thread_data.get('created_date'), str) else datetime.now().isoformat(), datetime.now().isoformat(), thread_data.get('last_post_date') if isinstance(thread_data.get('last_post_date'), str) else datetime.now().isoformat(), len(thread_data.get('posts', [])), 'active' )) conn.commit() conn.close() # Close connection before queueing to avoid database lock # Process posts downloaded_posts = 0 downloaded_images = 0 queued_images = 0 images_to_queue = [] # Collect images to queue # Apply date filtering if specified cutoff_date = None if number_of_days: cutoff_date = datetime.now() - timedelta(days=number_of_days) self.log(f"Filtering posts from last {number_of_days} days (after {cutoff_date.strftime('%Y-%m-%d')})", "info") # Reopen database connection for post processing if self.use_database: conn = self._get_db_connection() cursor = conn.cursor() for post in thread_data.get('posts', []): # Check date filter if cutoff_date and post.get('date'): try: post_date = datetime.fromisoformat(post.get('date').replace('Z', '+00:00')) if post_date < cutoff_date: continue # Skip posts older than cutoff except Exception: pass # If can't parse date, include the post post_id = post.get('id') or hashlib.sha256( f"{thread_id}:{post.get('author')}:{post.get('date')}".encode() ).hexdigest() # Check if post exists if self.use_database: cursor.execute( "SELECT downloaded FROM posts WHERE post_id = ?", (post_id,) ) post_exists = cursor.fetchone() if post_exists and not update_existing: continue # Skip JSON saving - we only want images # Save to database if self.use_database: cursor.execute(''' INSERT INTO posts (post_id, thread_id, post_url, author, post_date, content_hash, has_images, downloaded, download_date) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT (post_id) DO UPDATE SET thread_id = EXCLUDED.thread_id, post_url = EXCLUDED.post_url, author = EXCLUDED.author, post_date = EXCLUDED.post_date, content_hash = EXCLUDED.content_hash, has_images = EXCLUDED.has_images, downloaded = EXCLUDED.downloaded, download_date = EXCLUDED.download_date ''', ( post_id, thread_id, post.get('url') or None, post.get('author', 'Unknown'), post.get('date', datetime.now().isoformat()) if not isinstance(post.get('date'), str) else post.get('date'), self._get_content_hash(post.get('content', '')), len(post.get('images', [])) > 0, True, datetime.now().isoformat() )) downloaded_posts += 1 # Collect images for download if requested if download_images and post.get('images'): # Extract date for timestamp updating and filename prefix post_date_str = None post_date_obj = None thread_title = thread_data.get('title', '') # Try to extract date from thread title first (most reliable for these forums) if DATE_UTILS_AVAILABLE and thread_title: post_date_obj = DateHandler.extract_date_from_text(thread_title) if post_date_obj: self.log(f"Extracted date from title for filename: {post_date_obj.strftime('%Y%m%d_%H%M%S')}", "debug") # Fall back to post date from forum if not post_date_obj and post.get('date'): post_date_str = post.get('date') try: if 'T' in str(post_date_str): post_date_obj = datetime.fromisoformat(post_date_str.replace('Z', '+00:00')) else: # Try common forum date formats for fmt in ['%b %d, %Y at %I:%M %p', '%B %d, %Y', '%d %b %Y', '%Y-%m-%d', '%m/%d/%Y']: try: post_date_obj = datetime.strptime(str(post_date_str).strip(), fmt) break except ValueError: continue except Exception: pass # Prepare metadata for queue metadata = { 'post_title': thread_title, # Thread title for date extraction 'post_date': post_date_obj.isoformat() if post_date_obj else post_date_str, 'post_author': post.get('author', 'Unknown'), 'thread_title': thread_title } # Collect images to queue later for img_url in post.get('images', []): # Skip if external_only and it's an internal attachment if external_only and '/attachments/' in img_url: continue # Don't process pixhost URLs here - let download manager handle it img_filename = self._get_image_filename(img_url, post_date=post_date_obj) img_path = thread_dir / img_filename # Check if already exists locally if img_path.exists(): self.log(f"Skipping existing local file: {img_filename}", "info") continue # Collect item to queue later (after closing DB) images_to_queue.append({ 'url': img_url, 'save_path': img_path, 'referer': thread_url, 'thread_id': thread_id, 'post_id': post_id, 'forum_name': forum_name, 'metadata': metadata }) if self.use_database: conn.commit() conn.close() # Now add collected images to queue (after DB is closed) # Track which URLs were actually added (not duplicates) newly_queued_urls = set() for item in images_to_queue: if self.add_to_download_queue(**item): queued_images += 1 newly_queued_urls.add(item['url']) self.log(f"Queued: {Path(item['save_path']).name}", "debug") # Count how many were actually queued vs skipped skipped_count = len(images_to_queue) - queued_images if skipped_count > 0: self.log(f"Skipped {skipped_count} duplicate images from database", "info") # Process downloads with the new DownloadManager if queued_images > 0: self.log(f"Processing {queued_images} new images with multi-threaded downloader...", "info") # Create download manager with appropriate settings # When using adapter (db_path is None), disable download manager's own database if self.db_path: dm_db_path = str(self.db_path).replace('.db', '_downloads.db') dm_use_db = self.use_database else: dm_db_path = ":memory:" # Use in-memory database dm_use_db = False # Don't track in download manager's DB download_manager = DownloadManager( max_workers=10, # Increased concurrent downloads rate_limit=0.2, # Faster rate limit timeout=60, # Increased timeout for large images show_progress=self.show_progress, show_debug=False, # Hide debug messages use_database=dm_use_db, db_path=dm_db_path ) # Set Playwright context for authenticated downloads if context: download_manager.set_playwright_context(context) # Convert to DownloadItem objects (only newly queued items) download_items = [] for item in images_to_queue: # Skip items that weren't actually queued in THIS run (duplicates or already pending from previous runs) if item['url'] not in newly_queued_urls: continue # Extract post date for timestamp updating post_date = None fallback_date = None if item['metadata']: # First try to get the actual post date as fallback if item['metadata'].get('post_date'): try: fallback_date = datetime.fromisoformat(item['metadata']['post_date']) except Exception: pass # Try to extract date from post title, with post date as fallback post_title = item['metadata'].get('post_title', '') if post_title: post_date = DateHandler.extract_date_from_text(post_title, fallback_date=fallback_date) else: # No title to extract from, use the post date directly post_date = fallback_date download_items.append(DownloadItem( url=item['url'], save_path=item['save_path'], referer=item['referer'], metadata=item['metadata'], post_date=post_date )) # Close browser context NOW if we're downloading external images only # The download manager uses requests for external images, not playwright if external_only: self.log("Closing browser pages (keeping context alive for reuse)", "debug") try: # Only close the page, keep context alive for next thread if 'page' in locals() and page: page.close() page = None # If recycle_context is True and this is self.context, recycle it if recycle_context and context == self.context: self.log("Recycling browser context", "debug") if self.context: self.context.close() self.context = None # Create new context for next use if self.browser: self.context = self._create_browser_context(self.browser) import threading self._context_thread_id = threading.current_thread().ident # Reload cookies for authenticated forums if forum_name and forum_name in self.logged_in_forums: self.log(f"Reloading cookies for {forum_name}", "debug") self.authenticator.load_cookies(self.context, forum_name) # Only close local browser/context if different from self elif context and context != self.context: context.close() context = None if 'browser' in locals() and browser and browser != self.browser: browser.close() browser = None except Exception as e: self.log(f"Error managing browser context: {e}", "debug") # Download all items - split large batches to prevent timeouts if len(download_items) > 50: self.log(f"Large batch ({len(download_items)} images), downloading in chunks", "info") all_results = [] chunk_size = 30 for i in range(0, len(download_items), chunk_size): chunk = download_items[i:i+chunk_size] self.log(f"Downloading chunk {i//chunk_size + 1}/{(len(download_items)-1)//chunk_size + 1} ({len(chunk)} images)", "info") chunk_results = download_manager.download_batch(chunk) all_results.extend(chunk_results) # Keep browser alive between chunks if still in use if self.context and i + chunk_size < len(download_items): try: self.keep_alive() except Exception: pass # Browser may already be closed for external downloads results = all_results else: # Download all items at once for small batches results = download_manager.download_batch(download_items) # Count successful downloads downloaded_images = len([r for r in results if r.success]) failed_images = len([r for r in results if not r.success]) self.log(f"Download complete: {downloaded_images} successful, {failed_images} failed", "success") # Update download_queue status for successful downloads if self.use_database and results: conn = self._get_db_connection() cursor = conn.cursor() for result in results: if result.success: # Mark as completed in download_queue cursor.execute(''' UPDATE download_queue SET status = 'completed', downloaded_date = CURRENT_TIMESTAMP WHERE url = ? AND status = 'pending' ''', (result.item.url,)) # Also record in unified database if using adapter if self.db_adapter: try: metadata = result.item.metadata or {} # Extract filename and file_path from save_path filename = result.item.save_path.name if result.item.save_path else None file_path = str(result.item.save_path) if result.item.save_path else None # Get post_date from the DownloadItem item_post_date = result.item.post_date if hasattr(result.item, 'post_date') else None # If deferred, store for later recording after file move if getattr(self, '_current_defer_database', False): self.pending_downloads.append({ 'url': result.item.url, 'thread_id': metadata.get('thread_id'), 'post_id': metadata.get('post_id'), 'filename': filename, 'file_path': file_path, 'metadata': metadata, 'post_date': item_post_date }) self.log(f"Deferred recording for {filename}", "debug") else: self.db_adapter.record_download( url=result.item.url, thread_id=metadata.get('thread_id'), post_id=metadata.get('post_id'), filename=filename, metadata=metadata, file_path=file_path, post_date=item_post_date ) except Exception as e: self.log(f"Failed to record download in unified database: {e}", "error") conn.commit() conn.close() self.log(f"Updated {downloaded_images} items in download queue to completed", "debug") # Update our stats already included in downloaded_images self.stats['threads_processed'] += 1 self.stats['posts_downloaded'] += downloaded_posts self.stats['images_downloaded'] += downloaded_images # Track downloaded file paths for notification attachments downloaded_file_paths = [] # Move files from base_path to destination_path if different (unless skip_file_move is True) if not skip_file_move and base_download_path and destination_path and thread_dir != final_dir and downloaded_images > 0: try: import shutil # Use MoveManager to move files (handles file_inventory registration and face recognition) unified_db = self.db_adapter.unified_db if self.db_adapter and hasattr(self.db_adapter, 'unified_db') else None move_manager = MoveManager( unified_db=unified_db, face_recognition_enabled=True # Enable face recognition for forum downloads ) # Set batch context for proper file_inventory registration move_manager.batch_context = { 'platform': 'forums', 'source': forum_name } files_moved = 0 # Get post date from thread data (actual forum post date) post_date = None if thread_data: # First try to get the actual last_post_date from the forum last_post_date = thread_data.get('last_post_date') if last_post_date: try: if isinstance(last_post_date, str): post_date = datetime.fromisoformat(last_post_date.replace('Z', '+00:00')) elif isinstance(last_post_date, datetime): post_date = last_post_date if post_date: self.log(f"Using forum post date: {post_date.strftime('%Y-%m-%d %H:%M')}", "debug") except Exception as e: self.log(f"Failed to parse last_post_date: {e}", "debug") # Fallback: try to extract from title if no post date if not post_date and DATE_UTILS_AVAILABLE: thread_title = thread_data.get('title', '') if thread_title: post_date = DateHandler.extract_date_from_text(thread_title) if post_date: self.log(f"Extracted date from title: {post_date.strftime('%Y-%m-%d')}", "debug") for file in thread_dir.rglob('*'): if file.is_file(): relative_path = file.relative_to(thread_dir) dest_file = final_dir / relative_path dest_file.parent.mkdir(parents=True, exist_ok=True) # Use MoveManager.move_file() which handles: # - Duplicate detection via hash # - file_inventory registration # - EXIF and filesystem timestamp updates (centralized) # - Face recognition (moves to review queue if no match) if move_manager.move_file(file, dest_file, timestamp=post_date): files_moved += 1 elif dest_file.exists(): # File was skipped (already exists at destination) pass # Only add files that matched faces (not review queue) to notification list # move_manager.moved_files contains only matched files # move_manager.review_queue_files contains files without face matches matched_count = len(move_manager.moved_files) for file_info in move_manager.moved_files: file_path = file_info.get('file_path') if file_path: downloaded_file_paths.append(file_path) # DEBUG: Log what we're adding to notification self.log(f"Added {matched_count} face-matched files to notification list", "debug") if matched_count > 0: for fp in downloaded_file_paths[:3]: # Log first 3 self.log(f" - {Path(fp).name}", "debug") # Log review queue files for debugging review_count = len(move_manager.review_queue_files) if review_count > 0: self.log(f"{review_count} files moved to review queue (no face match)", "info") # Clean up temp directory completely if thread_dir.exists(): try: # Force remove the entire thread directory and all its contents import shutil shutil.rmtree(thread_dir, ignore_errors=True) self.log(f"Removed thread directory: {thread_dir}", "debug") except Exception as e: self.log(f"Failed to remove thread directory {thread_dir}: {e}", "warning") # Clean up all parent directories up to base_download_path # Start from the parent of thread_dir and work up parent = thread_dir.parent if not thread_dir.exists() else thread_dir.parent base_path = Path(base_download_path) # Keep going up until we reach base_download_path or its parent while parent and parent != base_path.parent and parent != base_path.parent.parent: try: if parent.exists(): # Check if directory is empty if not any(parent.iterdir()): parent.rmdir() self.log(f"Removed empty parent directory: {parent}", "debug") else: # Directory not empty, check if it only contains empty subdirs subdirs = [d for d in parent.iterdir() if d.is_dir()] if subdirs and all(not any(d.iterdir()) for d in subdirs): # All subdirs are empty, remove them for subdir in subdirs: try: subdir.rmdir() self.log(f"Removed empty subdirectory: {subdir}", "debug") except Exception: pass # Try to remove parent again if now empty if not any(parent.iterdir()): parent.rmdir() self.log(f"Removed parent directory after cleaning subdirs: {parent}", "debug") # Move up one level parent = parent.parent except Exception as e: self.log(f"Error cleaning parent directory {parent}: {e}", "debug") break if files_moved > 0: self.log(f"Moved {files_moved} files to: {final_dir}", "info") except Exception as e: self.log(f"Error moving files: {e}", "error") elif downloaded_images > 0 and final_dir and final_dir.exists(): # Files were downloaded directly to final location (no move needed) # Track the file paths for notification attachments for file in final_dir.rglob('*'): if file.is_file(): downloaded_file_paths.append(str(file)) if downloaded_file_paths: self.log(f"Tracked {len(downloaded_file_paths)} files in: {final_dir}", "debug") self.log( f"Thread complete: {downloaded_posts} posts, {downloaded_images} images", "success" ) # Update last_checked timestamp to prevent immediate re-checking by monitor if self.use_database and thread_id: try: conn = self._get_db_connection() cursor = conn.cursor() # Update last_checked to current time cursor.execute(''' UPDATE threads SET last_checked = ? WHERE thread_id = ? ''', (datetime.now().isoformat(), thread_id)) conn.commit() conn.close() self.log(f"Updated last_checked timestamp for thread {thread_id}", "debug") except Exception as e: self.log(f"Failed to update last_checked timestamp: {e}", "warning") # Also update in unified database if available if self.db_adapter and thread_id: try: self.db_adapter.db_update_thread_last_checked(thread_id) except Exception as e: self.log(f"Failed to update last_checked in unified database: {e}", "warning") # Close browser only if we created it locally (not if using existing from login()) if local_playwright and browser: browser.close() local_playwright.stop() return { 'status': 'success', 'thread_id': thread_id, 'posts_downloaded': downloaded_posts, 'images_downloaded': downloaded_images, 'thread_dir': str(thread_dir), # Temp directory where files were downloaded 'final_dir': str(final_dir) if destination_path else None, # Final destination directory 'downloaded_file_paths': downloaded_file_paths # List of final file paths for notifications } def update_monitored_threads(self, force_all: bool = False) -> Dict: """ Update all monitored threads Args: force_all: Update all threads regardless of monitor_until date Returns: Dictionary with update results """ if not self.use_database: self.log("Database required for thread monitoring", "error") return {} conn = self._get_db_connection() cursor = conn.cursor() # Get threads to update if force_all: cursor.execute( "SELECT thread_id, thread_url, forum_name FROM threads WHERE status = 'active'" ) else: cursor.execute(''' SELECT thread_id, thread_url, forum_name FROM threads WHERE status = 'active' AND (monitor_until IS NULL OR monitor_until > ?) ''', (datetime.now().isoformat(),)) threads = cursor.fetchall() conn.close() self.log(f"Updating {len(threads)} monitored threads", "info") results = { 'total': len(threads), 'updated': 0, 'new_posts': 0, 'errors': 0 } for thread_id, thread_url, forum_name in threads: try: result = self.download_thread( thread_url, forum_name=forum_name, update_existing=True ) if result['status'] == 'success': results['updated'] += 1 # Track new posts (would need to compare with previous count) except Exception as e: self.log(f"Error updating thread {thread_id}: {e}", "error") results['errors'] += 1 self.stats['errors'] += 1 self._apply_rate_limit() return results def _perform_advanced_search(self, forum_name: str, search_query: str, forum_url: str = None, newer_than_days: int = None, older_than_days: int = None, username: str = None, password: str = None, cloudflare_enabled: bool = False) -> str: """ Perform advanced search with date filters Returns the search results URL or None if failed """ from datetime import datetime, timedelta # Calculate dates newer_date = (datetime.now() - timedelta(days=newer_than_days)).strftime('%m/%d/%Y') if newer_than_days else None older_date = (datetime.now() - timedelta(days=older_than_days)).strftime('%m/%d/%Y') if older_than_days else None page = None try: # Check thread safety before using self.context - Playwright contexts # cannot be shared across threads (causes "Cannot switch to a different thread" error) import threading current_thread_id = threading.current_thread().ident context_thread_id = getattr(self, '_context_thread_id', None) can_use_self_context = (self.context and context_thread_id == current_thread_id) # Use existing context if available (from login session) AND in same thread if can_use_self_context: page = self.context.new_page() else: # Need to create a new browser context (thread-safe) if not self.playwright: self.playwright = sync_playwright().start() if not self.browser: self.browser = self.playwright.chromium.launch( headless=self.headless, executable_path='/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome' if os.path.exists('/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome') else None ) if not self.context: self.context = self._create_browser_context(self.browser) self._context_thread_id = current_thread_id page = self.context.new_page() # Validate forum URL if not forum_url: self.log(f"Forum URL is required for {forum_name}", "error") return None # Special handling for PicturePub - use form with date fields if forum_name == 'PicturePub': return self._perform_picturepub_search(page, forum_url, newer_date, older_date, search_query) # Special handling for phun.org - use direct URL search to avoid Cloudflare form challenge if 'phun.org' in forum_url.lower(): return self._perform_phun_search(page, forum_url, newer_date, search_query, cloudflare_enabled, forum_name) # Navigate to search page (with Cloudflare support) search_page_url = f"{forum_url}/search/" if not self._navigate_with_cloudflare(page, search_page_url, forum_name, cloudflare_enabled): self.log(f"Failed to navigate to search page for {forum_name}", "error") return None page.wait_for_timeout(500) # Click "Search everything" tab - try multiple selectors (English and German) search_tab_selectors = [ "text='Search everything'", "text='Everything'", "text='Alles durchsuchen'", "a:has-text('Everything')", "a:has-text('Alles')", "a[data-nav-id='everything']", ".tabPanes a:first" ] tab_clicked = False for selector in search_tab_selectors: try: if page.locator(selector).count() > 0: page.locator(selector).first.click() tab_clicked = True break except Exception: continue if tab_clicked: page.wait_for_timeout(500) # Scroll down to see date fields page.evaluate("window.scrollBy(0, 400)") page.wait_for_timeout(300) # Fill date filters FIRST (important for XenForo) if newer_date: # Try multiple selectors for newer date field newer_selectors = [ 'input[name="c[newer_than]"]', 'input[name="newer_than"]', 'input[placeholder*="Newer"]', 'input.input--date:first' ] newer_field = None for selector in newer_selectors: try: if page.locator(selector).count() > 0: newer_field = page.locator(selector).first break except Exception: continue if newer_field: newer_field.click() newer_field.clear() newer_field.type(newer_date, delay=50) page.keyboard.press('Tab') self.log(f"Set newer_than: {newer_date}", "info") if older_date: # Try multiple selectors for older date field older_selectors = [ 'input[name="c[older_than]"]', 'input[name="older_than"]', 'input[placeholder*="Older"]', 'input.input--date:last' ] older_field = None for selector in older_selectors: try: if page.locator(selector).count() > 0: older_field = page.locator(selector).first break except Exception: continue if older_field: older_field.click() older_field.clear() older_field.type(older_date, delay=50) page.keyboard.press('Tab') self.log(f"Set older_than: {older_date}", "info") page.wait_for_timeout(300) # Check "Search titles only" checkbox (supports English and German) titles_selectors = [ 'label:has-text("Search titles only")', 'label:has-text("Nur Titel durchsuchen")', 'input[name="c[title_only]"]' ] for selector in titles_selectors: try: elem = page.locator(selector).last if elem.count() > 0: elem.click(timeout=5000) page.wait_for_timeout(300) break except Exception: continue # Fill keywords LAST (important for XenForo) # Try multiple selectors for different languages keywords_selectors = [ 'input[name="keywords"][type="search"]', 'input[name="keywords"]', page.get_by_role("searchbox", name="Keywords:"), page.get_by_role("searchbox", name="Schlüsselwörter:") ] keywords_field = None for selector in keywords_selectors: try: if isinstance(selector, str): elem = page.locator(selector) else: elem = selector if elem.count() > 0: keywords_field = elem.last break except Exception: continue if keywords_field: keywords_field.click() keywords_field.clear() keywords_field.type(search_query, delay=50) self.log(f"Set keywords: {search_query}", "info") page.wait_for_timeout(300) # Scroll to search button and click page.evaluate("window.scrollBy(0, 200)") page.wait_for_timeout(200) # Try multiple search button selectors (English and German) search_button_selectors = [ '.formSubmitRow button[type="submit"]', 'button.button--icon--search[type="submit"]', 'button:has-text("Search")', 'button:has-text("Suche")', 'button[type="submit"]:visible' ] search_button = None for selector in search_button_selectors: try: btn = page.locator(selector).first if btn.count() > 0: search_button = btn break except Exception: continue if search_button: try: search_button.scroll_into_view_if_needed(timeout=5000) except Exception: pass search_button.click(force=True) # Wait for results page.wait_for_load_state('networkidle') # Try to wait for results, but don't fail if none found try: page.wait_for_selector('.contentRow-title', timeout=5000) except Exception: # Check if "no results" message is shown no_results = page.query_selector(':text("No results found")') if no_results: self.log("Search returned no results", "info") else: self.log("Waiting for results timed out", "warning") # Get the final search URL final_url = page.url self.log(f"Advanced search URL: {final_url}", "info") # Verify date filters are in URL if newer_than_days and "newer_than" not in final_url: self.log("Warning: newer_than filter may not be applied", "warning") if older_than_days and "older_than" not in final_url: self.log("Warning: older_than filter may not be applied", "warning") return final_url except Exception as e: self.log(f"Advanced search error: {e}", "error") return None finally: if page: page.close() def _perform_picturepub_search(self, page, forum_url: str, newer_date: str, older_date: str, search_query: str) -> str: """ Perform PicturePub-specific advanced search using form with date fields Returns the search results URL or None if failed """ from datetime import datetime try: self.log("Using PicturePub-specific advanced search with date fields", "info") # Navigate to search page page.goto(f"{forum_url}/search/", wait_until='networkidle') page.wait_for_timeout(2000) # Find the form that has date input fields (advanced form) forms = page.locator('form[action="/search/search"]').all() advanced_form = None for form in forms: # Check if this form has date fields newer_input = form.locator('input[name="c[newer_than]"]') if newer_input.count() > 0: # This is the advanced form with date fields advanced_form = form self.log("Found PicturePub advanced search form with date fields", "info") # Fill keywords in THIS form keywords = form.locator('input[name="keywords"]') if keywords.count() > 0: keywords.fill(search_query) self.log(f"Filled keywords: {search_query}", "info") # Fill newer_than date if newer_date: # Convert date format from MM/DD/YYYY to YYYY-MM-DD date_obj = datetime.strptime(newer_date, '%m/%d/%Y') formatted_date = date_obj.strftime('%Y-%m-%d') newer_input.fill(formatted_date) self.log(f"Set newer_than date: {formatted_date}", "info") # Fill older_than date if provided if older_date: older_input = form.locator('input[name="c[older_than]"]') if older_input.count() > 0: date_obj = datetime.strptime(older_date, '%m/%d/%Y') formatted_date = date_obj.strftime('%Y-%m-%d') older_input.fill(formatted_date) self.log(f"Set older_than date: {formatted_date}", "info") # Check titles only (optional - skip if blocked) try: titles_checkbox = form.locator('input[name="c[title_only]"]') if titles_checkbox.count() > 0: # Try to check with force to bypass overlays titles_checkbox.check(force=True) self.log("Checked 'Search titles only'", "info") except Exception: self.log("Could not check titles only checkbox (optional)", "debug") # Submit this form form.evaluate('form => form.submit()') self.log("Submitted PicturePub advanced search form", "info") break if not advanced_form: self.log("Could not find PicturePub advanced form, using simple search", "warning") # Wait for results page.wait_for_timeout(5000) final_url = page.url return final_url except Exception as e: self.log(f"PicturePub search error: {e}", "error") return None def _perform_phun_search(self, page, forum_url: str, newer_date: str, search_query: str, cloudflare_enabled: bool, forum_name: str) -> str: """ Perform phun.org-specific search using direct URL to avoid Cloudflare form challenge. phun.org uses an older XenForo theme that triggers Cloudflare on form submissions. Returns a special marker with results to avoid double navigation. """ from urllib.parse import quote_plus, urljoin try: self.log("Using phun.org direct URL search (bypasses Cloudflare form challenge)", "info") # Build direct search URL - phun.org/XenForo 1.x format # Uses keywords=, order=, title_only=1, date=UNIX_TS (no c[] wrapper) encoded_query = quote_plus(search_query) # Convert newer_date to Unix timestamp for XenForo 1.x date parameter date_param = "" if newer_date: try: from datetime import datetime # Try multiple date formats dt = None for fmt in ["%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"]: try: dt = datetime.strptime(newer_date, fmt) break except ValueError: continue if dt: unix_ts = int(dt.timestamp()) date_param = f"&date={unix_ts}" self.log(f"Filtering to posts newer than: {newer_date} (ts={unix_ts})", "info") else: self.log(f"Failed to parse date {newer_date}", "warning") except Exception as e: self.log(f"Failed to parse date {newer_date}: {e}", "warning") # XenForo 1.x format: keywords, order, title_only, date (no c[] wrapper) search_url = f"{forum_url}/search/search?keywords={encoded_query}&order=date&title_only=1{date_param}" self.log(f"Direct search URL: {search_url}", "debug") # Navigate with Cloudflare support if not self._navigate_with_cloudflare(page, search_url, forum_name, cloudflare_enabled): self.log(f"Failed to navigate to search results for {forum_name}", "error") return None page.wait_for_timeout(3000) final_url = page.url self.log(f"phun.org search result URL: {final_url}", "info") # Scrape results directly on this page (avoid double navigation/Cloudflare) results = [] # Debug: check what selectors are available phun_count = page.locator('li.searchResult h3 a').count() xf2_count = page.locator('.contentRow-title a').count() thread_links_count = page.locator('a[href*="/threads/"]').count() self.log(f"phun.org selector counts: li.searchResult={phun_count}, contentRow={xf2_count}, threads={thread_links_count}", "debug") # Try phun.org-specific selector first if phun_count > 0: thread_links = page.locator('li.searchResult h3 a').all() self.log(f"Found {len(thread_links)} phun.org-style search results", "info") for link in thread_links: try: result = { 'title': link.inner_text(), 'url': link.get_attribute('href'), 'author': 'Unknown', 'date': datetime.now().isoformat() } if result['url'] and not result['url'].startswith('http'): result['url'] = urljoin(forum_url, result['url']) if result.get('url') and '/threads/' in result['url']: result['thread_id'] = self._extract_thread_id(result['url']) results.append(result) self.log(f"Added: {result['title'][:50]}", "debug") except Exception as e: self.log(f"Error parsing result: {e}", "debug") # Try XenForo 2.x selector elif xf2_count > 0: thread_links = page.locator('.contentRow-title a').all() self.log(f"Found {len(thread_links)} XenForo 2.x search results", "info") for link in thread_links: try: result = { 'title': link.inner_text(), 'url': link.get_attribute('href'), 'author': 'Unknown', 'date': datetime.now().isoformat() } if result['url'] and not result['url'].startswith('http'): result['url'] = urljoin(forum_url, result['url']) if result.get('url') and '/threads/' in result['url']: result['thread_id'] = self._extract_thread_id(result['url']) results.append(result) except Exception as e: self.log(f"Error parsing result: {e}", "debug") # Fallback: find any thread links elif thread_links_count > 0: thread_links = page.locator('a[href*="/threads/"]').all() self.log(f"Using fallback: found {len(thread_links)} thread links", "info") for link in thread_links: try: href = link.get_attribute('href') title = link.inner_text().strip() if href and title and len(title) > 5: result = { 'title': title, 'url': href if href.startswith('http') else urljoin(forum_url, href), 'author': 'Unknown', 'date': datetime.now().isoformat() } result['thread_id'] = self._extract_thread_id(result['url']) if result not in results: results.append(result) except Exception: pass self.log(f"phun.org search found {len(results)} threads", "info") # Store results and return special marker self._phun_search_results = results return "PHUN_RESULTS_READY" except Exception as e: self.log(f"phun.org search error: {e}", "error") return None def _scrape_search_results(self, search_url: str, context=None) -> List[Dict]: """Scrape search results page with support for multiple forum types""" results = [] browser = None page = None local_playwright = None try: # Check thread safety before using self.context - Playwright contexts # cannot be shared across threads (causes "Cannot switch to a different thread" error) import threading current_thread_id = threading.current_thread().ident context_thread_id = getattr(self, '_context_thread_id', None) can_use_self_context = (self.context and context_thread_id == current_thread_id) # Use existing context if available (from login session) AND in same thread if can_use_self_context: page = self.context.new_page() elif context: page = context.new_page() else: local_playwright = sync_playwright().start() browser = local_playwright.chromium.launch( headless=self.headless, executable_path='/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome' if os.path.exists('/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome') else None ) page = browser.new_page(user_agent=self.user_agent) # Use 'load' instead of 'networkidle' - phun.org has many ads/trackers that prevent networkidle try: page.goto(search_url, wait_until='load', timeout=30000) page.wait_for_timeout(2000) # Brief wait for dynamic content except Exception as nav_error: self.log(f"Navigation timeout, trying domcontentloaded: {nav_error}", "warning") page.goto(search_url, wait_until='domcontentloaded', timeout=30000) # Use pre-set forum type or detect it if hasattr(self, 'forum_type') and self.forum_type: forum_type = self.forum_type self.log(f"Using pre-set forum type: {forum_type.value}", "info") else: forum_type = self.authenticator.detect_forum_type(page) if hasattr(self, 'authenticator') else ForumType.UNKNOWN self.log(f"Detected forum type: {forum_type.value}", "info") # Debug: check what selectors are available phun_count = page.locator('li.searchResult h3 a').count() xf2_count = page.locator('.contentRow-title a').count() block_count = page.locator('.block-row a').count() thread_links_count = page.locator('a[href*="/threads/"]').count() self.log(f"Selector counts: li.searchResult={phun_count}, contentRow={xf2_count}, block-row={block_count}, threads={thread_links_count}", "debug") # Debug: save HTML snippet to file for analysis if thread_links_count == 0: try: html_snippet = page.content()[:5000] with open('/tmp/phun_debug.html', 'w') as f: f.write(html_snippet) self.log("Saved HTML snippet to /tmp/phun_debug.html", "debug") except Exception: pass # phun.org / XenForo with listBlock structure (older theme) if phun_count > 0: thread_links = page.locator('li.searchResult h3 a').all() self.log(f"Found {len(thread_links)} phun.org-style search results", "info") for link in thread_links: result = {} try: result['title'] = link.inner_text() result['url'] = link.get_attribute('href') if result['url'] and not result['url'].startswith('http'): result['url'] = urljoin(search_url, result['url']) if result.get('url'): result['thread_id'] = self._extract_thread_id(result['url']) result['author'] = 'Unknown' result['date'] = datetime.now().isoformat() if result.get('url') and result.get('title'): # Only include thread links, skip post links if '/threads/' in result['url']: results.append(result) self.log(f"Added result: {result['title'][:50]}", "debug") else: self.log(f"Skipped (not a thread): {result.get('url', 'No URL')}", "debug") except Exception as e: self.log(f"Error parsing search result: {e}", "debug") continue # XenForo 2.x - contentRow structure elif page.locator('.contentRow-title a').count() > 0: # Look for all thread links in search results thread_links = page.locator('.contentRow-title a').all() self.log(f"Found {len(thread_links)} XenForo 2.x search results", "info") for link in thread_links: result = {} # Extract title and URL try: result['title'] = link.inner_text() result['url'] = link.get_attribute('href') self.log(f"Raw URL: {result['url']}", "debug") if result['url'] and not result['url'].startswith('http'): result['url'] = urljoin(search_url, result['url']) self.log(f"Processed URL: {result['url']}", "debug") # Extract thread ID from URL if result.get('url'): result['thread_id'] = self._extract_thread_id(result['url']) # For XenForo search results, we may not have all metadata # but we have title and URL which is enough result['author'] = 'Unknown' result['date'] = datetime.now().isoformat() # Accept any URL that looks like it could be a thread if result.get('url') and result.get('title'): # Skip obvious non-thread links skip_patterns = ['/members/', '/forums/', '/search/', '/login', '/register'] if not any(p in result['url'] for p in skip_patterns): results.append(result) self.log(f"Added result: {result['title'][:50]}", "debug") else: self.log(f"Skipped (non-thread pattern): {result.get('url', 'No URL')}", "debug") except Exception as e: self.log(f"Error parsing search result: {e}", "debug") continue # XenForo 1.x - ol.searchResults structure elif page.locator('ol.searchResults li').count() > 0: search_items = page.locator('ol.searchResults li').all() self.log(f"Found {len(search_items)} XenForo 1.x search results", "info") for item in search_items: result = {} title_elem = item.locator('h3.title a').first if title_elem: result['title'] = title_elem.inner_text() result['url'] = title_elem.get_attribute('href') if result['url'] and not result['url'].startswith('http'): result['url'] = urljoin(search_url, result['url']) if result.get('url'): result['thread_id'] = self._extract_thread_id(result['url']) meta_elem = item.locator('.meta').first if meta_elem: result['author'] = meta_elem.inner_text().split(',')[0].strip() if result.get('url'): results.append(result) # vBulletin structure elif page.locator('li.searchResult').count() > 0: search_items = page.locator('li.searchResult').all() self.log(f"Found {len(search_items)} vBulletin search results", "info") for item in search_items: result = {} title_elem = item.locator('h3 a').first if title_elem: result['title'] = title_elem.inner_text() result['url'] = title_elem.get_attribute('href') if result['url'] and not result['url'].startswith('http'): result['url'] = urljoin(search_url, result['url']) if result.get('url'): results.append(result) # Generic fallback else: # Check if page explicitly says no results no_results_text = page.locator('text=/no results/i, text=/no threads found/i, text=/no matches/i').first if no_results_text: self.log("Search returned no results (detected 'no results' message)", "info") # Don't use generic parser when we know there are no results else: # Try to find any links that look like thread URLs thread_links = page.locator('a[href*="/threads/"], a[href*="/topic/"], a[href*="showthread"]').all() self.log(f"Using generic parser, found {len(thread_links)} potential threads", "info") for link in thread_links: result = { 'title': link.inner_text(), 'url': link.get_attribute('href'), 'author': 'Unknown' } if result['url'] and not result['url'].startswith('http'): result['url'] = urljoin(search_url, result['url']) if result['url'] and result['title']: results.append(result) # Only close if we created them locally (not using persistent context) if page and not self.context and not context: page.close() if browser: browser.close() if local_playwright: local_playwright.stop() except Exception as e: self.log(f"Error scraping search results: {e}", "error") if page and not self.context and not context: page.close() if browser: browser.close() if local_playwright: local_playwright.stop() return results def _scrape_thread_impl(self, thread_url: str, context=None, saved_cookies=None) -> Optional[Dict]: """Implementation of thread scraping - runs in separate thread to avoid async context issues""" thread_data = { 'title': '', 'author': '', 'created_date': None, 'last_post_date': None, 'posts': [] } browser = None page = None local_playwright = None try: # Check thread safety before using self.context - Playwright contexts # cannot be shared across threads (causes "Cannot switch to a different thread" error) import threading current_thread_id = threading.current_thread().ident context_thread_id = getattr(self, '_context_thread_id', None) can_use_self_context = (self.context and context_thread_id == current_thread_id) # Use existing context if available (from login session) AND in same thread if can_use_self_context: page = self.context.new_page() elif context: # Use provided context page = context.new_page() else: # Create new context (always safe - new playwright instance per thread) local_playwright = sync_playwright().start() browser = local_playwright.chromium.launch( headless=self.headless, executable_path='/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome' if os.path.exists('/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome') else None ) page = browser.new_page(user_agent=self.user_agent) page.goto(thread_url, wait_until='networkidle') # Extract thread info (forum-specific) title_elem = page.query_selector('h1, .thread-title') if title_elem: thread_data['title'] = title_elem.inner_text() # Extract posts based on forum type # XenForo 1.x uses li.message, XenForo 2.x uses article.message if 'xenforo' in page.content().lower() or 'xf' in page.content().lower(): # Try XenForo 2 first (article.message), then XenForo 1 (li.message) posts = page.query_selector_all('article.message') if not posts: posts = page.query_selector_all('li.message') else: posts = page.query_selector_all('.post, .message, article') for post in posts: post_data = {} # Extract post content # XenForo 2: .message-body, XenForo 1: .messageContent, .messageText content_elem = post.query_selector('.message-body, .post-content, .messageContent, .messageText, .message-content') if content_elem: post_data['content'] = content_elem.inner_text() # Extract author author_elem = post.query_selector('.message-name, .author, .username') if author_elem: post_data['author'] = author_elem.inner_text() # Extract date date_elem = post.query_selector('time, .date, .timestamp') if date_elem: post_data['date'] = date_elem.get_attribute('datetime') or date_elem.inner_text() # Extract EXTERNAL image links (not inline forum attachments) images = [] # Look for external image host links link_selectors = [ 'a[href*="imagebam"]', 'a[href*="imgbox"]', 'a[href*="imgur"]', 'a[href*="postimg"]', 'a[href*="imgbb"]', 'a[href*="pixhost"]', 'a[href*="imagevenue"]', 'a[href*="catbox"]', 'a[href*="fastdl.app"]', 'a[href*="picturepub.net"]', 'a[href*="imagetwist"]', 'a.file-preview' ] for selector in link_selectors: links = post.query_selector_all(selector) for link in links: href = link.get_attribute('href') if href and href not in images: # Make sure it's a full URL if not href.startswith('http'): href = urljoin(thread_url, href) # Skip forum's internal attachments if '/attachments/' not in href: # Skip thumbnails (imgbox thumbs2, ImageBam thumbs, or _t.jpg/_t.png endings) if ('thumbs' in href and ('imgbox.com' in href or 'imagebam.com' in href)) or href.endswith('_t.jpg') or href.endswith('_t.png'): continue images.append(href) # Also check for any external links that might be images all_links = post.query_selector_all('a[href^="http"]') for link in all_links: href = link.get_attribute('href') if href and '/attachments/' not in href: # Check if it's an image host we support if ImageHostHandler.identify_host(href) and href not in images: images.append(href) post_data['images'] = images thread_data['posts'].append(post_data) # Extract last_post_date from the posts (use the most recent post's date) latest_date = None for post in thread_data['posts']: post_date_str = post.get('date') if post_date_str: try: # Try ISO format first (datetime attribute) if 'T' in str(post_date_str): parsed_date = datetime.fromisoformat(post_date_str.replace('Z', '+00:00')) else: # Try common forum date formats for fmt in ['%b %d, %Y at %I:%M %p', '%B %d, %Y', '%d %b %Y', '%Y-%m-%d', '%m/%d/%Y']: try: parsed_date = datetime.strptime(str(post_date_str).strip(), fmt) break except ValueError: continue else: parsed_date = None if parsed_date and (latest_date is None or parsed_date > latest_date): latest_date = parsed_date except (ValueError, TypeError, AttributeError): pass # Invalid date format, skip this post if latest_date: thread_data['last_post_date'] = latest_date.isoformat() self.log(f"Extracted last_post_date: {latest_date.strftime('%Y-%m-%d %H:%M')}", "debug") # Only close if we created them locally (not using persistent context) if page and not self.context and not context: page.close() if browser: browser.close() if local_playwright: local_playwright.stop() except Exception as e: self.log(f"Error scraping thread: {e}", "error") if page and not self.context and not context: page.close() if browser: browser.close() if local_playwright: local_playwright.stop() return None return thread_data def _scrape_thread(self, thread_url: str, context=None) -> Optional[Dict]: """Scrape a forum thread with authentication support""" thread_data = { 'title': '', 'author': '', 'created_date': None, 'last_post_date': None, 'posts': [] } browser = None page = None local_playwright = None try: # For phun.org, we need a fresh context with the correct user-agent # because cf_clearance cookies are tied to browser fingerprint use_fresh_context = 'phun.org' in thread_url # Use provided context first (passed from download_thread with thread-safe handling) # Only fall back to self.context if no context passed and not Cloudflare site # IMPORTANT: Check thread safety before using self.context - Playwright contexts # cannot be shared across threads (causes "Cannot switch to a different thread" error) import threading current_thread_id = threading.current_thread().ident context_thread_id = getattr(self, '_context_thread_id', None) can_use_self_context = (self.context and not use_fresh_context and context_thread_id == current_thread_id) if context and not use_fresh_context: # Use provided context (thread-safe - created in same thread) page = context.new_page() elif can_use_self_context: # Fall back to self.context only if in same thread (verified thread-safe) page = self.context.new_page() else: # Create new context (or forced for Cloudflare-protected sites) local_playwright = sync_playwright().start() browser = local_playwright.chromium.launch( headless=self.headless, executable_path='/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome' if os.path.exists('/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome') else None ) # For Cloudflare-protected sites, use the stored user_agent from cookies # cf_clearance cookies are tied to browser fingerprint effective_user_agent = self.user_agent if 'phun.org' in thread_url: cf_handler = CloudflareHandler( module_name="Forum.phun.org", cookie_file="cookies/forum_cookies_phun.org.json", flaresolverr_url=self.flaresolverr_url, flaresolverr_enabled=self.flaresolverr_enabled, user_agent=self.user_agent, logger=self.logger ) stored_ua = cf_handler.get_user_agent() if stored_ua: effective_user_agent = stored_ua self.log(f"Using stored user-agent for phun.org Cloudflare cookies", "debug") page = browser.new_page(user_agent=effective_user_agent) # Use Cloudflare bypass for phun.org if 'phun.org' in thread_url: # Always enable Cloudflare for phun.org - it requires it regardless of config cloudflare_enabled = True self.log(f"phun.org thread: forcing cloudflare_enabled=True", "debug") # Use 'load' instead of 'networkidle' to avoid timeout on ad-heavy pages if not self._navigate_with_cloudflare(page, thread_url, 'phun.org', cloudflare_enabled, wait_until='load', timeout=30000): self.log(f"Failed to navigate to thread with Cloudflare bypass: {thread_url}", "error") return thread_data page.wait_for_timeout(3000) # Longer wait for Cloudflare else: # Use 'load' instead of 'networkidle' for other forums try: page.goto(thread_url, wait_until='load', timeout=30000) # Wait for post content to render (XenForo 2.x or 1.x) try: page.wait_for_selector('article.message, li.message, .post, .message', timeout=10000) except Exception: pass # Timeout waiting for posts - page may have no posts or different structure except Exception as nav_error: self.log(f"Thread navigation timeout, trying domcontentloaded: {nav_error}", "warning") page.goto(thread_url, wait_until='domcontentloaded', timeout=30000) page.wait_for_timeout(3000) # Extract thread info (forum-specific) title_elem = page.query_selector('h1, .thread-title') if title_elem: thread_data['title'] = title_elem.inner_text() # Extract posts based on forum type # XenForo 1.x uses li.message, XenForo 2.x uses article.message html_content = page.content().lower() if 'xenforo' in html_content or 'xf' in html_content: # Try XenForo 2 first (article.message), then XenForo 1 (li.message) posts = page.query_selector_all('article.message') if not posts: posts = page.query_selector_all('li.message') self.log(f"XenForo 1.x detected, found {len(posts)} li.message posts", "debug") else: self.log(f"XenForo 2.x detected, found {len(posts)} article.message posts", "debug") else: posts = page.query_selector_all('.post, .message, article') self.log(f"Generic forum, found {len(posts)} posts", "debug") # Debug: check if we're hitting Cloudflare (only if no posts found) if not posts and ('just a moment' in html_content or 'cf-challenge' in html_content): self.log("WARNING: Thread page shows Cloudflare challenge!", "warning") # Save HTML for debugging try: with open('/tmp/phun_thread_debug.html', 'w') as f: f.write(page.content()[:10000]) except Exception: pass for post in posts: post_data = {} # Extract post content # XenForo 2: .message-body, XenForo 1: .messageContent, .messageText content_elem = post.query_selector('.message-body, .post-content, .messageContent, .messageText, .message-content') if content_elem: post_data['content'] = content_elem.inner_text() # Extract author author_elem = post.query_selector('.message-name, .author, .username') if author_elem: post_data['author'] = author_elem.inner_text() # Extract date date_elem = post.query_selector('time, .date, .timestamp') if date_elem: post_data['date'] = date_elem.get_attribute('datetime') or date_elem.inner_text() # Extract EXTERNAL image links (not inline forum attachments) images = [] # Look for external image host links link_selectors = [ 'a[href*="imagebam"]', 'a[href*="imgbox"]', 'a[href*="imgur"]', 'a[href*="postimg"]', 'a[href*="imgbb"]', 'a[href*="pixhost"]', 'a[href*="imagevenue"]', 'a[href*="catbox"]', 'a[href*="fastdl.app"]', 'a[href*="picturepub.net"]', 'a[href*="imagetwist"]', 'a.file-preview' ] for selector in link_selectors: links = post.query_selector_all(selector) for link in links: href = link.get_attribute('href') if href: images.append(href) # Also look for direct image links in the content (but exclude thumbnails) img_tags = post.query_selector_all('img') for img in img_tags: src = img.get_attribute('src') if src: # Skip ImageBam thumbnails (they're on thumbs*.imagebam.com) if 'thumbs' in src and 'imagebam.com' in src: continue # Skip imgbox thumbnails (they're on thumbs2.imgbox.com or end with _t.jpg) if ('thumbs' in src and 'imgbox.com' in src) or (src.endswith('_t.jpg') or src.endswith('_t.png')): continue # Skip ImageTwist thumbnail URLs - we get proper URLs from links # Thumbnails are on i*.imagetwist.com/th/ which we can't convert properly if 'imagetwist.com' in src and '/th/' in src: continue # Only add direct images from these hosts (not imagebam since we want the link not the thumb) if any(host in src for host in ['imgbox', 'imgur', 'postimg', 'imgbb']): images.append(src) if images: post_data['images'] = list(set(images)) # Remove duplicates thread_data['posts'].append(post_data) # Extract last_post_date from the posts (use the most recent post's date) latest_date = None for post in thread_data['posts']: post_date_str = post.get('date') if post_date_str: try: # Try ISO format first (datetime attribute) if 'T' in str(post_date_str): parsed_date = datetime.fromisoformat(post_date_str.replace('Z', '+00:00')) else: # Try common forum date formats for fmt in ['%b %d, %Y at %I:%M %p', '%B %d, %Y', '%d %b %Y', '%Y-%m-%d', '%m/%d/%Y']: try: parsed_date = datetime.strptime(str(post_date_str).strip(), fmt) break except ValueError: continue else: parsed_date = None if parsed_date and (latest_date is None or parsed_date > latest_date): latest_date = parsed_date except (ValueError, TypeError, AttributeError): pass # Invalid date format, skip this post if latest_date: thread_data['last_post_date'] = latest_date.isoformat() self.log(f"Extracted last_post_date: {latest_date.strftime('%Y-%m-%d %H:%M')}", "debug") except Exception as e: self.log(f"Error scraping thread: {e}", "error") if page: page.close() if browser: browser.close() if local_playwright: local_playwright.stop() return None finally: # Close only the page, keep context alive for reuse if page: try: page.close() except Exception: pass # Only close browser if we created it locally if browser: browser.close() if local_playwright: local_playwright.stop() return thread_data def _extract_thread_id(self, url: str) -> str: """Extract thread ID from URL (forum-specific)""" # Try common patterns patterns = [ r'/threads?/([0-9]+)', r'/t/([0-9]+)', r'[?&]t=([0-9]+)', r'/topic/([0-9]+)', r'/viewtopic\.php\?.*t=([0-9]+)' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) # Fallback to URL hash return hashlib.sha256(url.encode()).hexdigest() def _detect_forum(self, url: str) -> str: """Detect forum software from URL""" domain = urlparse(url).netloc # Check for common forum software if 'vbulletin' in url.lower() or '/showthread.php' in url: return 'vBulletin' elif 'phpbb' in url.lower() or '/viewtopic.php' in url: return 'phpBB' elif 'discourse' in url.lower() or '/t/' in url: return 'Discourse' elif 'xenforo' in url.lower() or '/threads/' in url: return 'XenForo' elif 'smf' in url.lower() or 'index.php?topic=' in url: return 'SMF' elif 'invision' in url.lower() or '/topic/' in url: return 'Invision' return domain def _extract_date_from_post(self, post: Dict, thread_data: Dict) -> Optional[datetime]: """Extract date from post or thread title""" import re from datetime import datetime # First try to extract from thread title title = thread_data.get('title', '') # Common date patterns in titles # Examples: "15.08.2025", "08/15/2025", "15-08-2025", "August 15, 2025" date_patterns = [ r'(\d{1,2})[\.\/\-](\d{1,2})[\.\/\-](\d{4})', # DD.MM.YYYY or MM/DD/YYYY r'(\d{4})[\-\/](\d{1,2})[\-\/](\d{1,2})', # YYYY-MM-DD r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})', # Month DD, YYYY r'(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{4})', # DD Mon YYYY ] for pattern in date_patterns: match = re.search(pattern, title) if match: try: # Parse based on pattern type if 'January' in pattern or 'February' in pattern: # Month name pattern month_str = match.group(1) day = int(match.group(2)) year = int(match.group(3)) month_map = { 'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12 } month = month_map[month_str] return datetime(year, month, day) elif match.group(1).isdigit(): groups = [int(g) for g in match.groups() if g.isdigit()] if len(groups) == 3: # Determine format based on values if groups[0] > 31: # YYYY-MM-DD return datetime(groups[0], groups[1], groups[2]) elif groups[2] > 31: # DD-MM-YYYY or MM-DD-YYYY # Assume DD.MM.YYYY for European format if '.' in title: return datetime(groups[2], groups[1], groups[0]) else: # Assume MM/DD/YYYY for US format return datetime(groups[2], groups[0], groups[1]) except Exception: pass # Fallback to post date if available if post.get('date'): try: # Parse various date formats date_str = post['date'] if isinstance(date_str, str): # Try ISO format first if 'T' in date_str: return datetime.fromisoformat(date_str.replace('Z', '+00:00')) # Try other formats for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%d.%m.%Y']: try: return datetime.strptime(date_str, fmt) except Exception: continue except Exception: pass return None def _extract_pixhost_direct_url(self, show_url: str) -> Optional[str]: """Extract direct image URL from pixhost show URL Based on the working pixhost_fetch.py script that probes imgNN.pixhost.to hosts """ import re # Extract dir_id and filename from show URL show_pattern = re.compile(r"https?://(?:www\.)?pixhost\.to/show/(\d+)/([^/]+)$", re.IGNORECASE) match = show_pattern.match(show_url) if not match: return None dir_id, filename = match.group(1), match.group(2) # Try common hosts (img1-120.pixhost.to) # Start with commonly used hosts common_hosts = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100] for host_num in common_hosts: # Try different extensions base, dot, ext = filename.rpartition(".") extensions = [filename] # Try original first if dot: # Try common image extensions for alt_ext in ["jpg", "jpeg", "png", "webp", "gif"]: if alt_ext.lower() != ext.lower(): extensions.append(f"{base}.{alt_ext}") for fname in extensions: direct_url = f"https://img{host_num}.pixhost.to/images/{dir_id}/{fname}" # Quick check with HEAD request try: response = requests.head(direct_url, timeout=2, allow_redirects=True, headers={"User-Agent": self.user_agent}) if response.status_code == 200: content_type = response.headers.get('Content-Type', '') # Check if it's an image if 'image' in content_type and 'removed.png' not in response.url: self.log(f"Found pixhost image on img{host_num}", "debug") return direct_url except Exception: continue # If common hosts fail, return None and let regular download handle it return None def _get_image_filename(self, url: str, post_date: datetime = None) -> str: """Generate filename for image with optional date/time prefix Args: url: Image URL post_date: Optional datetime to prefix filename (format: YYYYMMDD_HHMMSS_) Returns: Filename like "20251215_195700_3.jpg" if post_date provided, else "3.jpg" """ # Try to get original filename parsed = urlparse(url) filename = os.path.basename(parsed.path) if not filename or '.' not in filename: # Generate from URL hash ext = '.jpg' # Default extension if '.png' in url.lower(): ext = '.png' elif '.gif' in url.lower(): ext = '.gif' elif '.webp' in url.lower(): ext = '.webp' filename = hashlib.sha256(url.encode()).hexdigest() + ext # Add date/time prefix if provided (makes filenames unique across downloads) if post_date: date_prefix = post_date.strftime('%Y%m%d_%H%M%S_') filename = date_prefix + filename return filename def download_forum_section(self, section_url: str, forum_name: str, max_pages: int = 10, max_threads: int = None, username: str = None, password: str = None) -> Dict: """ Download all threads from a forum section/category Args: section_url: URL of the forum section forum_name: Name of the forum max_pages: Maximum pages to scan max_threads: Maximum threads to download username: Login username (optional) password: Login password (optional) Returns: Dictionary with download results """ self.log(f"Downloading forum section: {section_url}", "info") results = { 'threads_found': 0, 'threads_downloaded': 0, 'errors': 0, 'thread_urls': [] } try: # Run in thread to avoid event loop conflicts def run_section_download(): with sync_playwright() as p: browser = p.chromium.launch( headless=self.headless, executable_path='/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome' if os.path.exists('/opt/media-downloader/.playwright/chromium-1187/chrome-linux/chrome') else None ) context = self._create_browser_context(browser) # Handle authentication - try cookies first if forum_name: # Always try to load existing cookies first if self.authenticator.load_cookies(context, forum_name): self.logged_in_forums[forum_name] = True self.log(f"Loaded saved cookies for {forum_name}", "debug") # Only login if we have credentials and no valid cookies elif username and password and forum_name not in self.logged_in_forums: temp_page = context.new_page() if self.authenticator.auto_login(temp_page, username, password, section_url): self.authenticator.save_cookies(temp_page, forum_name) self.logged_in_forums[forum_name] = True self.log(f"Logged in to {forum_name}", "success") temp_page.close() page = context.new_page() # Detect forum type page.goto(section_url) forum_type = self.authenticator.detect_forum_type(page) thread_urls = [] # Extract thread URLs based on forum type for page_num in range(1, max_pages + 1): if page_num > 1: # Navigate to next page (forum-specific) next_url = self._get_next_page_url(section_url, page_num, forum_type) if next_url: page.goto(next_url) else: break # Extract thread links based on forum type if forum_type == ForumType.XENOFORO: links = page.locator('h3.contentRow-title a, .structItem-title a').all() elif forum_type == ForumType.VBULLETIN: links = page.locator('a.title, .threadtitle a').all() elif forum_type == ForumType.PHPBB: links = page.locator('a.topictitle, .topic-title a').all() elif forum_type == ForumType.DISCOURSE: links = page.locator('.topic-list-item a.title').all() elif forum_type == ForumType.INVISION: links = page.locator('.ipsDataItem_title a, h4.ipsType_large a').all() elif forum_type == ForumType.MYBB: links = page.locator('.subject_new a, .subject_old a').all() elif forum_type == ForumType.SMF: links = page.locator('.subject a, span.preview a').all() else: # Generic fallback links = page.locator('a[href*="thread"], a[href*="topic"], a[href*="/t/"]').all() for link in links: href = link.get_attribute('href') if href: full_url = urljoin(section_url, href) if full_url not in thread_urls: thread_urls.append(full_url) if max_threads and len(thread_urls) >= max_threads: break if max_threads and len(thread_urls) >= max_threads: break self._apply_rate_limit() browser.close() results['threads_found'] = len(thread_urls) results['thread_urls'] = thread_urls return results # nest_asyncio is already applied at module level results = run_section_download() # Download each thread for i, thread_url in enumerate(results.get('thread_urls', []), 1): self.log(f"Downloading thread {i}/{len(results['thread_urls'])}: {thread_url}", "info") try: thread_result = self.download_thread( thread_url, forum_name=forum_name, username=username, password=password ) if thread_result.get('status') == 'success': results['threads_downloaded'] += 1 except Exception as e: self.log(f"Error downloading thread: {e}", "error") results['errors'] += 1 self._apply_rate_limit() except Exception as e: self.log(f"Error downloading forum section: {e}", "error") results['errors'] += 1 return results def _get_next_page_url(self, base_url: str, page_num: int, forum_type: ForumType) -> Optional[str]: """Generate next page URL based on forum type""" if forum_type == ForumType.XENOFORO: return f"{base_url}?page={page_num}" elif forum_type == ForumType.VBULLETIN: return f"{base_url}?page={page_num}" elif forum_type == ForumType.PHPBB: return f"{base_url}&start={(page_num-1)*25}" # Usually 25 topics per page elif forum_type == ForumType.DISCOURSE: return f"{base_url}?page={page_num}" elif forum_type == ForumType.INVISION: return f"{base_url}?page={page_num}" elif forum_type == ForumType.MYBB: return f"{base_url}?page={page_num}" elif forum_type == ForumType.SMF: return f"{base_url}.{(page_num-1)*20}" # Usually 20 topics per page return None def get_statistics(self) -> Dict: """Get downloader statistics""" stats = self.stats.copy() if self.use_database: conn = self._get_db_connection() cursor = conn.cursor() # Get database stats cursor.execute("SELECT COUNT(*) FROM threads") stats['total_threads'] = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM posts") stats['total_posts'] = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM images WHERE downloaded = TRUE") stats['total_images'] = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM searches WHERE active = TRUE") stats['active_searches'] = cursor.fetchone()[0] cursor.execute( "SELECT COUNT(*) FROM threads WHERE status = 'active' AND monitor_until > ?", (datetime.now().isoformat(),) ) stats['monitored_threads'] = cursor.fetchone()[0] conn.close() return stats # Example usage if __name__ == "__main__": from pathlib import Path # Use proper database path (in-memory for standalone testing) downloader = ForumDownloader( headless=True, show_progress=True, use_database=False, # Disable DB for standalone testing db_path=None, download_dir=str(Path(__file__).parent.parent / "forum_downloads") ) # Example: Login to forums (supports XenForo, vBulletin, phpBB, Discourse, Invision, MyBB, SMF) # The login method will auto-detect the forum type downloader.login( forum_name="MyForum", username="your_username", password="your_password", forum_url="https://forum.example.com" ) # Example: Monitor a search with authentication downloader.monitor_search( forum_name="MyForum", search_query="interesting topic", search_url="https://forum.example.com/search?q=interesting+topic", check_frequency_hours=6, auto_track_days=30, username="your_username", # Optional if already logged in password="your_password" # Optional if already logged in ) # Example: Download a thread with authentication downloader.download_thread( thread_url="https://forum.example.com/threads/12345", forum_name="MyForum", download_images=True, username="your_username", # Optional if already logged in password="your_password" # Optional if already logged in ) # Example: Download from private/members-only section # Authentication is required for these private_thread = downloader.download_thread( thread_url="https://forum.example.com/private/threads/67890", forum_name="MyForum", download_images=True, username="your_username", password="your_password" ) # Example: Update all monitored threads downloader.update_monitored_threads() # Show statistics stats = downloader.get_statistics() forum_logger.info("Statistics:") for key, value in stats.items(): forum_logger.info(f" {key}: {value}") # Supported forum types: # - XenForo (1.x and 2.x) # - vBulletin (3.x, 4.x, 5.x) # - phpBB (all versions) # - Discourse # - Invision Power Board (IPB) # - MyBB # - Simple Machines Forum (SMF) # The module will automatically detect and handle each forum type