#!/usr/bin/env python3 """ Bellazon Forum Thread Image Scraper Downloads all full-size images from a Bellazon forum thread. Bellazon uses pattern. """ import re import sys import time import hashlib import requests from pathlib import Path from urllib.parse import urlparse, urljoin from html import unescape THREAD_URL = sys.argv[1] if len(sys.argv) > 1 else "https://www.bellazon.com/main/topic/39089-india-reynolds/" OUTPUT_DIR = sys.argv[2] if len(sys.argv) > 2 else "/opt/media-downloader/data/bellazon/india-reynolds" HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.bellazon.com/', } IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'} VIDEO_EXTENSIONS = {'.mp4', '.webm', '.mov', '.avi', '.mkv'} MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS SKIP_PATTERNS = [ 'avatar', 'emoji', 'icon', '/public/', 'rep_', 'style_', 'star_', '/js/', '/css/', 'button', 'logo', 'loading', 'spinner', 'pixel', 'spacer', '/default_photo', 'profile_photo', '/skin_', '/set_resources/', 'screenshot', ] def get_page_count(html: str) -> int: match = re.search(r'Page\s+\d+\s+of\s+(\d+)', html) return int(match.group(1)) if match else 1 def is_media_url(url: str) -> bool: parsed = urlparse(url) ext = Path(parsed.path).suffix.lower() return ext in MEDIA_EXTENSIONS def should_skip(url: str) -> bool: lower = url.lower() return any(skip in lower for skip in SKIP_PATTERNS) def extract_images_from_html(html: str, base_url: str) -> list: """Extract full-size image URLs from page HTML. Priority: wrapping Fallback: standalone (non-thumb) """ images = [] thumb_urls = set() # track thumbnails so we don't add them as standalone # Pattern 1: # This catches the bellazon pattern where thumbnails link to full images for match in re.finditer( r']+href=["\']([^"\']+)["\'][^>]*>\s*]+src=["\']([^"\']+)["\']', html, re.IGNORECASE | re.DOTALL ): href = unescape(match.group(1)) img_src = unescape(match.group(2)) if is_media_url(href) and not should_skip(href): full_url = urljoin(base_url, href) images.append(full_url) # Track the thumbnail so we skip it later thumb_urls.add(urljoin(base_url, img_src)) # Pattern 2: Standalone tags not wrapped in links to full-size for match in re.finditer(r']+src=["\']([^"\']+)["\']', html, re.IGNORECASE): url = unescape(match.group(1)) if should_skip(url): continue full_url = urljoin(base_url, url) # Skip if this is a thumbnail we already have the full version of if full_url in thumb_urls: continue # Skip anything with _thumb or .thumb in the name if '_thumb' in url or '.thumb.' in url: continue if is_media_url(url): images.append(full_url) # Pattern 3: Links to external image files (not bellazon) for match in re.finditer(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE): url = unescape(match.group(1)) parsed = urlparse(url) if parsed.netloc and 'bellazon' not in parsed.netloc and is_media_url(url): images.append(url) # Pattern 4: Forum attachments (attachment.php?id=XXX) with video/image filenames # e.g. B7A65853...MP4 for match in re.finditer( r']+href=["\']([^"\']*attachment\.php\?id=\d+)["\'][^>]*>([^<]+)', html, re.IGNORECASE ): href = unescape(match.group(1)) link_text = match.group(2).strip() ext = Path(link_text).suffix.lower() if ext in MEDIA_EXTENSIONS: full_url = urljoin(base_url, href) images.append((full_url, link_text)) # tuple: (url, filename) # Deduplicate preserving order seen = set() unique = [] for item in images: key = item[0] if isinstance(item, tuple) else item if key not in seen: seen.add(key) unique.append(item) return unique def download_media(item, output_dir: Path, session: requests.Session, seen_hashes: set) -> bool: # item is either a URL string or a (url, filename) tuple if isinstance(item, tuple): url, orig_filename = item else: url, orig_filename = item, None try: resp = session.get(url, timeout=60) if resp.status_code != 200: return False content_type = resp.headers.get('content-type', '') if not any(t in content_type for t in ['image', 'video', 'octet-stream']): return False data = resp.content if len(data) < 5000: # Skip tiny files (icons/placeholders) return False file_hash = hashlib.md5(data).hexdigest() if file_hash in seen_hashes: return False seen_hashes.add(file_hash) if orig_filename: filename = re.sub(r'[^\w\-_.]', '_', orig_filename) else: parsed = urlparse(url) filename = Path(parsed.path).name filename = re.sub(r'[^\w\-_.]', '_', filename) if not filename or filename == '_': filename = f"{file_hash}.jpg" filepath = output_dir / filename if filepath.exists(): filepath = output_dir / f"{filepath.stem}_{file_hash[:8]}{filepath.suffix}" filepath.write_bytes(data) return True except Exception as e: display = url[:80] if not orig_filename else orig_filename print(f" Error: {display}: {e}", flush=True) return False def main(): output_dir = Path(OUTPUT_DIR) output_dir.mkdir(parents=True, exist_ok=True) session = requests.Session() session.headers.update(HEADERS) print(f"Fetching: {THREAD_URL}", flush=True) resp = session.get(THREAD_URL, timeout=30) resp.raise_for_status() total_pages = get_page_count(resp.text) print(f"Total pages: {total_pages}", flush=True) seen_hashes = set() total_downloaded = 0 total_skipped = 0 for page_num in range(1, total_pages + 1): if page_num == 1: page_url = THREAD_URL html = resp.text else: page_url = f"{THREAD_URL.rstrip('/')}/page/{page_num}/" try: resp = session.get(page_url, timeout=30) resp.raise_for_status() html = resp.text except Exception as e: print(f" Error fetching page {page_num}: {e}", flush=True) continue images = extract_images_from_html(html, page_url) page_dl = 0 for img_url in images: if download_media(img_url, output_dir, session, seen_hashes): page_dl += 1 total_downloaded += 1 else: total_skipped += 1 print(f"Page {page_num}/{total_pages}: {page_dl} downloaded ({len(images)} found, {total_downloaded} total)", flush=True) if page_num < total_pages: time.sleep(1) print(f"\nDone! {total_downloaded} images saved to {output_dir}", flush=True) print(f"Skipped: {total_skipped}", flush=True) if __name__ == "__main__": main()