Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/scripts/bellazon_scraper.py
+++ b/scripts/bellazon_scraper.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+Bellazon Forum Thread Image Scraper
+
+Downloads all full-size images from a Bellazon forum thread.
+Bellazon uses <a href="full.jpg"><img src="full_thumb.jpg"></a> pattern.
+"""
+
+import re
+import sys
+import time
+import hashlib
+import requests
+from pathlib import Path
+from urllib.parse import urlparse, urljoin
+from html import unescape
+
+THREAD_URL = sys.argv[1] if len(sys.argv) > 1 else "https://www.bellazon.com/main/topic/39089-india-reynolds/"
+OUTPUT_DIR = sys.argv[2] if len(sys.argv) > 2 else "/opt/media-downloader/data/bellazon/india-reynolds"
+
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Referer': 'https://www.bellazon.com/',
+}
+
+IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}
+VIDEO_EXTENSIONS = {'.mp4', '.webm', '.mov', '.avi', '.mkv'}
+MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
+
+SKIP_PATTERNS = [
+    'avatar', 'emoji', 'icon', '/public/', 'rep_', 'style_',
+    'star_', '/js/', '/css/', 'button', 'logo', 'loading',
+    'spinner', 'pixel', 'spacer', '/default_photo',
+    'profile_photo', '/skin_', '/set_resources/', 'screenshot',
+]
+
+
+def get_page_count(html: str) -> int:
+    match = re.search(r'Page\s+\d+\s+of\s+(\d+)', html)
+    return int(match.group(1)) if match else 1
+
+
+def is_media_url(url: str) -> bool:
+    parsed = urlparse(url)
+    ext = Path(parsed.path).suffix.lower()
+    return ext in MEDIA_EXTENSIONS
+
+
+def should_skip(url: str) -> bool:
+    lower = url.lower()
+    return any(skip in lower for skip in SKIP_PATTERNS)
+
+
+def extract_images_from_html(html: str, base_url: str) -> list:
+    """Extract full-size image URLs from page HTML.
+
+    Priority: <a href="full.jpg"> wrapping <img src="thumb.jpg">
+    Fallback: standalone <img src="image.jpg"> (non-thumb)
+    """
+    images = []
+    thumb_urls = set()  # track thumbnails so we don't add them as standalone
+
+    # Pattern 1: <a href="full-size"><img src="thumb"></a>
+    # This catches the bellazon pattern where thumbnails link to full images
+    for match in re.finditer(
+        r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>\s*<img[^>]+src=["\']([^"\']+)["\']',
+        html, re.IGNORECASE | re.DOTALL
+    ):
+        href = unescape(match.group(1))
+        img_src = unescape(match.group(2))
+
+        if is_media_url(href) and not should_skip(href):
+            full_url = urljoin(base_url, href)
+            images.append(full_url)
+            # Track the thumbnail so we skip it later
+            thumb_urls.add(urljoin(base_url, img_src))
+
+    # Pattern 2: Standalone <img> tags not wrapped in links to full-size
+    for match in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
+        url = unescape(match.group(1))
+        if should_skip(url):
+            continue
+        full_url = urljoin(base_url, url)
+        # Skip if this is a thumbnail we already have the full version of
+        if full_url in thumb_urls:
+            continue
+        # Skip anything with _thumb or .thumb in the name
+        if '_thumb' in url or '.thumb.' in url:
+            continue
+        if is_media_url(url):
+            images.append(full_url)
+
+    # Pattern 3: Links to external image files (not bellazon)
+    for match in re.finditer(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE):
+        url = unescape(match.group(1))
+        parsed = urlparse(url)
+        if parsed.netloc and 'bellazon' not in parsed.netloc and is_media_url(url):
+            images.append(url)
+
+    # Pattern 4: Forum attachments (attachment.php?id=XXX) with video/image filenames
+    # e.g. <a href="...attachment.php?id=6887160">B7A65853...MP4</a>
+    for match in re.finditer(
+        r'<a[^>]+href=["\']([^"\']*attachment\.php\?id=\d+)["\'][^>]*>([^<]+)</a>',
+        html, re.IGNORECASE
+    ):
+        href = unescape(match.group(1))
+        link_text = match.group(2).strip()
+        ext = Path(link_text).suffix.lower()
+        if ext in MEDIA_EXTENSIONS:
+            full_url = urljoin(base_url, href)
+            images.append((full_url, link_text))  # tuple: (url, filename)
+
+    # Deduplicate preserving order
+    seen = set()
+    unique = []
+    for item in images:
+        key = item[0] if isinstance(item, tuple) else item
+        if key not in seen:
+            seen.add(key)
+            unique.append(item)
+    return unique
+
+
+def download_media(item, output_dir: Path, session: requests.Session, seen_hashes: set) -> bool:
+    # item is either a URL string or a (url, filename) tuple
+    if isinstance(item, tuple):
+        url, orig_filename = item
+    else:
+        url, orig_filename = item, None
+
+    try:
+        resp = session.get(url, timeout=60)
+        if resp.status_code != 200:
+            return False
+
+        content_type = resp.headers.get('content-type', '')
+        if not any(t in content_type for t in ['image', 'video', 'octet-stream']):
+            return False
+
+        data = resp.content
+        if len(data) < 5000:  # Skip tiny files (icons/placeholders)
+            return False
+
+        file_hash = hashlib.md5(data).hexdigest()
+        if file_hash in seen_hashes:
+            return False
+        seen_hashes.add(file_hash)
+
+        if orig_filename:
+            filename = re.sub(r'[^\w\-_.]', '_', orig_filename)
+        else:
+            parsed = urlparse(url)
+            filename = Path(parsed.path).name
+            filename = re.sub(r'[^\w\-_.]', '_', filename)
+        if not filename or filename == '_':
+            filename = f"{file_hash}.jpg"
+
+        filepath = output_dir / filename
+        if filepath.exists():
+            filepath = output_dir / f"{filepath.stem}_{file_hash[:8]}{filepath.suffix}"
+
+        filepath.write_bytes(data)
+        return True
+
+    except Exception as e:
+        display = url[:80] if not orig_filename else orig_filename
+        print(f"  Error: {display}: {e}", flush=True)
+        return False
+
+
+def main():
+    output_dir = Path(OUTPUT_DIR)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    session = requests.Session()
+    session.headers.update(HEADERS)
+
+    print(f"Fetching: {THREAD_URL}", flush=True)
+    resp = session.get(THREAD_URL, timeout=30)
+    resp.raise_for_status()
+
+    total_pages = get_page_count(resp.text)
+    print(f"Total pages: {total_pages}", flush=True)
+
+    seen_hashes = set()
+    total_downloaded = 0
+    total_skipped = 0
+
+    for page_num in range(1, total_pages + 1):
+        if page_num == 1:
+            page_url = THREAD_URL
+            html = resp.text
+        else:
+            page_url = f"{THREAD_URL.rstrip('/')}/page/{page_num}/"
+            try:
+                resp = session.get(page_url, timeout=30)
+                resp.raise_for_status()
+                html = resp.text
+            except Exception as e:
+                print(f"  Error fetching page {page_num}: {e}", flush=True)
+                continue
+
+        images = extract_images_from_html(html, page_url)
+        page_dl = 0
+
+        for img_url in images:
+            if download_media(img_url, output_dir, session, seen_hashes):
+                page_dl += 1
+                total_downloaded += 1
+            else:
+                total_skipped += 1
+
+        print(f"Page {page_num}/{total_pages}: {page_dl} downloaded ({len(images)} found, {total_downloaded} total)", flush=True)
+
+        if page_num < total_pages:
+            time.sleep(1)
+
+    print(f"\nDone! {total_downloaded} images saved to {output_dir}", flush=True)
+    print(f"Skipped: {total_skipped}", flush=True)
+
+
+if __name__ == "__main__":
+    main()