#!/usr/bin/env python3
"""
Bellazon Forum Thread Image Scraper
Downloads all full-size images from a Bellazon forum thread.
Bellazon uses
pattern.
"""
import re
import sys
import time
import hashlib
import requests
from pathlib import Path
from urllib.parse import urlparse, urljoin
from html import unescape
THREAD_URL = sys.argv[1] if len(sys.argv) > 1 else "https://www.bellazon.com/main/topic/39089-india-reynolds/"
OUTPUT_DIR = sys.argv[2] if len(sys.argv) > 2 else "/opt/media-downloader/data/bellazon/india-reynolds"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.bellazon.com/',
}
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}
VIDEO_EXTENSIONS = {'.mp4', '.webm', '.mov', '.avi', '.mkv'}
MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
SKIP_PATTERNS = [
'avatar', 'emoji', 'icon', '/public/', 'rep_', 'style_',
'star_', '/js/', '/css/', 'button', 'logo', 'loading',
'spinner', 'pixel', 'spacer', '/default_photo',
'profile_photo', '/skin_', '/set_resources/', 'screenshot',
]
def get_page_count(html: str) -> int:
match = re.search(r'Page\s+\d+\s+of\s+(\d+)', html)
return int(match.group(1)) if match else 1
def is_media_url(url: str) -> bool:
parsed = urlparse(url)
ext = Path(parsed.path).suffix.lower()
return ext in MEDIA_EXTENSIONS
def should_skip(url: str) -> bool:
lower = url.lower()
return any(skip in lower for skip in SKIP_PATTERNS)
def extract_images_from_html(html: str, base_url: str) -> list:
"""Extract full-size image URLs from page HTML.
Priority: wrapping
Fallback: standalone
(non-thumb)
"""
images = []
thumb_urls = set() # track thumbnails so we don't add them as standalone
# Pattern 1:
# This catches the bellazon pattern where thumbnails link to full images
for match in re.finditer(
r']+href=["\']([^"\']+)["\'][^>]*>\s*
]+src=["\']([^"\']+)["\']',
html, re.IGNORECASE | re.DOTALL
):
href = unescape(match.group(1))
img_src = unescape(match.group(2))
if is_media_url(href) and not should_skip(href):
full_url = urljoin(base_url, href)
images.append(full_url)
# Track the thumbnail so we skip it later
thumb_urls.add(urljoin(base_url, img_src))
# Pattern 2: Standalone
tags not wrapped in links to full-size
for match in re.finditer(r'
]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
url = unescape(match.group(1))
if should_skip(url):
continue
full_url = urljoin(base_url, url)
# Skip if this is a thumbnail we already have the full version of
if full_url in thumb_urls:
continue
# Skip anything with _thumb or .thumb in the name
if '_thumb' in url or '.thumb.' in url:
continue
if is_media_url(url):
images.append(full_url)
# Pattern 3: Links to external image files (not bellazon)
for match in re.finditer(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE):
url = unescape(match.group(1))
parsed = urlparse(url)
if parsed.netloc and 'bellazon' not in parsed.netloc and is_media_url(url):
images.append(url)
# Pattern 4: Forum attachments (attachment.php?id=XXX) with video/image filenames
# e.g. B7A65853...MP4
for match in re.finditer(
r']+href=["\']([^"\']*attachment\.php\?id=\d+)["\'][^>]*>([^<]+)',
html, re.IGNORECASE
):
href = unescape(match.group(1))
link_text = match.group(2).strip()
ext = Path(link_text).suffix.lower()
if ext in MEDIA_EXTENSIONS:
full_url = urljoin(base_url, href)
images.append((full_url, link_text)) # tuple: (url, filename)
# Deduplicate preserving order
seen = set()
unique = []
for item in images:
key = item[0] if isinstance(item, tuple) else item
if key not in seen:
seen.add(key)
unique.append(item)
return unique
def download_media(item, output_dir: Path, session: requests.Session, seen_hashes: set) -> bool:
# item is either a URL string or a (url, filename) tuple
if isinstance(item, tuple):
url, orig_filename = item
else:
url, orig_filename = item, None
try:
resp = session.get(url, timeout=60)
if resp.status_code != 200:
return False
content_type = resp.headers.get('content-type', '')
if not any(t in content_type for t in ['image', 'video', 'octet-stream']):
return False
data = resp.content
if len(data) < 5000: # Skip tiny files (icons/placeholders)
return False
file_hash = hashlib.md5(data).hexdigest()
if file_hash in seen_hashes:
return False
seen_hashes.add(file_hash)
if orig_filename:
filename = re.sub(r'[^\w\-_.]', '_', orig_filename)
else:
parsed = urlparse(url)
filename = Path(parsed.path).name
filename = re.sub(r'[^\w\-_.]', '_', filename)
if not filename or filename == '_':
filename = f"{file_hash}.jpg"
filepath = output_dir / filename
if filepath.exists():
filepath = output_dir / f"{filepath.stem}_{file_hash[:8]}{filepath.suffix}"
filepath.write_bytes(data)
return True
except Exception as e:
display = url[:80] if not orig_filename else orig_filename
print(f" Error: {display}: {e}", flush=True)
return False
def main():
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)
session = requests.Session()
session.headers.update(HEADERS)
print(f"Fetching: {THREAD_URL}", flush=True)
resp = session.get(THREAD_URL, timeout=30)
resp.raise_for_status()
total_pages = get_page_count(resp.text)
print(f"Total pages: {total_pages}", flush=True)
seen_hashes = set()
total_downloaded = 0
total_skipped = 0
for page_num in range(1, total_pages + 1):
if page_num == 1:
page_url = THREAD_URL
html = resp.text
else:
page_url = f"{THREAD_URL.rstrip('/')}/page/{page_num}/"
try:
resp = session.get(page_url, timeout=30)
resp.raise_for_status()
html = resp.text
except Exception as e:
print(f" Error fetching page {page_num}: {e}", flush=True)
continue
images = extract_images_from_html(html, page_url)
page_dl = 0
for img_url in images:
if download_media(img_url, output_dir, session, seen_hashes):
page_dl += 1
total_downloaded += 1
else:
total_skipped += 1
print(f"Page {page_num}/{total_pages}: {page_dl} downloaded ({len(images)} found, {total_downloaded} total)", flush=True)
if page_num < total_pages:
time.sleep(1)
print(f"\nDone! {total_downloaded} images saved to {output_dir}", flush=True)
print(f"Skipped: {total_skipped}", flush=True)
if __name__ == "__main__":
main()