Files
media-downloader/scripts/bellazon_scraper.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

226 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""
Bellazon Forum Thread Image Scraper
Downloads all full-size images from a Bellazon forum thread.
Bellazon uses <a href="full.jpg"><img src="full_thumb.jpg"></a> pattern.
"""
import re
import sys
import time
import hashlib
import requests
from pathlib import Path
from urllib.parse import urlparse, urljoin
from html import unescape
THREAD_URL = sys.argv[1] if len(sys.argv) > 1 else "https://www.bellazon.com/main/topic/39089-india-reynolds/"
OUTPUT_DIR = sys.argv[2] if len(sys.argv) > 2 else "/opt/media-downloader/data/bellazon/india-reynolds"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.bellazon.com/',
}
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}
VIDEO_EXTENSIONS = {'.mp4', '.webm', '.mov', '.avi', '.mkv'}
MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
SKIP_PATTERNS = [
'avatar', 'emoji', 'icon', '/public/', 'rep_', 'style_',
'star_', '/js/', '/css/', 'button', 'logo', 'loading',
'spinner', 'pixel', 'spacer', '/default_photo',
'profile_photo', '/skin_', '/set_resources/', 'screenshot',
]
def get_page_count(html: str) -> int:
match = re.search(r'Page\s+\d+\s+of\s+(\d+)', html)
return int(match.group(1)) if match else 1
def is_media_url(url: str) -> bool:
parsed = urlparse(url)
ext = Path(parsed.path).suffix.lower()
return ext in MEDIA_EXTENSIONS
def should_skip(url: str) -> bool:
lower = url.lower()
return any(skip in lower for skip in SKIP_PATTERNS)
def extract_images_from_html(html: str, base_url: str) -> list:
"""Extract full-size image URLs from page HTML.
Priority: <a href="full.jpg"> wrapping <img src="thumb.jpg">
Fallback: standalone <img src="image.jpg"> (non-thumb)
"""
images = []
thumb_urls = set() # track thumbnails so we don't add them as standalone
# Pattern 1: <a href="full-size"><img src="thumb"></a>
# This catches the bellazon pattern where thumbnails link to full images
for match in re.finditer(
r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>\s*<img[^>]+src=["\']([^"\']+)["\']',
html, re.IGNORECASE | re.DOTALL
):
href = unescape(match.group(1))
img_src = unescape(match.group(2))
if is_media_url(href) and not should_skip(href):
full_url = urljoin(base_url, href)
images.append(full_url)
# Track the thumbnail so we skip it later
thumb_urls.add(urljoin(base_url, img_src))
# Pattern 2: Standalone <img> tags not wrapped in links to full-size
for match in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
url = unescape(match.group(1))
if should_skip(url):
continue
full_url = urljoin(base_url, url)
# Skip if this is a thumbnail we already have the full version of
if full_url in thumb_urls:
continue
# Skip anything with _thumb or .thumb in the name
if '_thumb' in url or '.thumb.' in url:
continue
if is_media_url(url):
images.append(full_url)
# Pattern 3: Links to external image files (not bellazon)
for match in re.finditer(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE):
url = unescape(match.group(1))
parsed = urlparse(url)
if parsed.netloc and 'bellazon' not in parsed.netloc and is_media_url(url):
images.append(url)
# Pattern 4: Forum attachments (attachment.php?id=XXX) with video/image filenames
# e.g. <a href="...attachment.php?id=6887160">B7A65853...MP4</a>
for match in re.finditer(
r'<a[^>]+href=["\']([^"\']*attachment\.php\?id=\d+)["\'][^>]*>([^<]+)</a>',
html, re.IGNORECASE
):
href = unescape(match.group(1))
link_text = match.group(2).strip()
ext = Path(link_text).suffix.lower()
if ext in MEDIA_EXTENSIONS:
full_url = urljoin(base_url, href)
images.append((full_url, link_text)) # tuple: (url, filename)
# Deduplicate preserving order
seen = set()
unique = []
for item in images:
key = item[0] if isinstance(item, tuple) else item
if key not in seen:
seen.add(key)
unique.append(item)
return unique
def download_media(item, output_dir: Path, session: requests.Session, seen_hashes: set) -> bool:
# item is either a URL string or a (url, filename) tuple
if isinstance(item, tuple):
url, orig_filename = item
else:
url, orig_filename = item, None
try:
resp = session.get(url, timeout=60)
if resp.status_code != 200:
return False
content_type = resp.headers.get('content-type', '')
if not any(t in content_type for t in ['image', 'video', 'octet-stream']):
return False
data = resp.content
if len(data) < 5000: # Skip tiny files (icons/placeholders)
return False
file_hash = hashlib.md5(data).hexdigest()
if file_hash in seen_hashes:
return False
seen_hashes.add(file_hash)
if orig_filename:
filename = re.sub(r'[^\w\-_.]', '_', orig_filename)
else:
parsed = urlparse(url)
filename = Path(parsed.path).name
filename = re.sub(r'[^\w\-_.]', '_', filename)
if not filename or filename == '_':
filename = f"{file_hash}.jpg"
filepath = output_dir / filename
if filepath.exists():
filepath = output_dir / f"{filepath.stem}_{file_hash[:8]}{filepath.suffix}"
filepath.write_bytes(data)
return True
except Exception as e:
display = url[:80] if not orig_filename else orig_filename
print(f" Error: {display}: {e}", flush=True)
return False
def main():
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)
session = requests.Session()
session.headers.update(HEADERS)
print(f"Fetching: {THREAD_URL}", flush=True)
resp = session.get(THREAD_URL, timeout=30)
resp.raise_for_status()
total_pages = get_page_count(resp.text)
print(f"Total pages: {total_pages}", flush=True)
seen_hashes = set()
total_downloaded = 0
total_skipped = 0
for page_num in range(1, total_pages + 1):
if page_num == 1:
page_url = THREAD_URL
html = resp.text
else:
page_url = f"{THREAD_URL.rstrip('/')}/page/{page_num}/"
try:
resp = session.get(page_url, timeout=30)
resp.raise_for_status()
html = resp.text
except Exception as e:
print(f" Error fetching page {page_num}: {e}", flush=True)
continue
images = extract_images_from_html(html, page_url)
page_dl = 0
for img_url in images:
if download_media(img_url, output_dir, session, seen_hashes):
page_dl += 1
total_downloaded += 1
else:
total_skipped += 1
print(f"Page {page_num}/{total_pages}: {page_dl} downloaded ({len(images)} found, {total_downloaded} total)", flush=True)
if page_num < total_pages:
time.sleep(1)
print(f"\nDone! {total_downloaded} images saved to {output_dir}", flush=True)
print(f"Skipped: {total_skipped}", flush=True)
if __name__ == "__main__":
main()