226 lines
7.5 KiB
Python
226 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Bellazon Forum Thread Image Scraper
|
|
|
|
Downloads all full-size images from a Bellazon forum thread.
|
|
Bellazon uses <a href="full.jpg"><img src="full_thumb.jpg"></a> pattern.
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
import time
|
|
import hashlib
|
|
import requests
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse, urljoin
|
|
from html import unescape
|
|
|
|
THREAD_URL = sys.argv[1] if len(sys.argv) > 1 else "https://www.bellazon.com/main/topic/39089-india-reynolds/"
|
|
OUTPUT_DIR = sys.argv[2] if len(sys.argv) > 2 else "/opt/media-downloader/data/bellazon/india-reynolds"
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Referer': 'https://www.bellazon.com/',
|
|
}
|
|
|
|
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}
|
|
VIDEO_EXTENSIONS = {'.mp4', '.webm', '.mov', '.avi', '.mkv'}
|
|
MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
|
|
|
|
SKIP_PATTERNS = [
|
|
'avatar', 'emoji', 'icon', '/public/', 'rep_', 'style_',
|
|
'star_', '/js/', '/css/', 'button', 'logo', 'loading',
|
|
'spinner', 'pixel', 'spacer', '/default_photo',
|
|
'profile_photo', '/skin_', '/set_resources/', 'screenshot',
|
|
]
|
|
|
|
|
|
def get_page_count(html: str) -> int:
|
|
match = re.search(r'Page\s+\d+\s+of\s+(\d+)', html)
|
|
return int(match.group(1)) if match else 1
|
|
|
|
|
|
def is_media_url(url: str) -> bool:
|
|
parsed = urlparse(url)
|
|
ext = Path(parsed.path).suffix.lower()
|
|
return ext in MEDIA_EXTENSIONS
|
|
|
|
|
|
def should_skip(url: str) -> bool:
|
|
lower = url.lower()
|
|
return any(skip in lower for skip in SKIP_PATTERNS)
|
|
|
|
|
|
def extract_images_from_html(html: str, base_url: str) -> list:
|
|
"""Extract full-size image URLs from page HTML.
|
|
|
|
Priority: <a href="full.jpg"> wrapping <img src="thumb.jpg">
|
|
Fallback: standalone <img src="image.jpg"> (non-thumb)
|
|
"""
|
|
images = []
|
|
thumb_urls = set() # track thumbnails so we don't add them as standalone
|
|
|
|
# Pattern 1: <a href="full-size"><img src="thumb"></a>
|
|
# This catches the bellazon pattern where thumbnails link to full images
|
|
for match in re.finditer(
|
|
r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>\s*<img[^>]+src=["\']([^"\']+)["\']',
|
|
html, re.IGNORECASE | re.DOTALL
|
|
):
|
|
href = unescape(match.group(1))
|
|
img_src = unescape(match.group(2))
|
|
|
|
if is_media_url(href) and not should_skip(href):
|
|
full_url = urljoin(base_url, href)
|
|
images.append(full_url)
|
|
# Track the thumbnail so we skip it later
|
|
thumb_urls.add(urljoin(base_url, img_src))
|
|
|
|
# Pattern 2: Standalone <img> tags not wrapped in links to full-size
|
|
for match in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
|
|
url = unescape(match.group(1))
|
|
if should_skip(url):
|
|
continue
|
|
full_url = urljoin(base_url, url)
|
|
# Skip if this is a thumbnail we already have the full version of
|
|
if full_url in thumb_urls:
|
|
continue
|
|
# Skip anything with _thumb or .thumb in the name
|
|
if '_thumb' in url or '.thumb.' in url:
|
|
continue
|
|
if is_media_url(url):
|
|
images.append(full_url)
|
|
|
|
# Pattern 3: Links to external image files (not bellazon)
|
|
for match in re.finditer(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE):
|
|
url = unescape(match.group(1))
|
|
parsed = urlparse(url)
|
|
if parsed.netloc and 'bellazon' not in parsed.netloc and is_media_url(url):
|
|
images.append(url)
|
|
|
|
# Pattern 4: Forum attachments (attachment.php?id=XXX) with video/image filenames
|
|
# e.g. <a href="...attachment.php?id=6887160">B7A65853...MP4</a>
|
|
for match in re.finditer(
|
|
r'<a[^>]+href=["\']([^"\']*attachment\.php\?id=\d+)["\'][^>]*>([^<]+)</a>',
|
|
html, re.IGNORECASE
|
|
):
|
|
href = unescape(match.group(1))
|
|
link_text = match.group(2).strip()
|
|
ext = Path(link_text).suffix.lower()
|
|
if ext in MEDIA_EXTENSIONS:
|
|
full_url = urljoin(base_url, href)
|
|
images.append((full_url, link_text)) # tuple: (url, filename)
|
|
|
|
# Deduplicate preserving order
|
|
seen = set()
|
|
unique = []
|
|
for item in images:
|
|
key = item[0] if isinstance(item, tuple) else item
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(item)
|
|
return unique
|
|
|
|
|
|
def download_media(item, output_dir: Path, session: requests.Session, seen_hashes: set) -> bool:
|
|
# item is either a URL string or a (url, filename) tuple
|
|
if isinstance(item, tuple):
|
|
url, orig_filename = item
|
|
else:
|
|
url, orig_filename = item, None
|
|
|
|
try:
|
|
resp = session.get(url, timeout=60)
|
|
if resp.status_code != 200:
|
|
return False
|
|
|
|
content_type = resp.headers.get('content-type', '')
|
|
if not any(t in content_type for t in ['image', 'video', 'octet-stream']):
|
|
return False
|
|
|
|
data = resp.content
|
|
if len(data) < 5000: # Skip tiny files (icons/placeholders)
|
|
return False
|
|
|
|
file_hash = hashlib.md5(data).hexdigest()
|
|
if file_hash in seen_hashes:
|
|
return False
|
|
seen_hashes.add(file_hash)
|
|
|
|
if orig_filename:
|
|
filename = re.sub(r'[^\w\-_.]', '_', orig_filename)
|
|
else:
|
|
parsed = urlparse(url)
|
|
filename = Path(parsed.path).name
|
|
filename = re.sub(r'[^\w\-_.]', '_', filename)
|
|
if not filename or filename == '_':
|
|
filename = f"{file_hash}.jpg"
|
|
|
|
filepath = output_dir / filename
|
|
if filepath.exists():
|
|
filepath = output_dir / f"{filepath.stem}_{file_hash[:8]}{filepath.suffix}"
|
|
|
|
filepath.write_bytes(data)
|
|
return True
|
|
|
|
except Exception as e:
|
|
display = url[:80] if not orig_filename else orig_filename
|
|
print(f" Error: {display}: {e}", flush=True)
|
|
return False
|
|
|
|
|
|
def main():
|
|
output_dir = Path(OUTPUT_DIR)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
session = requests.Session()
|
|
session.headers.update(HEADERS)
|
|
|
|
print(f"Fetching: {THREAD_URL}", flush=True)
|
|
resp = session.get(THREAD_URL, timeout=30)
|
|
resp.raise_for_status()
|
|
|
|
total_pages = get_page_count(resp.text)
|
|
print(f"Total pages: {total_pages}", flush=True)
|
|
|
|
seen_hashes = set()
|
|
total_downloaded = 0
|
|
total_skipped = 0
|
|
|
|
for page_num in range(1, total_pages + 1):
|
|
if page_num == 1:
|
|
page_url = THREAD_URL
|
|
html = resp.text
|
|
else:
|
|
page_url = f"{THREAD_URL.rstrip('/')}/page/{page_num}/"
|
|
try:
|
|
resp = session.get(page_url, timeout=30)
|
|
resp.raise_for_status()
|
|
html = resp.text
|
|
except Exception as e:
|
|
print(f" Error fetching page {page_num}: {e}", flush=True)
|
|
continue
|
|
|
|
images = extract_images_from_html(html, page_url)
|
|
page_dl = 0
|
|
|
|
for img_url in images:
|
|
if download_media(img_url, output_dir, session, seen_hashes):
|
|
page_dl += 1
|
|
total_downloaded += 1
|
|
else:
|
|
total_skipped += 1
|
|
|
|
print(f"Page {page_num}/{total_pages}: {page_dl} downloaded ({len(images)} found, {total_downloaded} total)", flush=True)
|
|
|
|
if page_num < total_pages:
|
|
time.sleep(1)
|
|
|
|
print(f"\nDone! {total_downloaded} images saved to {output_dir}", flush=True)
|
|
print(f"Skipped: {total_skipped}", flush=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|