225
scripts/bellazon_scraper.py
Normal file
225
scripts/bellazon_scraper.py
Normal file
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Bellazon Forum Thread Image Scraper
|
||||
|
||||
Downloads all full-size images from a Bellazon forum thread.
|
||||
Bellazon uses <a href="full.jpg"><img src="full_thumb.jpg"></a> pattern.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import hashlib
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from html import unescape
|
||||
|
||||
THREAD_URL = sys.argv[1] if len(sys.argv) > 1 else "https://www.bellazon.com/main/topic/39089-india-reynolds/"
|
||||
OUTPUT_DIR = sys.argv[2] if len(sys.argv) > 2 else "/opt/media-downloader/data/bellazon/india-reynolds"
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Referer': 'https://www.bellazon.com/',
|
||||
}
|
||||
|
||||
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}
|
||||
VIDEO_EXTENSIONS = {'.mp4', '.webm', '.mov', '.avi', '.mkv'}
|
||||
MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
|
||||
|
||||
SKIP_PATTERNS = [
|
||||
'avatar', 'emoji', 'icon', '/public/', 'rep_', 'style_',
|
||||
'star_', '/js/', '/css/', 'button', 'logo', 'loading',
|
||||
'spinner', 'pixel', 'spacer', '/default_photo',
|
||||
'profile_photo', '/skin_', '/set_resources/', 'screenshot',
|
||||
]
|
||||
|
||||
|
||||
def get_page_count(html: str) -> int:
|
||||
match = re.search(r'Page\s+\d+\s+of\s+(\d+)', html)
|
||||
return int(match.group(1)) if match else 1
|
||||
|
||||
|
||||
def is_media_url(url: str) -> bool:
|
||||
parsed = urlparse(url)
|
||||
ext = Path(parsed.path).suffix.lower()
|
||||
return ext in MEDIA_EXTENSIONS
|
||||
|
||||
|
||||
def should_skip(url: str) -> bool:
|
||||
lower = url.lower()
|
||||
return any(skip in lower for skip in SKIP_PATTERNS)
|
||||
|
||||
|
||||
def extract_images_from_html(html: str, base_url: str) -> list:
|
||||
"""Extract full-size image URLs from page HTML.
|
||||
|
||||
Priority: <a href="full.jpg"> wrapping <img src="thumb.jpg">
|
||||
Fallback: standalone <img src="image.jpg"> (non-thumb)
|
||||
"""
|
||||
images = []
|
||||
thumb_urls = set() # track thumbnails so we don't add them as standalone
|
||||
|
||||
# Pattern 1: <a href="full-size"><img src="thumb"></a>
|
||||
# This catches the bellazon pattern where thumbnails link to full images
|
||||
for match in re.finditer(
|
||||
r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>\s*<img[^>]+src=["\']([^"\']+)["\']',
|
||||
html, re.IGNORECASE | re.DOTALL
|
||||
):
|
||||
href = unescape(match.group(1))
|
||||
img_src = unescape(match.group(2))
|
||||
|
||||
if is_media_url(href) and not should_skip(href):
|
||||
full_url = urljoin(base_url, href)
|
||||
images.append(full_url)
|
||||
# Track the thumbnail so we skip it later
|
||||
thumb_urls.add(urljoin(base_url, img_src))
|
||||
|
||||
# Pattern 2: Standalone <img> tags not wrapped in links to full-size
|
||||
for match in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
|
||||
url = unescape(match.group(1))
|
||||
if should_skip(url):
|
||||
continue
|
||||
full_url = urljoin(base_url, url)
|
||||
# Skip if this is a thumbnail we already have the full version of
|
||||
if full_url in thumb_urls:
|
||||
continue
|
||||
# Skip anything with _thumb or .thumb in the name
|
||||
if '_thumb' in url or '.thumb.' in url:
|
||||
continue
|
||||
if is_media_url(url):
|
||||
images.append(full_url)
|
||||
|
||||
# Pattern 3: Links to external image files (not bellazon)
|
||||
for match in re.finditer(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE):
|
||||
url = unescape(match.group(1))
|
||||
parsed = urlparse(url)
|
||||
if parsed.netloc and 'bellazon' not in parsed.netloc and is_media_url(url):
|
||||
images.append(url)
|
||||
|
||||
# Pattern 4: Forum attachments (attachment.php?id=XXX) with video/image filenames
|
||||
# e.g. <a href="...attachment.php?id=6887160">B7A65853...MP4</a>
|
||||
for match in re.finditer(
|
||||
r'<a[^>]+href=["\']([^"\']*attachment\.php\?id=\d+)["\'][^>]*>([^<]+)</a>',
|
||||
html, re.IGNORECASE
|
||||
):
|
||||
href = unescape(match.group(1))
|
||||
link_text = match.group(2).strip()
|
||||
ext = Path(link_text).suffix.lower()
|
||||
if ext in MEDIA_EXTENSIONS:
|
||||
full_url = urljoin(base_url, href)
|
||||
images.append((full_url, link_text)) # tuple: (url, filename)
|
||||
|
||||
# Deduplicate preserving order
|
||||
seen = set()
|
||||
unique = []
|
||||
for item in images:
|
||||
key = item[0] if isinstance(item, tuple) else item
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(item)
|
||||
return unique
|
||||
|
||||
|
||||
def download_media(item, output_dir: Path, session: requests.Session, seen_hashes: set) -> bool:
|
||||
# item is either a URL string or a (url, filename) tuple
|
||||
if isinstance(item, tuple):
|
||||
url, orig_filename = item
|
||||
else:
|
||||
url, orig_filename = item, None
|
||||
|
||||
try:
|
||||
resp = session.get(url, timeout=60)
|
||||
if resp.status_code != 200:
|
||||
return False
|
||||
|
||||
content_type = resp.headers.get('content-type', '')
|
||||
if not any(t in content_type for t in ['image', 'video', 'octet-stream']):
|
||||
return False
|
||||
|
||||
data = resp.content
|
||||
if len(data) < 5000: # Skip tiny files (icons/placeholders)
|
||||
return False
|
||||
|
||||
file_hash = hashlib.md5(data).hexdigest()
|
||||
if file_hash in seen_hashes:
|
||||
return False
|
||||
seen_hashes.add(file_hash)
|
||||
|
||||
if orig_filename:
|
||||
filename = re.sub(r'[^\w\-_.]', '_', orig_filename)
|
||||
else:
|
||||
parsed = urlparse(url)
|
||||
filename = Path(parsed.path).name
|
||||
filename = re.sub(r'[^\w\-_.]', '_', filename)
|
||||
if not filename or filename == '_':
|
||||
filename = f"{file_hash}.jpg"
|
||||
|
||||
filepath = output_dir / filename
|
||||
if filepath.exists():
|
||||
filepath = output_dir / f"{filepath.stem}_{file_hash[:8]}{filepath.suffix}"
|
||||
|
||||
filepath.write_bytes(data)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
display = url[:80] if not orig_filename else orig_filename
|
||||
print(f" Error: {display}: {e}", flush=True)
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update(HEADERS)
|
||||
|
||||
print(f"Fetching: {THREAD_URL}", flush=True)
|
||||
resp = session.get(THREAD_URL, timeout=30)
|
||||
resp.raise_for_status()
|
||||
|
||||
total_pages = get_page_count(resp.text)
|
||||
print(f"Total pages: {total_pages}", flush=True)
|
||||
|
||||
seen_hashes = set()
|
||||
total_downloaded = 0
|
||||
total_skipped = 0
|
||||
|
||||
for page_num in range(1, total_pages + 1):
|
||||
if page_num == 1:
|
||||
page_url = THREAD_URL
|
||||
html = resp.text
|
||||
else:
|
||||
page_url = f"{THREAD_URL.rstrip('/')}/page/{page_num}/"
|
||||
try:
|
||||
resp = session.get(page_url, timeout=30)
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
except Exception as e:
|
||||
print(f" Error fetching page {page_num}: {e}", flush=True)
|
||||
continue
|
||||
|
||||
images = extract_images_from_html(html, page_url)
|
||||
page_dl = 0
|
||||
|
||||
for img_url in images:
|
||||
if download_media(img_url, output_dir, session, seen_hashes):
|
||||
page_dl += 1
|
||||
total_downloaded += 1
|
||||
else:
|
||||
total_skipped += 1
|
||||
|
||||
print(f"Page {page_num}/{total_pages}: {page_dl} downloaded ({len(images)} found, {total_downloaded} total)", flush=True)
|
||||
|
||||
if page_num < total_pages:
|
||||
time.sleep(1)
|
||||
|
||||
print(f"\nDone! {total_downloaded} images saved to {output_dir}", flush=True)
|
||||
print(f"Skipped: {total_skipped}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user