679 lines
26 KiB
Python
679 lines
26 KiB
Python
"""
|
|
Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.
|
|
|
|
Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
|
|
to produce Post/Attachment objects for the paid content system.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from .models import Post, Attachment
|
|
|
|
|
|
class RedditClient(LoggingMixin):
|
|
"""
|
|
Client for fetching Reddit subreddit content via gallery-dl.
|
|
|
|
gallery-dl downloads files during fetch, so attachments come with local_path
|
|
already set. The sync handler moves files to their final location.
|
|
"""
|
|
|
|
SERVICE_ID = 'reddit'
|
|
PLATFORM = 'reddit'
|
|
|
|
def __init__(self, unified_db=None, log_callback=None):
|
|
self._init_logger('PaidContent', log_callback, default_module='Reddit')
|
|
self.unified_db = unified_db
|
|
self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'
|
|
|
|
def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
|
|
"""Get basic subreddit info by checking the Reddit JSON API.
|
|
|
|
Returns dict with creator_id and creator_name.
|
|
"""
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
try:
|
|
# Quick check via Reddit's public JSON endpoint
|
|
url = f'https://www.reddit.com/r/{subreddit}/about.json'
|
|
req = urllib.request.Request(url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
|
|
})
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
data = json.loads(resp.read().decode())
|
|
|
|
sub_data = data.get('data', {})
|
|
display_name = sub_data.get('display_name', subreddit)
|
|
title = sub_data.get('title', '')
|
|
|
|
# Extract icon — community_icon is higher res, icon_img is fallback
|
|
icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
|
|
# HTML entities in URLs
|
|
icon_url = icon_url.replace('&', '&') if icon_url else None
|
|
|
|
# Extract banner — banner_background_image is the main one
|
|
banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
|
|
banner_url = banner_url.split('?')[0] if banner_url else None
|
|
if banner_url:
|
|
banner_url = banner_url.replace('&', '&')
|
|
|
|
# Build bio from title + public description
|
|
public_desc = sub_data.get('public_description', '')
|
|
bio_parts = []
|
|
if title:
|
|
bio_parts.append(title)
|
|
if public_desc and public_desc != title:
|
|
bio_parts.append(public_desc)
|
|
subscribers = sub_data.get('subscribers')
|
|
if subscribers:
|
|
bio_parts.append(f"{subscribers:,} subscribers")
|
|
bio = ' — '.join(bio_parts) if bio_parts else None
|
|
|
|
# Subreddit creation date
|
|
created_utc = sub_data.get('created_utc')
|
|
joined_date = None
|
|
if created_utc:
|
|
try:
|
|
joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
# Use the subreddit title as display name (e.g. "Reddit Pics")
|
|
# Fall back to r/name format if no title
|
|
friendly_name = title if title else f'r/{display_name}'
|
|
|
|
return {
|
|
'creator_id': display_name.lower(),
|
|
'creator_name': f'r/{display_name}',
|
|
'display_name': friendly_name,
|
|
'bio': bio,
|
|
'joined_date': joined_date,
|
|
'profile_image_url': icon_url or None,
|
|
'banner_image_url': banner_url or None,
|
|
}
|
|
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 404:
|
|
self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
|
|
return None
|
|
elif e.code == 403:
|
|
# Private/quarantined — still exists, return basic info
|
|
self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
|
|
return {
|
|
'creator_id': subreddit.lower(),
|
|
'creator_name': f'r/{subreddit}',
|
|
}
|
|
else:
|
|
self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
|
|
# Return basic info and let sync verify
|
|
return {
|
|
'creator_id': subreddit.lower(),
|
|
'creator_name': f'r/{subreddit}',
|
|
}
|
|
except Exception as e:
|
|
self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
|
|
return None
|
|
|
|
def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
|
|
progress_callback=None) -> tuple:
|
|
"""Fetch posts and download media from a subreddit using gallery-dl.
|
|
|
|
Args:
|
|
subreddit: Subreddit name (without r/)
|
|
since_date: ISO date string; skip posts older than this
|
|
max_posts: Maximum posts to fetch (0 = unlimited)
|
|
progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
|
|
for live progress updates
|
|
|
|
Returns:
|
|
Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
|
|
when done moving files. Returns ([], None) on failure.
|
|
"""
|
|
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
|
|
|
|
try:
|
|
downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
|
|
progress_callback=progress_callback)
|
|
|
|
if not downloaded:
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
return [], None
|
|
|
|
# Group files by post using metadata sidecars
|
|
grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)
|
|
|
|
if not grouped:
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
return [], None
|
|
|
|
posts = []
|
|
for post_id, post_data in grouped.items():
|
|
attachments = []
|
|
for file_path in post_data['files']:
|
|
ext = file_path.suffix.lower()
|
|
file_type = self._detect_file_type(ext)
|
|
|
|
attachments.append(Attachment(
|
|
name=file_path.name,
|
|
file_type=file_type,
|
|
extension=ext,
|
|
server_path=str(file_path), # temp path, will be moved
|
|
download_url=None, # Already downloaded
|
|
file_size=file_path.stat().st_size if file_path.exists() else None,
|
|
))
|
|
|
|
if not attachments:
|
|
continue
|
|
|
|
post = Post(
|
|
post_id=post_id,
|
|
service_id=self.SERVICE_ID,
|
|
platform=self.PLATFORM,
|
|
creator_id=subreddit.lower(),
|
|
title=post_data.get('title'),
|
|
content=post_data.get('title'),
|
|
published_at=post_data.get('date'),
|
|
attachments=attachments,
|
|
)
|
|
posts.append(post)
|
|
|
|
self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
|
|
return posts, temp_dir
|
|
|
|
except Exception as e:
|
|
self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
return [], None
|
|
|
|
def run_gallery_dl(self, subreddit: str, temp_dir: str,
|
|
since_date: str = None, max_posts: int = 0,
|
|
progress_callback=None, batch_callback=None,
|
|
batch_size: int = 50) -> dict:
|
|
"""Run gallery-dl to download media from a subreddit.
|
|
|
|
Streams stdout line-by-line. Calls progress_callback for status updates
|
|
and batch_callback with lists of new file paths for incremental processing.
|
|
|
|
Args:
|
|
progress_callback: Called with (dl_count, skip_count, total_seen)
|
|
batch_callback: Called with (new_files: List[Path]) every batch_size files
|
|
batch_size: How many files to accumulate before calling batch_callback
|
|
|
|
Returns:
|
|
Dict with dl_count, skip_count, total.
|
|
"""
|
|
import time
|
|
|
|
# Use a separate download archive for paid content reddit
|
|
archive_dir = '/opt/media-downloader/data/cache'
|
|
os.makedirs(archive_dir, exist_ok=True)
|
|
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
|
|
|
|
cmd = [
|
|
self.gallery_dl_path,
|
|
'--write-metadata',
|
|
'--download-archive', archive_path,
|
|
'-d', temp_dir,
|
|
]
|
|
|
|
# REST API mode to avoid shared OAuth rate limits
|
|
cmd.extend(['-o', 'extractor.reddit.api=rest'])
|
|
|
|
# Limit posts (0 = unlimited)
|
|
if max_posts > 0:
|
|
cmd.extend(['--range', f'1-{max_posts}'])
|
|
|
|
# Date filtering
|
|
if since_date:
|
|
try:
|
|
cutoff = since_date[:10] # YYYY-MM-DD
|
|
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
|
|
|
|
# Check for Reddit cookies file
|
|
cookies_file = self._get_cookies_file()
|
|
if cookies_file:
|
|
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
|
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
|
|
cmd.extend(['--cookies', temp_cookie_file])
|
|
|
|
self.log(f"Running gallery-dl for r/{subreddit}", 'info')
|
|
self.log(f"Command: {' '.join(cmd)}", 'debug')
|
|
|
|
dl_count = 0
|
|
skip_count = 0
|
|
pending_files = []
|
|
|
|
try:
|
|
proc = subprocess.Popen(
|
|
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
)
|
|
|
|
start_time = time.time()
|
|
timeout_secs = 7200 # 2 hours
|
|
|
|
while True:
|
|
if time.time() - start_time > timeout_secs:
|
|
proc.kill()
|
|
self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
|
|
break
|
|
|
|
line = proc.stdout.readline()
|
|
if not line and proc.poll() is not None:
|
|
break
|
|
if not line:
|
|
continue
|
|
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
if line.startswith('# '):
|
|
# Skipped file (already in archive)
|
|
skip_count += 1
|
|
else:
|
|
# Downloaded file — gallery-dl prints the full path
|
|
dl_count += 1
|
|
file_path = Path(line)
|
|
if file_path.exists() and not file_path.name.endswith('.json'):
|
|
pending_files.append(file_path)
|
|
|
|
total = dl_count + skip_count
|
|
if progress_callback and total % 5 == 0:
|
|
progress_callback(dl_count, skip_count, total)
|
|
|
|
# Flush batch for processing
|
|
if batch_callback and len(pending_files) >= batch_size:
|
|
batch_callback(list(pending_files))
|
|
pending_files.clear()
|
|
|
|
proc.wait()
|
|
|
|
# Final batch
|
|
if batch_callback and pending_files:
|
|
batch_callback(list(pending_files))
|
|
pending_files.clear()
|
|
|
|
if progress_callback:
|
|
progress_callback(dl_count, skip_count, dl_count + skip_count)
|
|
|
|
returncode = proc.returncode
|
|
if returncode not in (None, 0, 1, 4, 5):
|
|
stderr = proc.stderr.read()
|
|
self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
|
|
if stderr:
|
|
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
|
|
|
|
except Exception as e:
|
|
self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')
|
|
|
|
self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
|
|
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
|
|
|
|
def _group_files_by_post(self, files: List[Path], temp_dir: str,
|
|
subreddit: str) -> Dict[str, Dict]:
|
|
"""Group downloaded files by Reddit post ID using metadata JSON sidecars.
|
|
|
|
Adapted from reddit_community_monitor.py:_group_files_by_post
|
|
|
|
Returns:
|
|
Dict mapping reddit_post_id -> {
|
|
'files': [Path],
|
|
'title': str,
|
|
'date': str,
|
|
'source_url': str
|
|
}
|
|
"""
|
|
posts: Dict[str, Dict] = {}
|
|
|
|
for file_path in files:
|
|
# Look for matching metadata JSON sidecar
|
|
json_path = file_path.with_suffix(file_path.suffix + '.json')
|
|
if not json_path.exists():
|
|
json_path = file_path.with_suffix('.json')
|
|
|
|
metadata = {}
|
|
if json_path.exists():
|
|
try:
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
metadata = json.load(f)
|
|
except (json.JSONDecodeError, Exception) as e:
|
|
self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')
|
|
|
|
# Extract Reddit post ID
|
|
reddit_post_id = None
|
|
for key in ('id', 'reddit_id', 'parent_id'):
|
|
if key in metadata:
|
|
reddit_post_id = str(metadata[key])
|
|
break
|
|
|
|
if not reddit_post_id:
|
|
# Filename-based fallback: subreddit_postid_num.ext
|
|
parts = file_path.stem.split('_')
|
|
if len(parts) >= 2:
|
|
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
|
|
else:
|
|
reddit_post_id = file_path.stem
|
|
|
|
# Extract post date
|
|
post_date = None
|
|
if 'date' in metadata:
|
|
date_val = metadata['date']
|
|
if isinstance(date_val, str):
|
|
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
|
try:
|
|
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
|
|
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
|
|
break
|
|
except ValueError:
|
|
continue
|
|
if not post_date:
|
|
post_date = date_val
|
|
elif isinstance(date_val, (int, float)):
|
|
try:
|
|
post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
if not post_date and 'created_utc' in metadata:
|
|
try:
|
|
post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
if not post_date:
|
|
post_date = datetime.now().isoformat()
|
|
|
|
title = metadata.get('title', metadata.get('description', ''))
|
|
sub = metadata.get('subreddit', subreddit)
|
|
source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''
|
|
|
|
if reddit_post_id not in posts:
|
|
posts[reddit_post_id] = {
|
|
'files': [],
|
|
'title': title,
|
|
'date': post_date,
|
|
'source_url': source_url,
|
|
}
|
|
|
|
posts[reddit_post_id]['files'].append(file_path)
|
|
|
|
return posts
|
|
|
|
def _get_cookies_file(self) -> Optional[str]:
|
|
"""Get Reddit cookies JSON from the scrapers table if configured."""
|
|
if not self.unified_db:
|
|
return None
|
|
|
|
try:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
|
|
)
|
|
row = cursor.fetchone()
|
|
if row and row[0]:
|
|
return row[0]
|
|
except Exception as e:
|
|
self.log(f"Could not load Reddit cookies: {e}", 'debug')
|
|
|
|
return None
|
|
|
|
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
|
|
"""Convert JSON cookies array to Netscape cookie file format."""
|
|
try:
|
|
cookies = json.loads(cookies_json)
|
|
if not isinstance(cookies, list):
|
|
return False
|
|
|
|
with open(output_path, 'w') as f:
|
|
f.write("# Netscape HTTP Cookie File\n")
|
|
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
|
|
for cookie in cookies:
|
|
domain = cookie.get('domain', '')
|
|
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
|
path = cookie.get('path', '/')
|
|
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
|
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
|
|
if expires is None:
|
|
expires = 0
|
|
expires = str(int(float(expires)))
|
|
name = cookie.get('name', '')
|
|
value = cookie.get('value', '')
|
|
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
|
|
|
|
return True
|
|
except Exception as e:
|
|
self.log(f"Failed to write Netscape cookie file: {e}", 'error')
|
|
return False
|
|
|
|
def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
|
|
before_ts: int = None,
|
|
progress_callback=None) -> List[Dict]:
|
|
"""Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.
|
|
|
|
Paginates through the full archive using created_utc ascending order.
|
|
Rate-limited to ~1 request per 2 seconds.
|
|
|
|
Args:
|
|
subreddit: Subreddit name (without r/)
|
|
after_ts: Unix timestamp to start from (0 = beginning of time)
|
|
before_ts: Unix timestamp to stop at (None = no upper limit)
|
|
progress_callback: Optional callable(fetched_count, message)
|
|
|
|
Returns:
|
|
List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
|
|
"""
|
|
import time
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
base_url = 'https://api.pullpush.io/reddit/search/submission/'
|
|
all_posts = []
|
|
current_after = after_ts
|
|
page = 0
|
|
|
|
while True:
|
|
params = (
|
|
f'subreddit={subreddit}'
|
|
f'&size=100'
|
|
f'&sort=asc'
|
|
f'&sort_type=created_utc'
|
|
f'&after={current_after}'
|
|
)
|
|
if before_ts is not None:
|
|
params += f'&before={before_ts}'
|
|
|
|
url = f'{base_url}?{params}'
|
|
page += 1
|
|
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
|
|
})
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
data = json.loads(resp.read().decode())
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 429:
|
|
self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
|
|
time.sleep(5)
|
|
continue
|
|
self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
|
|
break
|
|
except Exception as e:
|
|
self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
|
|
break
|
|
|
|
posts = data.get('data', [])
|
|
if not posts:
|
|
break
|
|
|
|
for post in posts:
|
|
all_posts.append({
|
|
'id': post.get('id', ''),
|
|
'title': post.get('title', ''),
|
|
'created_utc': post.get('created_utc', 0),
|
|
'url': post.get('url', ''),
|
|
'is_gallery': post.get('is_gallery', False),
|
|
'selftext': post.get('selftext', ''),
|
|
})
|
|
|
|
last_ts = posts[-1].get('created_utc', 0)
|
|
|
|
if progress_callback:
|
|
progress_callback(len(all_posts),
|
|
f"Fetched {len(all_posts)} post IDs (page {page})")
|
|
|
|
# Handle stuck pagination — same timestamp repeating
|
|
if last_ts <= current_after:
|
|
current_after = last_ts + 1
|
|
else:
|
|
current_after = last_ts
|
|
|
|
# If we got fewer than 100, we've reached the end
|
|
if len(posts) < 100:
|
|
break
|
|
|
|
# Rate limit: 2s between requests
|
|
time.sleep(2)
|
|
|
|
self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
|
|
return all_posts
|
|
|
|
def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
|
|
progress_callback=None, batch_callback=None,
|
|
batch_size: int = 50) -> dict:
|
|
"""Run gallery-dl with --input-file to download specific Reddit post URLs.
|
|
|
|
Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
|
|
instead of scraping a subreddit listing.
|
|
|
|
Args:
|
|
urls_file: Path to file containing one URL per line
|
|
temp_dir: Directory for gallery-dl to download into
|
|
progress_callback: Called with (dl_count, skip_count, total_seen)
|
|
batch_callback: Called with (new_files: List[Path]) every batch_size files
|
|
batch_size: How many files to accumulate before calling batch_callback
|
|
|
|
Returns:
|
|
Dict with dl_count, skip_count, total.
|
|
"""
|
|
import time
|
|
|
|
# Same archive as normal Reddit paid content sync
|
|
archive_dir = '/opt/media-downloader/data/cache'
|
|
os.makedirs(archive_dir, exist_ok=True)
|
|
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
|
|
|
|
cmd = [
|
|
self.gallery_dl_path,
|
|
'--write-metadata',
|
|
'--download-archive', archive_path,
|
|
'-d', temp_dir,
|
|
'-o', 'extractor.reddit.api=rest',
|
|
'--input-file', urls_file,
|
|
]
|
|
|
|
# Check for Reddit cookies file
|
|
cookies_file = self._get_cookies_file()
|
|
if cookies_file:
|
|
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
|
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
|
|
cmd.extend(['--cookies', temp_cookie_file])
|
|
|
|
self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
|
|
self.log(f"Command: {' '.join(cmd)}", 'debug')
|
|
|
|
dl_count = 0
|
|
skip_count = 0
|
|
pending_files = []
|
|
|
|
try:
|
|
proc = subprocess.Popen(
|
|
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
)
|
|
|
|
start_time = time.time()
|
|
timeout_secs = 14400 # 4 hours for backfill (can be large)
|
|
|
|
while True:
|
|
if time.time() - start_time > timeout_secs:
|
|
proc.kill()
|
|
self.log("gallery-dl backfill timed out", 'error')
|
|
break
|
|
|
|
line = proc.stdout.readline()
|
|
if not line and proc.poll() is not None:
|
|
break
|
|
if not line:
|
|
continue
|
|
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
if line.startswith('# '):
|
|
skip_count += 1
|
|
else:
|
|
dl_count += 1
|
|
file_path = Path(line)
|
|
if file_path.exists() and not file_path.name.endswith('.json'):
|
|
pending_files.append(file_path)
|
|
|
|
total = dl_count + skip_count
|
|
if progress_callback:
|
|
progress_callback(dl_count, skip_count, total)
|
|
|
|
if batch_callback and len(pending_files) >= batch_size:
|
|
batch_callback(list(pending_files))
|
|
pending_files.clear()
|
|
|
|
proc.wait()
|
|
|
|
# Final batch
|
|
if batch_callback and pending_files:
|
|
batch_callback(list(pending_files))
|
|
pending_files.clear()
|
|
|
|
if progress_callback:
|
|
progress_callback(dl_count, skip_count, dl_count + skip_count)
|
|
|
|
returncode = proc.returncode
|
|
if returncode not in (None, 0, 1, 4, 5):
|
|
stderr = proc.stderr.read()
|
|
self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
|
|
if stderr:
|
|
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
|
|
|
|
except Exception as e:
|
|
self.log(f"gallery-dl backfill failed: {e}", 'error')
|
|
|
|
self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
|
|
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
|
|
|
|
@staticmethod
|
|
def _detect_file_type(ext: str) -> str:
|
|
"""Detect file type from extension."""
|
|
ext = ext.lower().lstrip('.')
|
|
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
|
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}
|
|
|
|
if ext in image_exts:
|
|
return 'image'
|
|
elif ext in video_exts:
|
|
return 'video'
|
|
return 'unknown'
|