678
modules/paid_content/reddit_client.py
Normal file
678
modules/paid_content/reddit_client.py
Normal file
@@ -0,0 +1,678 @@
|
||||
"""
|
||||
Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.
|
||||
|
||||
Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
|
||||
to produce Post/Attachment objects for the paid content system.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class RedditClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Reddit subreddit content via gallery-dl.
|
||||
|
||||
gallery-dl downloads files during fetch, so attachments come with local_path
|
||||
already set. The sync handler moves files to their final location.
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'reddit'
|
||||
PLATFORM = 'reddit'
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Reddit')
|
||||
self.unified_db = unified_db
|
||||
self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'
|
||||
|
||||
def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
|
||||
"""Get basic subreddit info by checking the Reddit JSON API.
|
||||
|
||||
Returns dict with creator_id and creator_name.
|
||||
"""
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
try:
|
||||
# Quick check via Reddit's public JSON endpoint
|
||||
url = f'https://www.reddit.com/r/{subreddit}/about.json'
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
|
||||
sub_data = data.get('data', {})
|
||||
display_name = sub_data.get('display_name', subreddit)
|
||||
title = sub_data.get('title', '')
|
||||
|
||||
# Extract icon — community_icon is higher res, icon_img is fallback
|
||||
icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
|
||||
# HTML entities in URLs
|
||||
icon_url = icon_url.replace('&', '&') if icon_url else None
|
||||
|
||||
# Extract banner — banner_background_image is the main one
|
||||
banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
|
||||
banner_url = banner_url.split('?')[0] if banner_url else None
|
||||
if banner_url:
|
||||
banner_url = banner_url.replace('&', '&')
|
||||
|
||||
# Build bio from title + public description
|
||||
public_desc = sub_data.get('public_description', '')
|
||||
bio_parts = []
|
||||
if title:
|
||||
bio_parts.append(title)
|
||||
if public_desc and public_desc != title:
|
||||
bio_parts.append(public_desc)
|
||||
subscribers = sub_data.get('subscribers')
|
||||
if subscribers:
|
||||
bio_parts.append(f"{subscribers:,} subscribers")
|
||||
bio = ' — '.join(bio_parts) if bio_parts else None
|
||||
|
||||
# Subreddit creation date
|
||||
created_utc = sub_data.get('created_utc')
|
||||
joined_date = None
|
||||
if created_utc:
|
||||
try:
|
||||
joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
# Use the subreddit title as display name (e.g. "Reddit Pics")
|
||||
# Fall back to r/name format if no title
|
||||
friendly_name = title if title else f'r/{display_name}'
|
||||
|
||||
return {
|
||||
'creator_id': display_name.lower(),
|
||||
'creator_name': f'r/{display_name}',
|
||||
'display_name': friendly_name,
|
||||
'bio': bio,
|
||||
'joined_date': joined_date,
|
||||
'profile_image_url': icon_url or None,
|
||||
'banner_image_url': banner_url or None,
|
||||
}
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
|
||||
return None
|
||||
elif e.code == 403:
|
||||
# Private/quarantined — still exists, return basic info
|
||||
self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
|
||||
return {
|
||||
'creator_id': subreddit.lower(),
|
||||
'creator_name': f'r/{subreddit}',
|
||||
}
|
||||
else:
|
||||
self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
|
||||
# Return basic info and let sync verify
|
||||
return {
|
||||
'creator_id': subreddit.lower(),
|
||||
'creator_name': f'r/{subreddit}',
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
|
||||
return None
|
||||
|
||||
def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
|
||||
progress_callback=None) -> tuple:
|
||||
"""Fetch posts and download media from a subreddit using gallery-dl.
|
||||
|
||||
Args:
|
||||
subreddit: Subreddit name (without r/)
|
||||
since_date: ISO date string; skip posts older than this
|
||||
max_posts: Maximum posts to fetch (0 = unlimited)
|
||||
progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
|
||||
for live progress updates
|
||||
|
||||
Returns:
|
||||
Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
|
||||
when done moving files. Returns ([], None) on failure.
|
||||
"""
|
||||
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
|
||||
|
||||
try:
|
||||
downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
|
||||
progress_callback=progress_callback)
|
||||
|
||||
if not downloaded:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
# Group files by post using metadata sidecars
|
||||
grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)
|
||||
|
||||
if not grouped:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
posts = []
|
||||
for post_id, post_data in grouped.items():
|
||||
attachments = []
|
||||
for file_path in post_data['files']:
|
||||
ext = file_path.suffix.lower()
|
||||
file_type = self._detect_file_type(ext)
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=file_path.name,
|
||||
file_type=file_type,
|
||||
extension=ext,
|
||||
server_path=str(file_path), # temp path, will be moved
|
||||
download_url=None, # Already downloaded
|
||||
file_size=file_path.stat().st_size if file_path.exists() else None,
|
||||
))
|
||||
|
||||
if not attachments:
|
||||
continue
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=subreddit.lower(),
|
||||
title=post_data.get('title'),
|
||||
content=post_data.get('title'),
|
||||
published_at=post_data.get('date'),
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
|
||||
return posts, temp_dir
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
def run_gallery_dl(self, subreddit: str, temp_dir: str,
|
||||
since_date: str = None, max_posts: int = 0,
|
||||
progress_callback=None, batch_callback=None,
|
||||
batch_size: int = 50) -> dict:
|
||||
"""Run gallery-dl to download media from a subreddit.
|
||||
|
||||
Streams stdout line-by-line. Calls progress_callback for status updates
|
||||
and batch_callback with lists of new file paths for incremental processing.
|
||||
|
||||
Args:
|
||||
progress_callback: Called with (dl_count, skip_count, total_seen)
|
||||
batch_callback: Called with (new_files: List[Path]) every batch_size files
|
||||
batch_size: How many files to accumulate before calling batch_callback
|
||||
|
||||
Returns:
|
||||
Dict with dl_count, skip_count, total.
|
||||
"""
|
||||
import time
|
||||
|
||||
# Use a separate download archive for paid content reddit
|
||||
archive_dir = '/opt/media-downloader/data/cache'
|
||||
os.makedirs(archive_dir, exist_ok=True)
|
||||
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'--download-archive', archive_path,
|
||||
'-d', temp_dir,
|
||||
]
|
||||
|
||||
# REST API mode to avoid shared OAuth rate limits
|
||||
cmd.extend(['-o', 'extractor.reddit.api=rest'])
|
||||
|
||||
# Limit posts (0 = unlimited)
|
||||
if max_posts > 0:
|
||||
cmd.extend(['--range', f'1-{max_posts}'])
|
||||
|
||||
# Date filtering
|
||||
if since_date:
|
||||
try:
|
||||
cutoff = since_date[:10] # YYYY-MM-DD
|
||||
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
|
||||
|
||||
# Check for Reddit cookies file
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
||||
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
|
||||
cmd.extend(['--cookies', temp_cookie_file])
|
||||
|
||||
self.log(f"Running gallery-dl for r/{subreddit}", 'info')
|
||||
self.log(f"Command: {' '.join(cmd)}", 'debug')
|
||||
|
||||
dl_count = 0
|
||||
skip_count = 0
|
||||
pending_files = []
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
timeout_secs = 7200 # 2 hours
|
||||
|
||||
while True:
|
||||
if time.time() - start_time > timeout_secs:
|
||||
proc.kill()
|
||||
self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
|
||||
break
|
||||
|
||||
line = proc.stdout.readline()
|
||||
if not line and proc.poll() is not None:
|
||||
break
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith('# '):
|
||||
# Skipped file (already in archive)
|
||||
skip_count += 1
|
||||
else:
|
||||
# Downloaded file — gallery-dl prints the full path
|
||||
dl_count += 1
|
||||
file_path = Path(line)
|
||||
if file_path.exists() and not file_path.name.endswith('.json'):
|
||||
pending_files.append(file_path)
|
||||
|
||||
total = dl_count + skip_count
|
||||
if progress_callback and total % 5 == 0:
|
||||
progress_callback(dl_count, skip_count, total)
|
||||
|
||||
# Flush batch for processing
|
||||
if batch_callback and len(pending_files) >= batch_size:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
proc.wait()
|
||||
|
||||
# Final batch
|
||||
if batch_callback and pending_files:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, dl_count + skip_count)
|
||||
|
||||
returncode = proc.returncode
|
||||
if returncode not in (None, 0, 1, 4, 5):
|
||||
stderr = proc.stderr.read()
|
||||
self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
|
||||
if stderr:
|
||||
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')
|
||||
|
||||
self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
|
||||
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
|
||||
|
||||
def _group_files_by_post(self, files: List[Path], temp_dir: str,
|
||||
subreddit: str) -> Dict[str, Dict]:
|
||||
"""Group downloaded files by Reddit post ID using metadata JSON sidecars.
|
||||
|
||||
Adapted from reddit_community_monitor.py:_group_files_by_post
|
||||
|
||||
Returns:
|
||||
Dict mapping reddit_post_id -> {
|
||||
'files': [Path],
|
||||
'title': str,
|
||||
'date': str,
|
||||
'source_url': str
|
||||
}
|
||||
"""
|
||||
posts: Dict[str, Dict] = {}
|
||||
|
||||
for file_path in files:
|
||||
# Look for matching metadata JSON sidecar
|
||||
json_path = file_path.with_suffix(file_path.suffix + '.json')
|
||||
if not json_path.exists():
|
||||
json_path = file_path.with_suffix('.json')
|
||||
|
||||
metadata = {}
|
||||
if json_path.exists():
|
||||
try:
|
||||
with open(json_path, 'r', encoding='utf-8') as f:
|
||||
metadata = json.load(f)
|
||||
except (json.JSONDecodeError, Exception) as e:
|
||||
self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')
|
||||
|
||||
# Extract Reddit post ID
|
||||
reddit_post_id = None
|
||||
for key in ('id', 'reddit_id', 'parent_id'):
|
||||
if key in metadata:
|
||||
reddit_post_id = str(metadata[key])
|
||||
break
|
||||
|
||||
if not reddit_post_id:
|
||||
# Filename-based fallback: subreddit_postid_num.ext
|
||||
parts = file_path.stem.split('_')
|
||||
if len(parts) >= 2:
|
||||
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
|
||||
else:
|
||||
reddit_post_id = file_path.stem
|
||||
|
||||
# Extract post date
|
||||
post_date = None
|
||||
if 'date' in metadata:
|
||||
date_val = metadata['date']
|
||||
if isinstance(date_val, str):
|
||||
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
||||
try:
|
||||
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
|
||||
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
if not post_date:
|
||||
post_date = date_val
|
||||
elif isinstance(date_val, (int, float)):
|
||||
try:
|
||||
post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
if not post_date and 'created_utc' in metadata:
|
||||
try:
|
||||
post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
if not post_date:
|
||||
post_date = datetime.now().isoformat()
|
||||
|
||||
title = metadata.get('title', metadata.get('description', ''))
|
||||
sub = metadata.get('subreddit', subreddit)
|
||||
source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''
|
||||
|
||||
if reddit_post_id not in posts:
|
||||
posts[reddit_post_id] = {
|
||||
'files': [],
|
||||
'title': title,
|
||||
'date': post_date,
|
||||
'source_url': source_url,
|
||||
}
|
||||
|
||||
posts[reddit_post_id]['files'].append(file_path)
|
||||
|
||||
return posts
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get Reddit cookies JSON from the scrapers table if configured."""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
return row[0]
|
||||
except Exception as e:
|
||||
self.log(f"Could not load Reddit cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
|
||||
"""Convert JSON cookies array to Netscape cookie file format."""
|
||||
try:
|
||||
cookies = json.loads(cookies_json)
|
||||
if not isinstance(cookies, list):
|
||||
return False
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
|
||||
for cookie in cookies:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
|
||||
if expires is None:
|
||||
expires = 0
|
||||
expires = str(int(float(expires)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
self.log(f"Failed to write Netscape cookie file: {e}", 'error')
|
||||
return False
|
||||
|
||||
def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
|
||||
before_ts: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.
|
||||
|
||||
Paginates through the full archive using created_utc ascending order.
|
||||
Rate-limited to ~1 request per 2 seconds.
|
||||
|
||||
Args:
|
||||
subreddit: Subreddit name (without r/)
|
||||
after_ts: Unix timestamp to start from (0 = beginning of time)
|
||||
before_ts: Unix timestamp to stop at (None = no upper limit)
|
||||
progress_callback: Optional callable(fetched_count, message)
|
||||
|
||||
Returns:
|
||||
List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
|
||||
"""
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
base_url = 'https://api.pullpush.io/reddit/search/submission/'
|
||||
all_posts = []
|
||||
current_after = after_ts
|
||||
page = 0
|
||||
|
||||
while True:
|
||||
params = (
|
||||
f'subreddit={subreddit}'
|
||||
f'&size=100'
|
||||
f'&sort=asc'
|
||||
f'&sort_type=created_utc'
|
||||
f'&after={current_after}'
|
||||
)
|
||||
if before_ts is not None:
|
||||
params += f'&before={before_ts}'
|
||||
|
||||
url = f'{base_url}?{params}'
|
||||
page += 1
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 429:
|
||||
self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
|
||||
time.sleep(5)
|
||||
continue
|
||||
self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
|
||||
break
|
||||
except Exception as e:
|
||||
self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
|
||||
break
|
||||
|
||||
posts = data.get('data', [])
|
||||
if not posts:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
all_posts.append({
|
||||
'id': post.get('id', ''),
|
||||
'title': post.get('title', ''),
|
||||
'created_utc': post.get('created_utc', 0),
|
||||
'url': post.get('url', ''),
|
||||
'is_gallery': post.get('is_gallery', False),
|
||||
'selftext': post.get('selftext', ''),
|
||||
})
|
||||
|
||||
last_ts = posts[-1].get('created_utc', 0)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(all_posts),
|
||||
f"Fetched {len(all_posts)} post IDs (page {page})")
|
||||
|
||||
# Handle stuck pagination — same timestamp repeating
|
||||
if last_ts <= current_after:
|
||||
current_after = last_ts + 1
|
||||
else:
|
||||
current_after = last_ts
|
||||
|
||||
# If we got fewer than 100, we've reached the end
|
||||
if len(posts) < 100:
|
||||
break
|
||||
|
||||
# Rate limit: 2s between requests
|
||||
time.sleep(2)
|
||||
|
||||
self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
|
||||
return all_posts
|
||||
|
||||
def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
|
||||
progress_callback=None, batch_callback=None,
|
||||
batch_size: int = 50) -> dict:
|
||||
"""Run gallery-dl with --input-file to download specific Reddit post URLs.
|
||||
|
||||
Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
|
||||
instead of scraping a subreddit listing.
|
||||
|
||||
Args:
|
||||
urls_file: Path to file containing one URL per line
|
||||
temp_dir: Directory for gallery-dl to download into
|
||||
progress_callback: Called with (dl_count, skip_count, total_seen)
|
||||
batch_callback: Called with (new_files: List[Path]) every batch_size files
|
||||
batch_size: How many files to accumulate before calling batch_callback
|
||||
|
||||
Returns:
|
||||
Dict with dl_count, skip_count, total.
|
||||
"""
|
||||
import time
|
||||
|
||||
# Same archive as normal Reddit paid content sync
|
||||
archive_dir = '/opt/media-downloader/data/cache'
|
||||
os.makedirs(archive_dir, exist_ok=True)
|
||||
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'--download-archive', archive_path,
|
||||
'-d', temp_dir,
|
||||
'-o', 'extractor.reddit.api=rest',
|
||||
'--input-file', urls_file,
|
||||
]
|
||||
|
||||
# Check for Reddit cookies file
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
||||
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
|
||||
cmd.extend(['--cookies', temp_cookie_file])
|
||||
|
||||
self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
|
||||
self.log(f"Command: {' '.join(cmd)}", 'debug')
|
||||
|
||||
dl_count = 0
|
||||
skip_count = 0
|
||||
pending_files = []
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
timeout_secs = 14400 # 4 hours for backfill (can be large)
|
||||
|
||||
while True:
|
||||
if time.time() - start_time > timeout_secs:
|
||||
proc.kill()
|
||||
self.log("gallery-dl backfill timed out", 'error')
|
||||
break
|
||||
|
||||
line = proc.stdout.readline()
|
||||
if not line and proc.poll() is not None:
|
||||
break
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith('# '):
|
||||
skip_count += 1
|
||||
else:
|
||||
dl_count += 1
|
||||
file_path = Path(line)
|
||||
if file_path.exists() and not file_path.name.endswith('.json'):
|
||||
pending_files.append(file_path)
|
||||
|
||||
total = dl_count + skip_count
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, total)
|
||||
|
||||
if batch_callback and len(pending_files) >= batch_size:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
proc.wait()
|
||||
|
||||
# Final batch
|
||||
if batch_callback and pending_files:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, dl_count + skip_count)
|
||||
|
||||
returncode = proc.returncode
|
||||
if returncode not in (None, 0, 1, 4, 5):
|
||||
stderr = proc.stderr.read()
|
||||
self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
|
||||
if stderr:
|
||||
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"gallery-dl backfill failed: {e}", 'error')
|
||||
|
||||
self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
|
||||
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
|
||||
|
||||
@staticmethod
|
||||
def _detect_file_type(ext: str) -> str:
|
||||
"""Detect file type from extension."""
|
||||
ext = ext.lower().lstrip('.')
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}
|
||||
|
||||
if ext in image_exts:
|
||||
return 'image'
|
||||
elif ext in video_exts:
|
||||
return 'video'
|
||||
return 'unknown'
|
||||
Reference in New Issue
Block a user