24 KiB
24 KiB
Code Review - Specific Fix Examples
This document provides concrete code examples for implementing the recommended fixes from the comprehensive code review.
1. FIX: Token Exposure in URLs
Current Code (web/frontend/src/lib/api.ts:558-568)
getMediaThumbnailUrl(filePath: string, mediaType: 'image' | 'video') {
const token = localStorage.getItem('auth_token')
const tokenParam = token ? `&token=${encodeURIComponent(token)}` : ''
return `${API_BASE}/media/thumbnail?file_path=${encodeURIComponent(filePath)}&media_type=${mediaType}${tokenParam}`
}
Recommended Fix
// Backend creates secure session/ticket instead of token
async getMediaPreviewTicket(filePath: string): Promise<{ticket: string}> {
return this.post('/media/preview-ticket', { file_path: filePath })
}
// Frontend uses ticket (short-lived, single-use)
getMediaThumbnailUrl(filePath: string, mediaType: 'image' | 'video') {
const token = localStorage.getItem('auth_token')
if (!token) return ''
// Request ticket instead of embedding token
const ticket = await this.getMediaPreviewTicket(filePath)
return `${API_BASE}/media/thumbnail?file_path=${encodeURIComponent(filePath)}&media_type=${mediaType}&ticket=${ticket}`
}
// Always include Authorization header for critical operations
private getAuthHeaders(): HeadersInit {
const token = localStorage.getItem('auth_token')
const headers: HeadersInit = {
'Content-Type': 'application/json',
}
if (token) {
headers['Authorization'] = `Bearer ${token}` // Use header, not URL param
}
return headers
}
Backend Implementation
# In api.py
@app.post("/api/media/preview-ticket")
async def create_preview_ticket(
file_path: str,
current_user: Dict = Depends(get_current_user)
) -> Dict:
"""Create short-lived, single-use ticket for media preview"""
import secrets
import time
ticket = secrets.token_urlsafe(32)
expiry = time.time() + 300 # 5 minutes
# Store in Redis or in-memory cache
preview_tickets[ticket] = {
'file_path': file_path,
'user': current_user['username'],
'expiry': expiry,
'used': False
}
return {'ticket': ticket}
@app.get("/api/media/thumbnail")
async def get_thumbnail(
file_path: str,
media_type: str,
ticket: Optional[str] = None,
credentials: Optional[HTTPAuthorizationCredentials] = Depends(security)
) -> StreamingResponse:
"""Serve thumbnail with ticket or authorization header"""
auth_user = None
# Try authorization header first
if credentials:
payload = app_state.auth.verify_session(credentials.credentials)
if payload:
auth_user = payload
# Or use ticket
if ticket and ticket in preview_tickets:
ticket_data = preview_tickets[ticket]
if time.time() > ticket_data['expiry']:
raise HTTPException(status_code=401, detail="Ticket expired")
if ticket_data['used']:
raise HTTPException(status_code=401, detail="Ticket already used")
auth_user = {'username': ticket_data['user']}
preview_tickets[ticket]['used'] = True
if not auth_user:
raise HTTPException(status_code=401, detail="Not authenticated")
# ... rest of implementation
2. FIX: Path Traversal Vulnerability
Problem Code (api.py file handling)
# UNSAFE - vulnerable to path traversal
file_path = request.query_params.get('file_path')
with open(file_path, 'rb') as f: # Could be /etc/passwd!
return FileResponse(f)
Recommended Fix
from pathlib import Path
import os
# Safe path validation utility
def validate_file_path(file_path: str, allowed_base: str = None) -> Path:
"""
Validate file path is within allowed directory.
Prevents ../../../etc/passwd style attacks.
"""
if allowed_base is None:
allowed_base = '/opt/media-downloader/downloads'
# Convert to absolute paths
requested_path = Path(file_path).resolve()
base_path = Path(allowed_base).resolve()
# Check if requested path is within base directory
try:
requested_path.relative_to(base_path)
except ValueError:
raise HTTPException(
status_code=403,
detail="Access denied - path traversal detected"
)
# Check file exists
if not requested_path.exists():
raise HTTPException(status_code=404, detail="File not found")
# Check it's a file, not directory
if not requested_path.is_file():
raise HTTPException(status_code=403, detail="Invalid file")
return requested_path
# Safe endpoint implementation
@app.get("/api/media/preview")
async def get_media_preview(
file_path: str,
current_user: Dict = Depends(get_current_user)
) -> FileResponse:
"""Serve media file with safe path validation"""
try:
safe_path = validate_file_path(file_path)
return FileResponse(safe_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error serving file: {e}")
raise HTTPException(status_code=500, detail="Error serving file")
3. FIX: CSRF Protection
Add CSRF Middleware
# In api.py
from starlette.middleware.csrf import CSRFMiddleware
app.add_middleware(
CSRFMiddleware,
secret_key=SESSION_SECRET_KEY,
safe_methods=['GET', 'HEAD', 'OPTIONS'],
exempt_urls=['/api/auth/login', '/api/auth/logout'], # Public endpoints
)
Frontend Implementation
// web/frontend/src/lib/api.ts
async post<T>(endpoint: string, data?: any): Promise<T> {
// Get CSRF token from cookie or meta tag
const csrfToken = this.getCSRFToken()
const response = await fetch(`${API_BASE}${endpoint}`, {
method: 'POST',
headers: {
...this.getAuthHeaders(),
'X-CSRFToken': csrfToken, // Include CSRF token
},
body: data ? JSON.stringify(data) : undefined,
})
if (!response.ok) {
if (response.status === 401) {
this.handleUnauthorized()
}
throw new Error(`API error: ${response.statusText}`)
}
return response.json()
}
private getCSRFToken(): string {
// Try to get from meta tag
const meta = document.querySelector('meta[name="csrf-token"]')
if (meta) {
return meta.getAttribute('content') || ''
}
// Or from cookie
const cookies = document.cookie.split('; ')
const csrfCookie = cookies.find(c => c.startsWith('csrftoken='))
return csrfCookie ? csrfCookie.split('=')[1] : ''
}
4. FIX: Subprocess Command Injection
Vulnerable Code (modules/tiktok_module.py:294)
# DANGEROUS - username not escaped
username = "test'; rm -rf /; echo '"
output_dir = "/downloads"
# This could execute arbitrary commands!
cmd = f"yt-dlp -o '%(title)s.%(ext)s' https://www.tiktok.com/@{username}"
result = subprocess.run(cmd, capture_output=True, text=True, cwd=output_dir)
Recommended Fix
import subprocess
import shlex
from typing import List
def safe_run_command(cmd: List[str], cwd: str = None, **kwargs) -> subprocess.CompletedProcess:
"""
Safely run command with list-based arguments (prevents injection).
Never use shell=True with user input.
"""
try:
# Use list form - much safer than string form
result = subprocess.run(
cmd,
cwd=cwd,
capture_output=True,
text=True,
timeout=300,
**kwargs
)
return result
except subprocess.TimeoutExpired:
raise ValueError("Command timed out")
except Exception as e:
raise ValueError(f"Command failed: {e}")
# Usage with validation
def download_tiktok_video(username: str, output_dir: str) -> bool:
"""Download TikTok video safely"""
# Validate input
if not username or len(username) > 100:
raise ValueError("Invalid username")
# Remove dangerous characters
safe_username = ''.join(c for c in username if c.isalnum() or c in '@_-')
# Build command as list (safer)
cmd = [
'yt-dlp',
'-o', '%(title)s.%(ext)s',
f'https://www.tiktok.com/@{safe_username}'
]
try:
result = safe_run_command(cmd, cwd=output_dir)
if result.returncode != 0:
logger.error(f"yt-dlp error: {result.stderr}")
return False
return True
except Exception as e:
logger.error(f"Failed to download TikTok: {e}")
return False
5. FIX: Input Validation on Config
Current Vulnerable Code (api.py:349-351)
@app.put("/api/config")
async def update_config(
config: ConfigUpdate, # Raw dict, no validation
current_user: Dict = Depends(get_current_user)
):
"""Update configuration"""
app_state.config.update(config.config)
return {"success": True}
Recommended Fix with Validation
from pydantic import BaseModel, Field, validator
from typing import Optional, Dict, Any
# Define validated config schemas
class PlatformConfig(BaseModel):
enabled: bool = True
check_interval_hours: int = Field(gt=0, le=24)
max_retries: int = Field(ge=1, le=10)
timeout_seconds: int = Field(gt=0, le=3600)
@validator('check_interval_hours')
def validate_interval(cls, v):
if v < 1 or v > 24:
raise ValueError('Interval must be 1-24 hours')
return v
class MediaDownloaderConfig(BaseModel):
download_path: str
max_concurrent_downloads: int = Field(ge=1, le=20)
enable_deduplication: bool = True
enable_face_recognition: bool = False
recycle_bin_enabled: bool = True
recycle_bin_retention_days: int = Field(ge=1, le=365)
@validator('max_concurrent_downloads')
def validate_concurrent(cls, v):
if v < 1 or v > 20:
raise ValueError('Max concurrent downloads must be 1-20')
return v
@validator('download_path')
def validate_path(cls, v):
from pathlib import Path
p = Path(v)
if not p.exists():
raise ValueError('Download path does not exist')
if not p.is_dir():
raise ValueError('Download path must be a directory')
return str(p)
class ConfigUpdate(BaseModel):
instagram: Optional[PlatformConfig] = None
tiktok: Optional[PlatformConfig] = None
forums: Optional[PlatformConfig] = None
general: Optional[MediaDownloaderConfig] = None
# Safe endpoint with validation
@app.put("/api/config")
async def update_config(
update: ConfigUpdate, # Automatically validated by Pydantic
current_user: Dict = Depends(get_current_user)
) -> Dict:
"""Update configuration with validation"""
try:
config_dict = update.dict(exclude_unset=True)
# Log who made the change
logger.info(f"User {current_user['username']} updating config: {list(config_dict.keys())}")
# Merge with existing config
for key, value in config_dict.items():
if value is not None:
app_state.config[key] = value.dict()
# Save to database
for key, value in config_dict.items():
if value is not None:
app_state.settings.set(
key,
value.dict(),
category=key,
updated_by=current_user['username']
)
return {
"success": True,
"message": "Configuration updated successfully",
"updated_keys": list(config_dict.keys())
}
except Exception as e:
logger.error(f"Config update failed: {e}")
raise HTTPException(
status_code=400,
detail=f"Invalid configuration: {str(e)}"
)
6. FIX: JSON Metadata Search Performance
Current Inefficient Code (unified_database.py:576-590)
def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]:
"""Get download record by Instagram media ID"""
with self.get_connection() as conn:
cursor = conn.cursor()
# This causes FULL TABLE SCAN on large datasets!
pattern1 = f'%"media_id": "{media_id}"%'
pattern2 = f'%"media_id"%{media_id}%'
cursor.execute('''
SELECT * FROM downloads
WHERE platform = ?
AND (metadata LIKE ? OR metadata LIKE ?)
LIMIT 1
''', (platform, pattern1, pattern2))
Recommended Fix - Option 1: Separate Column
# Schema modification (add once)
def _init_database(self):
"""Initialize database with optimized schema"""
with self.get_connection() as conn:
cursor = conn.cursor()
# Add separate column for media_id (indexed)
try:
cursor.execute("ALTER TABLE downloads ADD COLUMN media_id TEXT")
except sqlite3.OperationalError:
pass # Column already exists
# Create efficient index
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_media_id_platform
ON downloads(media_id, platform)
WHERE media_id IS NOT NULL
''')
conn.commit()
def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]:
"""Get download record by Instagram media ID (fast)"""
with self.get_connection() as conn:
cursor = conn.cursor()
# Now uses fast index instead of LIKE scan
cursor.execute('''
SELECT id, url, platform, source, content_type,
filename, file_path, post_date, download_date,
file_size, file_hash, metadata
FROM downloads
WHERE platform = ? AND media_id = ?
LIMIT 1
''', (platform, media_id))
row = cursor.fetchone()
if row:
return dict(row)
return None
def record_download(self, media_id: str = None, **kwargs):
"""Record download with media_id extracted to separate column"""
# ... existing code ...
cursor.execute('''
INSERT INTO downloads (
url_hash, url, platform, source, content_type,
filename, file_path, file_size, file_hash,
post_date, status, error_message, metadata, media_id
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
url_hash, url, platform, source, content_type,
filename, file_path, file_size, file_hash,
post_date.isoformat() if post_date else None,
status, error_message,
json.dumps(metadata) if metadata else None,
media_id # Store separately for fast lookup
))
Recommended Fix - Option 2: JSON_EXTRACT (if using SQLite 3.38+)
# Uses SQLite's built-in JSON functions (more efficient than LIKE)
def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]:
"""Get download record by Instagram media ID using JSON_EXTRACT"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT id, url, platform, source, content_type,
filename, file_path, post_date, download_date,
file_size, file_hash, metadata
FROM downloads
WHERE platform = ?
AND JSON_EXTRACT(metadata, '$.media_id') = ?
LIMIT 1
''', (platform, media_id))
row = cursor.fetchone()
if row:
result = dict(row)
# Parse metadata
if result.get('metadata'):
try:
result['metadata'] = json.loads(result['metadata'])
except (ValueError, TypeError, json.JSONDecodeError):
pass
return result
return None
7. FIX: Bare Exception Handlers
Problematic Code (fastdl_module.py, media-downloader.py)
except: # Too broad!
break
Recommended Fix
import sqlite3
import requests
from requests.exceptions import RequestException, Timeout, ConnectionError
# Be specific about which exceptions to catch
try:
# ... code that might fail ...
download_file(url)
except (RequestException, Timeout, ConnectionError) as e:
# Handle network errors
logger.warning(f"Network error downloading {url}: {e}")
if isinstance(e, Timeout):
# Retry with longer timeout
continue
else:
# Skip this file
break
except sqlite3.OperationalError as e:
# Handle database errors specifically
if "database is locked" in str(e):
logger.warning("Database locked, retrying...")
time.sleep(1)
continue
else:
logger.error(f"Database error: {e}")
raise
except (OSError, IOError) as e:
# Handle file system errors
logger.error(f"File system error: {e}")
break
except Exception as e:
# Only catch unexpected errors as last resort
logger.error(f"Unexpected error: {type(e).__name__}: {e}", exc_info=True)
break
8. FIX: Async File I/O
Current Blocking Code (web/backend/api.py)
# This blocks the async event loop!
@app.get("/api/media/thumbnail")
async def get_thumbnail(file_path: str):
# Synchronous file I/O blocks other requests
with open(file_path, 'rb') as f:
image = Image.open(f)
# ... process image ...
return FileResponse(processed_image)
Recommended Fix with aiofiles
import aiofiles
from PIL import Image
import io
@app.get("/api/media/thumbnail")
async def get_thumbnail(
file_path: str,
media_type: str,
current_user: Dict = Depends(get_current_user_media)
) -> StreamingResponse:
"""Serve thumbnail efficiently without blocking"""
try:
# Use aiofiles for non-blocking file I/O
async with aiofiles.open(file_path, 'rb') as f:
file_data = await f.read()
# Offload CPU-bound image processing to thread pool
loop = asyncio.get_event_loop()
thumbnail = await loop.run_in_executor(
None, # Use default executor (ThreadPoolExecutor)
_create_thumbnail,
file_data,
media_type
)
return StreamingResponse(
io.BytesIO(thumbnail),
media_type="image/jpeg"
)
except FileNotFoundError:
raise HTTPException(status_code=404, detail="File not found")
except Exception as e:
logger.error(f"Error creating thumbnail: {e}")
raise HTTPException(status_code=500, detail="Error creating thumbnail")
def _create_thumbnail(file_data: bytes, media_type: str) -> bytes:
"""CPU-bound function to create thumbnail"""
try:
image = Image.open(io.BytesIO(file_data))
image.thumbnail((200, 200))
output = io.BytesIO()
image.save(output, format='JPEG', quality=85)
return output.getvalue()
except Exception as e:
logger.error(f"Thumbnail creation failed: {e}")
raise
9. FIX: Adapter Duplication
Current Duplicated Code (unified_database.py:1708-2080)
# FastDLDatabaseAdapter
class FastDLDatabaseAdapter:
def __init__(self, unified_db: UnifiedDatabase):
self.db = unified_db
self.platform = 'fastdl'
def is_already_downloaded(self, media_id: str) -> bool:
# ... 20+ lines of duplicate code ...
def record_download(self, media_id: str, username: str, **kwargs):
# ... 30+ lines of duplicate code ...
# TikTokDatabaseAdapter (similar structure)
# ToolzuDatabaseAdapter (similar structure)
# CoppermineDatabaseAdapter (similar structure)
# ... and more
Recommended Fix: Generic Base Adapter
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
class BaseDatabaseAdapter(ABC):
"""Generic adapter for unified database compatibility"""
def __init__(self, unified_db: UnifiedDatabase, platform: str):
self.db = unified_db
self.platform = platform
@abstractmethod
def get_identifier(self, data: Dict[str, Any]) -> str:
"""Extract unique identifier from data"""
pass
@abstractmethod
def build_metadata(self, data: Dict[str, Any]) -> Dict:
"""Build platform-specific metadata"""
pass
def is_already_downloaded(self, identifier: str) -> bool:
"""Check if content is already downloaded"""
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT 1 FROM downloads
WHERE platform = ? AND metadata LIKE ?
LIMIT 1
''', (self.platform, f'%"{self._id_key()}": "{identifier}"%'))
return cursor.fetchone() is not None
@abstractmethod
def _id_key(self) -> str:
"""Return the metadata key for identifier"""
pass
def record_download(
self,
identifier: str,
source: str,
**kwargs
) -> bool:
"""Record download with platform-specific data"""
url = self._build_url(identifier, source, kwargs)
metadata = self.build_metadata({
**kwargs,
self._id_key(): identifier
})
# Calculate file hash if provided
file_hash = None
if kwargs.get('file_path'):
try:
file_hash = UnifiedDatabase.get_file_hash(kwargs['file_path'])
except Exception:
pass
return self.db.record_download(
url=url,
platform=self.platform,
source=source,
content_type=kwargs.get('content_type', 'post'),
filename=kwargs.get('filename'),
file_path=kwargs.get('file_path'),
file_hash=file_hash,
post_date=kwargs.get('post_date'),
metadata=metadata
)
@abstractmethod
def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str:
"""Build URL for the content"""
pass
# Concrete implementations
class FastDLDatabaseAdapter(BaseDatabaseAdapter):
def __init__(self, unified_db: UnifiedDatabase):
super().__init__(unified_db, 'fastdl')
def _id_key(self) -> str:
return 'media_id'
def get_identifier(self, data: Dict) -> str:
return data.get('media_id', '')
def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str:
return kwargs.get('download_url') or f"instagram://{identifier}"
def build_metadata(self, data: Dict) -> Dict:
return {
'media_id': data.get('media_id'),
'source': 'fastdl',
**{k: v for k, v in data.items() if k not in ['media_id', 'file_path']}
}
class TikTokDatabaseAdapter(BaseDatabaseAdapter):
def __init__(self, unified_db: UnifiedDatabase):
super().__init__(unified_db, 'tiktok')
def _id_key(self) -> str:
return 'video_id'
def get_identifier(self, data: Dict) -> str:
return data.get('video_id', '')
def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str:
return f"https://www.tiktok.com/@{source}/video/{identifier}"
def build_metadata(self, data: Dict) -> Dict:
return {
'video_id': data.get('video_id'),
**{k: v for k, v in data.items() if k != 'video_id'}
}
class SnapchatDatabaseAdapter(BaseDatabaseAdapter):
def __init__(self, unified_db: UnifiedDatabase):
super().__init__(unified_db, 'snapchat')
def _id_key(self) -> str:
return 'story_id'
def get_identifier(self, data: Dict) -> str:
return data.get('story_id', '')
def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str:
return kwargs.get('url', f"snapchat://{identifier}")
def build_metadata(self, data: Dict) -> Dict:
return data.copy()
# ... similar for other platforms ...
Summary
These code examples provide concrete implementations for the major security, performance, and quality issues identified in the review. The fixes follow Python/TypeScript best practices and can be implemented incrementally.
Start with security fixes (sections 1-5), then move to performance (sections 6-8), then code quality (section 9).