# Code Review - Specific Fix Examples This document provides concrete code examples for implementing the recommended fixes from the comprehensive code review. ## 1. FIX: Token Exposure in URLs ### Current Code (web/frontend/src/lib/api.ts:558-568) ```typescript getMediaThumbnailUrl(filePath: string, mediaType: 'image' | 'video') { const token = localStorage.getItem('auth_token') const tokenParam = token ? `&token=${encodeURIComponent(token)}` : '' return `${API_BASE}/media/thumbnail?file_path=${encodeURIComponent(filePath)}&media_type=${mediaType}${tokenParam}` } ``` ### Recommended Fix ```typescript // Backend creates secure session/ticket instead of token async getMediaPreviewTicket(filePath: string): Promise<{ticket: string}> { return this.post('/media/preview-ticket', { file_path: filePath }) } // Frontend uses ticket (short-lived, single-use) getMediaThumbnailUrl(filePath: string, mediaType: 'image' | 'video') { const token = localStorage.getItem('auth_token') if (!token) return '' // Request ticket instead of embedding token const ticket = await this.getMediaPreviewTicket(filePath) return `${API_BASE}/media/thumbnail?file_path=${encodeURIComponent(filePath)}&media_type=${mediaType}&ticket=${ticket}` } // Always include Authorization header for critical operations private getAuthHeaders(): HeadersInit { const token = localStorage.getItem('auth_token') const headers: HeadersInit = { 'Content-Type': 'application/json', } if (token) { headers['Authorization'] = `Bearer ${token}` // Use header, not URL param } return headers } ``` ### Backend Implementation ```python # In api.py @app.post("/api/media/preview-ticket") async def create_preview_ticket( file_path: str, current_user: Dict = Depends(get_current_user) ) -> Dict: """Create short-lived, single-use ticket for media preview""" import secrets import time ticket = secrets.token_urlsafe(32) expiry = time.time() + 300 # 5 minutes # Store in Redis or in-memory cache preview_tickets[ticket] = { 'file_path': file_path, 'user': current_user['username'], 'expiry': expiry, 'used': False } return {'ticket': ticket} @app.get("/api/media/thumbnail") async def get_thumbnail( file_path: str, media_type: str, ticket: Optional[str] = None, credentials: Optional[HTTPAuthorizationCredentials] = Depends(security) ) -> StreamingResponse: """Serve thumbnail with ticket or authorization header""" auth_user = None # Try authorization header first if credentials: payload = app_state.auth.verify_session(credentials.credentials) if payload: auth_user = payload # Or use ticket if ticket and ticket in preview_tickets: ticket_data = preview_tickets[ticket] if time.time() > ticket_data['expiry']: raise HTTPException(status_code=401, detail="Ticket expired") if ticket_data['used']: raise HTTPException(status_code=401, detail="Ticket already used") auth_user = {'username': ticket_data['user']} preview_tickets[ticket]['used'] = True if not auth_user: raise HTTPException(status_code=401, detail="Not authenticated") # ... rest of implementation ``` --- ## 2. FIX: Path Traversal Vulnerability ### Problem Code (api.py file handling) ```python # UNSAFE - vulnerable to path traversal file_path = request.query_params.get('file_path') with open(file_path, 'rb') as f: # Could be /etc/passwd! return FileResponse(f) ``` ### Recommended Fix ```python from pathlib import Path import os # Safe path validation utility def validate_file_path(file_path: str, allowed_base: str = None) -> Path: """ Validate file path is within allowed directory. Prevents ../../../etc/passwd style attacks. """ if allowed_base is None: allowed_base = '/opt/media-downloader/downloads' # Convert to absolute paths requested_path = Path(file_path).resolve() base_path = Path(allowed_base).resolve() # Check if requested path is within base directory try: requested_path.relative_to(base_path) except ValueError: raise HTTPException( status_code=403, detail="Access denied - path traversal detected" ) # Check file exists if not requested_path.exists(): raise HTTPException(status_code=404, detail="File not found") # Check it's a file, not directory if not requested_path.is_file(): raise HTTPException(status_code=403, detail="Invalid file") return requested_path # Safe endpoint implementation @app.get("/api/media/preview") async def get_media_preview( file_path: str, current_user: Dict = Depends(get_current_user) ) -> FileResponse: """Serve media file with safe path validation""" try: safe_path = validate_file_path(file_path) return FileResponse(safe_path) except HTTPException: raise except Exception as e: logger.error(f"Error serving file: {e}") raise HTTPException(status_code=500, detail="Error serving file") ``` --- ## 3. FIX: CSRF Protection ### Add CSRF Middleware ```python # In api.py from starlette.middleware.csrf import CSRFMiddleware app.add_middleware( CSRFMiddleware, secret_key=SESSION_SECRET_KEY, safe_methods=['GET', 'HEAD', 'OPTIONS'], exempt_urls=['/api/auth/login', '/api/auth/logout'], # Public endpoints ) ``` ### Frontend Implementation ```typescript // web/frontend/src/lib/api.ts async post(endpoint: string, data?: any): Promise { // Get CSRF token from cookie or meta tag const csrfToken = this.getCSRFToken() const response = await fetch(`${API_BASE}${endpoint}`, { method: 'POST', headers: { ...this.getAuthHeaders(), 'X-CSRFToken': csrfToken, // Include CSRF token }, body: data ? JSON.stringify(data) : undefined, }) if (!response.ok) { if (response.status === 401) { this.handleUnauthorized() } throw new Error(`API error: ${response.statusText}`) } return response.json() } private getCSRFToken(): string { // Try to get from meta tag const meta = document.querySelector('meta[name="csrf-token"]') if (meta) { return meta.getAttribute('content') || '' } // Or from cookie const cookies = document.cookie.split('; ') const csrfCookie = cookies.find(c => c.startsWith('csrftoken=')) return csrfCookie ? csrfCookie.split('=')[1] : '' } ``` --- ## 4. FIX: Subprocess Command Injection ### Vulnerable Code (modules/tiktok_module.py:294) ```python # DANGEROUS - username not escaped username = "test'; rm -rf /; echo '" output_dir = "/downloads" # This could execute arbitrary commands! cmd = f"yt-dlp -o '%(title)s.%(ext)s' https://www.tiktok.com/@{username}" result = subprocess.run(cmd, capture_output=True, text=True, cwd=output_dir) ``` ### Recommended Fix ```python import subprocess import shlex from typing import List def safe_run_command(cmd: List[str], cwd: str = None, **kwargs) -> subprocess.CompletedProcess: """ Safely run command with list-based arguments (prevents injection). Never use shell=True with user input. """ try: # Use list form - much safer than string form result = subprocess.run( cmd, cwd=cwd, capture_output=True, text=True, timeout=300, **kwargs ) return result except subprocess.TimeoutExpired: raise ValueError("Command timed out") except Exception as e: raise ValueError(f"Command failed: {e}") # Usage with validation def download_tiktok_video(username: str, output_dir: str) -> bool: """Download TikTok video safely""" # Validate input if not username or len(username) > 100: raise ValueError("Invalid username") # Remove dangerous characters safe_username = ''.join(c for c in username if c.isalnum() or c in '@_-') # Build command as list (safer) cmd = [ 'yt-dlp', '-o', '%(title)s.%(ext)s', f'https://www.tiktok.com/@{safe_username}' ] try: result = safe_run_command(cmd, cwd=output_dir) if result.returncode != 0: logger.error(f"yt-dlp error: {result.stderr}") return False return True except Exception as e: logger.error(f"Failed to download TikTok: {e}") return False ``` --- ## 5. FIX: Input Validation on Config ### Current Vulnerable Code (api.py:349-351) ```python @app.put("/api/config") async def update_config( config: ConfigUpdate, # Raw dict, no validation current_user: Dict = Depends(get_current_user) ): """Update configuration""" app_state.config.update(config.config) return {"success": True} ``` ### Recommended Fix with Validation ```python from pydantic import BaseModel, Field, validator from typing import Optional, Dict, Any # Define validated config schemas class PlatformConfig(BaseModel): enabled: bool = True check_interval_hours: int = Field(gt=0, le=24) max_retries: int = Field(ge=1, le=10) timeout_seconds: int = Field(gt=0, le=3600) @validator('check_interval_hours') def validate_interval(cls, v): if v < 1 or v > 24: raise ValueError('Interval must be 1-24 hours') return v class MediaDownloaderConfig(BaseModel): download_path: str max_concurrent_downloads: int = Field(ge=1, le=20) enable_deduplication: bool = True enable_face_recognition: bool = False recycle_bin_enabled: bool = True recycle_bin_retention_days: int = Field(ge=1, le=365) @validator('max_concurrent_downloads') def validate_concurrent(cls, v): if v < 1 or v > 20: raise ValueError('Max concurrent downloads must be 1-20') return v @validator('download_path') def validate_path(cls, v): from pathlib import Path p = Path(v) if not p.exists(): raise ValueError('Download path does not exist') if not p.is_dir(): raise ValueError('Download path must be a directory') return str(p) class ConfigUpdate(BaseModel): instagram: Optional[PlatformConfig] = None tiktok: Optional[PlatformConfig] = None forums: Optional[PlatformConfig] = None general: Optional[MediaDownloaderConfig] = None # Safe endpoint with validation @app.put("/api/config") async def update_config( update: ConfigUpdate, # Automatically validated by Pydantic current_user: Dict = Depends(get_current_user) ) -> Dict: """Update configuration with validation""" try: config_dict = update.dict(exclude_unset=True) # Log who made the change logger.info(f"User {current_user['username']} updating config: {list(config_dict.keys())}") # Merge with existing config for key, value in config_dict.items(): if value is not None: app_state.config[key] = value.dict() # Save to database for key, value in config_dict.items(): if value is not None: app_state.settings.set( key, value.dict(), category=key, updated_by=current_user['username'] ) return { "success": True, "message": "Configuration updated successfully", "updated_keys": list(config_dict.keys()) } except Exception as e: logger.error(f"Config update failed: {e}") raise HTTPException( status_code=400, detail=f"Invalid configuration: {str(e)}" ) ``` --- ## 6. FIX: JSON Metadata Search Performance ### Current Inefficient Code (unified_database.py:576-590) ```python def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]: """Get download record by Instagram media ID""" with self.get_connection() as conn: cursor = conn.cursor() # This causes FULL TABLE SCAN on large datasets! pattern1 = f'%"media_id": "{media_id}"%' pattern2 = f'%"media_id"%{media_id}%' cursor.execute(''' SELECT * FROM downloads WHERE platform = ? AND (metadata LIKE ? OR metadata LIKE ?) LIMIT 1 ''', (platform, pattern1, pattern2)) ``` ### Recommended Fix - Option 1: Separate Column ```python # Schema modification (add once) def _init_database(self): """Initialize database with optimized schema""" with self.get_connection() as conn: cursor = conn.cursor() # Add separate column for media_id (indexed) try: cursor.execute("ALTER TABLE downloads ADD COLUMN media_id TEXT") except sqlite3.OperationalError: pass # Column already exists # Create efficient index cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_media_id_platform ON downloads(media_id, platform) WHERE media_id IS NOT NULL ''') conn.commit() def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]: """Get download record by Instagram media ID (fast)""" with self.get_connection() as conn: cursor = conn.cursor() # Now uses fast index instead of LIKE scan cursor.execute(''' SELECT id, url, platform, source, content_type, filename, file_path, post_date, download_date, file_size, file_hash, metadata FROM downloads WHERE platform = ? AND media_id = ? LIMIT 1 ''', (platform, media_id)) row = cursor.fetchone() if row: return dict(row) return None def record_download(self, media_id: str = None, **kwargs): """Record download with media_id extracted to separate column""" # ... existing code ... cursor.execute(''' INSERT INTO downloads ( url_hash, url, platform, source, content_type, filename, file_path, file_size, file_hash, post_date, status, error_message, metadata, media_id ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( url_hash, url, platform, source, content_type, filename, file_path, file_size, file_hash, post_date.isoformat() if post_date else None, status, error_message, json.dumps(metadata) if metadata else None, media_id # Store separately for fast lookup )) ``` ### Recommended Fix - Option 2: JSON_EXTRACT (if using SQLite 3.38+) ```python # Uses SQLite's built-in JSON functions (more efficient than LIKE) def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]: """Get download record by Instagram media ID using JSON_EXTRACT""" with self.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT id, url, platform, source, content_type, filename, file_path, post_date, download_date, file_size, file_hash, metadata FROM downloads WHERE platform = ? AND JSON_EXTRACT(metadata, '$.media_id') = ? LIMIT 1 ''', (platform, media_id)) row = cursor.fetchone() if row: result = dict(row) # Parse metadata if result.get('metadata'): try: result['metadata'] = json.loads(result['metadata']) except (ValueError, TypeError, json.JSONDecodeError): pass return result return None ``` --- ## 7. FIX: Bare Exception Handlers ### Problematic Code (fastdl_module.py, media-downloader.py) ```python except: # Too broad! break ``` ### Recommended Fix ```python import sqlite3 import requests from requests.exceptions import RequestException, Timeout, ConnectionError # Be specific about which exceptions to catch try: # ... code that might fail ... download_file(url) except (RequestException, Timeout, ConnectionError) as e: # Handle network errors logger.warning(f"Network error downloading {url}: {e}") if isinstance(e, Timeout): # Retry with longer timeout continue else: # Skip this file break except sqlite3.OperationalError as e: # Handle database errors specifically if "database is locked" in str(e): logger.warning("Database locked, retrying...") time.sleep(1) continue else: logger.error(f"Database error: {e}") raise except (OSError, IOError) as e: # Handle file system errors logger.error(f"File system error: {e}") break except Exception as e: # Only catch unexpected errors as last resort logger.error(f"Unexpected error: {type(e).__name__}: {e}", exc_info=True) break ``` --- ## 8. FIX: Async File I/O ### Current Blocking Code (web/backend/api.py) ```python # This blocks the async event loop! @app.get("/api/media/thumbnail") async def get_thumbnail(file_path: str): # Synchronous file I/O blocks other requests with open(file_path, 'rb') as f: image = Image.open(f) # ... process image ... return FileResponse(processed_image) ``` ### Recommended Fix with aiofiles ```python import aiofiles from PIL import Image import io @app.get("/api/media/thumbnail") async def get_thumbnail( file_path: str, media_type: str, current_user: Dict = Depends(get_current_user_media) ) -> StreamingResponse: """Serve thumbnail efficiently without blocking""" try: # Use aiofiles for non-blocking file I/O async with aiofiles.open(file_path, 'rb') as f: file_data = await f.read() # Offload CPU-bound image processing to thread pool loop = asyncio.get_event_loop() thumbnail = await loop.run_in_executor( None, # Use default executor (ThreadPoolExecutor) _create_thumbnail, file_data, media_type ) return StreamingResponse( io.BytesIO(thumbnail), media_type="image/jpeg" ) except FileNotFoundError: raise HTTPException(status_code=404, detail="File not found") except Exception as e: logger.error(f"Error creating thumbnail: {e}") raise HTTPException(status_code=500, detail="Error creating thumbnail") def _create_thumbnail(file_data: bytes, media_type: str) -> bytes: """CPU-bound function to create thumbnail""" try: image = Image.open(io.BytesIO(file_data)) image.thumbnail((200, 200)) output = io.BytesIO() image.save(output, format='JPEG', quality=85) return output.getvalue() except Exception as e: logger.error(f"Thumbnail creation failed: {e}") raise ``` --- ## 9. FIX: Adapter Duplication ### Current Duplicated Code (unified_database.py:1708-2080) ```python # FastDLDatabaseAdapter class FastDLDatabaseAdapter: def __init__(self, unified_db: UnifiedDatabase): self.db = unified_db self.platform = 'fastdl' def is_already_downloaded(self, media_id: str) -> bool: # ... 20+ lines of duplicate code ... def record_download(self, media_id: str, username: str, **kwargs): # ... 30+ lines of duplicate code ... # TikTokDatabaseAdapter (similar structure) # ToolzuDatabaseAdapter (similar structure) # CoppermineDatabaseAdapter (similar structure) # ... and more ``` ### Recommended Fix: Generic Base Adapter ```python from abc import ABC, abstractmethod from typing import Any, Dict, Optional class BaseDatabaseAdapter(ABC): """Generic adapter for unified database compatibility""" def __init__(self, unified_db: UnifiedDatabase, platform: str): self.db = unified_db self.platform = platform @abstractmethod def get_identifier(self, data: Dict[str, Any]) -> str: """Extract unique identifier from data""" pass @abstractmethod def build_metadata(self, data: Dict[str, Any]) -> Dict: """Build platform-specific metadata""" pass def is_already_downloaded(self, identifier: str) -> bool: """Check if content is already downloaded""" with self.db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT 1 FROM downloads WHERE platform = ? AND metadata LIKE ? LIMIT 1 ''', (self.platform, f'%"{self._id_key()}": "{identifier}"%')) return cursor.fetchone() is not None @abstractmethod def _id_key(self) -> str: """Return the metadata key for identifier""" pass def record_download( self, identifier: str, source: str, **kwargs ) -> bool: """Record download with platform-specific data""" url = self._build_url(identifier, source, kwargs) metadata = self.build_metadata({ **kwargs, self._id_key(): identifier }) # Calculate file hash if provided file_hash = None if kwargs.get('file_path'): try: file_hash = UnifiedDatabase.get_file_hash(kwargs['file_path']) except Exception: pass return self.db.record_download( url=url, platform=self.platform, source=source, content_type=kwargs.get('content_type', 'post'), filename=kwargs.get('filename'), file_path=kwargs.get('file_path'), file_hash=file_hash, post_date=kwargs.get('post_date'), metadata=metadata ) @abstractmethod def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str: """Build URL for the content""" pass # Concrete implementations class FastDLDatabaseAdapter(BaseDatabaseAdapter): def __init__(self, unified_db: UnifiedDatabase): super().__init__(unified_db, 'fastdl') def _id_key(self) -> str: return 'media_id' def get_identifier(self, data: Dict) -> str: return data.get('media_id', '') def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str: return kwargs.get('download_url') or f"instagram://{identifier}" def build_metadata(self, data: Dict) -> Dict: return { 'media_id': data.get('media_id'), 'source': 'fastdl', **{k: v for k, v in data.items() if k not in ['media_id', 'file_path']} } class TikTokDatabaseAdapter(BaseDatabaseAdapter): def __init__(self, unified_db: UnifiedDatabase): super().__init__(unified_db, 'tiktok') def _id_key(self) -> str: return 'video_id' def get_identifier(self, data: Dict) -> str: return data.get('video_id', '') def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str: return f"https://www.tiktok.com/@{source}/video/{identifier}" def build_metadata(self, data: Dict) -> Dict: return { 'video_id': data.get('video_id'), **{k: v for k, v in data.items() if k != 'video_id'} } class SnapchatDatabaseAdapter(BaseDatabaseAdapter): def __init__(self, unified_db: UnifiedDatabase): super().__init__(unified_db, 'snapchat') def _id_key(self) -> str: return 'story_id' def get_identifier(self, data: Dict) -> str: return data.get('story_id', '') def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str: return kwargs.get('url', f"snapchat://{identifier}") def build_metadata(self, data: Dict) -> Dict: return data.copy() # ... similar for other platforms ... ``` --- ## Summary These code examples provide concrete implementations for the major security, performance, and quality issues identified in the review. The fixes follow Python/TypeScript best practices and can be implemented incrementally. Start with security fixes (sections 1-5), then move to performance (sections 6-8), then code quality (section 9).