Files
media-downloader/docs/archive/CODE_REVIEW_FIX_EXAMPLES.md
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

24 KiB

Code Review - Specific Fix Examples

This document provides concrete code examples for implementing the recommended fixes from the comprehensive code review.

1. FIX: Token Exposure in URLs

Current Code (web/frontend/src/lib/api.ts:558-568)

getMediaThumbnailUrl(filePath: string, mediaType: 'image' | 'video') {
    const token = localStorage.getItem('auth_token')
    const tokenParam = token ? `&token=${encodeURIComponent(token)}` : ''
    return `${API_BASE}/media/thumbnail?file_path=${encodeURIComponent(filePath)}&media_type=${mediaType}${tokenParam}`
}
// Backend creates secure session/ticket instead of token
async getMediaPreviewTicket(filePath: string): Promise<{ticket: string}> {
    return this.post('/media/preview-ticket', { file_path: filePath })
}

// Frontend uses ticket (short-lived, single-use)
getMediaThumbnailUrl(filePath: string, mediaType: 'image' | 'video') {
    const token = localStorage.getItem('auth_token')
    if (!token) return ''
    
    // Request ticket instead of embedding token
    const ticket = await this.getMediaPreviewTicket(filePath)
    return `${API_BASE}/media/thumbnail?file_path=${encodeURIComponent(filePath)}&media_type=${mediaType}&ticket=${ticket}`
}

// Always include Authorization header for critical operations
private getAuthHeaders(): HeadersInit {
    const token = localStorage.getItem('auth_token')
    const headers: HeadersInit = {
        'Content-Type': 'application/json',
    }
    if (token) {
        headers['Authorization'] = `Bearer ${token}`  // Use header, not URL param
    }
    return headers
}

Backend Implementation

# In api.py

@app.post("/api/media/preview-ticket")
async def create_preview_ticket(
    file_path: str,
    current_user: Dict = Depends(get_current_user)
) -> Dict:
    """Create short-lived, single-use ticket for media preview"""
    import secrets
    import time
    
    ticket = secrets.token_urlsafe(32)
    expiry = time.time() + 300  # 5 minutes
    
    # Store in Redis or in-memory cache
    preview_tickets[ticket] = {
        'file_path': file_path,
        'user': current_user['username'],
        'expiry': expiry,
        'used': False
    }
    
    return {'ticket': ticket}

@app.get("/api/media/thumbnail")
async def get_thumbnail(
    file_path: str,
    media_type: str,
    ticket: Optional[str] = None,
    credentials: Optional[HTTPAuthorizationCredentials] = Depends(security)
) -> StreamingResponse:
    """Serve thumbnail with ticket or authorization header"""
    
    auth_user = None
    
    # Try authorization header first
    if credentials:
        payload = app_state.auth.verify_session(credentials.credentials)
        if payload:
            auth_user = payload
    
    # Or use ticket
    if ticket and ticket in preview_tickets:
        ticket_data = preview_tickets[ticket]
        if time.time() > ticket_data['expiry']:
            raise HTTPException(status_code=401, detail="Ticket expired")
        if ticket_data['used']:
            raise HTTPException(status_code=401, detail="Ticket already used")
        auth_user = {'username': ticket_data['user']}
        preview_tickets[ticket]['used'] = True
    
    if not auth_user:
        raise HTTPException(status_code=401, detail="Not authenticated")
    
    # ... rest of implementation

2. FIX: Path Traversal Vulnerability

Problem Code (api.py file handling)

# UNSAFE - vulnerable to path traversal
file_path = request.query_params.get('file_path')
with open(file_path, 'rb') as f:  # Could be /etc/passwd!
    return FileResponse(f)
from pathlib import Path
import os

# Safe path validation utility
def validate_file_path(file_path: str, allowed_base: str = None) -> Path:
    """
    Validate file path is within allowed directory.
    Prevents ../../../etc/passwd style attacks.
    """
    if allowed_base is None:
        allowed_base = '/opt/media-downloader/downloads'
    
    # Convert to absolute paths
    requested_path = Path(file_path).resolve()
    base_path = Path(allowed_base).resolve()
    
    # Check if requested path is within base directory
    try:
        requested_path.relative_to(base_path)
    except ValueError:
        raise HTTPException(
            status_code=403,
            detail="Access denied - path traversal detected"
        )
    
    # Check file exists
    if not requested_path.exists():
        raise HTTPException(status_code=404, detail="File not found")
    
    # Check it's a file, not directory
    if not requested_path.is_file():
        raise HTTPException(status_code=403, detail="Invalid file")
    
    return requested_path

# Safe endpoint implementation
@app.get("/api/media/preview")
async def get_media_preview(
    file_path: str,
    current_user: Dict = Depends(get_current_user)
) -> FileResponse:
    """Serve media file with safe path validation"""
    try:
        safe_path = validate_file_path(file_path)
        return FileResponse(safe_path)
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error serving file: {e}")
        raise HTTPException(status_code=500, detail="Error serving file")

3. FIX: CSRF Protection

Add CSRF Middleware

# In api.py

from starlette.middleware.csrf import CSRFMiddleware

app.add_middleware(
    CSRFMiddleware,
    secret_key=SESSION_SECRET_KEY,
    safe_methods=['GET', 'HEAD', 'OPTIONS'],
    exempt_urls=['/api/auth/login', '/api/auth/logout'],  # Public endpoints
)

Frontend Implementation

// web/frontend/src/lib/api.ts

async post<T>(endpoint: string, data?: any): Promise<T> {
    // Get CSRF token from cookie or meta tag
    const csrfToken = this.getCSRFToken()
    
    const response = await fetch(`${API_BASE}${endpoint}`, {
        method: 'POST',
        headers: {
            ...this.getAuthHeaders(),
            'X-CSRFToken': csrfToken,  // Include CSRF token
        },
        body: data ? JSON.stringify(data) : undefined,
    })
    
    if (!response.ok) {
        if (response.status === 401) {
            this.handleUnauthorized()
        }
        throw new Error(`API error: ${response.statusText}`)
    }
    return response.json()
}

private getCSRFToken(): string {
    // Try to get from meta tag
    const meta = document.querySelector('meta[name="csrf-token"]')
    if (meta) {
        return meta.getAttribute('content') || ''
    }
    
    // Or from cookie
    const cookies = document.cookie.split('; ')
    const csrfCookie = cookies.find(c => c.startsWith('csrftoken='))
    return csrfCookie ? csrfCookie.split('=')[1] : ''
}

4. FIX: Subprocess Command Injection

Vulnerable Code (modules/tiktok_module.py:294)

# DANGEROUS - username not escaped
username = "test'; rm -rf /; echo '"
output_dir = "/downloads"

# This could execute arbitrary commands!
cmd = f"yt-dlp -o '%(title)s.%(ext)s' https://www.tiktok.com/@{username}"
result = subprocess.run(cmd, capture_output=True, text=True, cwd=output_dir)
import subprocess
import shlex
from typing import List

def safe_run_command(cmd: List[str], cwd: str = None, **kwargs) -> subprocess.CompletedProcess:
    """
    Safely run command with list-based arguments (prevents injection).
    Never use shell=True with user input.
    """
    try:
        # Use list form - much safer than string form
        result = subprocess.run(
            cmd,
            cwd=cwd,
            capture_output=True,
            text=True,
            timeout=300,
            **kwargs
        )
        return result
    except subprocess.TimeoutExpired:
        raise ValueError("Command timed out")
    except Exception as e:
        raise ValueError(f"Command failed: {e}")

# Usage with validation
def download_tiktok_video(username: str, output_dir: str) -> bool:
    """Download TikTok video safely"""
    
    # Validate input
    if not username or len(username) > 100:
        raise ValueError("Invalid username")
    
    # Remove dangerous characters
    safe_username = ''.join(c for c in username if c.isalnum() or c in '@_-')
    
    # Build command as list (safer)
    cmd = [
        'yt-dlp',
        '-o', '%(title)s.%(ext)s',
        f'https://www.tiktok.com/@{safe_username}'
    ]
    
    try:
        result = safe_run_command(cmd, cwd=output_dir)
        
        if result.returncode != 0:
            logger.error(f"yt-dlp error: {result.stderr}")
            return False
        
        return True
        
    except Exception as e:
        logger.error(f"Failed to download TikTok: {e}")
        return False

5. FIX: Input Validation on Config

Current Vulnerable Code (api.py:349-351)

@app.put("/api/config")
async def update_config(
    config: ConfigUpdate,  # Raw dict, no validation
    current_user: Dict = Depends(get_current_user)
):
    """Update configuration"""
    app_state.config.update(config.config)
    return {"success": True}
from pydantic import BaseModel, Field, validator
from typing import Optional, Dict, Any

# Define validated config schemas
class PlatformConfig(BaseModel):
    enabled: bool = True
    check_interval_hours: int = Field(gt=0, le=24)
    max_retries: int = Field(ge=1, le=10)
    timeout_seconds: int = Field(gt=0, le=3600)
    
    @validator('check_interval_hours')
    def validate_interval(cls, v):
        if v < 1 or v > 24:
            raise ValueError('Interval must be 1-24 hours')
        return v

class MediaDownloaderConfig(BaseModel):
    download_path: str
    max_concurrent_downloads: int = Field(ge=1, le=20)
    enable_deduplication: bool = True
    enable_face_recognition: bool = False
    recycle_bin_enabled: bool = True
    recycle_bin_retention_days: int = Field(ge=1, le=365)
    
    @validator('max_concurrent_downloads')
    def validate_concurrent(cls, v):
        if v < 1 or v > 20:
            raise ValueError('Max concurrent downloads must be 1-20')
        return v
    
    @validator('download_path')
    def validate_path(cls, v):
        from pathlib import Path
        p = Path(v)
        if not p.exists():
            raise ValueError('Download path does not exist')
        if not p.is_dir():
            raise ValueError('Download path must be a directory')
        return str(p)

class ConfigUpdate(BaseModel):
    instagram: Optional[PlatformConfig] = None
    tiktok: Optional[PlatformConfig] = None
    forums: Optional[PlatformConfig] = None
    general: Optional[MediaDownloaderConfig] = None

# Safe endpoint with validation
@app.put("/api/config")
async def update_config(
    update: ConfigUpdate,  # Automatically validated by Pydantic
    current_user: Dict = Depends(get_current_user)
) -> Dict:
    """Update configuration with validation"""
    
    try:
        config_dict = update.dict(exclude_unset=True)
        
        # Log who made the change
        logger.info(f"User {current_user['username']} updating config: {list(config_dict.keys())}")
        
        # Merge with existing config
        for key, value in config_dict.items():
            if value is not None:
                app_state.config[key] = value.dict()
        
        # Save to database
        for key, value in config_dict.items():
            if value is not None:
                app_state.settings.set(
                    key,
                    value.dict(),
                    category=key,
                    updated_by=current_user['username']
                )
        
        return {
            "success": True,
            "message": "Configuration updated successfully",
            "updated_keys": list(config_dict.keys())
        }
        
    except Exception as e:
        logger.error(f"Config update failed: {e}")
        raise HTTPException(
            status_code=400,
            detail=f"Invalid configuration: {str(e)}"
        )

6. FIX: JSON Metadata Search Performance

Current Inefficient Code (unified_database.py:576-590)

def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]:
    """Get download record by Instagram media ID"""
    with self.get_connection() as conn:
        cursor = conn.cursor()
        
        # This causes FULL TABLE SCAN on large datasets!
        pattern1 = f'%"media_id": "{media_id}"%'
        pattern2 = f'%"media_id"%{media_id}%'
        
        cursor.execute('''
            SELECT * FROM downloads
            WHERE platform = ?
            AND (metadata LIKE ? OR metadata LIKE ?)
            LIMIT 1
        ''', (platform, pattern1, pattern2))
# Schema modification (add once)
def _init_database(self):
    """Initialize database with optimized schema"""
    with self.get_connection() as conn:
        cursor = conn.cursor()
        
        # Add separate column for media_id (indexed)
        try:
            cursor.execute("ALTER TABLE downloads ADD COLUMN media_id TEXT")
        except sqlite3.OperationalError:
            pass  # Column already exists
        
        # Create efficient index
        cursor.execute('''
            CREATE INDEX IF NOT EXISTS idx_media_id_platform
            ON downloads(media_id, platform)
            WHERE media_id IS NOT NULL
        ''')
        conn.commit()

def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]:
    """Get download record by Instagram media ID (fast)"""
    with self.get_connection() as conn:
        cursor = conn.cursor()
        
        # Now uses fast index instead of LIKE scan
        cursor.execute('''
            SELECT id, url, platform, source, content_type,
                   filename, file_path, post_date, download_date,
                   file_size, file_hash, metadata
            FROM downloads
            WHERE platform = ? AND media_id = ?
            LIMIT 1
        ''', (platform, media_id))
        
        row = cursor.fetchone()
        if row:
            return dict(row)
        return None

def record_download(self, media_id: str = None, **kwargs):
    """Record download with media_id extracted to separate column"""
    # ... existing code ...
    cursor.execute('''
        INSERT INTO downloads (
            url_hash, url, platform, source, content_type,
            filename, file_path, file_size, file_hash,
            post_date, status, error_message, metadata, media_id
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        url_hash, url, platform, source, content_type,
        filename, file_path, file_size, file_hash,
        post_date.isoformat() if post_date else None,
        status, error_message,
        json.dumps(metadata) if metadata else None,
        media_id  # Store separately for fast lookup
    ))
# Uses SQLite's built-in JSON functions (more efficient than LIKE)
def get_download_by_media_id(self, media_id: str, platform: str = 'fastdl') -> Optional[Dict]:
    """Get download record by Instagram media ID using JSON_EXTRACT"""
    with self.get_connection() as conn:
        cursor = conn.cursor()
        
        cursor.execute('''
            SELECT id, url, platform, source, content_type,
                   filename, file_path, post_date, download_date,
                   file_size, file_hash, metadata
            FROM downloads
            WHERE platform = ?
            AND JSON_EXTRACT(metadata, '$.media_id') = ?
            LIMIT 1
        ''', (platform, media_id))
        
        row = cursor.fetchone()
        if row:
            result = dict(row)
            # Parse metadata
            if result.get('metadata'):
                try:
                    result['metadata'] = json.loads(result['metadata'])
                except (ValueError, TypeError, json.JSONDecodeError):
                    pass
            return result
        return None

7. FIX: Bare Exception Handlers

Problematic Code (fastdl_module.py, media-downloader.py)

except:  # Too broad!
    break
import sqlite3
import requests
from requests.exceptions import RequestException, Timeout, ConnectionError

# Be specific about which exceptions to catch
try:
    # ... code that might fail ...
    download_file(url)
    
except (RequestException, Timeout, ConnectionError) as e:
    # Handle network errors
    logger.warning(f"Network error downloading {url}: {e}")
    if isinstance(e, Timeout):
        # Retry with longer timeout
        continue
    else:
        # Skip this file
        break

except sqlite3.OperationalError as e:
    # Handle database errors specifically
    if "database is locked" in str(e):
        logger.warning("Database locked, retrying...")
        time.sleep(1)
        continue
    else:
        logger.error(f"Database error: {e}")
        raise

except (OSError, IOError) as e:
    # Handle file system errors
    logger.error(f"File system error: {e}")
    break

except Exception as e:
    # Only catch unexpected errors as last resort
    logger.error(f"Unexpected error: {type(e).__name__}: {e}", exc_info=True)
    break

8. FIX: Async File I/O

Current Blocking Code (web/backend/api.py)

# This blocks the async event loop!
@app.get("/api/media/thumbnail")
async def get_thumbnail(file_path: str):
    # Synchronous file I/O blocks other requests
    with open(file_path, 'rb') as f:
        image = Image.open(f)
        # ... process image ...
        return FileResponse(processed_image)
import aiofiles
from PIL import Image
import io

@app.get("/api/media/thumbnail")
async def get_thumbnail(
    file_path: str,
    media_type: str,
    current_user: Dict = Depends(get_current_user_media)
) -> StreamingResponse:
    """Serve thumbnail efficiently without blocking"""
    
    try:
        # Use aiofiles for non-blocking file I/O
        async with aiofiles.open(file_path, 'rb') as f:
            file_data = await f.read()
        
        # Offload CPU-bound image processing to thread pool
        loop = asyncio.get_event_loop()
        thumbnail = await loop.run_in_executor(
            None,  # Use default executor (ThreadPoolExecutor)
            _create_thumbnail,
            file_data,
            media_type
        )
        
        return StreamingResponse(
            io.BytesIO(thumbnail),
            media_type="image/jpeg"
        )
        
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail="File not found")
    except Exception as e:
        logger.error(f"Error creating thumbnail: {e}")
        raise HTTPException(status_code=500, detail="Error creating thumbnail")

def _create_thumbnail(file_data: bytes, media_type: str) -> bytes:
    """CPU-bound function to create thumbnail"""
    try:
        image = Image.open(io.BytesIO(file_data))
        image.thumbnail((200, 200))
        
        output = io.BytesIO()
        image.save(output, format='JPEG', quality=85)
        return output.getvalue()
        
    except Exception as e:
        logger.error(f"Thumbnail creation failed: {e}")
        raise

9. FIX: Adapter Duplication

Current Duplicated Code (unified_database.py:1708-2080)

# FastDLDatabaseAdapter
class FastDLDatabaseAdapter:
    def __init__(self, unified_db: UnifiedDatabase):
        self.db = unified_db
        self.platform = 'fastdl'
    
    def is_already_downloaded(self, media_id: str) -> bool:
        # ... 20+ lines of duplicate code ...
    
    def record_download(self, media_id: str, username: str, **kwargs):
        # ... 30+ lines of duplicate code ...

# TikTokDatabaseAdapter (similar structure)
# ToolzuDatabaseAdapter (similar structure)
# CoppermineDatabaseAdapter (similar structure)
# ... and more
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional

class BaseDatabaseAdapter(ABC):
    """Generic adapter for unified database compatibility"""
    
    def __init__(self, unified_db: UnifiedDatabase, platform: str):
        self.db = unified_db
        self.platform = platform
    
    @abstractmethod
    def get_identifier(self, data: Dict[str, Any]) -> str:
        """Extract unique identifier from data"""
        pass
    
    @abstractmethod
    def build_metadata(self, data: Dict[str, Any]) -> Dict:
        """Build platform-specific metadata"""
        pass
    
    def is_already_downloaded(self, identifier: str) -> bool:
        """Check if content is already downloaded"""
        with self.db.get_connection() as conn:
            cursor = conn.cursor()
            cursor.execute('''
                SELECT 1 FROM downloads 
                WHERE platform = ? AND metadata LIKE ?
                LIMIT 1
            ''', (self.platform, f'%"{self._id_key()}": "{identifier}"%'))
            return cursor.fetchone() is not None
    
    @abstractmethod
    def _id_key(self) -> str:
        """Return the metadata key for identifier"""
        pass
    
    def record_download(
        self,
        identifier: str,
        source: str,
        **kwargs
    ) -> bool:
        """Record download with platform-specific data"""
        
        url = self._build_url(identifier, source, kwargs)
        metadata = self.build_metadata({
            **kwargs,
            self._id_key(): identifier
        })
        
        # Calculate file hash if provided
        file_hash = None
        if kwargs.get('file_path'):
            try:
                file_hash = UnifiedDatabase.get_file_hash(kwargs['file_path'])
            except Exception:
                pass
        
        return self.db.record_download(
            url=url,
            platform=self.platform,
            source=source,
            content_type=kwargs.get('content_type', 'post'),
            filename=kwargs.get('filename'),
            file_path=kwargs.get('file_path'),
            file_hash=file_hash,
            post_date=kwargs.get('post_date'),
            metadata=metadata
        )
    
    @abstractmethod
    def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str:
        """Build URL for the content"""
        pass

# Concrete implementations
class FastDLDatabaseAdapter(BaseDatabaseAdapter):
    def __init__(self, unified_db: UnifiedDatabase):
        super().__init__(unified_db, 'fastdl')
    
    def _id_key(self) -> str:
        return 'media_id'
    
    def get_identifier(self, data: Dict) -> str:
        return data.get('media_id', '')
    
    def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str:
        return kwargs.get('download_url') or f"instagram://{identifier}"
    
    def build_metadata(self, data: Dict) -> Dict:
        return {
            'media_id': data.get('media_id'),
            'source': 'fastdl',
            **{k: v for k, v in data.items() if k not in ['media_id', 'file_path']}
        }

class TikTokDatabaseAdapter(BaseDatabaseAdapter):
    def __init__(self, unified_db: UnifiedDatabase):
        super().__init__(unified_db, 'tiktok')
    
    def _id_key(self) -> str:
        return 'video_id'
    
    def get_identifier(self, data: Dict) -> str:
        return data.get('video_id', '')
    
    def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str:
        return f"https://www.tiktok.com/@{source}/video/{identifier}"
    
    def build_metadata(self, data: Dict) -> Dict:
        return {
            'video_id': data.get('video_id'),
            **{k: v for k, v in data.items() if k != 'video_id'}
        }

class SnapchatDatabaseAdapter(BaseDatabaseAdapter):
    def __init__(self, unified_db: UnifiedDatabase):
        super().__init__(unified_db, 'snapchat')
    
    def _id_key(self) -> str:
        return 'story_id'
    
    def get_identifier(self, data: Dict) -> str:
        return data.get('story_id', '')
    
    def _build_url(self, identifier: str, source: str, kwargs: Dict) -> str:
        return kwargs.get('url', f"snapchat://{identifier}")
    
    def build_metadata(self, data: Dict) -> Dict:
        return data.copy()

# ... similar for other platforms ...

Summary

These code examples provide concrete implementations for the major security, performance, and quality issues identified in the review. The fixes follow Python/TypeScript best practices and can be implemented incrementally.

Start with security fixes (sections 1-5), then move to performance (sections 6-8), then code quality (section 9).