media-downloader/modules/paid_content/file_host_downloader.py

"""
Download files from external file hosting services
Supports: Bunkr, Pixeldrain, Gofile, Cyberdrop
"""

import asyncio
import re
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urlparse, parse_qs

import aiohttp

from modules.base_module import LoggingMixin, RateLimitMixin


class FileHostDownloader(LoggingMixin, RateLimitMixin):
    """
    Download files from various file hosting services
    Used for manual import of PPV content
    """

    SUPPORTED_HOSTS = {
        'bunkr': ['bunkr.sk', 'bunkr.si', 'bunkr.la', 'bunkrr.ru', 'bunkr.ph', 'bunkr.is', 'bunkr.ac', 'bunkr.cr'],
        'pixeldrain': ['pixeldrain.com'],
        'gofile': ['gofile.io'],
        'cyberdrop': ['cyberdrop.me', 'cyberdrop.to', 'cyberdrop.cc'],
        'fileditch': ['fileditchfiles.me', 'fileditch.me'],
    }

    # Bunkr CDN servers (food-themed) - try in order
    BUNKR_CDNS = [
        'i-soup.bunkr.ru',
        'i-burger.bunkr.ru',
        'i-pizza.bunkr.ru',
        'i-taco.bunkr.ru',
        'i-fries.bunkr.ru',
        'i-hotdog.bunkr.ru',
        'i-nachos.bunkr.ru',
        'i-sushi.bunkr.ru',
        'i-ramen.bunkr.ru',
        'i-curry.bunkr.ru',
        'i-kebab.bunkr.ru',
        'i-pasta.bunkr.ru',
        'i-steak.bunkr.ru',
        'i-salad.bunkr.ru',
        'i-sandwich.bunkr.ru',
        'i-waffle.bunkr.ru',
        'i-pancake.bunkr.ru',
        'i-donut.bunkr.ru',
        'i-cookie.bunkr.ru',
        'i-cake.bunkr.ru',
        'i-bacon.bunkr.ru',
        'i-cheese.bunkr.ru',
        'i-chicken.bunkr.ru',
        'i-fish.bunkr.ru',
        'i-noodle.bunkr.ru',
        'i-rice.bunkr.ru',
        'i-bread.bunkr.ru',
        'burger.bunkr.ru',
        'pizza.bunkr.ru',
        'milkshake.bunkr.ru',
    ]

    def __init__(self, log_callback=None, progress_callback=None):
        self._init_logger('PaidContent', log_callback, default_module='FileHost')
        self._init_rate_limiter(min_delay=1, max_delay=3)
        self.progress_callback = progress_callback  # Called with (downloaded_bytes, total_bytes, filename)

    def detect_host(self, url: str) -> Optional[str]:
        """Detect which file host a URL belongs to"""
        try:
            parsed = urlparse(url)
            domain = parsed.netloc.lower().replace('www.', '')

            for host, domains in self.SUPPORTED_HOSTS.items():
                if domain in domains:
                    return host
        except Exception:
            pass
        return None

    def is_supported_url(self, url: str) -> bool:
        """Check if URL is from a supported file host"""
        return self.detect_host(url) is not None

    async def download_url(self, url: str, save_dir: Path) -> Dict:
        """
        Download file(s) from URL
        Returns: {'success': bool, 'files': [paths], 'error': str}
        """
        host = self.detect_host(url)
        if not host:
            return {'success': False, 'files': [], 'error': 'Unsupported host'}

        handler = getattr(self, f'_download_{host}', None)
        if not handler:
            return {'success': False, 'files': [], 'error': f'No handler for {host}'}

        try:
            save_dir = Path(save_dir)
            save_dir.mkdir(parents=True, exist_ok=True)
            return await handler(url, save_dir)
        except Exception as e:
            self.log(f"Error downloading from {host}: {e}", 'error')
            return {'success': False, 'files': [], 'error': str(e)}

    async def _download_pixeldrain(self, url: str, save_dir: Path) -> Dict:
        """Download from Pixeldrain"""
        # Extract file ID from URL
        # Format: https://pixeldrain.com/u/FILEID or /l/LISTID

        parsed = urlparse(url)
        path_parts = parsed.path.strip('/').split('/')

        if len(path_parts) < 2:
            return {'success': False, 'files': [], 'error': 'Invalid Pixeldrain URL'}

        url_type, file_id = path_parts[0], path_parts[1]

        files = []
        timeout = aiohttp.ClientTimeout(total=300)

        async with aiohttp.ClientSession(timeout=timeout) as session:
            if url_type == 'u':
                # Single file
                api_url = f"https://pixeldrain.com/api/file/{file_id}/info"
                async with session.get(api_url) as resp:
                    if resp.status != 200:
                        return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
                    info = await resp.json()

                download_url = f"https://pixeldrain.com/api/file/{file_id}"
                filename = info.get('name', f'{file_id}.bin')
                save_path = save_dir / self._sanitize_filename(filename)

                await self._download_file(session, download_url, save_path)
                files.append(str(save_path))

            elif url_type == 'l':
                # List (album)
                api_url = f"https://pixeldrain.com/api/list/{file_id}"
                async with session.get(api_url) as resp:
                    if resp.status != 200:
                        return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
                    data = await resp.json()

                for i, item in enumerate(data.get('files', [])):
                    self._delay_between_items()
                    item_id = item['id']
                    filename = item.get('name', f'{i:03d}_{item_id}.bin')
                    download_url = f"https://pixeldrain.com/api/file/{item_id}"
                    save_path = save_dir / self._sanitize_filename(filename)

                    try:
                        await self._download_file(session, download_url, save_path)
                        files.append(str(save_path))
                    except Exception as e:
                        self.log(f"Failed to download {filename}: {e}", 'warning')

        return {'success': True, 'files': files, 'error': None}

    async def _download_gofile(self, url: str, save_dir: Path) -> Dict:
        """Download from Gofile"""
        # Extract content ID from URL
        # Format: https://gofile.io/d/CONTENTID

        parsed = urlparse(url)
        path_parts = parsed.path.strip('/').split('/')

        if len(path_parts) < 2 or path_parts[0] != 'd':
            return {'success': False, 'files': [], 'error': 'Invalid Gofile URL'}

        content_id = path_parts[1]

        files = []
        timeout = aiohttp.ClientTimeout(total=300)

        async with aiohttp.ClientSession(timeout=timeout) as session:
            # Create guest account token (POST request required since API change)
            async with session.post('https://api.gofile.io/accounts') as resp:
                if resp.status != 200:
                    return {'success': False, 'files': [], 'error': 'Failed to get Gofile token'}
                account_data = await resp.json()
                if account_data.get('status') != 'ok':
                    return {'success': False, 'files': [], 'error': f"Gofile API error: {account_data.get('status')}"}
                token = account_data.get('data', {}).get('token')

            if not token:
                return {'success': False, 'files': [], 'error': 'No Gofile token received'}

            # Get content info
            # Gofile requires x-website-token header (changed from query param in 2024)
            headers = {
                'Authorization': f'Bearer {token}',
                'x-website-token': '4fd6sg89d7s6',
            }
            api_url = f"https://api.gofile.io/contents/{content_id}"

            async with session.get(api_url, headers=headers) as resp:
                if resp.status == 401:
                    return {'success': False, 'files': [], 'error': 'Gofile authentication failed - websiteToken may have changed'}
                if resp.status != 200:
                    return {'success': False, 'files': [], 'error': f'Failed to get content: {resp.status}'}
                content_data = await resp.json()

            if content_data.get('status') == 'error-notPremium':
                return {'success': False, 'files': [], 'error': 'Gofile requires premium account for API access - try direct download'}
            if content_data.get('status') != 'ok':
                error = content_data.get('data', {}).get('message', content_data.get('status', 'Unknown error'))
                return {'success': False, 'files': [], 'error': error}

            contents = content_data.get('data', {}).get('children', {})

            for item_id, item in contents.items():
                if item.get('type') != 'file':
                    continue

                self._delay_between_items()
                download_url = item.get('link')
                filename = item.get('name', f'{item_id}.bin')
                save_path = save_dir / self._sanitize_filename(filename)

                try:
                    await self._download_file(session, download_url, save_path, headers=headers)
                    files.append(str(save_path))
                except Exception as e:
                    self.log(f"Failed to download {filename}: {e}", 'warning')

        return {'success': True, 'files': files, 'error': None}

    async def _download_cyberdrop(self, url: str, save_dir: Path) -> Dict:
        """Download from Cyberdrop"""
        # Cyberdrop albums: https://cyberdrop.me/a/ALBUMID
        # Single files: https://cyberdrop.me/f/FILEID or direct CDN links

        files = []
        timeout = aiohttp.ClientTimeout(total=300)

        async with aiohttp.ClientSession(timeout=timeout) as session:
            parsed = urlparse(url)
            path_parts = parsed.path.strip('/').split('/')

            if len(path_parts) >= 2 and path_parts[0] == 'a':
                # Album
                album_url = url
                async with session.get(album_url) as resp:
                    if resp.status != 200:
                        return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
                    html = await resp.text()

                # Parse file links from HTML
                # Pattern: href="https://fs-XXX.cyberdrop.to/FILE"
                cdn_pattern = r'href="(https://[a-z0-9-]+\.cyberdrop\.[a-z]+/[^"]+)"'
                matches = re.findall(cdn_pattern, html)

                for i, file_url in enumerate(matches):
                    self._delay_between_items()
                    filename = file_url.split('/')[-1].split('?')[0]
                    if not filename:
                        filename = f'{i:03d}.bin'
                    save_path = save_dir / self._sanitize_filename(filename)

                    try:
                        await self._download_file(session, file_url, save_path)
                        files.append(str(save_path))
                    except Exception as e:
                        self.log(f"Failed to download {filename}: {e}", 'warning')

            else:
                # Single file or direct CDN link
                filename = parsed.path.split('/')[-1] or 'download.bin'
                save_path = save_dir / self._sanitize_filename(filename)

                await self._download_file(session, url, save_path)
                files.append(str(save_path))

        return {'success': True, 'files': files, 'error': None}

    async def _download_bunkr(self, url: str, save_dir: Path) -> Dict:
        """Download from Bunkr with CDN fallback support"""
        # Bunkr albums: https://bunkr.sk/a/ALBUMID
        # Single files: https://bunkr.sk/f/FILEID or https://bunkr.sk/v/VIDEOID

        files = []
        failed = []
        timeout = aiohttp.ClientTimeout(total=600)  # Increased for large files

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }

        async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
            parsed = urlparse(url)
            path_parts = parsed.path.strip('/').split('/')

            if len(path_parts) >= 2 and path_parts[0] == 'a':
                # Album page
                async with session.get(url) as resp:
                    if resp.status != 200:
                        return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
                    html = await resp.text()

                # Parse file links from HTML - look for /f/ links
                file_pattern = r'href="(/f/[^"]+)"'
                matches = re.findall(file_pattern, html)

                self.log(f"Found {len(matches)} files in Bunkr album", 'info')

                for i, file_path in enumerate(matches):
                    self._delay_between_items()

                    # Make absolute URL
                    file_url = f"https://{parsed.netloc}{file_path}"

                    # Get direct download URL and file UUID
                    direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, file_url)
                    if not direct_url:
                        self.log(f"Could not get direct URL for {file_url}", 'warning')
                        failed.append(file_url)
                        continue

                    filename = direct_url.split('/')[-1].split('?')[0]
                    if not filename:
                        filename = f'{i:03d}.bin'
                    save_path = save_dir / self._sanitize_filename(filename)

                    try:
                        await self._download_file(session, direct_url, save_path,
                                                  try_cdn_fallback=True, file_uuid=file_uuid)
                        files.append(str(save_path))
                        self.log(f"Downloaded: {filename}", 'info')
                    except Exception as e:
                        self.log(f"Failed to download {filename}: {e}", 'warning')
                        failed.append(filename)

            else:
                # Single file page
                direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, url)
                if not direct_url:
                    return {'success': False, 'files': [], 'error': 'Could not get direct download URL'}

                filename = direct_url.split('/')[-1].split('?')[0] or 'download.bin'
                save_path = save_dir / self._sanitize_filename(filename)

                await self._download_file(session, direct_url, save_path,
                                          try_cdn_fallback=True, file_uuid=file_uuid)
                files.append(str(save_path))

        result = {'success': len(files) > 0, 'files': files, 'error': None}
        if failed:
            result['failed'] = failed
            result['error'] = f'{len(failed)} files failed to download'
        return result

    async def _get_bunkr_direct_url_with_uuid(self, session: aiohttp.ClientSession, page_url: str) -> tuple:
        """Extract direct download URL and file UUID from Bunkr file page"""
        try:
            async with session.get(page_url) as resp:
                if resp.status != 200:
                    return None, None
                html = await resp.text()

            file_uuid = None

            # Extract file UUID first
            uuid_patterns = [
                r'data-v="([a-f0-9-]{36}\.[a-z0-9]+)"',
                r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\.[a-z0-9]+)',
            ]
            for pattern in uuid_patterns:
                match = re.search(pattern, html)
                if match:
                    file_uuid = match.group(1)
                    break

            # Try to find existing CDN URL in page
            cdn_patterns = [
                r'href="(https://[^"]*\.bunkr\.ru/[^"]+)"',
                r'src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
                r'data-src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
            ]

            for pattern in cdn_patterns:
                match = re.search(pattern, html)
                if match:
                    url = match.group(1)
                    if await self._check_url_accessible(session, url):
                        return url, file_uuid

            # If we have UUID, try CDNs
            if file_uuid:
                self.log(f"Found file UUID: {file_uuid}, trying CDNs...", 'debug')
                for cdn in self.BUNKR_CDNS:
                    cdn_url = f"https://{cdn}/{file_uuid}"
                    if await self._check_url_accessible(session, cdn_url):
                        self.log(f"Found working CDN: {cdn}", 'debug')
                        return cdn_url, file_uuid

            return None, file_uuid
        except Exception as e:
            self.log(f"Error getting Bunkr direct URL: {e}", 'warning')
            return None, None

    async def _check_url_accessible(self, session: aiohttp.ClientSession, url: str) -> bool:
        """Check if a URL is accessible (returns 200)"""
        try:
            async with session.head(url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as resp:
                return resp.status == 200
        except Exception:
            return False

    async def _download_fileditch(self, url: str, save_dir: Path) -> Dict:
        """Download from FileDitch (Cloudflare-protected)"""
        from modules.cloudflare_handler import CloudflareHandler

        # Extract filename from URL: file.php?f=/b74/tLyJWGrzvSyRlJvBVDBa.mp4
        parsed = urlparse(url)
        params = parse_qs(parsed.query)
        file_path = params.get('f', [''])[0]
        if not file_path:
            return {'success': False, 'files': [], 'error': 'Invalid FileDitch URL - no file parameter'}

        filename = file_path.rsplit('/', 1)[-1] if '/' in file_path else file_path
        if not filename:
            return {'success': False, 'files': [], 'error': 'Could not extract filename from URL'}

        save_path = save_dir / self._sanitize_filename(filename)

        # Use CloudflareHandler to get cookies via FlareSolverr
        cf_handler = CloudflareHandler(
            module_name='FileDitch',
            flaresolverr_url='http://localhost:8191/v1',
            flaresolverr_enabled=True,
        )

        self.log('Bypassing Cloudflare for FileDitch via FlareSolverr...', 'info')
        if not cf_handler.get_cookies_via_flaresolverr(url):
            return {'success': False, 'files': [], 'error': 'Failed to bypass Cloudflare for FileDitch'}

        cookies = cf_handler.get_cookies_dict()
        user_agent = cf_handler.get_user_agent()

        # Download with the obtained cookies
        timeout = aiohttp.ClientTimeout(total=3600)
        cookie_jar = aiohttp.CookieJar()
        headers = {'User-Agent': user_agent or 'Mozilla/5.0'}

        async with aiohttp.ClientSession(timeout=timeout, cookie_jar=cookie_jar, headers=headers) as session:
            # Set cookies on session
            for name, value in cookies.items():
                cookie_jar.update_cookies({name: value}, response_url=url)

            await self._download_file(session, url, save_path, headers=headers)

        return {'success': True, 'files': [str(save_path)], 'error': None}

    async def _download_file(self, session: aiohttp.ClientSession, url: str,
                            save_path: Path, headers: Dict = None,
                            try_cdn_fallback: bool = False, file_uuid: str = None) -> None:
        """Download a single file with streaming and optional CDN fallback"""
        save_path.parent.mkdir(parents=True, exist_ok=True)

        urls_to_try = [url]

        # If CDN fallback enabled and we have a file UUID, add alternate CDNs
        if try_cdn_fallback and file_uuid:
            for cdn in self.BUNKR_CDNS:
                alt_url = f"https://{cdn}/{file_uuid}"
                if alt_url != url:
                    urls_to_try.append(alt_url)

        last_error = None
        for try_url in urls_to_try:
            try:
                self.log(f"Downloading: {save_path.name} from {try_url[:60]}...", 'info')
                async with session.get(try_url, headers=headers) as resp:
                    if resp.status == 200:
                        total_size = int(resp.headers.get('content-length', 0))
                        downloaded = 0
                        last_log_pct = 0

                        with open(save_path, 'wb') as f:
                            async for chunk in resp.content.iter_chunked(65536):  # 64KB chunks
                                f.write(chunk)
                                downloaded += len(chunk)

                                # Log and callback progress every 2%
                                if total_size > 0:
                                    pct = int(downloaded * 100 / total_size)
                                    if pct >= last_log_pct + 2:
                                        self.log(f"  {save_path.name}: {pct}% ({downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB)", 'info')
                                        last_log_pct = pct
                                        # Call progress callback if provided
                                        if self.progress_callback:
                                            try:
                                                self.progress_callback(downloaded, total_size, save_path.name)
                                            except Exception:
                                                pass  # Don't fail download due to callback error

                        self.log(f"Downloaded: {save_path.name} ({downloaded // (1024*1024)}MB)", 'info')
                        return  # Success
                    else:
                        last_error = f"HTTP {resp.status}"
                        self.log(f"Download failed: {save_path.name} - {last_error}", 'warning')
            except Exception as e:
                last_error = str(e)
                self.log(f"Download error: {save_path.name} - {last_error}", 'warning')
                # Try next CDN
                continue

        raise Exception(f"Download failed after trying {len(urls_to_try)} URLs: {last_error}")

    def _sanitize_filename(self, filename: str) -> str:
        """Sanitize filename for filesystem"""
        if not filename:
            return 'download.bin'
        # Remove/replace invalid characters
        filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
        filename = filename.strip('. ')
        return filename or 'download.bin'

    @classmethod
    def get_supported_domains(cls) -> List[str]:
        """Get list of all supported domains"""
        domains = []
        for host_domains in cls.SUPPORTED_HOSTS.values():
            domains.extend(host_domains)
        return domains