ohm_streaming/app/downloaders/video_players/vidzy.py

"""Vidzy video hosting service downloader"""
import logging
import asyncio
import re
import subprocess
import os
from typing import Optional
from .base import BaseVideoPlayer
from bs4 import BeautifulSoup
from app.utils import sanitize_filename

logger = logging.getLogger(__name__)


class VidzyDownloader(BaseVideoPlayer):
    """
    Downloader for Vidzy video hosting service.

    Vidzy is a video hosting platform used by various anime streaming sites.
    Uses heavy JavaScript obfuscation, so Playwright is required.
    """

    def can_handle(self, url: str) -> bool:
        """Check if this downloader can handle the given URL"""
        return "vidzy" in url.lower()

    async def get_download_link(
        self,
        url: str,
        target_filename: Optional[str] = None
    ) -> tuple[str, str]:
        """
        Extract direct download link and filename from Vidzy URL.

        Args:
            url: The Vidzy video player URL
            target_filename: Optional filename override

        Returns:
            Tuple of (download_url, filename)
        """
        try:
            # Extract actual Vidzy URL from pipe-separated format if present
            # Format: video_url|anime_url|episode_title
            if '|' in url:
                url = url.split('|')[0].strip()
                logger.debug(f"Extracted Vidzy URL from pipe format: {url}")

            logger.info(f"Fetching Vidzy URL: {url}")

            # Try using Playwright first (Vidzy uses heavy JS obfuscation)
            video_url = await self._extract_with_playwright(url)

            if not video_url:
                # Fallback to static HTML parsing
                logger.warning("Playwright extraction failed, trying static parsing...")
                video_url = await self._extract_static(url)

            if not video_url:
                raise ValueError(f"Could not extract video URL from Vidzy")

            logger.info(f"Successfully extracted Vidzy URL: {video_url[:100]}...")

            # Generate filename
            if target_filename:
                filename = sanitize_filename(target_filename)
            else:
                # Try to extract filename from URL
                filename = video_url.split('/')[-1].split('?')[0]
                if not filename or len(filename) < 5:
                    filename = "vidzy_video.mp4"
                filename = sanitize_filename(filename)

            # Ensure .mp4 extension
            if not filename.endswith('.mp4'):
                filename += '.mp4'

            # Check if it's an M3U8 playlist (HLS stream)
            if '.m3u8' in video_url:
                logger.info(f"Detected M3U8 stream, will download with ffmpeg")

                # Download and convert M3U8 to MP4 directly
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                    'Referer': 'https://vidzy.org/',
                }

                mp4_path = await self._download_m3u8_as_mp4(video_url, filename, headers)
                logger.info(f"Successfully extracted Vidzy download link: {filename}")
                return mp4_path, filename

            # It's a direct MP4 link
            logger.info(f"Successfully extracted Vidzy download link: {filename}")
            return video_url, filename

        except Exception as e:
            logger.error(f"Error extracting Vidzy download link: {e}")
            raise ValueError(f"Failed to extract download link from Vidzy: {str(e)}")

    async def _extract_with_playwright(self, url: str) -> Optional[str]:
        """Extract video URL using Playwright with network interception"""
        try:
            from playwright.async_api import async_playwright

            logger.info("Launching Playwright for Vidzy...")

            video_urls = []

            async with async_playwright() as p:
                browser = await p.chromium.launch(
                    headless=True,
                    args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
                )

                context = await browser.new_context(
                    user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
                )

                page = await context.new_page()

                # Set up request interception
                async def handle_request(route):
                    req_url = route.request.url

                    # Look for video files (HLS streams and MP4s)
                    if any(ext in req_url.lower() for ext in ['.m3u8', '.mp4', 'master']):
                        if 'vidzy' not in req_url.lower() or 'master' in req_url.lower():
                            logger.info(f"🎥 Captured video URL: {req_url[:100]}...")
                            video_urls.append(req_url)

                    await route.continue_()

                await page.route('**', handle_request)

                logger.info("Navigating to Vidzy page...")

                try:
                    await page.goto(url, wait_until='domcontentloaded', timeout=30000)
                except Exception as e:
                    logger.warning(f"Navigation warning: {e}")

                # Wait for page to load and initialize player
                logger.info("Waiting for video player to load...")
                await asyncio.sleep(5)

                # Try JavaScript extraction from VideoJS player
                try:
                    js_result = await page.evaluate("""
                        () => {
                            // Check if videojs is available
                            if (typeof videojs !== 'undefined' && videojs.players) {
                                // Get all players
                                const players = Object.values(videojs.players);
                                if (players.length > 0) {
                                    const player = players[0];

                                    // Try to get source from player
                                    if (player.currentSrc()) {
                                        return player.currentSrc();
                                    }

                                    // Try to get sources array
                                    if (player.currentSources() && player.currentSources().length > 0) {
                                        return player.currentSources()[0].src;
                                    }
                                }
                            }

                            // Check all video elements
                            const videos = document.querySelectorAll('video');
                            for (let v of videos) {
                                if (v.src) {
                                    return v.src;
                                }
                                const sources = v.querySelectorAll('source');
                                for (let s of sources) {
                                    if (s.src) {
                                        return s.src;
                                    }
                                }
                            }

                            // Look for sources in scripts (VideoJS config)
                            const scripts = document.querySelectorAll('script');
                            for (let script of scripts) {
                                const text = script.textContent;
                                // Look for sources array with .m3u8 URLs
                                const sourcesMatch = text.match(/sources\s*:\s*\[\s*\{\s*src\s*:\s*['"](https?:\/\/[^'"]+\.m3u8[^'"]*)['"]/i);
                                if (sourcesMatch) {
                                    return sourcesMatch[1];
                                }
                            }

                            return null;
                        }
                    """)

                    if js_result and ('.m3u8' in js_result or '.mp4' in js_result):
                        logger.info(f"Found video URL via JavaScript evaluation")
                        video_urls.append(js_result)
                except Exception as e:
                    logger.warning(f"JS extraction error: {e}")

                # Wait more for network requests
                await asyncio.sleep(3)

                await browser.close()

                # Return best video URL (prefer master.m3u8 for HLS)
                if video_urls:
                    seen = set()
                    unique_urls = []
                    for url in video_urls:
                        if url not in seen:
                            seen.add(url)
                            unique_urls.append(url)

                    if unique_urls:
                        logger.info(f"✅ Found {len(unique_urls)} video URL(s)")

                        # Prefer master.m3u8 (HLS playlist)
                        for url in unique_urls:
                            if 'master.m3u8' in url or '.m3u8' in url:
                                logger.info(f"Using HLS playlist: {url[:100]}...")
                                return url

                        # Fall back to first URL
                        return unique_urls[0]

                logger.warning("❌ No video URLs found via Playwright")
                return None

        except ImportError:
            logger.warning("Playwright not installed, falling back to static parsing")
            return None
        except Exception as e:
            logger.warning(f"Playwright error: {e}")
            return None

    async def _extract_static(self, url: str) -> Optional[str]:
        """Static HTML parsing fallback"""
        try:
            response = await self.client.get(url)
            response.raise_for_status()
            html = response.text

            soup = BeautifulSoup(html, 'lxml')

            # Method 1: Look for video source in <video> tag
            video_tag = soup.find('video')
            if video_tag and video_tag.get('src'):
                logger.info(f"Found video source from <video> tag")
                return video_tag['src']

            # Method 2: Look for source in <source> tag
            source_tag = soup.find('source')
            if source_tag and source_tag.get('src'):
                logger.info(f"Found video source from <source> tag")
                return source_tag['src']

            # Method 3: Search entire HTML for .m3u8 URLs (Vidzy uses HLS)
            html_patterns = [
                r'(https?://[^\s<>"\'`]+\.m3u8[^\s<>"\'`]*)',
                r'(https?://[^\s<>"\'`]+/master[^\s<>"\'`]*)',
            ]

            for pattern in html_patterns:
                matches = re.findall(pattern, html)
                if matches:
                    # Filter out obvious false positives
                    for match in matches:
                        # Accept URLs with 'master' or from video hosts
                        if 'master' in match.lower() or any(host in match for host in ['hls', 'video', 'stream']):
                            logger.info(f"Found video URL in HTML: {match[:100]}...")
                            return match

            logger.warning("Static parsing failed to find video URL")
            return None

        except Exception as e:
            logger.warning(f"Static parsing error: {e}")
            return None

    async def _download_m3u8_as_mp4(self, m3u8_url: str, filename: str, headers: dict, download_dir: str = "downloads") -> str:
        """Download M3U8 stream and convert to MP4 using ffmpeg"""
        # Create downloads directory if it doesn't exist
        os.makedirs(download_dir, exist_ok=True)

        output_path = os.path.join(download_dir, filename)

        # Build headers for ffmpeg - using multiple -headers options
        header_args = []
        for key, value in headers.items():
            header_args.extend(['-headers', f'{key}: {value}'])

        cmd = [
            'ffmpeg',
            *header_args,
            '-i', m3u8_url,
            '-c', 'copy',
            '-bsf:a', 'aac_adtstoasc',
            '-y',
            output_path
        ]

        try:
            logger.info(f"Downloading M3U8 with ffmpeg...")
            logger.info(f"URL: {m3u8_url[:80]}...")
            logger.info(f"Output: {output_path}")

            # Run ffmpeg without capturing output to avoid buffering issues
            # Use a log file instead
            log_path = output_path + '.log'
            with open(log_path, 'w') as log_file:
                result = subprocess.run(
                    cmd,
                    stdout=log_file,
                    stderr=log_file,
                    timeout=600  # 10 minutes for very long videos
                )

            # Check if file was created even if ffmpeg had issues
            if os.path.exists(output_path):
                file_size = os.path.getsize(output_path)
                if file_size > 1000:  # At least 1KB
                    logger.info(f"✅ Download complete: {file_size / (1024*1024):.2f} MB")
                    return output_path

            # If we get here, something went wrong
            raise Exception(f"FFmpeg failed - no output file created")

        except subprocess.TimeoutExpired:
            # Check if file was created despite timeout
            if os.path.exists(output_path):
                file_size = os.path.getsize(output_path)
                if file_size > 1000:  # At least 1KB
                    logger.warning(f"⚠️  Timeout but file created: {file_size / (1024*1024):.2f} MB")
                    return output_path
            raise Exception("FFmpeg timeout (10 minutes) - video too large")

        except FileNotFoundError:
            raise Exception("ffmpeg not found - please install ffmpeg: apt install ffmpeg")
        except Exception as e:
            raise Exception(f"Error downloading M3U8: {str(e)}")