feat: Add series TV support with Vidzy HLS downloads and duplicate prevention

Major improvements: - Series TV support via FS7 provider with dedicated search endpoint - Vidzy downloader now uses Playwright for JS obfuscation and ffmpeg for HLS streams - Episode filenames properly named (Series Title - Episode X) instead of master.m3u8.mp4 - Duplicate download prevention: checks existing tasks before creating new ones - Removed host preference system in favor of intelligent URL-based detection Technical changes: - Vidzy: Added Playwright extraction and M3U8→MP4 conversion with ffmpeg - FS7: Episodes now use pipe format (video_url|series_url|episode_title) - DownloadManager: Extract target_filename from pipe URL and prevent duplicates - UI: New Series tab with search, recommendations, and releases sections - Anime-Sama: Removed hardcoded host preferences, uses site's URL order Generated with [Claude Code](https://claude.com/claude-code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <[email protected]> Co-Authored-By: Happy <[email protected]>
2026-01-25 20:42:29 +00:00
parent 5e50081b58
commit c1c31d7685
17 changed files with 938 additions and 219 deletions
@@ -30,6 +30,25 @@ class DownloadManager:
        return list(self.tasks.values())

    def create_task(self, request: DownloadRequest) -> DownloadTask:
+        # Check for existing tasks with the same URL
+        # Extract actual URL from pipe-separated format
+        url_to_check = request.url.split('|')[0] if '|' in request.url else request.url
+
+        # Look for existing non-failed tasks with the same URL
+        for existing_task in self.tasks.values():
+            existing_url = existing_task.url.split('|')[0] if '|' in existing_task.url else existing_task.url
+
+            # If same URL and task is not failed/cancelled/completed
+            if existing_url == url_to_check and existing_task.status not in [
+                DownloadStatus.FAILED,
+                DownloadStatus.CANCELLED,
+                DownloadStatus.COMPLETED
+            ]:
+                logger.info(f"Duplicate download detected: {url_to_check[:80]}...")
+                logger.info(f"Returning existing task: {existing_task.id}")
+                return existing_task
+
+        # No duplicate found, create new task
        task_id = str(uuid.uuid4())
        task = DownloadTask(
            id=task_id,
@@ -103,7 +122,17 @@ class DownloadManager:

                # Get downloader and extract link
                downloader = get_downloader(task.url)
-                download_url, filename = await downloader.get_download_link(task.url)
+
+                # Extract episode title from pipe-separated URL if present
+                # Format: video_url|anime_page_url|episode_title
+                target_filename = None
+                if '|' in task.url:
+                    parts = task.url.split('|')
+                    if len(parts) >= 3:
+                        target_filename = parts[2].strip()
+                        logger.debug(f"Extracted target filename from pipe: {target_filename}")
+
+                download_url, filename = await downloader.get_download_link(task.url, target_filename)

                logger.info(f"Download URL: {download_url[:100] if len(download_url) > 100 else download_url}")
                logger.debug(f"Downloader filename: {filename}")
@@ -424,7 +424,7 @@ class AnimeSamaDownloader(BaseAnimeSite):
            filename = target_filename if target_filename else temp_filename

            print(f"[ANIME-SAMA] Got video: {filename}")
-            print(f"[ANIME-SAMA] Video URL: {video_url[:100]}...")
+            print(f"[ANIME-SAMA] Video URL: {video_url[:100] if video_url else 'None'}...")

            # Return the direct video URL
            # The download_manager will handle the actual download
@@ -432,7 +432,8 @@ class AnimeSamaDownloader(BaseAnimeSite):

        except Exception as e:
            print(f"[ANIME-SAMA] Lpayer extraction error: {e}")
-            raise Exception(f"Error extracting from lpayer: {str(e)}")
+            # Re-raise with clearer message
+            raise Exception(f"Lpayer player not supported - this video host requires manual download. Try another host (VidMoly, SendVid, Sibnet). Error: {str(e)}")

    async def _extract_from_player(self, player_url: str) -> str | None:
        """Try to extract direct video URL from player iframe"""
@@ -783,7 +784,8 @@ class AnimeSamaDownloader(BaseAnimeSite):

                        print(f"[ANIME-SAMA] Detected format {'A (source-based)' if is_format_a else 'B (episode-based)'} - eps1 has {len(eps1_urls)} URLs")

-                        host_preference = ['sibnet.ru', 'vidmoly', 'sendvid', 'lpayer']
+                        # No more host preference! Just collect all available URLs for each episode
+                        # The download system will automatically detect and use the appropriate downloader
                        all_episodes_by_number = {}

                        if is_format_a:
@@ -797,48 +799,36 @@ class AnimeSamaDownloader(BaseAnimeSite):
                                    if episode_num not in all_episodes_by_number:
                                        all_episodes_by_number[episode_num] = []

-                                    # Determine host preference score (lower = better)
-                                    host_score = len(host_preference)
-                                    for i, host in enumerate(host_preference):
-                                        if host in url.lower():
-                                            host_score = i
-                                            break
-
-                                    all_episodes_by_number[episode_num].append((host_score, url))
+                                    all_episodes_by_number[episode_num].append(url)
                        else:
                            # Format B: Each epsX is an episode, containing multiple sources
                            for eps_num, urls_text in eps_matches:
                                episode_num = str(eps_num).zfill(2)
                                episode_urls = re.findall(r"'(https?://[^']+)'", urls_text)

-                                for url in episode_urls:
-                                    if episode_num not in all_episodes_by_number:
-                                        all_episodes_by_number[episode_num] = []
+                                if episode_num not in all_episodes_by_number:
+                                    all_episodes_by_number[episode_num] = []

-                                    # Determine host preference score (lower = better)
-                                    host_score = len(host_preference)
-                                    for i, host in enumerate(host_preference):
-                                        if host in url.lower():
-                                            host_score = i
-                                            break
+                                all_episodes_by_number[episode_num].extend(episode_urls)

-                                    all_episodes_by_number[episode_num].append((host_score, url))
-
-                        # For each episode, use the best available URL (lowest score = best host)
+                        # For each episode, use the first available URL
+                        # (they are usually already in order of preference on the site)
                        for episode_num in sorted(all_episodes_by_number.keys()):
-                            sorted_urls = sorted(all_episodes_by_number[episode_num], key=lambda x: x[0])
-                            best_url = sorted_urls[0][1]  # Get the URL with lowest score (best host)
+                            available_urls = all_episodes_by_number[episode_num]

+                            # Use the first available URL (the site usually lists them in preference order)
+                            episode_url = available_urls[0]
                            episode_title = f'Episode {episode_num}'
-                            combined_url = f"{best_url}|{anime_url}|{episode_title}"
+                            combined_url = f"{episode_url}|{anime_url}|{episode_title}"

                            episodes.append({
                                'episode': episode_num,
                                'url': combined_url,
-                                'title': episode_title
+                                'title': episode_title,
+                                'available_hosts': len(available_urls)  # Store count of available hosts
                            })

-                        print(f"[ANIME-SAMA] Found {len(episodes)} episodes (prioritizing {host_preference})")
+                        print(f"[ANIME-SAMA] Found {len(episodes)} episodes")
                        return episodes

                except Exception as e:
@@ -89,6 +89,10 @@ class FS7Downloader(BaseSeriesSite):
                        continue
                    title = text

+                # Clean up title: remove "affiche" suffix and clean extra whitespace
+                title = re.sub(r'\s+affiche$', '', title, flags=re.IGNORECASE).strip()
+                title = re.sub(r'\s+', ' ', title)  # Normalize whitespace
+
                # Extract cover image
                img = item.find('img')
                cover_image = img.get('src', '') if img else ''
@@ -135,6 +139,12 @@ class FS7Downloader(BaseSeriesSite):
            soup = BeautifulSoup(html, 'lxml')
            episodes = []

+            # Get series title for episode naming
+            title_elem = soup.find('h1')
+            series_title = title_elem.get_text(strip=True) if title_elem else "Series"
+            # Clean up title: remove "affiche" suffix
+            series_title = re.sub(r'\s+affiche$', '', series_title, flags=re.IGNORECASE).strip()
+
            # FS7 stores episode data in JavaScript div elements
            # Format: <div data-ep="1" data-vidzy="..." data-uqload="..." data-netu="..." data-voe="..."></div>
            episode_divs = soup.find_all('div', attrs={'data-ep': True})
@@ -144,17 +154,28 @@ class FS7Downloader(BaseSeriesSite):

                # Try different video players in order of preference
                video_url = None
+                host_name = None
                for player in ['data-vidzy', 'data-uqload', 'data-voe', 'data-netu']:
                    player_url = div.get(player, '').strip()
                    if player_url:
                        video_url = player_url
-                        logger.debug(f"Found episode {ep_num} on {player}")
+                        # Extract host name from attribute name
+                        host_name = player.replace('data-', '').title()
+                        logger.debug(f"Found episode {ep_num} on {host_name}")
                        break

                if video_url and ep_num:
+                    # Create episode title for filename
+                    episode_title = f"{series_title} - Episode {ep_num}"
+
+                    # Use pipe-separated format: video_url|anime_url|episode_title
+                    combined_url = f"{video_url}|{anime_url}|{episode_title}"
+
                    episodes.append({
                        'episode': ep_num,
-                        'url': video_url
+                        'url': combined_url,
+                        'title': episode_title,
+                        'host': host_name or 'Unknown'
                    })

            # Sort by episode number
@@ -193,6 +214,9 @@ class FS7Downloader(BaseSeriesSite):
            title = soup.find('h1')
            title = title.get_text(strip=True) if title else "Unknown"

+            # Clean up title: remove "affiche" suffix
+            title = re.sub(r'\s+affiche$', '', title, flags=re.IGNORECASE).strip()
+
            # Extract description/synopsis
            description_elem = soup.find('div', class_='full-text')
            description = description_elem.get_text(strip=True) if description_elem else ""
@@ -1,5 +1,9 @@
 """Vidzy video hosting service downloader"""
 import logging
+import asyncio
+import re
+import subprocess
+import os
 from typing import Optional
 from .base import BaseVideoPlayer
 from bs4 import BeautifulSoup
@@ -13,6 +17,7 @@ class VidzyDownloader(BaseVideoPlayer):
    Downloader for Vidzy video hosting service.

    Vidzy is a video hosting platform used by various anime streaming sites.
+    Uses heavy JavaScript obfuscation, so Playwright is required.
    """

    def can_handle(self, url: str) -> bool:
@@ -35,9 +40,206 @@ class VidzyDownloader(BaseVideoPlayer):
            Tuple of (download_url, filename)
        """
        try:
+            # Extract actual Vidzy URL from pipe-separated format if present
+            # Format: video_url|anime_url|episode_title
+            if '|' in url:
+                url = url.split('|')[0].strip()
+                logger.debug(f"Extracted Vidzy URL from pipe format: {url}")
+
            logger.info(f"Fetching Vidzy URL: {url}")

-            # Fetch the page
+            # Try using Playwright first (Vidzy uses heavy JS obfuscation)
+            video_url = await self._extract_with_playwright(url)
+
+            if not video_url:
+                # Fallback to static HTML parsing
+                logger.warning("Playwright extraction failed, trying static parsing...")
+                video_url = await self._extract_static(url)
+
+            if not video_url:
+                raise ValueError(f"Could not extract video URL from Vidzy")
+
+            logger.info(f"Successfully extracted Vidzy URL: {video_url[:100]}...")
+
+            # Generate filename
+            if target_filename:
+                filename = sanitize_filename(target_filename)
+            else:
+                # Try to extract filename from URL
+                filename = video_url.split('/')[-1].split('?')[0]
+                if not filename or len(filename) < 5:
+                    filename = "vidzy_video.mp4"
+                filename = sanitize_filename(filename)
+
+            # Ensure .mp4 extension
+            if not filename.endswith('.mp4'):
+                filename += '.mp4'
+
+            # Check if it's an M3U8 playlist (HLS stream)
+            if '.m3u8' in video_url:
+                logger.info(f"Detected M3U8 stream, will download with ffmpeg")
+
+                # Download and convert M3U8 to MP4 directly
+                headers = {
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+                    'Referer': 'https://vidzy.org/',
+                }
+
+                mp4_path = await self._download_m3u8_as_mp4(video_url, filename, headers)
+                logger.info(f"Successfully extracted Vidzy download link: {filename}")
+                return mp4_path, filename
+
+            # It's a direct MP4 link
+            logger.info(f"Successfully extracted Vidzy download link: {filename}")
+            return video_url, filename
+
+        except Exception as e:
+            logger.error(f"Error extracting Vidzy download link: {e}")
+            raise ValueError(f"Failed to extract download link from Vidzy: {str(e)}")
+
+    async def _extract_with_playwright(self, url: str) -> Optional[str]:
+        """Extract video URL using Playwright with network interception"""
+        try:
+            from playwright.async_api import async_playwright
+
+            logger.info("Launching Playwright for Vidzy...")
+
+            video_urls = []
+
+            async with async_playwright() as p:
+                browser = await p.chromium.launch(
+                    headless=True,
+                    args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
+                )
+
+                context = await browser.new_context(
+                    user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
+                )
+
+                page = await context.new_page()
+
+                # Set up request interception
+                async def handle_request(route):
+                    req_url = route.request.url
+
+                    # Look for video files (HLS streams and MP4s)
+                    if any(ext in req_url.lower() for ext in ['.m3u8', '.mp4', 'master']):
+                        if 'vidzy' not in req_url.lower() or 'master' in req_url.lower():
+                            logger.info(f"🎥 Captured video URL: {req_url[:100]}...")
+                            video_urls.append(req_url)
+
+                    await route.continue_()
+
+                await page.route('**', handle_request)
+
+                logger.info("Navigating to Vidzy page...")
+
+                try:
+                    await page.goto(url, wait_until='domcontentloaded', timeout=30000)
+                except Exception as e:
+                    logger.warning(f"Navigation warning: {e}")
+
+                # Wait for page to load and initialize player
+                logger.info("Waiting for video player to load...")
+                await asyncio.sleep(5)
+
+                # Try JavaScript extraction from VideoJS player
+                try:
+                    js_result = await page.evaluate("""
+                        () => {
+                            // Check if videojs is available
+                            if (typeof videojs !== 'undefined' && videojs.players) {
+                                // Get all players
+                                const players = Object.values(videojs.players);
+                                if (players.length > 0) {
+                                    const player = players[0];
+
+                                    // Try to get source from player
+                                    if (player.currentSrc()) {
+                                        return player.currentSrc();
+                                    }
+
+                                    // Try to get sources array
+                                    if (player.currentSources() && player.currentSources().length > 0) {
+                                        return player.currentSources()[0].src;
+                                    }
+                                }
+                            }
+
+                            // Check all video elements
+                            const videos = document.querySelectorAll('video');
+                            for (let v of videos) {
+                                if (v.src) {
+                                    return v.src;
+                                }
+                                const sources = v.querySelectorAll('source');
+                                for (let s of sources) {
+                                    if (s.src) {
+                                        return s.src;
+                                    }
+                                }
+                            }
+
+                            // Look for sources in scripts (VideoJS config)
+                            const scripts = document.querySelectorAll('script');
+                            for (let script of scripts) {
+                                const text = script.textContent;
+                                // Look for sources array with .m3u8 URLs
+                                const sourcesMatch = text.match(/sources\s*:\s*\[\s*\{\s*src\s*:\s*['"](https?:\/\/[^'"]+\.m3u8[^'"]*)['"]/i);
+                                if (sourcesMatch) {
+                                    return sourcesMatch[1];
+                                }
+                            }
+
+                            return null;
+                        }
+                    """)
+
+                    if js_result and ('.m3u8' in js_result or '.mp4' in js_result):
+                        logger.info(f"Found video URL via JavaScript evaluation")
+                        video_urls.append(js_result)
+                except Exception as e:
+                    logger.warning(f"JS extraction error: {e}")
+
+                # Wait more for network requests
+                await asyncio.sleep(3)
+
+                await browser.close()
+
+                # Return best video URL (prefer master.m3u8 for HLS)
+                if video_urls:
+                    seen = set()
+                    unique_urls = []
+                    for url in video_urls:
+                        if url not in seen:
+                            seen.add(url)
+                            unique_urls.append(url)
+
+                    if unique_urls:
+                        logger.info(f"✅ Found {len(unique_urls)} video URL(s)")
+
+                        # Prefer master.m3u8 (HLS playlist)
+                        for url in unique_urls:
+                            if 'master.m3u8' in url or '.m3u8' in url:
+                                logger.info(f"Using HLS playlist: {url[:100]}...")
+                                return url
+
+                        # Fall back to first URL
+                        return unique_urls[0]
+
+                logger.warning("❌ No video URLs found via Playwright")
+                return None
+
+        except ImportError:
+            logger.warning("Playwright not installed, falling back to static parsing")
+            return None
+        except Exception as e:
+            logger.warning(f"Playwright error: {e}")
+            return None
+
+    async def _extract_static(self, url: str) -> Optional[str]:
+        """Static HTML parsing fallback"""
+        try:
            response = await self.client.get(url)
            response.raise_for_status()
            html = response.text
@@ -47,65 +249,96 @@ class VidzyDownloader(BaseVideoPlayer):
            # Method 1: Look for video source in <video> tag
            video_tag = soup.find('video')
            if video_tag and video_tag.get('src'):
-                download_url = video_tag['src']
                logger.info(f"Found video source from <video> tag")
-            else:
-                # Method 2: Look for source in <source> tag
-                source_tag = soup.find('source')
-                if source_tag and source_tag.get('src'):
-                    download_url = source_tag['src']
-                    logger.info(f"Found video source from <source> tag")
-                else:
-                    # Method 3: Look for video URL in JavaScript
-                    # Vidzy often stores the video URL in a JavaScript variable
-                    scripts = soup.find_all('script')
-                    for script in scripts:
-                        if script.string:
-                            # Look for patterns like 'file:"URL"' or 'file: "URL"'
-                            import re
-                            patterns = [
-                                r'file\s*:\s*["\']([^"\']+\.mp4[^"\']*)["\']',
-                                r'source\s*:\s*["\']([^"\']+\.mp4[^"\']*)["\']',
-                                r'videoUrl\s*:\s*["\']([^"\']+)["\']',
-                                r'"url"\s*:\s*["\']([^"\']+\.mp4[^"\']*)["\']',
-                            ]
-                            for pattern in patterns:
-                                match = re.search(pattern, script.string)
-                                if match:
-                                    download_url = match.group(1)
-                                    logger.info(f"Found video source from JavaScript")
-                                    break
-                            if 'download_url' in locals():
-                                break
+                return video_tag['src']

-                    if 'download_url' not in locals():
-                        raise ValueError("Could not find video URL in page")
+            # Method 2: Look for source in <source> tag
+            source_tag = soup.find('source')
+            if source_tag and source_tag.get('src'):
+                logger.info(f"Found video source from <source> tag")
+                return source_tag['src']

-            # Ensure URL is absolute
-            if not download_url.startswith('http'):
-                if download_url.startswith('//'):
-                    download_url = 'https:' + download_url
-                else:
-                    from urllib.parse import urljoin
-                    download_url = urljoin(url, download_url)
+            # Method 3: Search entire HTML for .m3u8 URLs (Vidzy uses HLS)
+            html_patterns = [
+                r'(https?://[^\s<>"\'`]+\.m3u8[^\s<>"\'`]*)',
+                r'(https?://[^\s<>"\'`]+/master[^\s<>"\'`]*)',
+            ]

-            # Generate filename
-            if target_filename:
-                filename = sanitize_filename(target_filename)
-            else:
-                # Try to extract filename from URL
-                filename = download_url.split('/')[-1].split('?')[0]
-                if not filename or len(filename) < 5:
-                    filename = "vidzy_video.mp4"
-                filename = sanitize_filename(filename)
+            for pattern in html_patterns:
+                matches = re.findall(pattern, html)
+                if matches:
+                    # Filter out obvious false positives
+                    for match in matches:
+                        # Accept URLs with 'master' or from video hosts
+                        if 'master' in match.lower() or any(host in match for host in ['hls', 'video', 'stream']):
+                            logger.info(f"Found video URL in HTML: {match[:100]}...")
+                            return match

-            # Ensure .mp4 extension
-            if not filename.endswith('.mp4'):
-                filename += '.mp4'
-
-            logger.info(f"Successfully extracted Vidzy download link: {filename}")
-            return download_url, filename
+            logger.warning("Static parsing failed to find video URL")
+            return None

        except Exception as e:
-            logger.error(f"Error extracting Vidzy download link: {e}")
-            raise ValueError(f"Failed to extract download link from Vidzy: {str(e)}")
+            logger.warning(f"Static parsing error: {e}")
+            return None
+
+    async def _download_m3u8_as_mp4(self, m3u8_url: str, filename: str, headers: dict, download_dir: str = "downloads") -> str:
+        """Download M3U8 stream and convert to MP4 using ffmpeg"""
+        # Create downloads directory if it doesn't exist
+        os.makedirs(download_dir, exist_ok=True)
+
+        output_path = os.path.join(download_dir, filename)
+
+        # Build headers for ffmpeg - using multiple -headers options
+        header_args = []
+        for key, value in headers.items():
+            header_args.extend(['-headers', f'{key}: {value}'])
+
+        cmd = [
+            'ffmpeg',
+            *header_args,
+            '-i', m3u8_url,
+            '-c', 'copy',
+            '-bsf:a', 'aac_adtstoasc',
+            '-y',
+            output_path
+        ]
+
+        try:
+            logger.info(f"Downloading M3U8 with ffmpeg...")
+            logger.info(f"URL: {m3u8_url[:80]}...")
+            logger.info(f"Output: {output_path}")
+
+            # Run ffmpeg without capturing output to avoid buffering issues
+            # Use a log file instead
+            log_path = output_path + '.log'
+            with open(log_path, 'w') as log_file:
+                result = subprocess.run(
+                    cmd,
+                    stdout=log_file,
+                    stderr=log_file,
+                    timeout=600  # 10 minutes for very long videos
+                )
+
+            # Check if file was created even if ffmpeg had issues
+            if os.path.exists(output_path):
+                file_size = os.path.getsize(output_path)
+                if file_size > 1000:  # At least 1KB
+                    logger.info(f"✅ Download complete: {file_size / (1024*1024):.2f} MB")
+                    return output_path
+
+            # If we get here, something went wrong
+            raise Exception(f"FFmpeg failed - no output file created")
+
+        except subprocess.TimeoutExpired:
+            # Check if file was created despite timeout
+            if os.path.exists(output_path):
+                file_size = os.path.getsize(output_path)
+                if file_size > 1000:  # At least 1KB
+                    logger.warning(f"⚠️  Timeout but file created: {file_size / (1024*1024):.2f} MB")
+                    return output_path
+            raise Exception("FFmpeg timeout (10 minutes) - video too large")
+
+        except FileNotFoundError:
+            raise Exception("ffmpeg not found - please install ffmpeg: apt install ffmpeg")
+        except Exception as e:
+            raise Exception(f"Error downloading M3U8: {str(e)}")