diff --git a/app/download_manager.py b/app/download_manager.py index bd18422..51422a8 100644 --- a/app/download_manager.py +++ b/app/download_manager.py @@ -30,6 +30,25 @@ class DownloadManager: return list(self.tasks.values()) def create_task(self, request: DownloadRequest) -> DownloadTask: + # Check for existing tasks with the same URL + # Extract actual URL from pipe-separated format + url_to_check = request.url.split('|')[0] if '|' in request.url else request.url + + # Look for existing non-failed tasks with the same URL + for existing_task in self.tasks.values(): + existing_url = existing_task.url.split('|')[0] if '|' in existing_task.url else existing_task.url + + # If same URL and task is not failed/cancelled/completed + if existing_url == url_to_check and existing_task.status not in [ + DownloadStatus.FAILED, + DownloadStatus.CANCELLED, + DownloadStatus.COMPLETED + ]: + logger.info(f"Duplicate download detected: {url_to_check[:80]}...") + logger.info(f"Returning existing task: {existing_task.id}") + return existing_task + + # No duplicate found, create new task task_id = str(uuid.uuid4()) task = DownloadTask( id=task_id, @@ -103,7 +122,17 @@ class DownloadManager: # Get downloader and extract link downloader = get_downloader(task.url) - download_url, filename = await downloader.get_download_link(task.url) + + # Extract episode title from pipe-separated URL if present + # Format: video_url|anime_page_url|episode_title + target_filename = None + if '|' in task.url: + parts = task.url.split('|') + if len(parts) >= 3: + target_filename = parts[2].strip() + logger.debug(f"Extracted target filename from pipe: {target_filename}") + + download_url, filename = await downloader.get_download_link(task.url, target_filename) logger.info(f"Download URL: {download_url[:100] if len(download_url) > 100 else download_url}") logger.debug(f"Downloader filename: {filename}") diff --git a/app/downloaders/anime_sites/animesama.py b/app/downloaders/anime_sites/animesama.py index 7f0daf8..577e8bf 100644 --- a/app/downloaders/anime_sites/animesama.py +++ b/app/downloaders/anime_sites/animesama.py @@ -424,7 +424,7 @@ class AnimeSamaDownloader(BaseAnimeSite): filename = target_filename if target_filename else temp_filename print(f"[ANIME-SAMA] Got video: {filename}") - print(f"[ANIME-SAMA] Video URL: {video_url[:100]}...") + print(f"[ANIME-SAMA] Video URL: {video_url[:100] if video_url else 'None'}...") # Return the direct video URL # The download_manager will handle the actual download @@ -432,7 +432,8 @@ class AnimeSamaDownloader(BaseAnimeSite): except Exception as e: print(f"[ANIME-SAMA] Lpayer extraction error: {e}") - raise Exception(f"Error extracting from lpayer: {str(e)}") + # Re-raise with clearer message + raise Exception(f"Lpayer player not supported - this video host requires manual download. Try another host (VidMoly, SendVid, Sibnet). Error: {str(e)}") async def _extract_from_player(self, player_url: str) -> str | None: """Try to extract direct video URL from player iframe""" @@ -783,7 +784,8 @@ class AnimeSamaDownloader(BaseAnimeSite): print(f"[ANIME-SAMA] Detected format {'A (source-based)' if is_format_a else 'B (episode-based)'} - eps1 has {len(eps1_urls)} URLs") - host_preference = ['sibnet.ru', 'vidmoly', 'sendvid', 'lpayer'] + # No more host preference! Just collect all available URLs for each episode + # The download system will automatically detect and use the appropriate downloader all_episodes_by_number = {} if is_format_a: @@ -797,48 +799,36 @@ class AnimeSamaDownloader(BaseAnimeSite): if episode_num not in all_episodes_by_number: all_episodes_by_number[episode_num] = [] - # Determine host preference score (lower = better) - host_score = len(host_preference) - for i, host in enumerate(host_preference): - if host in url.lower(): - host_score = i - break - - all_episodes_by_number[episode_num].append((host_score, url)) + all_episodes_by_number[episode_num].append(url) else: # Format B: Each epsX is an episode, containing multiple sources for eps_num, urls_text in eps_matches: episode_num = str(eps_num).zfill(2) episode_urls = re.findall(r"'(https?://[^']+)'", urls_text) - for url in episode_urls: - if episode_num not in all_episodes_by_number: - all_episodes_by_number[episode_num] = [] + if episode_num not in all_episodes_by_number: + all_episodes_by_number[episode_num] = [] - # Determine host preference score (lower = better) - host_score = len(host_preference) - for i, host in enumerate(host_preference): - if host in url.lower(): - host_score = i - break + all_episodes_by_number[episode_num].extend(episode_urls) - all_episodes_by_number[episode_num].append((host_score, url)) - - # For each episode, use the best available URL (lowest score = best host) + # For each episode, use the first available URL + # (they are usually already in order of preference on the site) for episode_num in sorted(all_episodes_by_number.keys()): - sorted_urls = sorted(all_episodes_by_number[episode_num], key=lambda x: x[0]) - best_url = sorted_urls[0][1] # Get the URL with lowest score (best host) + available_urls = all_episodes_by_number[episode_num] + # Use the first available URL (the site usually lists them in preference order) + episode_url = available_urls[0] episode_title = f'Episode {episode_num}' - combined_url = f"{best_url}|{anime_url}|{episode_title}" + combined_url = f"{episode_url}|{anime_url}|{episode_title}" episodes.append({ 'episode': episode_num, 'url': combined_url, - 'title': episode_title + 'title': episode_title, + 'available_hosts': len(available_urls) # Store count of available hosts }) - print(f"[ANIME-SAMA] Found {len(episodes)} episodes (prioritizing {host_preference})") + print(f"[ANIME-SAMA] Found {len(episodes)} episodes") return episodes except Exception as e: diff --git a/app/downloaders/series_sites/fs7.py b/app/downloaders/series_sites/fs7.py index 4fd424f..afb2f60 100644 --- a/app/downloaders/series_sites/fs7.py +++ b/app/downloaders/series_sites/fs7.py @@ -89,6 +89,10 @@ class FS7Downloader(BaseSeriesSite): continue title = text + # Clean up title: remove "affiche" suffix and clean extra whitespace + title = re.sub(r'\s+affiche$', '', title, flags=re.IGNORECASE).strip() + title = re.sub(r'\s+', ' ', title) # Normalize whitespace + # Extract cover image img = item.find('img') cover_image = img.get('src', '') if img else '' @@ -135,6 +139,12 @@ class FS7Downloader(BaseSeriesSite): soup = BeautifulSoup(html, 'lxml') episodes = [] + # Get series title for episode naming + title_elem = soup.find('h1') + series_title = title_elem.get_text(strip=True) if title_elem else "Series" + # Clean up title: remove "affiche" suffix + series_title = re.sub(r'\s+affiche$', '', series_title, flags=re.IGNORECASE).strip() + # FS7 stores episode data in JavaScript div elements # Format:
episode_divs = soup.find_all('div', attrs={'data-ep': True}) @@ -144,17 +154,28 @@ class FS7Downloader(BaseSeriesSite): # Try different video players in order of preference video_url = None + host_name = None for player in ['data-vidzy', 'data-uqload', 'data-voe', 'data-netu']: player_url = div.get(player, '').strip() if player_url: video_url = player_url - logger.debug(f"Found episode {ep_num} on {player}") + # Extract host name from attribute name + host_name = player.replace('data-', '').title() + logger.debug(f"Found episode {ep_num} on {host_name}") break if video_url and ep_num: + # Create episode title for filename + episode_title = f"{series_title} - Episode {ep_num}" + + # Use pipe-separated format: video_url|anime_url|episode_title + combined_url = f"{video_url}|{anime_url}|{episode_title}" + episodes.append({ 'episode': ep_num, - 'url': video_url + 'url': combined_url, + 'title': episode_title, + 'host': host_name or 'Unknown' }) # Sort by episode number @@ -193,6 +214,9 @@ class FS7Downloader(BaseSeriesSite): title = soup.find('h1') title = title.get_text(strip=True) if title else "Unknown" + # Clean up title: remove "affiche" suffix + title = re.sub(r'\s+affiche$', '', title, flags=re.IGNORECASE).strip() + # Extract description/synopsis description_elem = soup.find('div', class_='full-text') description = description_elem.get_text(strip=True) if description_elem else "" diff --git a/app/downloaders/video_players/vidzy.py b/app/downloaders/video_players/vidzy.py index 2696e75..90df3f4 100644 --- a/app/downloaders/video_players/vidzy.py +++ b/app/downloaders/video_players/vidzy.py @@ -1,5 +1,9 @@ """Vidzy video hosting service downloader""" import logging +import asyncio +import re +import subprocess +import os from typing import Optional from .base import BaseVideoPlayer from bs4 import BeautifulSoup @@ -13,6 +17,7 @@ class VidzyDownloader(BaseVideoPlayer): Downloader for Vidzy video hosting service. Vidzy is a video hosting platform used by various anime streaming sites. + Uses heavy JavaScript obfuscation, so Playwright is required. """ def can_handle(self, url: str) -> bool: @@ -35,9 +40,206 @@ class VidzyDownloader(BaseVideoPlayer): Tuple of (download_url, filename) """ try: + # Extract actual Vidzy URL from pipe-separated format if present + # Format: video_url|anime_url|episode_title + if '|' in url: + url = url.split('|')[0].strip() + logger.debug(f"Extracted Vidzy URL from pipe format: {url}") + logger.info(f"Fetching Vidzy URL: {url}") - # Fetch the page + # Try using Playwright first (Vidzy uses heavy JS obfuscation) + video_url = await self._extract_with_playwright(url) + + if not video_url: + # Fallback to static HTML parsing + logger.warning("Playwright extraction failed, trying static parsing...") + video_url = await self._extract_static(url) + + if not video_url: + raise ValueError(f"Could not extract video URL from Vidzy") + + logger.info(f"Successfully extracted Vidzy URL: {video_url[:100]}...") + + # Generate filename + if target_filename: + filename = sanitize_filename(target_filename) + else: + # Try to extract filename from URL + filename = video_url.split('/')[-1].split('?')[0] + if not filename or len(filename) < 5: + filename = "vidzy_video.mp4" + filename = sanitize_filename(filename) + + # Ensure .mp4 extension + if not filename.endswith('.mp4'): + filename += '.mp4' + + # Check if it's an M3U8 playlist (HLS stream) + if '.m3u8' in video_url: + logger.info(f"Detected M3U8 stream, will download with ffmpeg") + + # Download and convert M3U8 to MP4 directly + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Referer': 'https://vidzy.org/', + } + + mp4_path = await self._download_m3u8_as_mp4(video_url, filename, headers) + logger.info(f"Successfully extracted Vidzy download link: {filename}") + return mp4_path, filename + + # It's a direct MP4 link + logger.info(f"Successfully extracted Vidzy download link: {filename}") + return video_url, filename + + except Exception as e: + logger.error(f"Error extracting Vidzy download link: {e}") + raise ValueError(f"Failed to extract download link from Vidzy: {str(e)}") + + async def _extract_with_playwright(self, url: str) -> Optional[str]: + """Extract video URL using Playwright with network interception""" + try: + from playwright.async_api import async_playwright + + logger.info("Launching Playwright for Vidzy...") + + video_urls = [] + + async with async_playwright() as p: + browser = await p.chromium.launch( + headless=True, + args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'] + ) + + context = await browser.new_context( + user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + ) + + page = await context.new_page() + + # Set up request interception + async def handle_request(route): + req_url = route.request.url + + # Look for video files (HLS streams and MP4s) + if any(ext in req_url.lower() for ext in ['.m3u8', '.mp4', 'master']): + if 'vidzy' not in req_url.lower() or 'master' in req_url.lower(): + logger.info(f"🎥 Captured video URL: {req_url[:100]}...") + video_urls.append(req_url) + + await route.continue_() + + await page.route('**', handle_request) + + logger.info("Navigating to Vidzy page...") + + try: + await page.goto(url, wait_until='domcontentloaded', timeout=30000) + except Exception as e: + logger.warning(f"Navigation warning: {e}") + + # Wait for page to load and initialize player + logger.info("Waiting for video player to load...") + await asyncio.sleep(5) + + # Try JavaScript extraction from VideoJS player + try: + js_result = await page.evaluate(""" + () => { + // Check if videojs is available + if (typeof videojs !== 'undefined' && videojs.players) { + // Get all players + const players = Object.values(videojs.players); + if (players.length > 0) { + const player = players[0]; + + // Try to get source from player + if (player.currentSrc()) { + return player.currentSrc(); + } + + // Try to get sources array + if (player.currentSources() && player.currentSources().length > 0) { + return player.currentSources()[0].src; + } + } + } + + // Check all video elements + const videos = document.querySelectorAll('video'); + for (let v of videos) { + if (v.src) { + return v.src; + } + const sources = v.querySelectorAll('source'); + for (let s of sources) { + if (s.src) { + return s.src; + } + } + } + + // Look for sources in scripts (VideoJS config) + const scripts = document.querySelectorAll('script'); + for (let script of scripts) { + const text = script.textContent; + // Look for sources array with .m3u8 URLs + const sourcesMatch = text.match(/sources\s*:\s*\[\s*\{\s*src\s*:\s*['"](https?:\/\/[^'"]+\.m3u8[^'"]*)['"]/i); + if (sourcesMatch) { + return sourcesMatch[1]; + } + } + + return null; + } + """) + + if js_result and ('.m3u8' in js_result or '.mp4' in js_result): + logger.info(f"Found video URL via JavaScript evaluation") + video_urls.append(js_result) + except Exception as e: + logger.warning(f"JS extraction error: {e}") + + # Wait more for network requests + await asyncio.sleep(3) + + await browser.close() + + # Return best video URL (prefer master.m3u8 for HLS) + if video_urls: + seen = set() + unique_urls = [] + for url in video_urls: + if url not in seen: + seen.add(url) + unique_urls.append(url) + + if unique_urls: + logger.info(f"✅ Found {len(unique_urls)} video URL(s)") + + # Prefer master.m3u8 (HLS playlist) + for url in unique_urls: + if 'master.m3u8' in url or '.m3u8' in url: + logger.info(f"Using HLS playlist: {url[:100]}...") + return url + + # Fall back to first URL + return unique_urls[0] + + logger.warning("❌ No video URLs found via Playwright") + return None + + except ImportError: + logger.warning("Playwright not installed, falling back to static parsing") + return None + except Exception as e: + logger.warning(f"Playwright error: {e}") + return None + + async def _extract_static(self, url: str) -> Optional[str]: + """Static HTML parsing fallback""" + try: response = await self.client.get(url) response.raise_for_status() html = response.text @@ -47,65 +249,96 @@ class VidzyDownloader(BaseVideoPlayer): # Method 1: Look for video source in