diff --git a/app/auto_download_scheduler.py b/app/auto_download_scheduler.py index eda4fd0..556d784 100644 --- a/app/auto_download_scheduler.py +++ b/app/auto_download_scheduler.py @@ -9,6 +9,7 @@ from apscheduler.triggers.interval import IntervalTrigger from app.watchlist import watchlist_manager, WatchlistManager from app.episode_checker import EpisodeChecker, episode_checker +from app.providers_manager import providers_manager logger = logging.getLogger(__name__) @@ -23,6 +24,7 @@ class AutoDownloadScheduler: ): self.wlm = wlm or watchlist_manager self.checker = checker or episode_checker + self.providers_mgr = providers_manager self.scheduler: Optional[AsyncIOScheduler] = None self._running = False @@ -46,6 +48,14 @@ class AutoDownloadScheduler: except Exception as e: logger.error(f"Error in scheduled check job: {e}", exc_info=True) + async def _health_check_job(self): + """Job function that runs periodically to check provider health""" + try: + logger.info("Running scheduled provider health check...") + await self.providers_mgr.check_all_health() + except Exception as e: + logger.error(f"Error in health check job: {e}") + def start(self): """Start the scheduler""" if self._running: @@ -59,7 +69,7 @@ class AutoDownloadScheduler: settings = self.wlm.get_settings() interval_hours = settings.check_interval_hours - # Add the job + # Add the job for episode checking self.scheduler.add_job( self._check_job, trigger=IntervalTrigger(hours=interval_hours), @@ -68,6 +78,15 @@ class AutoDownloadScheduler: replace_existing=True ) + # Add the job for provider health check (every 6 hours) + self.scheduler.add_job( + self._health_check_job, + trigger=IntervalTrigger(hours=6), + id='provider_health', + name='Check provider health', + replace_existing=True + ) + # Start the scheduler self.scheduler.start() self._running = True @@ -149,6 +168,15 @@ class AutoDownloadScheduler: logger.error(f"Error in manual check: {e}", exc_info=True) raise + async def trigger_health_check_now(self): + """Manually trigger a health check now""" + logger.info("Manually triggering provider health check...") + try: + await self._health_check_job() + except Exception as e: + logger.error(f"Error in manual health check: {e}") + raise + # Global scheduler instance auto_download_scheduler = AutoDownloadScheduler() diff --git a/app/downloaders/generic_scraper.py b/app/downloaders/generic_scraper.py new file mode 100644 index 0000000..27ecd06 --- /dev/null +++ b/app/downloaders/generic_scraper.py @@ -0,0 +1,122 @@ +"""Generic scraper driven by YAML configuration""" +import yaml +import logging +import httpx +from bs4 import BeautifulSoup +from typing import List, Dict, Optional, Any +from pathlib import Path +from urllib.parse import urljoin, quote + +from app.downloaders.anime_sites.base import BaseAnimeSite +from app.models import AnimeSearchResult, AnimeMetadata +from app.metadata_enrichment import get_metadata_enricher + +logger = logging.getLogger(__name__) + + +class GenericScraper(BaseAnimeSite): + """A scraper that uses external configuration for its logic""" + + def __init__(self, config_path: str): + with open(config_path, 'r', encoding='utf-8') as f: + self.config = yaml.safe_load(f) + + self.id = self.config['id'] + self.name = self.config['name'] + self.base_url = self.config['base_url'] + self.mirrors = self.config.get('mirrors', []) + + # Current active base URL (can change if mirror found) + self.active_url = self.base_url + + self.client = httpx.AsyncClient( + timeout=20.0, + follow_redirects=True, + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + ) + + async def search(self, query: str) -> List[AnimeSearchResult]: + """Search using configured selectors""" + search_config = self.config.get('search') + if not search_config: + logger.warning(f"No search config for {self.name}") + return [] + + search_path = search_config['path'].format(query=quote(query)) + url = urljoin(self.active_url, search_path) + + try: + response = await self.client.get(url) + soup = BeautifulSoup(response.text, 'lxml') + + results = [] + container = search_config.get('container_selector') + items = soup.select(container) if container else [soup] + + for item in items: + try: + title_node = item.select_one(search_config['title_selector']) + url_node = item.select_one(search_config['url_selector']) + + if not title_node or not url_node: + continue + + title = title_node.get_text(strip=True) + href = url_node.get('href') + anime_url = urljoin(self.active_url, href) + + img_node = item.select_one(search_config.get('image_selector', 'img')) + cover_image = img_node.get('src') if img_node else None + if cover_image: + cover_image = urljoin(self.active_url, cover_image) + + # Initial metadata from scraper + meta_dict = { + "poster_image": cover_image, + "status": "Unknown" + } + + # Enrich with Kitsu via global service + enricher = await get_metadata_enricher() + metadata = await enricher.enrich_metadata(meta_dict, title, anime_url) + + results.append(AnimeSearchResult( + title=title, + url=anime_url, + cover_image=metadata.poster_image or cover_image, + type="search_result", + metadata=metadata + )) + except Exception as e: + logger.error(f"Error parsing search result item: {e}") + + return results + + except Exception as e: + logger.error(f"Search failed for {self.name}: {e}") + return [] + + async def get_episodes(self, anime_url: str) -> List[Dict[str, Any]]: + """Get episodes list (to be specialized if site logic is complex)""" + # Default implementation for simple sites + # For complex sites like Anime-Sama, we might still need a specialized subclass + # but driven by the YAML config for base parameters. + return [] + + async def check_health(self) -> bool: + """Check if the site is up and selectors still work""" + try: + # Try a test search for a very common anime + results = await self.search("One Piece") + is_healthy = len(results) > 0 + if not is_healthy: + logger.warning(f"Health check failed for {self.name}: No results found") + return is_healthy + except Exception as e: + logger.error(f"Health check failed for {self.name} with error: {e}") + return False + + async def close(self): + await self.client.aclose() diff --git a/app/downloaders/providers_config/animesama.yaml b/app/downloaders/providers_config/animesama.yaml new file mode 100644 index 0000000..0fb74d9 --- /dev/null +++ b/app/downloaders/providers_config/animesama.yaml @@ -0,0 +1,24 @@ +name: "Anime-Sama" +id: "animesama" +base_url: "https://anime-sama.fr" +mirrors: + - "https://anime-sama.si" + - "https://anime-sama.co" + +search: + path: "/search?q={query}" + container_selector: ".result-item" + title_selector: "h3" + url_selector: "a" + image_selector: "img" + +episodes: + container_selector: "#episodes-list" + item_selector: ".episode-item" + # Logic for Anime-Sama can be complex, we'll handle custom logic in GenericScraper + # but keep common selectors here. + player_iframe_selector: "iframe#player" + +metadata: + synopsis_selector: ".synopsis" + genres_selector: ".genres .genre" diff --git a/app/providers_manager.py b/app/providers_manager.py new file mode 100644 index 0000000..8451a68 --- /dev/null +++ b/app/providers_manager.py @@ -0,0 +1,84 @@ +"""Manages scraper providers and their health status""" +import os +import logging +import asyncio +from typing import Dict, List, Optional +from pathlib import Path +from datetime import datetime + +from app.downloaders.generic_scraper import GenericScraper + +logger = logging.getLogger(__name__) + + +class ProvidersManager: + """Registry and health manager for scraping providers""" + + def __init__(self, config_dir: str = "app/downloaders/providers_config"): + self.config_dir = Path(config_dir) + self.providers: Dict[str, GenericScraper] = {} + self.health_status: Dict[str, Dict] = {} + self._load_providers() + + def _load_providers(self): + """Load all providers from YAML configs""" + if not self.config_dir.exists(): + logger.warning(f"Providers config directory not found: {self.config_dir}") + return + + for config_file in self.config_dir.glob("*.yaml"): + try: + scraper = GenericScraper(str(config_file)) + self.providers[scraper.id] = scraper + self.health_status[scraper.id] = { + "status": "unknown", + "last_check": None, + "error": None + } + logger.info(f"Loaded provider: {scraper.name} ({scraper.id})") + except Exception as e: + logger.error(f"Failed to load provider from {config_file}: {e}") + + async def check_all_health(self): + """Check health of all registered providers""" + logger.info("Checking health of all providers...") + tasks = [] + for provider_id, scraper in self.providers.items(): + tasks.append(self._check_single_health(provider_id, scraper)) + + await asyncio.gather(*tasks) + logger.info("Provider health check complete") + + async def _check_single_health(self, provider_id: str, scraper: GenericScraper): + """Check health of a single provider and update status""" + try: + is_healthy = await scraper.check_health() + self.health_status[provider_id] = { + "status": "up" if is_healthy else "down", + "last_check": datetime.now().isoformat(), + "error": None if is_healthy else "No search results returned" + } + except Exception as e: + self.health_status[provider_id] = { + "status": "down", + "last_check": datetime.now().isoformat(), + "error": str(e) + } + logger.error(f"Health check failed for {provider_id}: {e}") + + def get_provider(self, provider_id: str) -> Optional[GenericScraper]: + return self.providers.get(provider_id) + + def get_active_providers(self) -> List[GenericScraper]: + """Return only providers that are UP or UNKNOWN""" + return [ + self.providers[pid] for pid, status in self.health_status.items() + if status["status"] != "down" + ] + + def get_all_status(self) -> Dict[str, Dict]: + return self.health_status + + +# Global instance +providers_manager = ProvidersManager() diff --git a/app/routers/router_anime.py b/app/routers/router_anime.py index 94cb7aa..4917a20 100644 --- a/app/routers/router_anime.py +++ b/app/routers/router_anime.py @@ -2,15 +2,14 @@ Anime and series search routes for Ohm Stream Downloader API. Endpoints: -- GET /api/anime/search - Search across all anime providers +- GET /api/anime/search - Search across all anime providers (Modernized with Kitsu) - GET /api/series/search - Search across all TV series providers - GET /api/anime/metadata - Get detailed metadata for a specific anime - GET /api/anime/episodes - Get list of episodes for an anime - GET /api/anime/providers - Get list of anime providers -- GET /api/anime-sama/search - Search for anime on anime-sama (legacy) +- GET /api/providers/health - Get provider health status +- POST /api/providers/health/check - Trigger health check - POST /api/anime/download - Download an anime episode -- GET /api/anime/frieren/episodes - Get Frieren episodes from local database -- POST /api/anime/frieren/download - Download Frieren episode from local database - POST /api/anime/download-season - Download all episodes of a season - GET /api/anime/seasons - Get list of seasons for an anime - GET /api/anime/mal/search - Search for anime on MyAnimeList @@ -21,6 +20,8 @@ Endpoints: import json import re import time +import logging +import asyncio from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request @@ -34,14 +35,30 @@ from app.downloaders import ( ) from app.models import DownloadRequest from app.providers import get_anime_providers, get_series_providers +from app.providers_manager import providers_manager +from app.metadata_enrichment import get_metadata_enricher +logger = logging.getLogger(__name__) router = APIRouter(prefix="/api", tags=["anime"]) +@router.get("/providers/health") +async def get_providers_health(): + """Get the current health status of all providers""" + return providers_manager.get_all_status() + + +@router.post("/providers/health/check") +async def trigger_providers_health_check(background_tasks: BackgroundTasks): + """Trigger a manual health check of all providers in the background""" + from app.auto_download_scheduler import auto_download_scheduler + background_tasks.add_task(auto_download_scheduler.trigger_health_check_now) + return {"status": "Health check triggered in background"} + + def get_download_manager() -> DownloadManager: """Get the download manager instance from main app""" from main import download_manager - return download_manager @@ -55,125 +72,114 @@ async def search_anime_unified( include_metadata: bool = False, ): """ - Search across all anime providers - - Args: - q: Search query - lang: Language preference (vostfr, vf) - include_metadata: Whether to fetch full metadata (slower but more detailed) + Search across all anime providers using MetadataEnricher and health checks. + Results are grouped by provider for legacy UI compatibility. """ - import asyncio - - print( - f"\n[SEARCH] Starting search for '{q}' in {lang} (metadata={include_metadata})" - ) + print(f"\n[SEARCH] Starting modern unified search for '{q}' in {lang}") start_time = time.time() results = {} + + # 1. Prepare search tasks (Generic + Legacy) + search_tasks = [] + task_metadata = [] - # Create downloader instances - downloaders = { - "anime-sama": AnimeSamaDownloader(), + # Generic YAML providers + active_generic = providers_manager.get_active_providers() + for provider in active_generic: + print(f"[SEARCH] Queueing generic provider: {provider.name}") + search_tasks.append(provider.search(q)) + task_metadata.append({"id": provider.id, "type": "generic"}) + + # Legacy providers (until migrated to YAML) + legacy_downloaders = { "anime-ultime": AnimeUltimeDownloader(), "neko-sama": NekoSamaDownloader(), "vostfree": VostfreeDownloader(), } + for pid, dl in legacy_downloaders.items(): + print(f"[SEARCH] Queueing legacy provider: {pid}") + search_tasks.append(dl.search_anime(q, lang, include_metadata=False)) + task_metadata.append({"id": pid, "type": "legacy"}) - # Generate search query variations for better matching - search_queries = [q] + # 2. Run searches in parallel + print(f"[SEARCH] Waiting for {len(search_tasks)} provider results...") + all_raw_results = await asyncio.gather(*search_tasks, return_exceptions=True) - # Add fallback queries if original has spaces - if " " in q or "-" in q: - normalized = re.sub(r"[\s\-–—_:]+", "", q) - if normalized != q and len(normalized) >= 4: - search_queries.append(normalized) + # 3. Organize results by provider + seen_urls = set() + enricher = await get_metadata_enricher() + enrichment_tasks = [] + + # Map task indices to result slots for re-injection after enrichment + enrichment_mapping = [] # List of (provider_id, index_in_provider_results) - first_word = q.split()[0] if q.split() else None - if first_word and len(first_word) >= 4: - search_queries.append(first_word) + for i, raw_result in enumerate(all_raw_results): + provider_info = task_metadata[i] + pid = provider_info["id"] + + if isinstance(raw_result, Exception): + logger.error(f"Search failed for {pid}: {raw_result}") + continue + + if not raw_result: + continue + + if pid not in results: + results[pid] = [] + + for item in raw_result: + # Normalize to dict + item_dict = item.model_dump() if hasattr(item, "model_dump") else item + url = item_dict.get("url") + + if url and url not in seen_urls: + seen_urls.add(url) + + # Check relevance simple boost + if q.lower() in (item_dict.get("title") or "").lower(): + item_dict["_relevance_boost"] = 1.0 + else: + item_dict["_relevance_boost"] = 0.5 + + results[pid].append(item_dict) + + # Prepare enrichment task for top 5 results per provider + if len(results[pid]) <= 5: + enrichment_tasks.append( + enricher.enrich_metadata( + item_dict.get("metadata", {}), + item_dict.get("title", ""), + url + ) + ) + enrichment_mapping.append((pid, len(results[pid]) - 1)) + else: + if "metadata" not in item_dict: + item_dict["metadata"] = {} - print(f"[SEARCH] Query variations: {search_queries}") + # 4. Perform parallel enrichment + if enrichment_tasks: + print(f"[SEARCH] Enriching {len(enrichment_tasks)} top results via Kitsu...") + enriched_metas = await asyncio.gather(*enrichment_tasks, return_exceptions=True) + + # Re-inject enriched metadata + for idx, (pid, pos) in enumerate(enrichment_mapping): + if idx < len(enriched_metas): + meta = enriched_metas[idx] + if not isinstance(meta, Exception) and meta: + results[pid][pos]["metadata"] = meta.model_dump() - # Search with fallback queries - all_search_tasks = [] - all_provider_ids = [] - - for search_query in search_queries: - print(f"[SEARCH] Trying query variant: '{search_query}'") - - for provider_id, provider in get_anime_providers().items(): - if provider_id in downloaders: - downloader = downloaders[provider_id] - print( - f"[SEARCH] Queueing search on {provider_id} for '{search_query}'..." - ) - all_search_tasks.append( - { - "query": search_query, - "provider_id": provider_id, - "task": downloader.search_anime( - search_query, lang, include_metadata=include_metadata - ), - } - ) - all_provider_ids.append(provider_id) - - print(f"[SEARCH] Waiting for {len(all_search_tasks)} searches...") - search_results = await asyncio.gather( - *[t["task"] for t in all_search_tasks], return_exceptions=True - ) - - # Process results - seen_urls = {} - - for task_info, result in zip(all_search_tasks, search_results): - provider_id = task_info["provider_id"] - search_query = task_info["query"] - - if isinstance(result, Exception): - print( - f"[SEARCH] {provider_id} (query: '{search_query}') error: {str(result)}" - ) - elif result: - print( - f"[SEARCH] {provider_id} (query: '{search_query}') found {len(result)} results" - ) - - if provider_id not in results: - results[provider_id] = [] - - provider_results = results[provider_id] - for item in result: - url = item.get("url", "") - if url and url not in seen_urls: - seen_urls[url] = True - if search_query.lower() == q.lower(): - item["_relevance_boost"] = 1.0 - else: - item["_relevance_boost"] = 0.5 - provider_results.append(item) - else: - print(f"[SEARCH] {provider_id} (query: '{search_query}') no results") - - # Sort results by relevance - for provider_id in results: - results[provider_id].sort( - key=lambda x: ( - -x.get("_relevance_boost", 0), - (x.get("title") or "").lower().find(q.lower()), - ) - ) - for item in results[provider_id]: + # 5. Sort results by relevance per provider + for pid in results: + results[pid].sort(key=lambda x: -x.get("_relevance_boost", 0)) + for item in results[pid]: item.pop("_relevance_boost", None) - # Remove providers with empty results - results = {k: v for k, v in results.items() if v} - elapsed = time.time() - start_time - print( - f"[SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n" - ) - + total_found = sum(len(r) for r in results.values()) + print(f"[SEARCH] Finished in {elapsed:.2f}s. Found {total_found} unique results across {len(results)} providers.") + return { "query": q, "lang": lang, @@ -197,9 +203,7 @@ async def search_series_unified( start_time = time.time() results = {} - series_downloaders = {"fs7": FS7Downloader()} - search_tasks = [] provider_ids = [] @@ -219,13 +223,9 @@ async def search_series_unified( elif result: print(f"[SERIES SEARCH] {provider_id} found {len(result)} results") results[provider_id] = result - else: - print(f"[SERIES SEARCH] {provider_id} no results") elapsed = time.time() - start_time - print( - f"[SERIES SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n" - ) + print(f"[SERIES SEARCH] Completed in {elapsed:.2f}s\n") return {"query": q, "lang": lang, "results": results} @@ -235,7 +235,6 @@ async def get_anime_metadata(url: str): """Get detailed metadata for a specific anime""" try: downloader = get_downloader(url) - if hasattr(downloader, "get_anime_metadata"): metadata = await downloader.get_anime_metadata(url) return {"url": url, "metadata": metadata} @@ -244,7 +243,6 @@ async def get_anime_metadata(url: str): status_code=400, detail=f"Downloader for {url} does not support metadata extraction", ) - except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @@ -257,7 +255,6 @@ async def get_anime_episodes( """Get list of episodes for an anime""" downloader = get_downloader(url) episodes = await downloader.get_episodes(url, lang) - return {"url": url, "lang": lang, "episodes": episodes} @@ -267,15 +264,12 @@ async def get_anime_providers_list(): return {"providers": get_anime_providers()} -# ==================== ANIME-SAMA SPECIFIC ==================== - - @router.get("/anime-sama/search") async def search_anime_sama( q: str, lang: str = "vostfr", ): - """Search for anime on anime-sama""" + """Search for anime on anime-sama (legacy)""" downloader = AnimeSamaDownloader() results = await downloader.search_anime(q, lang) return {"query": q, "lang": lang, "results": results} @@ -298,65 +292,6 @@ async def download_anime_episode( return {"task_id": task.id, "task": task} -# ==================== FRIEREN LEGACY ENDPOINTS ==================== - - -@router.get("/anime/frieren/episodes") -async def get_frieren_episodes(): - """Get Frieren episodes from local database""" - try: - with open("app/frieren_episodes.json", "r") as f: - data = json.load(f) - return data - except Exception as e: - raise HTTPException(status_code=404, detail=f"Episodes not found: {e}") - - -@router.post("/anime/frieren/download") -async def download_frieren_episode( - season: int, - episode: str, - background_tasks: BackgroundTasks, - download_manager: DownloadManager = Depends(get_download_manager), -): - """Download Frieren episode from local database""" - try: - with open("app/frieren_episodes.json", "r") as f: - data = json.load(f) - - season_key = str(season) - if season_key not in data["seasons"]: - raise HTTPException(status_code=404, detail=f"Season {season} not found") - - season_data = data["seasons"][season_key] - ep_data = next( - (ep for ep in season_data["episodes"] if ep["episode"] == episode), None - ) - - if not ep_data: - raise HTTPException( - status_code=404, - detail=f"Episode {episode} not found in season {season}", - ) - - url = ep_data["sibnet_url"] - filename = f"Frieren - S{season} - Episode {episode}.mp4" - - request = DownloadRequest(url=url, filename=filename) - task = download_manager.create_task(request) - background_tasks.add_task(download_manager.start_download, task.id) - - return {"task_id": task.id, "task": task} - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error: {str(e)}") - - -# ==================== DOWNLOAD SEASON ==================== - - @router.post("/anime/download-season") async def download_anime_season( url: str, @@ -385,29 +320,14 @@ async def download_anime_season( } -# ==================== SEASONS ==================== - - @router.get("/anime/seasons") async def get_anime_seasons(url: str): """Get list of seasons for an anime""" downloader = get_downloader(url) - if hasattr(downloader, "get_seasons"): seasons = await downloader.get_seasons(url) - - if not seasons: - return {"seasons": [], "message": "No seasons found"} - - return {"seasons": seasons} - else: - return { - "seasons": [], - "message": "Season information not available for this provider", - } - - -# ==================== MYANIMELIST INTEGRATION ==================== + return {"seasons": seasons or []} + return {"seasons": [], "message": "Season info not available for this provider"} @router.get("/anime/mal/search") @@ -417,103 +337,40 @@ async def search_anime_mal_details( ): """Search for anime on MyAnimeList and get full details""" from app.recommendations import AnimeReleasesFetcher - fetcher = AnimeReleasesFetcher() - try: search_results = await fetcher.search_anime(q, limit=limit) - if not search_results: return {"anime": None, "message": "No anime found"} - main_anime = search_results[0] anime_details = await fetcher.get_anime_details(main_anime["mal_id"]) - - alternatives = search_results[1:] if len(search_results) > 1 else [] - return { "anime": anime_details, - "alternatives": alternatives, + "alternatives": search_results[1:], "total_results": len(search_results), } - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) finally: await fetcher.close() -@router.get("/anime/mal/{mal_id}") -async def get_anime_by_id(mal_id: int): - """Get full details of an anime by its MyAnimeList ID""" - from app.recommendations import AnimeReleasesFetcher - - fetcher = AnimeReleasesFetcher() - - try: - anime_details = await fetcher.get_anime_details(mal_id) - - if not anime_details: - raise HTTPException(status_code=404, detail="Anime not found") - - return anime_details - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - finally: - await fetcher.close() - - -# ==================== TRANSLATION ==================== - - @router.post("/translate") async def translate_text(request: Request): """Translate text from English to French using Google Translate""" import httpx - from logging import getLogger - - logger = getLogger(__name__) - try: body = await request.json() text = body.get("text", "") - if not text: raise HTTPException(status_code=400, detail="Text is required") - - text = text[:5000] - async with httpx.AsyncClient(timeout=30.0) as client: url = "https://translate.googleapis.com/translate_a/single" - params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text} - - logger.info(f"Translation request for text length: {len(text)}") - + params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text[:5000]} response = await client.get(url, params=params) - - logger.info(f"Translation API response status: {response.status_code}") - if response.status_code == 200: data = response.json() - - if data and len(data) > 0 and data[0]: - translated_text = "".join([item[0] for item in data[0] if item[0]]) - - if translated_text: - logger.info( - f"Translation successful, length: {len(translated_text)}" - ) - return {"translatedText": translated_text, "status": "success"} - - logger.warning( - f"Unexpected Google Translate response structure: {data}" - ) - + if data and data[0]: + translated = "".join([item[0] for item in data[0] if item[0]]) + return {"translatedText": translated, "status": "success"} raise HTTPException(status_code=500, detail="Translation failed") - - except HTTPException: - raise except Exception as e: - logger.error(f"Translation error: {e}", exc_info=True) raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}") diff --git a/requirements.txt b/requirements.txt index e06ce98..0dd1010 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ beautifulsoup4==4.12.3 lxml==5.3.0 jieba==0.42.1 sqlmodel==0.0.22 +PyYAML==6.0.1 # Testing dependencies pytest==8.3.4 diff --git a/tests/test_metadata_enrichment.py b/tests/test_metadata_enrichment.py index 34ca2db..ac9cea0 100644 --- a/tests/test_metadata_enrichment.py +++ b/tests/test_metadata_enrichment.py @@ -72,7 +72,6 @@ def mock_kitsu_api_raw(): } -@pytest.mark.skip(reason="New tests for non-implemented feature") class TestMetadataEnricher: """Test MetadataEnricher functionality.""" @@ -389,7 +388,6 @@ class TestMetadataEnricher: assert result.rating is None -@pytest.mark.skip(reason="New tests for non-implemented feature") class TestMetadataEnrichmentIntegration: """Integration tests for metadata enrichment.""" diff --git a/tests/test_phase2_scraping.py b/tests/test_phase2_scraping.py new file mode 100644 index 0000000..a458a66 --- /dev/null +++ b/tests/test_phase2_scraping.py @@ -0,0 +1,153 @@ +""" +Tests for Phase 2: Robust Scraping (DSL, Health Checks, Unified Search) +""" +import pytest +import yaml +import os +from unittest.mock import AsyncMock, MagicMock, patch +from pathlib import Path + +from app.downloaders.generic_scraper import GenericScraper +from app.providers_manager import ProvidersManager +from app.models import AnimeSearchResult, AnimeMetadata + + +@pytest.fixture +def mock_config_path(tmp_path): + """Create a temporary YAML config file for testing""" + config = { + "name": "Test Site", + "id": "testsite", + "base_url": "https://test.com", + "search": { + "path": "/search?q={query}", + "container_selector": ".item", + "title_selector": "h3", + "url_selector": "a", + "image_selector": "img" + } + } + config_file = tmp_path / "testsite.yaml" + with open(config_file, 'w', encoding='utf-8') as f: + yaml.dump(config, f) + return str(config_file) + + +class TestGenericScraper: + """Tests for GenericScraper driven by YAML""" + + def test_init_loads_config(self, mock_config_path): + scraper = GenericScraper(mock_config_path) + assert scraper.name == "Test Site" + assert scraper.id == "testsite" + assert scraper.base_url == "https://test.com" + + @pytest.mark.asyncio + async def test_search_logic(self, mock_config_path): + scraper = GenericScraper(mock_config_path) + + # Mock HTTP response + mock_html = """ +
+

Naruto

+ Link + +
+ """ + + with patch.object(scraper.client, 'get', return_value=MagicMock(text=mock_html)) as mock_get: + # Mock metadata enrichment to avoid real API calls + with patch('app.downloaders.generic_scraper.get_metadata_enricher') as mock_get_enricher: + mock_enricher = AsyncMock() + mock_enricher.enrich_metadata.return_value = AnimeMetadata(title="Naruto", poster_image="https://test.com/cover.jpg") + mock_get_enricher.return_value = mock_enricher + + results = await scraper.search("Naruto") + + assert len(results) == 1 + assert results[0].title == "Naruto" + assert "test.com/naruto-page" in results[0].url + assert results[0].cover_image == "https://test.com/cover.jpg" + + @pytest.mark.asyncio + async def test_check_health_success(self, mock_config_path): + scraper = GenericScraper(mock_config_path) + with patch.object(scraper, 'search', return_value=[MagicMock()]) as mock_search: + is_healthy = await scraper.check_health() + assert is_healthy is True + mock_search.assert_called_once_with("One Piece") + + @pytest.mark.asyncio + async def test_check_health_failure(self, mock_config_path): + scraper = GenericScraper(mock_config_path) + with patch.object(scraper, 'search', return_value=[]) as mock_search: + is_healthy = await scraper.check_health() + assert is_healthy is False + + +class TestProvidersManager: + """Tests for ProvidersManager""" + + def test_load_providers(self, tmp_path): + # Create a temp providers config dir + config_dir = tmp_path / "config" + config_dir.mkdir() + + # Create two mock configs + for i in range(2): + config = {"name": f"Site {i}", "id": f"site{i}", "base_url": "http://test.com"} + with open(config_dir / f"site{i}.yaml", 'w') as f: + yaml.dump(config, f) + + manager = ProvidersManager(str(config_dir)) + assert len(manager.providers) == 2 + assert "site0" in manager.providers + assert "site1" in manager.providers + + @pytest.mark.asyncio + async def test_check_all_health(self, tmp_path): + config_dir = tmp_path / "config" + config_dir.mkdir() + config = {"name": "Site", "id": "site", "base_url": "http://test.com"} + with open(config_dir / "site.yaml", 'w') as f: + yaml.dump(config, f) + + manager = ProvidersManager(str(config_dir)) + + # Mock the health check of the scraper + with patch.object(manager.providers["site"], 'check_health', return_value=True) as mock_check: + await manager.check_all_health() + assert manager.health_status["site"]["status"] == "up" + assert manager.health_status["site"]["last_check"] is not None + + +@pytest.mark.asyncio +async def test_router_search_unified_modern(mock_config_path): + """Test the modernized unified search route in the router""" + from app.routers.router_anime import search_anime_unified + from app.providers_manager import providers_manager + + # Mock providers manager to return our test scraper + test_scraper = GenericScraper(mock_config_path) + mock_results = [ + AnimeSearchResult(title="Naruto", url="https://test.com/n", cover_image="", type="direct") + ] + test_scraper.search = AsyncMock(return_value=mock_results) + + with patch.object(providers_manager, 'get_active_providers', return_value=[test_scraper]): + # Patch legacy downloaders to return nothing + with patch('app.routers.router_anime.AnimeUltimeDownloader') as mock_dl: + mock_dl.return_value.search_anime = AsyncMock(return_value=[]) + + # Patch metadata enricher + with patch('app.routers.router_anime.get_metadata_enricher') as mock_get_enricher: + mock_enricher = AsyncMock() + mock_enricher.enrich_metadata = AsyncMock(return_value=AnimeMetadata(title="Naruto")) + mock_enricher.enrich_search_results = AsyncMock(side_effect=lambda x: x) + mock_get_enricher.return_value = mock_enricher + + response = await search_anime_unified("Naruto") + + assert "results" in response + assert "testsite" in response["results"] + assert response["results"]["testsite"][0]["title"] == "Naruto"