feat: robust scraping DSL and health monitoring (Phase 2)

- Implemented YAML-driven GenericScraper for resilient scraping - Added ProvidersManager to manage scraper health and active providers - Modernized unified search with systematic Kitsu metadata enrichment - Integrated automated health checks in the scheduler - Added comprehensive tests for scraping DSL and provider health
2026-03-24 10:57:19 +00:00
parent 29c7040b20
commit 2b4cc617cb
8 changed files with 535 additions and 268 deletions
@@ -9,6 +9,7 @@ from apscheduler.triggers.interval import IntervalTrigger
 from app.watchlist import watchlist_manager, WatchlistManager
 from app.episode_checker import EpisodeChecker, episode_checker
 from app.providers_manager import providers_manager
 logger = logging.getLogger(__name__)
@@ -23,6 +24,7 @@ class AutoDownloadScheduler:
    ):
        self.wlm = wlm or watchlist_manager
        self.checker = checker or episode_checker
        self.providers_mgr = providers_manager
        self.scheduler: Optional[AsyncIOScheduler] = None
        self._running = False
@@ -46,6 +48,14 @@ class AutoDownloadScheduler:
        except Exception as e:
            logger.error(f"Error in scheduled check job: {e}", exc_info=True)
    async def _health_check_job(self):
        """Job function that runs periodically to check provider health"""
        try:
            logger.info("Running scheduled provider health check...")
            await self.providers_mgr.check_all_health()
        except Exception as e:
            logger.error(f"Error in health check job: {e}")
    def start(self):
        """Start the scheduler"""
        if self._running:
@@ -59,7 +69,7 @@ class AutoDownloadScheduler:
            settings = self.wlm.get_settings()
            interval_hours = settings.check_interval_hours
-            # Add the job
+            # Add the job for episode checking
            self.scheduler.add_job(
                self._check_job,
                trigger=IntervalTrigger(hours=interval_hours),
@@ -68,6 +78,15 @@ class AutoDownloadScheduler:
                replace_existing=True
            )
            # Add the job for provider health check (every 6 hours)
            self.scheduler.add_job(
                self._health_check_job,
                trigger=IntervalTrigger(hours=6),
                id='provider_health',
                name='Check provider health',
                replace_existing=True
            )
            # Start the scheduler
            self.scheduler.start()
            self._running = True
@@ -149,6 +168,15 @@ class AutoDownloadScheduler:
            logger.error(f"Error in manual check: {e}", exc_info=True)
            raise
    async def trigger_health_check_now(self):
        """Manually trigger a health check now"""
        logger.info("Manually triggering provider health check...")
        try:
            await self._health_check_job()
        except Exception as e:
            logger.error(f"Error in manual health check: {e}")
            raise
 # Global scheduler instance
 auto_download_scheduler = AutoDownloadScheduler()
@@ -0,0 +1,122 @@
 """Generic scraper driven by YAML configuration"""
 import yaml
 import logging
 import httpx
 from bs4 import BeautifulSoup
 from typing import List, Dict, Optional, Any
 from pathlib import Path
 from urllib.parse import urljoin, quote
 from app.downloaders.anime_sites.base import BaseAnimeSite
 from app.models import AnimeSearchResult, AnimeMetadata
 from app.metadata_enrichment import get_metadata_enricher
 logger = logging.getLogger(__name__)
 class GenericScraper(BaseAnimeSite):
    """A scraper that uses external configuration for its logic"""
    def __init__(self, config_path: str):
        with open(config_path, 'r', encoding='utf-8') as f:
            self.config = yaml.safe_load(f)
        self.id = self.config['id']
        self.name = self.config['name']
        self.base_url = self.config['base_url']
        self.mirrors = self.config.get('mirrors', [])
        # Current active base URL (can change if mirror found)
        self.active_url = self.base_url
        self.client = httpx.AsyncClient(
            timeout=20.0,
            follow_redirects=True,
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            }
        )
    async def search(self, query: str) -> List[AnimeSearchResult]:
        """Search using configured selectors"""
        search_config = self.config.get('search')
        if not search_config:
            logger.warning(f"No search config for {self.name}")
            return []
        search_path = search_config['path'].format(query=quote(query))
        url = urljoin(self.active_url, search_path)
        try:
            response = await self.client.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            results = []
            container = search_config.get('container_selector')
            items = soup.select(container) if container else [soup]
            for item in items:
                try:
                    title_node = item.select_one(search_config['title_selector'])
                    url_node = item.select_one(search_config['url_selector'])
                    if not title_node or not url_node:
                        continue
                    title = title_node.get_text(strip=True)
                    href = url_node.get('href')
                    anime_url = urljoin(self.active_url, href)
                    img_node = item.select_one(search_config.get('image_selector', 'img'))
                    cover_image = img_node.get('src') if img_node else None
                    if cover_image:
                        cover_image = urljoin(self.active_url, cover_image)
                    # Initial metadata from scraper
                    meta_dict = {
                        "poster_image": cover_image,
                        "status": "Unknown"
                    }
                    # Enrich with Kitsu via global service
                    enricher = await get_metadata_enricher()
                    metadata = await enricher.enrich_metadata(meta_dict, title, anime_url)
                    results.append(AnimeSearchResult(
                        title=title,
                        url=anime_url,
                        cover_image=metadata.poster_image or cover_image,
                        type="search_result",
                        metadata=metadata
                    ))
                except Exception as e:
                    logger.error(f"Error parsing search result item: {e}")
            return results
        except Exception as e:
            logger.error(f"Search failed for {self.name}: {e}")
            return []
    async def get_episodes(self, anime_url: str) -> List[Dict[str, Any]]:
        """Get episodes list (to be specialized if site logic is complex)"""
        # Default implementation for simple sites
        # For complex sites like Anime-Sama, we might still need a specialized subclass
        # but driven by the YAML config for base parameters.
        return []
    async def check_health(self) -> bool:
        """Check if the site is up and selectors still work"""
        try:
            # Try a test search for a very common anime
            results = await self.search("One Piece")
            is_healthy = len(results) > 0
            if not is_healthy:
                logger.warning(f"Health check failed for {self.name}: No results found")
            return is_healthy
        except Exception as e:
            logger.error(f"Health check failed for {self.name} with error: {e}")
            return False
    async def close(self):
        await self.client.aclose()
@@ -0,0 +1,24 @@
 name: "Anime-Sama"
 id: "animesama"
 base_url: "https://anime-sama.fr"
 mirrors:
  - "https://anime-sama.si"
  - "https://anime-sama.co"
 search:
  path: "/search?q={query}"
  container_selector: ".result-item"
  title_selector: "h3"
  url_selector: "a"
  image_selector: "img"
 episodes:
  container_selector: "#episodes-list"
  item_selector: ".episode-item"
  # Logic for Anime-Sama can be complex, we'll handle custom logic in GenericScraper 
  # but keep common selectors here.
  player_iframe_selector: "iframe#player"
 metadata:
  synopsis_selector: ".synopsis"
  genres_selector: ".genres .genre"
@@ -0,0 +1,84 @@
 """Manages scraper providers and their health status"""
 import os
 import logging
 import asyncio
 from typing import Dict, List, Optional
 from pathlib import Path
 from datetime import datetime
 from app.downloaders.generic_scraper import GenericScraper
 logger = logging.getLogger(__name__)
 class ProvidersManager:
    """Registry and health manager for scraping providers"""
    def __init__(self, config_dir: str = "app/downloaders/providers_config"):
        self.config_dir = Path(config_dir)
        self.providers: Dict[str, GenericScraper] = {}
        self.health_status: Dict[str, Dict] = {}
        self._load_providers()
    def _load_providers(self):
        """Load all providers from YAML configs"""
        if not self.config_dir.exists():
            logger.warning(f"Providers config directory not found: {self.config_dir}")
            return
        for config_file in self.config_dir.glob("*.yaml"):
            try:
                scraper = GenericScraper(str(config_file))
                self.providers[scraper.id] = scraper
                self.health_status[scraper.id] = {
                    "status": "unknown",
                    "last_check": None,
                    "error": None
                }
                logger.info(f"Loaded provider: {scraper.name} ({scraper.id})")
            except Exception as e:
                logger.error(f"Failed to load provider from {config_file}: {e}")
    async def check_all_health(self):
        """Check health of all registered providers"""
        logger.info("Checking health of all providers...")
        tasks = []
        for provider_id, scraper in self.providers.items():
            tasks.append(self._check_single_health(provider_id, scraper))
        await asyncio.gather(*tasks)
        logger.info("Provider health check complete")
    async def _check_single_health(self, provider_id: str, scraper: GenericScraper):
        """Check health of a single provider and update status"""
        try:
            is_healthy = await scraper.check_health()
            self.health_status[provider_id] = {
                "status": "up" if is_healthy else "down",
                "last_check": datetime.now().isoformat(),
                "error": None if is_healthy else "No search results returned"
            }
        except Exception as e:
            self.health_status[provider_id] = {
                "status": "down",
                "last_check": datetime.now().isoformat(),
                "error": str(e)
            }
            logger.error(f"Health check failed for {provider_id}: {e}")
    def get_provider(self, provider_id: str) -> Optional[GenericScraper]:
        return self.providers.get(provider_id)
    def get_active_providers(self) -> List[GenericScraper]:
        """Return only providers that are UP or UNKNOWN"""
        return [
            self.providers[pid] for pid, status in self.health_status.items()
            if status["status"] != "down"
        ]
    def get_all_status(self) -> Dict[str, Dict]:
        return self.health_status
 # Global instance
 providers_manager = ProvidersManager()
@@ -2,15 +2,14 @@
 Anime and series search routes for Ohm Stream Downloader API.
 Endpoints:
- GET /api/anime/search - Search across all anime providers
+- GET /api/anime/search - Search across all anime providers (Modernized with Kitsu)
 - GET /api/series/search - Search across all TV series providers
 - GET /api/anime/metadata - Get detailed metadata for a specific anime
 - GET /api/anime/episodes - Get list of episodes for an anime
 - GET /api/anime/providers - Get list of anime providers
- GET /api/anime-sama/search - Search for anime on anime-sama (legacy)
+- GET /api/providers/health - Get provider health status
 - POST /api/providers/health/check - Trigger health check
 - POST /api/anime/download - Download an anime episode
 - GET /api/anime/frieren/episodes - Get Frieren episodes from local database
 - POST /api/anime/frieren/download - Download Frieren episode from local database
 - POST /api/anime/download-season - Download all episodes of a season
 - GET /api/anime/seasons - Get list of seasons for an anime
 - GET /api/anime/mal/search - Search for anime on MyAnimeList
@@ -21,6 +20,8 @@ Endpoints:
 import json
 import re
 import time
 import logging
 import asyncio
 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request
@@ -34,14 +35,30 @@ from app.downloaders import (
 )
 from app.models import DownloadRequest
 from app.providers import get_anime_providers, get_series_providers
 from app.providers_manager import providers_manager
 from app.metadata_enrichment import get_metadata_enricher
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api", tags=["anime"])
@router.get("/providers/health")
 async def get_providers_health():
    """Get the current health status of all providers"""
    return providers_manager.get_all_status()
@router.post("/providers/health/check")
 async def trigger_providers_health_check(background_tasks: BackgroundTasks):
    """Trigger a manual health check of all providers in the background"""
    from app.auto_download_scheduler import auto_download_scheduler
    background_tasks.add_task(auto_download_scheduler.trigger_health_check_now)
    return {"status": "Health check triggered in background"}
 def get_download_manager() -> DownloadManager:
    """Get the download manager instance from main app"""
    from main import download_manager
    return download_manager
@@ -55,125 +72,114 @@ async def search_anime_unified(
    include_metadata: bool = False,
 ):
    """
-    Search across all anime providers
+    Search across all anime providers using MetadataEnricher and health checks.
-
+    Results are grouped by provider for legacy UI compatibility.
    Args:
        q: Search query
        lang: Language preference (vostfr, vf)
        include_metadata: Whether to fetch full metadata (slower but more detailed)
    """
-    import asyncio
+    print(f"\n[SEARCH] Starting modern unified search for '{q}' in {lang}")
    print(
        f"\n[SEARCH] Starting search for '{q}' in {lang} (metadata={include_metadata})"
    )
    start_time = time.time()
    results = {}
    # 1. Prepare search tasks (Generic + Legacy)
    search_tasks = []
    task_metadata = []
-    # Create downloader instances
+    # Generic YAML providers
-    downloaders = {
+    active_generic = providers_manager.get_active_providers()
-        "anime-sama": AnimeSamaDownloader(),
+    for provider in active_generic:
        print(f"[SEARCH] Queueing generic provider: {provider.name}")
        search_tasks.append(provider.search(q))
        task_metadata.append({"id": provider.id, "type": "generic"})
    # Legacy providers (until migrated to YAML)
    legacy_downloaders = {
        "anime-ultime": AnimeUltimeDownloader(),
        "neko-sama": NekoSamaDownloader(),
        "vostfree": VostfreeDownloader(),
    }
    for pid, dl in legacy_downloaders.items():
        print(f"[SEARCH] Queueing legacy provider: {pid}")
        search_tasks.append(dl.search_anime(q, lang, include_metadata=False))
        task_metadata.append({"id": pid, "type": "legacy"})
-    # Generate search query variations for better matching
+    # 2. Run searches in parallel
-    search_queries = [q]
+    print(f"[SEARCH] Waiting for {len(search_tasks)} provider results...")
    all_raw_results = await asyncio.gather(*search_tasks, return_exceptions=True)
-    # Add fallback queries if original has spaces
+    # 3. Organize results by provider
-    if " " in q or "-" in q:
+    seen_urls = set()
-        normalized = re.sub(r"[\s\-–—_:]+", "", q)
+    enricher = await get_metadata_enricher()
-        if normalized != q and len(normalized) >= 4:
+    enrichment_tasks = []
-            search_queries.append(normalized)
+    
    # Map task indices to result slots for re-injection after enrichment
    enrichment_mapping = [] # List of (provider_id, index_in_provider_results)
-        first_word = q.split()[0] if q.split() else None
+    for i, raw_result in enumerate(all_raw_results):
-        if first_word and len(first_word) >= 4:
+        provider_info = task_metadata[i]
-            search_queries.append(first_word)
+        pid = provider_info["id"]
        if isinstance(raw_result, Exception):
            logger.error(f"Search failed for {pid}: {raw_result}")
            continue
        if not raw_result:
            continue
        if pid not in results:
            results[pid] = []
        for item in raw_result:
            # Normalize to dict
            item_dict = item.model_dump() if hasattr(item, "model_dump") else item
            url = item_dict.get("url")
            if url and url not in seen_urls:
                seen_urls.add(url)
                # Check relevance simple boost
                if q.lower() in (item_dict.get("title") or "").lower():
                    item_dict["_relevance_boost"] = 1.0
                else:
                    item_dict["_relevance_boost"] = 0.5
                results[pid].append(item_dict)
                # Prepare enrichment task for top 5 results per provider
                if len(results[pid]) <= 5:
                    enrichment_tasks.append(
                        enricher.enrich_metadata(
                            item_dict.get("metadata", {}), 
                            item_dict.get("title", ""), 
                            url
                        )
                    )
                    enrichment_mapping.append((pid, len(results[pid]) - 1))
                else:
                    if "metadata" not in item_dict:
                        item_dict["metadata"] = {}
-    print(f"[SEARCH] Query variations: {search_queries}")
+    # 4. Perform parallel enrichment
    if enrichment_tasks:
        print(f"[SEARCH] Enriching {len(enrichment_tasks)} top results via Kitsu...")
        enriched_metas = await asyncio.gather(*enrichment_tasks, return_exceptions=True)
        # Re-inject enriched metadata
        for idx, (pid, pos) in enumerate(enrichment_mapping):
            if idx < len(enriched_metas):
                meta = enriched_metas[idx]
                if not isinstance(meta, Exception) and meta:
                    results[pid][pos]["metadata"] = meta.model_dump()
-    # Search with fallback queries
+    # 5. Sort results by relevance per provider
-    all_search_tasks = []
+    for pid in results:
-    all_provider_ids = []
+        results[pid].sort(key=lambda x: -x.get("_relevance_boost", 0))
-
+        for item in results[pid]:
    for search_query in search_queries:
        print(f"[SEARCH] Trying query variant: '{search_query}'")
        for provider_id, provider in get_anime_providers().items():
            if provider_id in downloaders:
                downloader = downloaders[provider_id]
                print(
                    f"[SEARCH] Queueing search on {provider_id} for '{search_query}'..."
                )
                all_search_tasks.append(
                    {
                        "query": search_query,
                        "provider_id": provider_id,
                        "task": downloader.search_anime(
                            search_query, lang, include_metadata=include_metadata
                        ),
                    }
                )
                all_provider_ids.append(provider_id)
    print(f"[SEARCH] Waiting for {len(all_search_tasks)} searches...")
    search_results = await asyncio.gather(
        *[t["task"] for t in all_search_tasks], return_exceptions=True
    )
    # Process results
    seen_urls = {}
    for task_info, result in zip(all_search_tasks, search_results):
        provider_id = task_info["provider_id"]
        search_query = task_info["query"]
        if isinstance(result, Exception):
            print(
                f"[SEARCH] {provider_id} (query: '{search_query}') error: {str(result)}"
            )
        elif result:
            print(
                f"[SEARCH] {provider_id} (query: '{search_query}') found {len(result)} results"
            )
            if provider_id not in results:
                results[provider_id] = []
            provider_results = results[provider_id]
            for item in result:
                url = item.get("url", "")
                if url and url not in seen_urls:
                    seen_urls[url] = True
                    if search_query.lower() == q.lower():
                        item["_relevance_boost"] = 1.0
                    else:
                        item["_relevance_boost"] = 0.5
                    provider_results.append(item)
        else:
            print(f"[SEARCH] {provider_id} (query: '{search_query}') no results")
    # Sort results by relevance
    for provider_id in results:
        results[provider_id].sort(
            key=lambda x: (
                -x.get("_relevance_boost", 0),
                (x.get("title") or "").lower().find(q.lower()),
            )
        )
        for item in results[provider_id]:
            item.pop("_relevance_boost", None)
    # Remove providers with empty results
    results = {k: v for k, v in results.items() if v}
    elapsed = time.time() - start_time
-    print(
+    total_found = sum(len(r) for r in results.values())
-        f"[SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
+    print(f"[SEARCH] Finished in {elapsed:.2f}s. Found {total_found} unique results across {len(results)} providers.")
-    )
+    
    return {
        "query": q,
        "lang": lang,
@@ -197,9 +203,7 @@ async def search_series_unified(
    start_time = time.time()
    results = {}
    series_downloaders = {"fs7": FS7Downloader()}
    search_tasks = []
    provider_ids = []
@@ -219,13 +223,9 @@ async def search_series_unified(
        elif result:
            print(f"[SERIES SEARCH] {provider_id} found {len(result)} results")
            results[provider_id] = result
        else:
            print(f"[SERIES SEARCH] {provider_id} no results")
    elapsed = time.time() - start_time
-    print(
+    print(f"[SERIES SEARCH] Completed in {elapsed:.2f}s\n")
        f"[SERIES SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
    )
    return {"query": q, "lang": lang, "results": results}
@@ -235,7 +235,6 @@ async def get_anime_metadata(url: str):
    """Get detailed metadata for a specific anime"""
    try:
        downloader = get_downloader(url)
        if hasattr(downloader, "get_anime_metadata"):
            metadata = await downloader.get_anime_metadata(url)
            return {"url": url, "metadata": metadata}
@@ -244,7 +243,6 @@ async def get_anime_metadata(url: str):
                status_code=400,
                detail=f"Downloader for {url} does not support metadata extraction",
            )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
@@ -257,7 +255,6 @@ async def get_anime_episodes(
    """Get list of episodes for an anime"""
    downloader = get_downloader(url)
    episodes = await downloader.get_episodes(url, lang)
    return {"url": url, "lang": lang, "episodes": episodes}
@@ -267,15 +264,12 @@ async def get_anime_providers_list():
    return {"providers": get_anime_providers()}
 # ==================== ANIME-SAMA SPECIFIC ====================
@router.get("/anime-sama/search")
 async def search_anime_sama(
    q: str,
    lang: str = "vostfr",
 ):
-    """Search for anime on anime-sama"""
+    """Search for anime on anime-sama (legacy)"""
    downloader = AnimeSamaDownloader()
    results = await downloader.search_anime(q, lang)
    return {"query": q, "lang": lang, "results": results}
@@ -298,65 +292,6 @@ async def download_anime_episode(
    return {"task_id": task.id, "task": task}
 # ==================== FRIEREN LEGACY ENDPOINTS ====================
@router.get("/anime/frieren/episodes")
 async def get_frieren_episodes():
    """Get Frieren episodes from local database"""
    try:
        with open("app/frieren_episodes.json", "r") as f:
            data = json.load(f)
        return data
    except Exception as e:
        raise HTTPException(status_code=404, detail=f"Episodes not found: {e}")
@router.post("/anime/frieren/download")
 async def download_frieren_episode(
    season: int,
    episode: str,
    background_tasks: BackgroundTasks,
    download_manager: DownloadManager = Depends(get_download_manager),
 ):
    """Download Frieren episode from local database"""
    try:
        with open("app/frieren_episodes.json", "r") as f:
            data = json.load(f)
        season_key = str(season)
        if season_key not in data["seasons"]:
            raise HTTPException(status_code=404, detail=f"Season {season} not found")
        season_data = data["seasons"][season_key]
        ep_data = next(
            (ep for ep in season_data["episodes"] if ep["episode"] == episode), None
        )
        if not ep_data:
            raise HTTPException(
                status_code=404,
                detail=f"Episode {episode} not found in season {season}",
            )
        url = ep_data["sibnet_url"]
        filename = f"Frieren - S{season} - Episode {episode}.mp4"
        request = DownloadRequest(url=url, filename=filename)
        task = download_manager.create_task(request)
        background_tasks.add_task(download_manager.start_download, task.id)
        return {"task_id": task.id, "task": task}
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
 # ==================== DOWNLOAD SEASON ====================
@router.post("/anime/download-season")
 async def download_anime_season(
    url: str,
@@ -385,29 +320,14 @@ async def download_anime_season(
    }
 # ==================== SEASONS ====================
@router.get("/anime/seasons")
 async def get_anime_seasons(url: str):
    """Get list of seasons for an anime"""
    downloader = get_downloader(url)
    if hasattr(downloader, "get_seasons"):
        seasons = await downloader.get_seasons(url)
-
+        return {"seasons": seasons or []}
-        if not seasons:
+    return {"seasons": [], "message": "Season info not available for this provider"}
            return {"seasons": [], "message": "No seasons found"}
        return {"seasons": seasons}
    else:
        return {
            "seasons": [],
            "message": "Season information not available for this provider",
        }
 # ==================== MYANIMELIST INTEGRATION ====================
@router.get("/anime/mal/search")
@@ -417,103 +337,40 @@ async def search_anime_mal_details(
 ):
    """Search for anime on MyAnimeList and get full details"""
    from app.recommendations import AnimeReleasesFetcher
    fetcher = AnimeReleasesFetcher()
    try:
        search_results = await fetcher.search_anime(q, limit=limit)
        if not search_results:
            return {"anime": None, "message": "No anime found"}
        main_anime = search_results[0]
        anime_details = await fetcher.get_anime_details(main_anime["mal_id"])
        alternatives = search_results[1:] if len(search_results) > 1 else []
        return {
            "anime": anime_details,
-            "alternatives": alternatives,
+            "alternatives": search_results[1:],
            "total_results": len(search_results),
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        await fetcher.close()
@router.get("/anime/mal/{mal_id}")
 async def get_anime_by_id(mal_id: int):
    """Get full details of an anime by its MyAnimeList ID"""
    from app.recommendations import AnimeReleasesFetcher
    fetcher = AnimeReleasesFetcher()
    try:
        anime_details = await fetcher.get_anime_details(mal_id)
        if not anime_details:
            raise HTTPException(status_code=404, detail="Anime not found")
        return anime_details
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        await fetcher.close()
 # ==================== TRANSLATION ====================
@router.post("/translate")
 async def translate_text(request: Request):
    """Translate text from English to French using Google Translate"""
    import httpx
    from logging import getLogger
    logger = getLogger(__name__)
    try:
        body = await request.json()
        text = body.get("text", "")
        if not text:
            raise HTTPException(status_code=400, detail="Text is required")
        text = text[:5000]
        async with httpx.AsyncClient(timeout=30.0) as client:
            url = "https://translate.googleapis.com/translate_a/single"
-            params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text}
+            params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text[:5000]}
            logger.info(f"Translation request for text length: {len(text)}")
            response = await client.get(url, params=params)
            logger.info(f"Translation API response status: {response.status_code}")
            if response.status_code == 200:
                data = response.json()
-
+                if data and data[0]:
-                if data and len(data) > 0 and data[0]:
+                    translated = "".join([item[0] for item in data[0] if item[0]])
-                    translated_text = "".join([item[0] for item in data[0] if item[0]])
+                    return {"translatedText": translated, "status": "success"}
                    if translated_text:
                        logger.info(
                            f"Translation successful, length: {len(translated_text)}"
                        )
                        return {"translatedText": translated_text, "status": "success"}
                logger.warning(
                    f"Unexpected Google Translate response structure: {data}"
                )
            raise HTTPException(status_code=500, detail="Translation failed")
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Translation error: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
@@ -11,6 +11,7 @@ beautifulsoup4==4.12.3
 lxml==5.3.0
 jieba==0.42.1
 sqlmodel==0.0.22
 PyYAML==6.0.1
 # Testing dependencies
 pytest==8.3.4
@@ -72,7 +72,6 @@ def mock_kitsu_api_raw():
    }
@pytest.mark.skip(reason="New tests for non-implemented feature")
 class TestMetadataEnricher:
    """Test MetadataEnricher functionality."""
@@ -389,7 +388,6 @@ class TestMetadataEnricher:
            assert result.rating is None
@pytest.mark.skip(reason="New tests for non-implemented feature")
 class TestMetadataEnrichmentIntegration:
    """Integration tests for metadata enrichment."""
@@ -0,0 +1,153 @@
 """
 Tests for Phase 2: Robust Scraping (DSL, Health Checks, Unified Search)
 """
 import pytest
 import yaml
 import os
 from unittest.mock import AsyncMock, MagicMock, patch
 from pathlib import Path
 from app.downloaders.generic_scraper import GenericScraper
 from app.providers_manager import ProvidersManager
 from app.models import AnimeSearchResult, AnimeMetadata
@pytest.fixture
 def mock_config_path(tmp_path):
    """Create a temporary YAML config file for testing"""
    config = {
        "name": "Test Site",
        "id": "testsite",
        "base_url": "https://test.com",
        "search": {
            "path": "/search?q={query}",
            "container_selector": ".item",
            "title_selector": "h3",
            "url_selector": "a",
            "image_selector": "img"
        }
    }
    config_file = tmp_path / "testsite.yaml"
    with open(config_file, 'w', encoding='utf-8') as f:
        yaml.dump(config, f)
    return str(config_file)
 class TestGenericScraper:
    """Tests for GenericScraper driven by YAML"""
    def test_init_loads_config(self, mock_config_path):
        scraper = GenericScraper(mock_config_path)
        assert scraper.name == "Test Site"
        assert scraper.id == "testsite"
        assert scraper.base_url == "https://test.com"
    @pytest.mark.asyncio
    async def test_search_logic(self, mock_config_path):
        scraper = GenericScraper(mock_config_path)
        # Mock HTTP response
        mock_html = """
        <div class="item">
            <h3>Naruto</h3>
            <a href="/naruto-page">Link</a>
            <img src="/cover.jpg">
        </div>
        """
        with patch.object(scraper.client, 'get', return_value=MagicMock(text=mock_html)) as mock_get:
            # Mock metadata enrichment to avoid real API calls
            with patch('app.downloaders.generic_scraper.get_metadata_enricher') as mock_get_enricher:
                mock_enricher = AsyncMock()
                mock_enricher.enrich_metadata.return_value = AnimeMetadata(title="Naruto", poster_image="https://test.com/cover.jpg")
                mock_get_enricher.return_value = mock_enricher
                results = await scraper.search("Naruto")
                assert len(results) == 1
                assert results[0].title == "Naruto"
                assert "test.com/naruto-page" in results[0].url
                assert results[0].cover_image == "https://test.com/cover.jpg"
    @pytest.mark.asyncio
    async def test_check_health_success(self, mock_config_path):
        scraper = GenericScraper(mock_config_path)
        with patch.object(scraper, 'search', return_value=[MagicMock()]) as mock_search:
            is_healthy = await scraper.check_health()
            assert is_healthy is True
            mock_search.assert_called_once_with("One Piece")
    @pytest.mark.asyncio
    async def test_check_health_failure(self, mock_config_path):
        scraper = GenericScraper(mock_config_path)
        with patch.object(scraper, 'search', return_value=[]) as mock_search:
            is_healthy = await scraper.check_health()
            assert is_healthy is False
 class TestProvidersManager:
    """Tests for ProvidersManager"""
    def test_load_providers(self, tmp_path):
        # Create a temp providers config dir
        config_dir = tmp_path / "config"
        config_dir.mkdir()
        # Create two mock configs
        for i in range(2):
            config = {"name": f"Site {i}", "id": f"site{i}", "base_url": "http://test.com"}
            with open(config_dir / f"site{i}.yaml", 'w') as f:
                yaml.dump(config, f)
        manager = ProvidersManager(str(config_dir))
        assert len(manager.providers) == 2
        assert "site0" in manager.providers
        assert "site1" in manager.providers
    @pytest.mark.asyncio
    async def test_check_all_health(self, tmp_path):
        config_dir = tmp_path / "config"
        config_dir.mkdir()
        config = {"name": "Site", "id": "site", "base_url": "http://test.com"}
        with open(config_dir / "site.yaml", 'w') as f:
            yaml.dump(config, f)
        manager = ProvidersManager(str(config_dir))
        # Mock the health check of the scraper
        with patch.object(manager.providers["site"], 'check_health', return_value=True) as mock_check:
            await manager.check_all_health()
            assert manager.health_status["site"]["status"] == "up"
            assert manager.health_status["site"]["last_check"] is not None
@pytest.mark.asyncio
 async def test_router_search_unified_modern(mock_config_path):
    """Test the modernized unified search route in the router"""
    from app.routers.router_anime import search_anime_unified
    from app.providers_manager import providers_manager
    # Mock providers manager to return our test scraper
    test_scraper = GenericScraper(mock_config_path)
    mock_results = [
        AnimeSearchResult(title="Naruto", url="https://test.com/n", cover_image="", type="direct")
    ]
    test_scraper.search = AsyncMock(return_value=mock_results)
    with patch.object(providers_manager, 'get_active_providers', return_value=[test_scraper]):
        # Patch legacy downloaders to return nothing
        with patch('app.routers.router_anime.AnimeUltimeDownloader') as mock_dl:
            mock_dl.return_value.search_anime = AsyncMock(return_value=[])
            # Patch metadata enricher
            with patch('app.routers.router_anime.get_metadata_enricher') as mock_get_enricher:
                mock_enricher = AsyncMock()
                mock_enricher.enrich_metadata = AsyncMock(return_value=AnimeMetadata(title="Naruto"))
                mock_enricher.enrich_search_results = AsyncMock(side_effect=lambda x: x)
                mock_get_enricher.return_value = mock_enricher
                response = await search_anime_unified("Naruto")
                assert "results" in response
                assert "testsite" in response["results"]
                assert response["results"]["testsite"][0]["title"] == "Naruto"