feat: robust scraping DSL and health monitoring (Phase 2)

- Implemented YAML-driven GenericScraper for resilient scraping - Added ProvidersManager to manage scraper health and active providers - Modernized unified search with systematic Kitsu metadata enrichment - Integrated automated health checks in the scheduler - Added comprehensive tests for scraping DSL and provider health
2026-03-24 10:57:19 +00:00
parent 29c7040b20
commit 2b4cc617cb
8 changed files with 535 additions and 268 deletions
@@ -2,15 +2,14 @@
 Anime and series search routes for Ohm Stream Downloader API.

 Endpoints:
- GET /api/anime/search - Search across all anime providers
+- GET /api/anime/search - Search across all anime providers (Modernized with Kitsu)
 - GET /api/series/search - Search across all TV series providers
 - GET /api/anime/metadata - Get detailed metadata for a specific anime
 - GET /api/anime/episodes - Get list of episodes for an anime
 - GET /api/anime/providers - Get list of anime providers
- GET /api/anime-sama/search - Search for anime on anime-sama (legacy)
+- GET /api/providers/health - Get provider health status
+- POST /api/providers/health/check - Trigger health check
 - POST /api/anime/download - Download an anime episode
- GET /api/anime/frieren/episodes - Get Frieren episodes from local database
- POST /api/anime/frieren/download - Download Frieren episode from local database
 - POST /api/anime/download-season - Download all episodes of a season
 - GET /api/anime/seasons - Get list of seasons for an anime
 - GET /api/anime/mal/search - Search for anime on MyAnimeList
@@ -21,6 +20,8 @@ Endpoints:
 import json
 import re
 import time
+import logging
+import asyncio

 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request

@@ -34,14 +35,30 @@ from app.downloaders import (
 )
 from app.models import DownloadRequest
 from app.providers import get_anime_providers, get_series_providers
+from app.providers_manager import providers_manager
+from app.metadata_enrichment import get_metadata_enricher

+logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api", tags=["anime"])


+@router.get("/providers/health")
+async def get_providers_health():
+    """Get the current health status of all providers"""
+    return providers_manager.get_all_status()
+
+
+@router.post("/providers/health/check")
+async def trigger_providers_health_check(background_tasks: BackgroundTasks):
+    """Trigger a manual health check of all providers in the background"""
+    from app.auto_download_scheduler import auto_download_scheduler
+    background_tasks.add_task(auto_download_scheduler.trigger_health_check_now)
+    return {"status": "Health check triggered in background"}
+
+
 def get_download_manager() -> DownloadManager:
    """Get the download manager instance from main app"""
    from main import download_manager
-
    return download_manager


@@ -55,125 +72,114 @@ async def search_anime_unified(
    include_metadata: bool = False,
 ):
    """
-    Search across all anime providers
-
-    Args:
-        q: Search query
-        lang: Language preference (vostfr, vf)
-        include_metadata: Whether to fetch full metadata (slower but more detailed)
+    Search across all anime providers using MetadataEnricher and health checks.
+    Results are grouped by provider for legacy UI compatibility.
    """
-    import asyncio
-
-    print(
-        f"\n[SEARCH] Starting search for '{q}' in {lang} (metadata={include_metadata})"
-    )
+    print(f"\n[SEARCH] Starting modern unified search for '{q}' in {lang}")
    start_time = time.time()

    results = {}
+    
+    # 1. Prepare search tasks (Generic + Legacy)
+    search_tasks = []
+    task_metadata = []

-    # Create downloader instances
-    downloaders = {
-        "anime-sama": AnimeSamaDownloader(),
+    # Generic YAML providers
+    active_generic = providers_manager.get_active_providers()
+    for provider in active_generic:
+        print(f"[SEARCH] Queueing generic provider: {provider.name}")
+        search_tasks.append(provider.search(q))
+        task_metadata.append({"id": provider.id, "type": "generic"})
+
+    # Legacy providers (until migrated to YAML)
+    legacy_downloaders = {
        "anime-ultime": AnimeUltimeDownloader(),
        "neko-sama": NekoSamaDownloader(),
        "vostfree": VostfreeDownloader(),
    }
+    for pid, dl in legacy_downloaders.items():
+        print(f"[SEARCH] Queueing legacy provider: {pid}")
+        search_tasks.append(dl.search_anime(q, lang, include_metadata=False))
+        task_metadata.append({"id": pid, "type": "legacy"})

-    # Generate search query variations for better matching
-    search_queries = [q]
+    # 2. Run searches in parallel
+    print(f"[SEARCH] Waiting for {len(search_tasks)} provider results...")
+    all_raw_results = await asyncio.gather(*search_tasks, return_exceptions=True)

-    # Add fallback queries if original has spaces
-    if " " in q or "-" in q:
-        normalized = re.sub(r"[\s\-–—_:]+", "", q)
-        if normalized != q and len(normalized) >= 4:
-            search_queries.append(normalized)
+    # 3. Organize results by provider
+    seen_urls = set()
+    enricher = await get_metadata_enricher()
+    enrichment_tasks = []
+    
+    # Map task indices to result slots for re-injection after enrichment
+    enrichment_mapping = [] # List of (provider_id, index_in_provider_results)

-        first_word = q.split()[0] if q.split() else None
-        if first_word and len(first_word) >= 4:
-            search_queries.append(first_word)
+    for i, raw_result in enumerate(all_raw_results):
+        provider_info = task_metadata[i]
+        pid = provider_info["id"]
+        
+        if isinstance(raw_result, Exception):
+            logger.error(f"Search failed for {pid}: {raw_result}")
+            continue
+        
+        if not raw_result:
+            continue
+            
+        if pid not in results:
+            results[pid] = []
+            
+        for item in raw_result:
+            # Normalize to dict
+            item_dict = item.model_dump() if hasattr(item, "model_dump") else item
+            url = item_dict.get("url")
+            
+            if url and url not in seen_urls:
+                seen_urls.add(url)
+                
+                # Check relevance simple boost
+                if q.lower() in (item_dict.get("title") or "").lower():
+                    item_dict["_relevance_boost"] = 1.0
+                else:
+                    item_dict["_relevance_boost"] = 0.5
+                
+                results[pid].append(item_dict)
+                
+                # Prepare enrichment task for top 5 results per provider
+                if len(results[pid]) <= 5:
+                    enrichment_tasks.append(
+                        enricher.enrich_metadata(
+                            item_dict.get("metadata", {}), 
+                            item_dict.get("title", ""), 
+                            url
+                        )
+                    )
+                    enrichment_mapping.append((pid, len(results[pid]) - 1))
+                else:
+                    if "metadata" not in item_dict:
+                        item_dict["metadata"] = {}

-    print(f"[SEARCH] Query variations: {search_queries}")
+    # 4. Perform parallel enrichment
+    if enrichment_tasks:
+        print(f"[SEARCH] Enriching {len(enrichment_tasks)} top results via Kitsu...")
+        enriched_metas = await asyncio.gather(*enrichment_tasks, return_exceptions=True)
+        
+        # Re-inject enriched metadata
+        for idx, (pid, pos) in enumerate(enrichment_mapping):
+            if idx < len(enriched_metas):
+                meta = enriched_metas[idx]
+                if not isinstance(meta, Exception) and meta:
+                    results[pid][pos]["metadata"] = meta.model_dump()

-    # Search with fallback queries
-    all_search_tasks = []
-    all_provider_ids = []
-
-    for search_query in search_queries:
-        print(f"[SEARCH] Trying query variant: '{search_query}'")
-
-        for provider_id, provider in get_anime_providers().items():
-            if provider_id in downloaders:
-                downloader = downloaders[provider_id]
-                print(
-                    f"[SEARCH] Queueing search on {provider_id} for '{search_query}'..."
-                )
-                all_search_tasks.append(
-                    {
-                        "query": search_query,
-                        "provider_id": provider_id,
-                        "task": downloader.search_anime(
-                            search_query, lang, include_metadata=include_metadata
-                        ),
-                    }
-                )
-                all_provider_ids.append(provider_id)
-
-    print(f"[SEARCH] Waiting for {len(all_search_tasks)} searches...")
-    search_results = await asyncio.gather(
-        *[t["task"] for t in all_search_tasks], return_exceptions=True
-    )
-
-    # Process results
-    seen_urls = {}
-
-    for task_info, result in zip(all_search_tasks, search_results):
-        provider_id = task_info["provider_id"]
-        search_query = task_info["query"]
-
-        if isinstance(result, Exception):
-            print(
-                f"[SEARCH] {provider_id} (query: '{search_query}') error: {str(result)}"
-            )
-        elif result:
-            print(
-                f"[SEARCH] {provider_id} (query: '{search_query}') found {len(result)} results"
-            )
-
-            if provider_id not in results:
-                results[provider_id] = []
-
-            provider_results = results[provider_id]
-            for item in result:
-                url = item.get("url", "")
-                if url and url not in seen_urls:
-                    seen_urls[url] = True
-                    if search_query.lower() == q.lower():
-                        item["_relevance_boost"] = 1.0
-                    else:
-                        item["_relevance_boost"] = 0.5
-                    provider_results.append(item)
-        else:
-            print(f"[SEARCH] {provider_id} (query: '{search_query}') no results")
-
-    # Sort results by relevance
-    for provider_id in results:
-        results[provider_id].sort(
-            key=lambda x: (
-                -x.get("_relevance_boost", 0),
-                (x.get("title") or "").lower().find(q.lower()),
-            )
-        )
-        for item in results[provider_id]:
+    # 5. Sort results by relevance per provider
+    for pid in results:
+        results[pid].sort(key=lambda x: -x.get("_relevance_boost", 0))
+        for item in results[pid]:
            item.pop("_relevance_boost", None)

-    # Remove providers with empty results
-    results = {k: v for k, v in results.items() if v}
-
    elapsed = time.time() - start_time
-    print(
-        f"[SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
-    )
-
+    total_found = sum(len(r) for r in results.values())
+    print(f"[SEARCH] Finished in {elapsed:.2f}s. Found {total_found} unique results across {len(results)} providers.")
+    
    return {
        "query": q,
        "lang": lang,
@@ -197,9 +203,7 @@ async def search_series_unified(
    start_time = time.time()

    results = {}
-
    series_downloaders = {"fs7": FS7Downloader()}
-
    search_tasks = []
    provider_ids = []

@@ -219,13 +223,9 @@ async def search_series_unified(
        elif result:
            print(f"[SERIES SEARCH] {provider_id} found {len(result)} results")
            results[provider_id] = result
-        else:
-            print(f"[SERIES SEARCH] {provider_id} no results")

    elapsed = time.time() - start_time
-    print(
-        f"[SERIES SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
-    )
+    print(f"[SERIES SEARCH] Completed in {elapsed:.2f}s\n")

    return {"query": q, "lang": lang, "results": results}

@@ -235,7 +235,6 @@ async def get_anime_metadata(url: str):
    """Get detailed metadata for a specific anime"""
    try:
        downloader = get_downloader(url)
-
        if hasattr(downloader, "get_anime_metadata"):
            metadata = await downloader.get_anime_metadata(url)
            return {"url": url, "metadata": metadata}
@@ -244,7 +243,6 @@ async def get_anime_metadata(url: str):
                status_code=400,
                detail=f"Downloader for {url} does not support metadata extraction",
            )
-
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@@ -257,7 +255,6 @@ async def get_anime_episodes(
    """Get list of episodes for an anime"""
    downloader = get_downloader(url)
    episodes = await downloader.get_episodes(url, lang)
-
    return {"url": url, "lang": lang, "episodes": episodes}


@@ -267,15 +264,12 @@ async def get_anime_providers_list():
    return {"providers": get_anime_providers()}


-# ==================== ANIME-SAMA SPECIFIC ====================
-
-
@router.get("/anime-sama/search")
 async def search_anime_sama(
    q: str,
    lang: str = "vostfr",
 ):
-    """Search for anime on anime-sama"""
+    """Search for anime on anime-sama (legacy)"""
    downloader = AnimeSamaDownloader()
    results = await downloader.search_anime(q, lang)
    return {"query": q, "lang": lang, "results": results}
@@ -298,65 +292,6 @@ async def download_anime_episode(
    return {"task_id": task.id, "task": task}


-# ==================== FRIEREN LEGACY ENDPOINTS ====================
-
-
-@router.get("/anime/frieren/episodes")
-async def get_frieren_episodes():
-    """Get Frieren episodes from local database"""
-    try:
-        with open("app/frieren_episodes.json", "r") as f:
-            data = json.load(f)
-        return data
-    except Exception as e:
-        raise HTTPException(status_code=404, detail=f"Episodes not found: {e}")
-
-
-@router.post("/anime/frieren/download")
-async def download_frieren_episode(
-    season: int,
-    episode: str,
-    background_tasks: BackgroundTasks,
-    download_manager: DownloadManager = Depends(get_download_manager),
-):
-    """Download Frieren episode from local database"""
-    try:
-        with open("app/frieren_episodes.json", "r") as f:
-            data = json.load(f)
-
-        season_key = str(season)
-        if season_key not in data["seasons"]:
-            raise HTTPException(status_code=404, detail=f"Season {season} not found")
-
-        season_data = data["seasons"][season_key]
-        ep_data = next(
-            (ep for ep in season_data["episodes"] if ep["episode"] == episode), None
-        )
-
-        if not ep_data:
-            raise HTTPException(
-                status_code=404,
-                detail=f"Episode {episode} not found in season {season}",
-            )
-
-        url = ep_data["sibnet_url"]
-        filename = f"Frieren - S{season} - Episode {episode}.mp4"
-
-        request = DownloadRequest(url=url, filename=filename)
-        task = download_manager.create_task(request)
-        background_tasks.add_task(download_manager.start_download, task.id)
-
-        return {"task_id": task.id, "task": task}
-
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
-
-
-# ==================== DOWNLOAD SEASON ====================
-
-
@router.post("/anime/download-season")
 async def download_anime_season(
    url: str,
@@ -385,29 +320,14 @@ async def download_anime_season(
    }


-# ==================== SEASONS ====================
-
-
@router.get("/anime/seasons")
 async def get_anime_seasons(url: str):
    """Get list of seasons for an anime"""
    downloader = get_downloader(url)
-
    if hasattr(downloader, "get_seasons"):
        seasons = await downloader.get_seasons(url)
-
-        if not seasons:
-            return {"seasons": [], "message": "No seasons found"}
-
-        return {"seasons": seasons}
-    else:
-        return {
-            "seasons": [],
-            "message": "Season information not available for this provider",
-        }
-
-
-# ==================== MYANIMELIST INTEGRATION ====================
+        return {"seasons": seasons or []}
+    return {"seasons": [], "message": "Season info not available for this provider"}


@router.get("/anime/mal/search")
@@ -417,103 +337,40 @@ async def search_anime_mal_details(
 ):
    """Search for anime on MyAnimeList and get full details"""
    from app.recommendations import AnimeReleasesFetcher
-
    fetcher = AnimeReleasesFetcher()
-
    try:
        search_results = await fetcher.search_anime(q, limit=limit)
-
        if not search_results:
            return {"anime": None, "message": "No anime found"}
-
        main_anime = search_results[0]
        anime_details = await fetcher.get_anime_details(main_anime["mal_id"])
-
-        alternatives = search_results[1:] if len(search_results) > 1 else []
-
        return {
            "anime": anime_details,
-            "alternatives": alternatives,
+            "alternatives": search_results[1:],
            "total_results": len(search_results),
        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
    finally:
        await fetcher.close()


-@router.get("/anime/mal/{mal_id}")
-async def get_anime_by_id(mal_id: int):
-    """Get full details of an anime by its MyAnimeList ID"""
-    from app.recommendations import AnimeReleasesFetcher
-
-    fetcher = AnimeReleasesFetcher()
-
-    try:
-        anime_details = await fetcher.get_anime_details(mal_id)
-
-        if not anime_details:
-            raise HTTPException(status_code=404, detail="Anime not found")
-
-        return anime_details
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        await fetcher.close()
-
-
-# ==================== TRANSLATION ====================
-
-
@router.post("/translate")
 async def translate_text(request: Request):
    """Translate text from English to French using Google Translate"""
    import httpx
-    from logging import getLogger
-
-    logger = getLogger(__name__)
-
    try:
        body = await request.json()
        text = body.get("text", "")
-
        if not text:
            raise HTTPException(status_code=400, detail="Text is required")
-
-        text = text[:5000]
-
        async with httpx.AsyncClient(timeout=30.0) as client:
            url = "https://translate.googleapis.com/translate_a/single"
-            params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text}
-
-            logger.info(f"Translation request for text length: {len(text)}")
-
+            params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text[:5000]}
            response = await client.get(url, params=params)
-
-            logger.info(f"Translation API response status: {response.status_code}")
-
            if response.status_code == 200:
                data = response.json()
-
-                if data and len(data) > 0 and data[0]:
-                    translated_text = "".join([item[0] for item in data[0] if item[0]])
-
-                    if translated_text:
-                        logger.info(
-                            f"Translation successful, length: {len(translated_text)}"
-                        )
-                        return {"translatedText": translated_text, "status": "success"}
-
-                logger.warning(
-                    f"Unexpected Google Translate response structure: {data}"
-                )
-
+                if data and data[0]:
+                    translated = "".join([item[0] for item in data[0] if item[0]])
+                    return {"translatedText": translated, "status": "success"}
            raise HTTPException(status_code=500, detail="Translation failed")
-
-    except HTTPException:
-        raise
    except Exception as e:
-        logger.error(f"Translation error: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")