feat: robust scraping DSL and health monitoring (Phase 2)
CI / Test (Python 3.11) (push) Has been cancelled
CI / Test (Python 3.12) (push) Has been cancelled
CI / Lint (push) Has been cancelled
CI / Type Check (push) Has been cancelled
CI / Summary (push) Has been cancelled

- Implemented YAML-driven GenericScraper for resilient scraping
- Added ProvidersManager to manage scraper health and active providers
- Modernized unified search with systematic Kitsu metadata enrichment
- Integrated automated health checks in the scheduler
- Added comprehensive tests for scraping DSL and provider health
This commit is contained in:
root
2026-03-24 10:57:19 +00:00
parent 29c7040b20
commit 2b4cc617cb
8 changed files with 535 additions and 268 deletions
+122 -265
View File
@@ -2,15 +2,14 @@
Anime and series search routes for Ohm Stream Downloader API.
Endpoints:
- GET /api/anime/search - Search across all anime providers
- GET /api/anime/search - Search across all anime providers (Modernized with Kitsu)
- GET /api/series/search - Search across all TV series providers
- GET /api/anime/metadata - Get detailed metadata for a specific anime
- GET /api/anime/episodes - Get list of episodes for an anime
- GET /api/anime/providers - Get list of anime providers
- GET /api/anime-sama/search - Search for anime on anime-sama (legacy)
- GET /api/providers/health - Get provider health status
- POST /api/providers/health/check - Trigger health check
- POST /api/anime/download - Download an anime episode
- GET /api/anime/frieren/episodes - Get Frieren episodes from local database
- POST /api/anime/frieren/download - Download Frieren episode from local database
- POST /api/anime/download-season - Download all episodes of a season
- GET /api/anime/seasons - Get list of seasons for an anime
- GET /api/anime/mal/search - Search for anime on MyAnimeList
@@ -21,6 +20,8 @@ Endpoints:
import json
import re
import time
import logging
import asyncio
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request
@@ -34,14 +35,30 @@ from app.downloaders import (
)
from app.models import DownloadRequest
from app.providers import get_anime_providers, get_series_providers
from app.providers_manager import providers_manager
from app.metadata_enrichment import get_metadata_enricher
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api", tags=["anime"])
@router.get("/providers/health")
async def get_providers_health():
"""Get the current health status of all providers"""
return providers_manager.get_all_status()
@router.post("/providers/health/check")
async def trigger_providers_health_check(background_tasks: BackgroundTasks):
"""Trigger a manual health check of all providers in the background"""
from app.auto_download_scheduler import auto_download_scheduler
background_tasks.add_task(auto_download_scheduler.trigger_health_check_now)
return {"status": "Health check triggered in background"}
def get_download_manager() -> DownloadManager:
"""Get the download manager instance from main app"""
from main import download_manager
return download_manager
@@ -55,125 +72,114 @@ async def search_anime_unified(
include_metadata: bool = False,
):
"""
Search across all anime providers
Args:
q: Search query
lang: Language preference (vostfr, vf)
include_metadata: Whether to fetch full metadata (slower but more detailed)
Search across all anime providers using MetadataEnricher and health checks.
Results are grouped by provider for legacy UI compatibility.
"""
import asyncio
print(
f"\n[SEARCH] Starting search for '{q}' in {lang} (metadata={include_metadata})"
)
print(f"\n[SEARCH] Starting modern unified search for '{q}' in {lang}")
start_time = time.time()
results = {}
# 1. Prepare search tasks (Generic + Legacy)
search_tasks = []
task_metadata = []
# Create downloader instances
downloaders = {
"anime-sama": AnimeSamaDownloader(),
# Generic YAML providers
active_generic = providers_manager.get_active_providers()
for provider in active_generic:
print(f"[SEARCH] Queueing generic provider: {provider.name}")
search_tasks.append(provider.search(q))
task_metadata.append({"id": provider.id, "type": "generic"})
# Legacy providers (until migrated to YAML)
legacy_downloaders = {
"anime-ultime": AnimeUltimeDownloader(),
"neko-sama": NekoSamaDownloader(),
"vostfree": VostfreeDownloader(),
}
for pid, dl in legacy_downloaders.items():
print(f"[SEARCH] Queueing legacy provider: {pid}")
search_tasks.append(dl.search_anime(q, lang, include_metadata=False))
task_metadata.append({"id": pid, "type": "legacy"})
# Generate search query variations for better matching
search_queries = [q]
# 2. Run searches in parallel
print(f"[SEARCH] Waiting for {len(search_tasks)} provider results...")
all_raw_results = await asyncio.gather(*search_tasks, return_exceptions=True)
# Add fallback queries if original has spaces
if " " in q or "-" in q:
normalized = re.sub(r"[\s\-–—_:]+", "", q)
if normalized != q and len(normalized) >= 4:
search_queries.append(normalized)
# 3. Organize results by provider
seen_urls = set()
enricher = await get_metadata_enricher()
enrichment_tasks = []
# Map task indices to result slots for re-injection after enrichment
enrichment_mapping = [] # List of (provider_id, index_in_provider_results)
first_word = q.split()[0] if q.split() else None
if first_word and len(first_word) >= 4:
search_queries.append(first_word)
for i, raw_result in enumerate(all_raw_results):
provider_info = task_metadata[i]
pid = provider_info["id"]
if isinstance(raw_result, Exception):
logger.error(f"Search failed for {pid}: {raw_result}")
continue
if not raw_result:
continue
if pid not in results:
results[pid] = []
for item in raw_result:
# Normalize to dict
item_dict = item.model_dump() if hasattr(item, "model_dump") else item
url = item_dict.get("url")
if url and url not in seen_urls:
seen_urls.add(url)
# Check relevance simple boost
if q.lower() in (item_dict.get("title") or "").lower():
item_dict["_relevance_boost"] = 1.0
else:
item_dict["_relevance_boost"] = 0.5
results[pid].append(item_dict)
# Prepare enrichment task for top 5 results per provider
if len(results[pid]) <= 5:
enrichment_tasks.append(
enricher.enrich_metadata(
item_dict.get("metadata", {}),
item_dict.get("title", ""),
url
)
)
enrichment_mapping.append((pid, len(results[pid]) - 1))
else:
if "metadata" not in item_dict:
item_dict["metadata"] = {}
print(f"[SEARCH] Query variations: {search_queries}")
# 4. Perform parallel enrichment
if enrichment_tasks:
print(f"[SEARCH] Enriching {len(enrichment_tasks)} top results via Kitsu...")
enriched_metas = await asyncio.gather(*enrichment_tasks, return_exceptions=True)
# Re-inject enriched metadata
for idx, (pid, pos) in enumerate(enrichment_mapping):
if idx < len(enriched_metas):
meta = enriched_metas[idx]
if not isinstance(meta, Exception) and meta:
results[pid][pos]["metadata"] = meta.model_dump()
# Search with fallback queries
all_search_tasks = []
all_provider_ids = []
for search_query in search_queries:
print(f"[SEARCH] Trying query variant: '{search_query}'")
for provider_id, provider in get_anime_providers().items():
if provider_id in downloaders:
downloader = downloaders[provider_id]
print(
f"[SEARCH] Queueing search on {provider_id} for '{search_query}'..."
)
all_search_tasks.append(
{
"query": search_query,
"provider_id": provider_id,
"task": downloader.search_anime(
search_query, lang, include_metadata=include_metadata
),
}
)
all_provider_ids.append(provider_id)
print(f"[SEARCH] Waiting for {len(all_search_tasks)} searches...")
search_results = await asyncio.gather(
*[t["task"] for t in all_search_tasks], return_exceptions=True
)
# Process results
seen_urls = {}
for task_info, result in zip(all_search_tasks, search_results):
provider_id = task_info["provider_id"]
search_query = task_info["query"]
if isinstance(result, Exception):
print(
f"[SEARCH] {provider_id} (query: '{search_query}') error: {str(result)}"
)
elif result:
print(
f"[SEARCH] {provider_id} (query: '{search_query}') found {len(result)} results"
)
if provider_id not in results:
results[provider_id] = []
provider_results = results[provider_id]
for item in result:
url = item.get("url", "")
if url and url not in seen_urls:
seen_urls[url] = True
if search_query.lower() == q.lower():
item["_relevance_boost"] = 1.0
else:
item["_relevance_boost"] = 0.5
provider_results.append(item)
else:
print(f"[SEARCH] {provider_id} (query: '{search_query}') no results")
# Sort results by relevance
for provider_id in results:
results[provider_id].sort(
key=lambda x: (
-x.get("_relevance_boost", 0),
(x.get("title") or "").lower().find(q.lower()),
)
)
for item in results[provider_id]:
# 5. Sort results by relevance per provider
for pid in results:
results[pid].sort(key=lambda x: -x.get("_relevance_boost", 0))
for item in results[pid]:
item.pop("_relevance_boost", None)
# Remove providers with empty results
results = {k: v for k, v in results.items() if v}
elapsed = time.time() - start_time
print(
f"[SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
)
total_found = sum(len(r) for r in results.values())
print(f"[SEARCH] Finished in {elapsed:.2f}s. Found {total_found} unique results across {len(results)} providers.")
return {
"query": q,
"lang": lang,
@@ -197,9 +203,7 @@ async def search_series_unified(
start_time = time.time()
results = {}
series_downloaders = {"fs7": FS7Downloader()}
search_tasks = []
provider_ids = []
@@ -219,13 +223,9 @@ async def search_series_unified(
elif result:
print(f"[SERIES SEARCH] {provider_id} found {len(result)} results")
results[provider_id] = result
else:
print(f"[SERIES SEARCH] {provider_id} no results")
elapsed = time.time() - start_time
print(
f"[SERIES SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
)
print(f"[SERIES SEARCH] Completed in {elapsed:.2f}s\n")
return {"query": q, "lang": lang, "results": results}
@@ -235,7 +235,6 @@ async def get_anime_metadata(url: str):
"""Get detailed metadata for a specific anime"""
try:
downloader = get_downloader(url)
if hasattr(downloader, "get_anime_metadata"):
metadata = await downloader.get_anime_metadata(url)
return {"url": url, "metadata": metadata}
@@ -244,7 +243,6 @@ async def get_anime_metadata(url: str):
status_code=400,
detail=f"Downloader for {url} does not support metadata extraction",
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@@ -257,7 +255,6 @@ async def get_anime_episodes(
"""Get list of episodes for an anime"""
downloader = get_downloader(url)
episodes = await downloader.get_episodes(url, lang)
return {"url": url, "lang": lang, "episodes": episodes}
@@ -267,15 +264,12 @@ async def get_anime_providers_list():
return {"providers": get_anime_providers()}
# ==================== ANIME-SAMA SPECIFIC ====================
@router.get("/anime-sama/search")
async def search_anime_sama(
q: str,
lang: str = "vostfr",
):
"""Search for anime on anime-sama"""
"""Search for anime on anime-sama (legacy)"""
downloader = AnimeSamaDownloader()
results = await downloader.search_anime(q, lang)
return {"query": q, "lang": lang, "results": results}
@@ -298,65 +292,6 @@ async def download_anime_episode(
return {"task_id": task.id, "task": task}
# ==================== FRIEREN LEGACY ENDPOINTS ====================
@router.get("/anime/frieren/episodes")
async def get_frieren_episodes():
"""Get Frieren episodes from local database"""
try:
with open("app/frieren_episodes.json", "r") as f:
data = json.load(f)
return data
except Exception as e:
raise HTTPException(status_code=404, detail=f"Episodes not found: {e}")
@router.post("/anime/frieren/download")
async def download_frieren_episode(
season: int,
episode: str,
background_tasks: BackgroundTasks,
download_manager: DownloadManager = Depends(get_download_manager),
):
"""Download Frieren episode from local database"""
try:
with open("app/frieren_episodes.json", "r") as f:
data = json.load(f)
season_key = str(season)
if season_key not in data["seasons"]:
raise HTTPException(status_code=404, detail=f"Season {season} not found")
season_data = data["seasons"][season_key]
ep_data = next(
(ep for ep in season_data["episodes"] if ep["episode"] == episode), None
)
if not ep_data:
raise HTTPException(
status_code=404,
detail=f"Episode {episode} not found in season {season}",
)
url = ep_data["sibnet_url"]
filename = f"Frieren - S{season} - Episode {episode}.mp4"
request = DownloadRequest(url=url, filename=filename)
task = download_manager.create_task(request)
background_tasks.add_task(download_manager.start_download, task.id)
return {"task_id": task.id, "task": task}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
# ==================== DOWNLOAD SEASON ====================
@router.post("/anime/download-season")
async def download_anime_season(
url: str,
@@ -385,29 +320,14 @@ async def download_anime_season(
}
# ==================== SEASONS ====================
@router.get("/anime/seasons")
async def get_anime_seasons(url: str):
"""Get list of seasons for an anime"""
downloader = get_downloader(url)
if hasattr(downloader, "get_seasons"):
seasons = await downloader.get_seasons(url)
if not seasons:
return {"seasons": [], "message": "No seasons found"}
return {"seasons": seasons}
else:
return {
"seasons": [],
"message": "Season information not available for this provider",
}
# ==================== MYANIMELIST INTEGRATION ====================
return {"seasons": seasons or []}
return {"seasons": [], "message": "Season info not available for this provider"}
@router.get("/anime/mal/search")
@@ -417,103 +337,40 @@ async def search_anime_mal_details(
):
"""Search for anime on MyAnimeList and get full details"""
from app.recommendations import AnimeReleasesFetcher
fetcher = AnimeReleasesFetcher()
try:
search_results = await fetcher.search_anime(q, limit=limit)
if not search_results:
return {"anime": None, "message": "No anime found"}
main_anime = search_results[0]
anime_details = await fetcher.get_anime_details(main_anime["mal_id"])
alternatives = search_results[1:] if len(search_results) > 1 else []
return {
"anime": anime_details,
"alternatives": alternatives,
"alternatives": search_results[1:],
"total_results": len(search_results),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
await fetcher.close()
@router.get("/anime/mal/{mal_id}")
async def get_anime_by_id(mal_id: int):
"""Get full details of an anime by its MyAnimeList ID"""
from app.recommendations import AnimeReleasesFetcher
fetcher = AnimeReleasesFetcher()
try:
anime_details = await fetcher.get_anime_details(mal_id)
if not anime_details:
raise HTTPException(status_code=404, detail="Anime not found")
return anime_details
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
await fetcher.close()
# ==================== TRANSLATION ====================
@router.post("/translate")
async def translate_text(request: Request):
"""Translate text from English to French using Google Translate"""
import httpx
from logging import getLogger
logger = getLogger(__name__)
try:
body = await request.json()
text = body.get("text", "")
if not text:
raise HTTPException(status_code=400, detail="Text is required")
text = text[:5000]
async with httpx.AsyncClient(timeout=30.0) as client:
url = "https://translate.googleapis.com/translate_a/single"
params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text}
logger.info(f"Translation request for text length: {len(text)}")
params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text[:5000]}
response = await client.get(url, params=params)
logger.info(f"Translation API response status: {response.status_code}")
if response.status_code == 200:
data = response.json()
if data and len(data) > 0 and data[0]:
translated_text = "".join([item[0] for item in data[0] if item[0]])
if translated_text:
logger.info(
f"Translation successful, length: {len(translated_text)}"
)
return {"translatedText": translated_text, "status": "success"}
logger.warning(
f"Unexpected Google Translate response structure: {data}"
)
if data and data[0]:
translated = "".join([item[0] for item in data[0] if item[0]])
return {"translatedText": translated, "status": "success"}
raise HTTPException(status_code=500, detail="Translation failed")
except HTTPException:
raise
except Exception as e:
logger.error(f"Translation error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")