feat: robust scraping DSL and health monitoring (Phase 2)
- Implemented YAML-driven GenericScraper for resilient scraping - Added ProvidersManager to manage scraper health and active providers - Modernized unified search with systematic Kitsu metadata enrichment - Integrated automated health checks in the scheduler - Added comprehensive tests for scraping DSL and provider health
This commit is contained in:
+122
-265
@@ -2,15 +2,14 @@
|
||||
Anime and series search routes for Ohm Stream Downloader API.
|
||||
|
||||
Endpoints:
|
||||
- GET /api/anime/search - Search across all anime providers
|
||||
- GET /api/anime/search - Search across all anime providers (Modernized with Kitsu)
|
||||
- GET /api/series/search - Search across all TV series providers
|
||||
- GET /api/anime/metadata - Get detailed metadata for a specific anime
|
||||
- GET /api/anime/episodes - Get list of episodes for an anime
|
||||
- GET /api/anime/providers - Get list of anime providers
|
||||
- GET /api/anime-sama/search - Search for anime on anime-sama (legacy)
|
||||
- GET /api/providers/health - Get provider health status
|
||||
- POST /api/providers/health/check - Trigger health check
|
||||
- POST /api/anime/download - Download an anime episode
|
||||
- GET /api/anime/frieren/episodes - Get Frieren episodes from local database
|
||||
- POST /api/anime/frieren/download - Download Frieren episode from local database
|
||||
- POST /api/anime/download-season - Download all episodes of a season
|
||||
- GET /api/anime/seasons - Get list of seasons for an anime
|
||||
- GET /api/anime/mal/search - Search for anime on MyAnimeList
|
||||
@@ -21,6 +20,8 @@ Endpoints:
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
import asyncio
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request
|
||||
|
||||
@@ -34,14 +35,30 @@ from app.downloaders import (
|
||||
)
|
||||
from app.models import DownloadRequest
|
||||
from app.providers import get_anime_providers, get_series_providers
|
||||
from app.providers_manager import providers_manager
|
||||
from app.metadata_enrichment import get_metadata_enricher
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/api", tags=["anime"])
|
||||
|
||||
|
||||
@router.get("/providers/health")
|
||||
async def get_providers_health():
|
||||
"""Get the current health status of all providers"""
|
||||
return providers_manager.get_all_status()
|
||||
|
||||
|
||||
@router.post("/providers/health/check")
|
||||
async def trigger_providers_health_check(background_tasks: BackgroundTasks):
|
||||
"""Trigger a manual health check of all providers in the background"""
|
||||
from app.auto_download_scheduler import auto_download_scheduler
|
||||
background_tasks.add_task(auto_download_scheduler.trigger_health_check_now)
|
||||
return {"status": "Health check triggered in background"}
|
||||
|
||||
|
||||
def get_download_manager() -> DownloadManager:
|
||||
"""Get the download manager instance from main app"""
|
||||
from main import download_manager
|
||||
|
||||
return download_manager
|
||||
|
||||
|
||||
@@ -55,125 +72,114 @@ async def search_anime_unified(
|
||||
include_metadata: bool = False,
|
||||
):
|
||||
"""
|
||||
Search across all anime providers
|
||||
|
||||
Args:
|
||||
q: Search query
|
||||
lang: Language preference (vostfr, vf)
|
||||
include_metadata: Whether to fetch full metadata (slower but more detailed)
|
||||
Search across all anime providers using MetadataEnricher and health checks.
|
||||
Results are grouped by provider for legacy UI compatibility.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
print(
|
||||
f"\n[SEARCH] Starting search for '{q}' in {lang} (metadata={include_metadata})"
|
||||
)
|
||||
print(f"\n[SEARCH] Starting modern unified search for '{q}' in {lang}")
|
||||
start_time = time.time()
|
||||
|
||||
results = {}
|
||||
|
||||
# 1. Prepare search tasks (Generic + Legacy)
|
||||
search_tasks = []
|
||||
task_metadata = []
|
||||
|
||||
# Create downloader instances
|
||||
downloaders = {
|
||||
"anime-sama": AnimeSamaDownloader(),
|
||||
# Generic YAML providers
|
||||
active_generic = providers_manager.get_active_providers()
|
||||
for provider in active_generic:
|
||||
print(f"[SEARCH] Queueing generic provider: {provider.name}")
|
||||
search_tasks.append(provider.search(q))
|
||||
task_metadata.append({"id": provider.id, "type": "generic"})
|
||||
|
||||
# Legacy providers (until migrated to YAML)
|
||||
legacy_downloaders = {
|
||||
"anime-ultime": AnimeUltimeDownloader(),
|
||||
"neko-sama": NekoSamaDownloader(),
|
||||
"vostfree": VostfreeDownloader(),
|
||||
}
|
||||
for pid, dl in legacy_downloaders.items():
|
||||
print(f"[SEARCH] Queueing legacy provider: {pid}")
|
||||
search_tasks.append(dl.search_anime(q, lang, include_metadata=False))
|
||||
task_metadata.append({"id": pid, "type": "legacy"})
|
||||
|
||||
# Generate search query variations for better matching
|
||||
search_queries = [q]
|
||||
# 2. Run searches in parallel
|
||||
print(f"[SEARCH] Waiting for {len(search_tasks)} provider results...")
|
||||
all_raw_results = await asyncio.gather(*search_tasks, return_exceptions=True)
|
||||
|
||||
# Add fallback queries if original has spaces
|
||||
if " " in q or "-" in q:
|
||||
normalized = re.sub(r"[\s\-–—_:]+", "", q)
|
||||
if normalized != q and len(normalized) >= 4:
|
||||
search_queries.append(normalized)
|
||||
# 3. Organize results by provider
|
||||
seen_urls = set()
|
||||
enricher = await get_metadata_enricher()
|
||||
enrichment_tasks = []
|
||||
|
||||
# Map task indices to result slots for re-injection after enrichment
|
||||
enrichment_mapping = [] # List of (provider_id, index_in_provider_results)
|
||||
|
||||
first_word = q.split()[0] if q.split() else None
|
||||
if first_word and len(first_word) >= 4:
|
||||
search_queries.append(first_word)
|
||||
for i, raw_result in enumerate(all_raw_results):
|
||||
provider_info = task_metadata[i]
|
||||
pid = provider_info["id"]
|
||||
|
||||
if isinstance(raw_result, Exception):
|
||||
logger.error(f"Search failed for {pid}: {raw_result}")
|
||||
continue
|
||||
|
||||
if not raw_result:
|
||||
continue
|
||||
|
||||
if pid not in results:
|
||||
results[pid] = []
|
||||
|
||||
for item in raw_result:
|
||||
# Normalize to dict
|
||||
item_dict = item.model_dump() if hasattr(item, "model_dump") else item
|
||||
url = item_dict.get("url")
|
||||
|
||||
if url and url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
|
||||
# Check relevance simple boost
|
||||
if q.lower() in (item_dict.get("title") or "").lower():
|
||||
item_dict["_relevance_boost"] = 1.0
|
||||
else:
|
||||
item_dict["_relevance_boost"] = 0.5
|
||||
|
||||
results[pid].append(item_dict)
|
||||
|
||||
# Prepare enrichment task for top 5 results per provider
|
||||
if len(results[pid]) <= 5:
|
||||
enrichment_tasks.append(
|
||||
enricher.enrich_metadata(
|
||||
item_dict.get("metadata", {}),
|
||||
item_dict.get("title", ""),
|
||||
url
|
||||
)
|
||||
)
|
||||
enrichment_mapping.append((pid, len(results[pid]) - 1))
|
||||
else:
|
||||
if "metadata" not in item_dict:
|
||||
item_dict["metadata"] = {}
|
||||
|
||||
print(f"[SEARCH] Query variations: {search_queries}")
|
||||
# 4. Perform parallel enrichment
|
||||
if enrichment_tasks:
|
||||
print(f"[SEARCH] Enriching {len(enrichment_tasks)} top results via Kitsu...")
|
||||
enriched_metas = await asyncio.gather(*enrichment_tasks, return_exceptions=True)
|
||||
|
||||
# Re-inject enriched metadata
|
||||
for idx, (pid, pos) in enumerate(enrichment_mapping):
|
||||
if idx < len(enriched_metas):
|
||||
meta = enriched_metas[idx]
|
||||
if not isinstance(meta, Exception) and meta:
|
||||
results[pid][pos]["metadata"] = meta.model_dump()
|
||||
|
||||
# Search with fallback queries
|
||||
all_search_tasks = []
|
||||
all_provider_ids = []
|
||||
|
||||
for search_query in search_queries:
|
||||
print(f"[SEARCH] Trying query variant: '{search_query}'")
|
||||
|
||||
for provider_id, provider in get_anime_providers().items():
|
||||
if provider_id in downloaders:
|
||||
downloader = downloaders[provider_id]
|
||||
print(
|
||||
f"[SEARCH] Queueing search on {provider_id} for '{search_query}'..."
|
||||
)
|
||||
all_search_tasks.append(
|
||||
{
|
||||
"query": search_query,
|
||||
"provider_id": provider_id,
|
||||
"task": downloader.search_anime(
|
||||
search_query, lang, include_metadata=include_metadata
|
||||
),
|
||||
}
|
||||
)
|
||||
all_provider_ids.append(provider_id)
|
||||
|
||||
print(f"[SEARCH] Waiting for {len(all_search_tasks)} searches...")
|
||||
search_results = await asyncio.gather(
|
||||
*[t["task"] for t in all_search_tasks], return_exceptions=True
|
||||
)
|
||||
|
||||
# Process results
|
||||
seen_urls = {}
|
||||
|
||||
for task_info, result in zip(all_search_tasks, search_results):
|
||||
provider_id = task_info["provider_id"]
|
||||
search_query = task_info["query"]
|
||||
|
||||
if isinstance(result, Exception):
|
||||
print(
|
||||
f"[SEARCH] {provider_id} (query: '{search_query}') error: {str(result)}"
|
||||
)
|
||||
elif result:
|
||||
print(
|
||||
f"[SEARCH] {provider_id} (query: '{search_query}') found {len(result)} results"
|
||||
)
|
||||
|
||||
if provider_id not in results:
|
||||
results[provider_id] = []
|
||||
|
||||
provider_results = results[provider_id]
|
||||
for item in result:
|
||||
url = item.get("url", "")
|
||||
if url and url not in seen_urls:
|
||||
seen_urls[url] = True
|
||||
if search_query.lower() == q.lower():
|
||||
item["_relevance_boost"] = 1.0
|
||||
else:
|
||||
item["_relevance_boost"] = 0.5
|
||||
provider_results.append(item)
|
||||
else:
|
||||
print(f"[SEARCH] {provider_id} (query: '{search_query}') no results")
|
||||
|
||||
# Sort results by relevance
|
||||
for provider_id in results:
|
||||
results[provider_id].sort(
|
||||
key=lambda x: (
|
||||
-x.get("_relevance_boost", 0),
|
||||
(x.get("title") or "").lower().find(q.lower()),
|
||||
)
|
||||
)
|
||||
for item in results[provider_id]:
|
||||
# 5. Sort results by relevance per provider
|
||||
for pid in results:
|
||||
results[pid].sort(key=lambda x: -x.get("_relevance_boost", 0))
|
||||
for item in results[pid]:
|
||||
item.pop("_relevance_boost", None)
|
||||
|
||||
# Remove providers with empty results
|
||||
results = {k: v for k, v in results.items() if v}
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(
|
||||
f"[SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
|
||||
)
|
||||
|
||||
total_found = sum(len(r) for r in results.values())
|
||||
print(f"[SEARCH] Finished in {elapsed:.2f}s. Found {total_found} unique results across {len(results)} providers.")
|
||||
|
||||
return {
|
||||
"query": q,
|
||||
"lang": lang,
|
||||
@@ -197,9 +203,7 @@ async def search_series_unified(
|
||||
start_time = time.time()
|
||||
|
||||
results = {}
|
||||
|
||||
series_downloaders = {"fs7": FS7Downloader()}
|
||||
|
||||
search_tasks = []
|
||||
provider_ids = []
|
||||
|
||||
@@ -219,13 +223,9 @@ async def search_series_unified(
|
||||
elif result:
|
||||
print(f"[SERIES SEARCH] {provider_id} found {len(result)} results")
|
||||
results[provider_id] = result
|
||||
else:
|
||||
print(f"[SERIES SEARCH] {provider_id} no results")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(
|
||||
f"[SERIES SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
|
||||
)
|
||||
print(f"[SERIES SEARCH] Completed in {elapsed:.2f}s\n")
|
||||
|
||||
return {"query": q, "lang": lang, "results": results}
|
||||
|
||||
@@ -235,7 +235,6 @@ async def get_anime_metadata(url: str):
|
||||
"""Get detailed metadata for a specific anime"""
|
||||
try:
|
||||
downloader = get_downloader(url)
|
||||
|
||||
if hasattr(downloader, "get_anime_metadata"):
|
||||
metadata = await downloader.get_anime_metadata(url)
|
||||
return {"url": url, "metadata": metadata}
|
||||
@@ -244,7 +243,6 @@ async def get_anime_metadata(url: str):
|
||||
status_code=400,
|
||||
detail=f"Downloader for {url} does not support metadata extraction",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@@ -257,7 +255,6 @@ async def get_anime_episodes(
|
||||
"""Get list of episodes for an anime"""
|
||||
downloader = get_downloader(url)
|
||||
episodes = await downloader.get_episodes(url, lang)
|
||||
|
||||
return {"url": url, "lang": lang, "episodes": episodes}
|
||||
|
||||
|
||||
@@ -267,15 +264,12 @@ async def get_anime_providers_list():
|
||||
return {"providers": get_anime_providers()}
|
||||
|
||||
|
||||
# ==================== ANIME-SAMA SPECIFIC ====================
|
||||
|
||||
|
||||
@router.get("/anime-sama/search")
|
||||
async def search_anime_sama(
|
||||
q: str,
|
||||
lang: str = "vostfr",
|
||||
):
|
||||
"""Search for anime on anime-sama"""
|
||||
"""Search for anime on anime-sama (legacy)"""
|
||||
downloader = AnimeSamaDownloader()
|
||||
results = await downloader.search_anime(q, lang)
|
||||
return {"query": q, "lang": lang, "results": results}
|
||||
@@ -298,65 +292,6 @@ async def download_anime_episode(
|
||||
return {"task_id": task.id, "task": task}
|
||||
|
||||
|
||||
# ==================== FRIEREN LEGACY ENDPOINTS ====================
|
||||
|
||||
|
||||
@router.get("/anime/frieren/episodes")
|
||||
async def get_frieren_episodes():
|
||||
"""Get Frieren episodes from local database"""
|
||||
try:
|
||||
with open("app/frieren_episodes.json", "r") as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=404, detail=f"Episodes not found: {e}")
|
||||
|
||||
|
||||
@router.post("/anime/frieren/download")
|
||||
async def download_frieren_episode(
|
||||
season: int,
|
||||
episode: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
download_manager: DownloadManager = Depends(get_download_manager),
|
||||
):
|
||||
"""Download Frieren episode from local database"""
|
||||
try:
|
||||
with open("app/frieren_episodes.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
season_key = str(season)
|
||||
if season_key not in data["seasons"]:
|
||||
raise HTTPException(status_code=404, detail=f"Season {season} not found")
|
||||
|
||||
season_data = data["seasons"][season_key]
|
||||
ep_data = next(
|
||||
(ep for ep in season_data["episodes"] if ep["episode"] == episode), None
|
||||
)
|
||||
|
||||
if not ep_data:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Episode {episode} not found in season {season}",
|
||||
)
|
||||
|
||||
url = ep_data["sibnet_url"]
|
||||
filename = f"Frieren - S{season} - Episode {episode}.mp4"
|
||||
|
||||
request = DownloadRequest(url=url, filename=filename)
|
||||
task = download_manager.create_task(request)
|
||||
background_tasks.add_task(download_manager.start_download, task.id)
|
||||
|
||||
return {"task_id": task.id, "task": task}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
|
||||
|
||||
|
||||
# ==================== DOWNLOAD SEASON ====================
|
||||
|
||||
|
||||
@router.post("/anime/download-season")
|
||||
async def download_anime_season(
|
||||
url: str,
|
||||
@@ -385,29 +320,14 @@ async def download_anime_season(
|
||||
}
|
||||
|
||||
|
||||
# ==================== SEASONS ====================
|
||||
|
||||
|
||||
@router.get("/anime/seasons")
|
||||
async def get_anime_seasons(url: str):
|
||||
"""Get list of seasons for an anime"""
|
||||
downloader = get_downloader(url)
|
||||
|
||||
if hasattr(downloader, "get_seasons"):
|
||||
seasons = await downloader.get_seasons(url)
|
||||
|
||||
if not seasons:
|
||||
return {"seasons": [], "message": "No seasons found"}
|
||||
|
||||
return {"seasons": seasons}
|
||||
else:
|
||||
return {
|
||||
"seasons": [],
|
||||
"message": "Season information not available for this provider",
|
||||
}
|
||||
|
||||
|
||||
# ==================== MYANIMELIST INTEGRATION ====================
|
||||
return {"seasons": seasons or []}
|
||||
return {"seasons": [], "message": "Season info not available for this provider"}
|
||||
|
||||
|
||||
@router.get("/anime/mal/search")
|
||||
@@ -417,103 +337,40 @@ async def search_anime_mal_details(
|
||||
):
|
||||
"""Search for anime on MyAnimeList and get full details"""
|
||||
from app.recommendations import AnimeReleasesFetcher
|
||||
|
||||
fetcher = AnimeReleasesFetcher()
|
||||
|
||||
try:
|
||||
search_results = await fetcher.search_anime(q, limit=limit)
|
||||
|
||||
if not search_results:
|
||||
return {"anime": None, "message": "No anime found"}
|
||||
|
||||
main_anime = search_results[0]
|
||||
anime_details = await fetcher.get_anime_details(main_anime["mal_id"])
|
||||
|
||||
alternatives = search_results[1:] if len(search_results) > 1 else []
|
||||
|
||||
return {
|
||||
"anime": anime_details,
|
||||
"alternatives": alternatives,
|
||||
"alternatives": search_results[1:],
|
||||
"total_results": len(search_results),
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
await fetcher.close()
|
||||
|
||||
|
||||
@router.get("/anime/mal/{mal_id}")
|
||||
async def get_anime_by_id(mal_id: int):
|
||||
"""Get full details of an anime by its MyAnimeList ID"""
|
||||
from app.recommendations import AnimeReleasesFetcher
|
||||
|
||||
fetcher = AnimeReleasesFetcher()
|
||||
|
||||
try:
|
||||
anime_details = await fetcher.get_anime_details(mal_id)
|
||||
|
||||
if not anime_details:
|
||||
raise HTTPException(status_code=404, detail="Anime not found")
|
||||
|
||||
return anime_details
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
await fetcher.close()
|
||||
|
||||
|
||||
# ==================== TRANSLATION ====================
|
||||
|
||||
|
||||
@router.post("/translate")
|
||||
async def translate_text(request: Request):
|
||||
"""Translate text from English to French using Google Translate"""
|
||||
import httpx
|
||||
from logging import getLogger
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
try:
|
||||
body = await request.json()
|
||||
text = body.get("text", "")
|
||||
|
||||
if not text:
|
||||
raise HTTPException(status_code=400, detail="Text is required")
|
||||
|
||||
text = text[:5000]
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
url = "https://translate.googleapis.com/translate_a/single"
|
||||
params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text}
|
||||
|
||||
logger.info(f"Translation request for text length: {len(text)}")
|
||||
|
||||
params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text[:5000]}
|
||||
response = await client.get(url, params=params)
|
||||
|
||||
logger.info(f"Translation API response status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
|
||||
if data and len(data) > 0 and data[0]:
|
||||
translated_text = "".join([item[0] for item in data[0] if item[0]])
|
||||
|
||||
if translated_text:
|
||||
logger.info(
|
||||
f"Translation successful, length: {len(translated_text)}"
|
||||
)
|
||||
return {"translatedText": translated_text, "status": "success"}
|
||||
|
||||
logger.warning(
|
||||
f"Unexpected Google Translate response structure: {data}"
|
||||
)
|
||||
|
||||
if data and data[0]:
|
||||
translated = "".join([item[0] for item in data[0] if item[0]])
|
||||
return {"translatedText": translated, "status": "success"}
|
||||
raise HTTPException(status_code=500, detail="Translation failed")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Translation error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
|
||||
|
||||
Reference in New Issue
Block a user