feat: robust scraping DSL and health monitoring (Phase 2)
CI / Test (Python 3.11) (push) Has been cancelled
CI / Test (Python 3.12) (push) Has been cancelled
CI / Lint (push) Has been cancelled
CI / Type Check (push) Has been cancelled
CI / Summary (push) Has been cancelled

- Implemented YAML-driven GenericScraper for resilient scraping
- Added ProvidersManager to manage scraper health and active providers
- Modernized unified search with systematic Kitsu metadata enrichment
- Integrated automated health checks in the scheduler
- Added comprehensive tests for scraping DSL and provider health
This commit is contained in:
root
2026-03-24 10:57:19 +00:00
parent 29c7040b20
commit 2b4cc617cb
8 changed files with 535 additions and 268 deletions
+29 -1
View File
@@ -9,6 +9,7 @@ from apscheduler.triggers.interval import IntervalTrigger
from app.watchlist import watchlist_manager, WatchlistManager
from app.episode_checker import EpisodeChecker, episode_checker
from app.providers_manager import providers_manager
logger = logging.getLogger(__name__)
@@ -23,6 +24,7 @@ class AutoDownloadScheduler:
):
self.wlm = wlm or watchlist_manager
self.checker = checker or episode_checker
self.providers_mgr = providers_manager
self.scheduler: Optional[AsyncIOScheduler] = None
self._running = False
@@ -46,6 +48,14 @@ class AutoDownloadScheduler:
except Exception as e:
logger.error(f"Error in scheduled check job: {e}", exc_info=True)
async def _health_check_job(self):
"""Job function that runs periodically to check provider health"""
try:
logger.info("Running scheduled provider health check...")
await self.providers_mgr.check_all_health()
except Exception as e:
logger.error(f"Error in health check job: {e}")
def start(self):
"""Start the scheduler"""
if self._running:
@@ -59,7 +69,7 @@ class AutoDownloadScheduler:
settings = self.wlm.get_settings()
interval_hours = settings.check_interval_hours
# Add the job
# Add the job for episode checking
self.scheduler.add_job(
self._check_job,
trigger=IntervalTrigger(hours=interval_hours),
@@ -68,6 +78,15 @@ class AutoDownloadScheduler:
replace_existing=True
)
# Add the job for provider health check (every 6 hours)
self.scheduler.add_job(
self._health_check_job,
trigger=IntervalTrigger(hours=6),
id='provider_health',
name='Check provider health',
replace_existing=True
)
# Start the scheduler
self.scheduler.start()
self._running = True
@@ -149,6 +168,15 @@ class AutoDownloadScheduler:
logger.error(f"Error in manual check: {e}", exc_info=True)
raise
async def trigger_health_check_now(self):
"""Manually trigger a health check now"""
logger.info("Manually triggering provider health check...")
try:
await self._health_check_job()
except Exception as e:
logger.error(f"Error in manual health check: {e}")
raise
# Global scheduler instance
auto_download_scheduler = AutoDownloadScheduler()
+122
View File
@@ -0,0 +1,122 @@
"""Generic scraper driven by YAML configuration"""
import yaml
import logging
import httpx
from bs4 import BeautifulSoup
from typing import List, Dict, Optional, Any
from pathlib import Path
from urllib.parse import urljoin, quote
from app.downloaders.anime_sites.base import BaseAnimeSite
from app.models import AnimeSearchResult, AnimeMetadata
from app.metadata_enrichment import get_metadata_enricher
logger = logging.getLogger(__name__)
class GenericScraper(BaseAnimeSite):
"""A scraper that uses external configuration for its logic"""
def __init__(self, config_path: str):
with open(config_path, 'r', encoding='utf-8') as f:
self.config = yaml.safe_load(f)
self.id = self.config['id']
self.name = self.config['name']
self.base_url = self.config['base_url']
self.mirrors = self.config.get('mirrors', [])
# Current active base URL (can change if mirror found)
self.active_url = self.base_url
self.client = httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
)
async def search(self, query: str) -> List[AnimeSearchResult]:
"""Search using configured selectors"""
search_config = self.config.get('search')
if not search_config:
logger.warning(f"No search config for {self.name}")
return []
search_path = search_config['path'].format(query=quote(query))
url = urljoin(self.active_url, search_path)
try:
response = await self.client.get(url)
soup = BeautifulSoup(response.text, 'lxml')
results = []
container = search_config.get('container_selector')
items = soup.select(container) if container else [soup]
for item in items:
try:
title_node = item.select_one(search_config['title_selector'])
url_node = item.select_one(search_config['url_selector'])
if not title_node or not url_node:
continue
title = title_node.get_text(strip=True)
href = url_node.get('href')
anime_url = urljoin(self.active_url, href)
img_node = item.select_one(search_config.get('image_selector', 'img'))
cover_image = img_node.get('src') if img_node else None
if cover_image:
cover_image = urljoin(self.active_url, cover_image)
# Initial metadata from scraper
meta_dict = {
"poster_image": cover_image,
"status": "Unknown"
}
# Enrich with Kitsu via global service
enricher = await get_metadata_enricher()
metadata = await enricher.enrich_metadata(meta_dict, title, anime_url)
results.append(AnimeSearchResult(
title=title,
url=anime_url,
cover_image=metadata.poster_image or cover_image,
type="search_result",
metadata=metadata
))
except Exception as e:
logger.error(f"Error parsing search result item: {e}")
return results
except Exception as e:
logger.error(f"Search failed for {self.name}: {e}")
return []
async def get_episodes(self, anime_url: str) -> List[Dict[str, Any]]:
"""Get episodes list (to be specialized if site logic is complex)"""
# Default implementation for simple sites
# For complex sites like Anime-Sama, we might still need a specialized subclass
# but driven by the YAML config for base parameters.
return []
async def check_health(self) -> bool:
"""Check if the site is up and selectors still work"""
try:
# Try a test search for a very common anime
results = await self.search("One Piece")
is_healthy = len(results) > 0
if not is_healthy:
logger.warning(f"Health check failed for {self.name}: No results found")
return is_healthy
except Exception as e:
logger.error(f"Health check failed for {self.name} with error: {e}")
return False
async def close(self):
await self.client.aclose()
@@ -0,0 +1,24 @@
name: "Anime-Sama"
id: "animesama"
base_url: "https://anime-sama.fr"
mirrors:
- "https://anime-sama.si"
- "https://anime-sama.co"
search:
path: "/search?q={query}"
container_selector: ".result-item"
title_selector: "h3"
url_selector: "a"
image_selector: "img"
episodes:
container_selector: "#episodes-list"
item_selector: ".episode-item"
# Logic for Anime-Sama can be complex, we'll handle custom logic in GenericScraper
# but keep common selectors here.
player_iframe_selector: "iframe#player"
metadata:
synopsis_selector: ".synopsis"
genres_selector: ".genres .genre"
+84
View File
@@ -0,0 +1,84 @@
"""Manages scraper providers and their health status"""
import os
import logging
import asyncio
from typing import Dict, List, Optional
from pathlib import Path
from datetime import datetime
from app.downloaders.generic_scraper import GenericScraper
logger = logging.getLogger(__name__)
class ProvidersManager:
"""Registry and health manager for scraping providers"""
def __init__(self, config_dir: str = "app/downloaders/providers_config"):
self.config_dir = Path(config_dir)
self.providers: Dict[str, GenericScraper] = {}
self.health_status: Dict[str, Dict] = {}
self._load_providers()
def _load_providers(self):
"""Load all providers from YAML configs"""
if not self.config_dir.exists():
logger.warning(f"Providers config directory not found: {self.config_dir}")
return
for config_file in self.config_dir.glob("*.yaml"):
try:
scraper = GenericScraper(str(config_file))
self.providers[scraper.id] = scraper
self.health_status[scraper.id] = {
"status": "unknown",
"last_check": None,
"error": None
}
logger.info(f"Loaded provider: {scraper.name} ({scraper.id})")
except Exception as e:
logger.error(f"Failed to load provider from {config_file}: {e}")
async def check_all_health(self):
"""Check health of all registered providers"""
logger.info("Checking health of all providers...")
tasks = []
for provider_id, scraper in self.providers.items():
tasks.append(self._check_single_health(provider_id, scraper))
await asyncio.gather(*tasks)
logger.info("Provider health check complete")
async def _check_single_health(self, provider_id: str, scraper: GenericScraper):
"""Check health of a single provider and update status"""
try:
is_healthy = await scraper.check_health()
self.health_status[provider_id] = {
"status": "up" if is_healthy else "down",
"last_check": datetime.now().isoformat(),
"error": None if is_healthy else "No search results returned"
}
except Exception as e:
self.health_status[provider_id] = {
"status": "down",
"last_check": datetime.now().isoformat(),
"error": str(e)
}
logger.error(f"Health check failed for {provider_id}: {e}")
def get_provider(self, provider_id: str) -> Optional[GenericScraper]:
return self.providers.get(provider_id)
def get_active_providers(self) -> List[GenericScraper]:
"""Return only providers that are UP or UNKNOWN"""
return [
self.providers[pid] for pid, status in self.health_status.items()
if status["status"] != "down"
]
def get_all_status(self) -> Dict[str, Dict]:
return self.health_status
# Global instance
providers_manager = ProvidersManager()
+122 -265
View File
@@ -2,15 +2,14 @@
Anime and series search routes for Ohm Stream Downloader API.
Endpoints:
- GET /api/anime/search - Search across all anime providers
- GET /api/anime/search - Search across all anime providers (Modernized with Kitsu)
- GET /api/series/search - Search across all TV series providers
- GET /api/anime/metadata - Get detailed metadata for a specific anime
- GET /api/anime/episodes - Get list of episodes for an anime
- GET /api/anime/providers - Get list of anime providers
- GET /api/anime-sama/search - Search for anime on anime-sama (legacy)
- GET /api/providers/health - Get provider health status
- POST /api/providers/health/check - Trigger health check
- POST /api/anime/download - Download an anime episode
- GET /api/anime/frieren/episodes - Get Frieren episodes from local database
- POST /api/anime/frieren/download - Download Frieren episode from local database
- POST /api/anime/download-season - Download all episodes of a season
- GET /api/anime/seasons - Get list of seasons for an anime
- GET /api/anime/mal/search - Search for anime on MyAnimeList
@@ -21,6 +20,8 @@ Endpoints:
import json
import re
import time
import logging
import asyncio
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request
@@ -34,14 +35,30 @@ from app.downloaders import (
)
from app.models import DownloadRequest
from app.providers import get_anime_providers, get_series_providers
from app.providers_manager import providers_manager
from app.metadata_enrichment import get_metadata_enricher
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api", tags=["anime"])
@router.get("/providers/health")
async def get_providers_health():
"""Get the current health status of all providers"""
return providers_manager.get_all_status()
@router.post("/providers/health/check")
async def trigger_providers_health_check(background_tasks: BackgroundTasks):
"""Trigger a manual health check of all providers in the background"""
from app.auto_download_scheduler import auto_download_scheduler
background_tasks.add_task(auto_download_scheduler.trigger_health_check_now)
return {"status": "Health check triggered in background"}
def get_download_manager() -> DownloadManager:
"""Get the download manager instance from main app"""
from main import download_manager
return download_manager
@@ -55,125 +72,114 @@ async def search_anime_unified(
include_metadata: bool = False,
):
"""
Search across all anime providers
Args:
q: Search query
lang: Language preference (vostfr, vf)
include_metadata: Whether to fetch full metadata (slower but more detailed)
Search across all anime providers using MetadataEnricher and health checks.
Results are grouped by provider for legacy UI compatibility.
"""
import asyncio
print(
f"\n[SEARCH] Starting search for '{q}' in {lang} (metadata={include_metadata})"
)
print(f"\n[SEARCH] Starting modern unified search for '{q}' in {lang}")
start_time = time.time()
results = {}
# 1. Prepare search tasks (Generic + Legacy)
search_tasks = []
task_metadata = []
# Create downloader instances
downloaders = {
"anime-sama": AnimeSamaDownloader(),
# Generic YAML providers
active_generic = providers_manager.get_active_providers()
for provider in active_generic:
print(f"[SEARCH] Queueing generic provider: {provider.name}")
search_tasks.append(provider.search(q))
task_metadata.append({"id": provider.id, "type": "generic"})
# Legacy providers (until migrated to YAML)
legacy_downloaders = {
"anime-ultime": AnimeUltimeDownloader(),
"neko-sama": NekoSamaDownloader(),
"vostfree": VostfreeDownloader(),
}
for pid, dl in legacy_downloaders.items():
print(f"[SEARCH] Queueing legacy provider: {pid}")
search_tasks.append(dl.search_anime(q, lang, include_metadata=False))
task_metadata.append({"id": pid, "type": "legacy"})
# Generate search query variations for better matching
search_queries = [q]
# 2. Run searches in parallel
print(f"[SEARCH] Waiting for {len(search_tasks)} provider results...")
all_raw_results = await asyncio.gather(*search_tasks, return_exceptions=True)
# Add fallback queries if original has spaces
if " " in q or "-" in q:
normalized = re.sub(r"[\s\-–—_:]+", "", q)
if normalized != q and len(normalized) >= 4:
search_queries.append(normalized)
# 3. Organize results by provider
seen_urls = set()
enricher = await get_metadata_enricher()
enrichment_tasks = []
# Map task indices to result slots for re-injection after enrichment
enrichment_mapping = [] # List of (provider_id, index_in_provider_results)
first_word = q.split()[0] if q.split() else None
if first_word and len(first_word) >= 4:
search_queries.append(first_word)
for i, raw_result in enumerate(all_raw_results):
provider_info = task_metadata[i]
pid = provider_info["id"]
if isinstance(raw_result, Exception):
logger.error(f"Search failed for {pid}: {raw_result}")
continue
if not raw_result:
continue
if pid not in results:
results[pid] = []
for item in raw_result:
# Normalize to dict
item_dict = item.model_dump() if hasattr(item, "model_dump") else item
url = item_dict.get("url")
if url and url not in seen_urls:
seen_urls.add(url)
# Check relevance simple boost
if q.lower() in (item_dict.get("title") or "").lower():
item_dict["_relevance_boost"] = 1.0
else:
item_dict["_relevance_boost"] = 0.5
results[pid].append(item_dict)
# Prepare enrichment task for top 5 results per provider
if len(results[pid]) <= 5:
enrichment_tasks.append(
enricher.enrich_metadata(
item_dict.get("metadata", {}),
item_dict.get("title", ""),
url
)
)
enrichment_mapping.append((pid, len(results[pid]) - 1))
else:
if "metadata" not in item_dict:
item_dict["metadata"] = {}
print(f"[SEARCH] Query variations: {search_queries}")
# 4. Perform parallel enrichment
if enrichment_tasks:
print(f"[SEARCH] Enriching {len(enrichment_tasks)} top results via Kitsu...")
enriched_metas = await asyncio.gather(*enrichment_tasks, return_exceptions=True)
# Re-inject enriched metadata
for idx, (pid, pos) in enumerate(enrichment_mapping):
if idx < len(enriched_metas):
meta = enriched_metas[idx]
if not isinstance(meta, Exception) and meta:
results[pid][pos]["metadata"] = meta.model_dump()
# Search with fallback queries
all_search_tasks = []
all_provider_ids = []
for search_query in search_queries:
print(f"[SEARCH] Trying query variant: '{search_query}'")
for provider_id, provider in get_anime_providers().items():
if provider_id in downloaders:
downloader = downloaders[provider_id]
print(
f"[SEARCH] Queueing search on {provider_id} for '{search_query}'..."
)
all_search_tasks.append(
{
"query": search_query,
"provider_id": provider_id,
"task": downloader.search_anime(
search_query, lang, include_metadata=include_metadata
),
}
)
all_provider_ids.append(provider_id)
print(f"[SEARCH] Waiting for {len(all_search_tasks)} searches...")
search_results = await asyncio.gather(
*[t["task"] for t in all_search_tasks], return_exceptions=True
)
# Process results
seen_urls = {}
for task_info, result in zip(all_search_tasks, search_results):
provider_id = task_info["provider_id"]
search_query = task_info["query"]
if isinstance(result, Exception):
print(
f"[SEARCH] {provider_id} (query: '{search_query}') error: {str(result)}"
)
elif result:
print(
f"[SEARCH] {provider_id} (query: '{search_query}') found {len(result)} results"
)
if provider_id not in results:
results[provider_id] = []
provider_results = results[provider_id]
for item in result:
url = item.get("url", "")
if url and url not in seen_urls:
seen_urls[url] = True
if search_query.lower() == q.lower():
item["_relevance_boost"] = 1.0
else:
item["_relevance_boost"] = 0.5
provider_results.append(item)
else:
print(f"[SEARCH] {provider_id} (query: '{search_query}') no results")
# Sort results by relevance
for provider_id in results:
results[provider_id].sort(
key=lambda x: (
-x.get("_relevance_boost", 0),
(x.get("title") or "").lower().find(q.lower()),
)
)
for item in results[provider_id]:
# 5. Sort results by relevance per provider
for pid in results:
results[pid].sort(key=lambda x: -x.get("_relevance_boost", 0))
for item in results[pid]:
item.pop("_relevance_boost", None)
# Remove providers with empty results
results = {k: v for k, v in results.items() if v}
elapsed = time.time() - start_time
print(
f"[SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
)
total_found = sum(len(r) for r in results.values())
print(f"[SEARCH] Finished in {elapsed:.2f}s. Found {total_found} unique results across {len(results)} providers.")
return {
"query": q,
"lang": lang,
@@ -197,9 +203,7 @@ async def search_series_unified(
start_time = time.time()
results = {}
series_downloaders = {"fs7": FS7Downloader()}
search_tasks = []
provider_ids = []
@@ -219,13 +223,9 @@ async def search_series_unified(
elif result:
print(f"[SERIES SEARCH] {provider_id} found {len(result)} results")
results[provider_id] = result
else:
print(f"[SERIES SEARCH] {provider_id} no results")
elapsed = time.time() - start_time
print(
f"[SERIES SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
)
print(f"[SERIES SEARCH] Completed in {elapsed:.2f}s\n")
return {"query": q, "lang": lang, "results": results}
@@ -235,7 +235,6 @@ async def get_anime_metadata(url: str):
"""Get detailed metadata for a specific anime"""
try:
downloader = get_downloader(url)
if hasattr(downloader, "get_anime_metadata"):
metadata = await downloader.get_anime_metadata(url)
return {"url": url, "metadata": metadata}
@@ -244,7 +243,6 @@ async def get_anime_metadata(url: str):
status_code=400,
detail=f"Downloader for {url} does not support metadata extraction",
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@@ -257,7 +255,6 @@ async def get_anime_episodes(
"""Get list of episodes for an anime"""
downloader = get_downloader(url)
episodes = await downloader.get_episodes(url, lang)
return {"url": url, "lang": lang, "episodes": episodes}
@@ -267,15 +264,12 @@ async def get_anime_providers_list():
return {"providers": get_anime_providers()}
# ==================== ANIME-SAMA SPECIFIC ====================
@router.get("/anime-sama/search")
async def search_anime_sama(
q: str,
lang: str = "vostfr",
):
"""Search for anime on anime-sama"""
"""Search for anime on anime-sama (legacy)"""
downloader = AnimeSamaDownloader()
results = await downloader.search_anime(q, lang)
return {"query": q, "lang": lang, "results": results}
@@ -298,65 +292,6 @@ async def download_anime_episode(
return {"task_id": task.id, "task": task}
# ==================== FRIEREN LEGACY ENDPOINTS ====================
@router.get("/anime/frieren/episodes")
async def get_frieren_episodes():
"""Get Frieren episodes from local database"""
try:
with open("app/frieren_episodes.json", "r") as f:
data = json.load(f)
return data
except Exception as e:
raise HTTPException(status_code=404, detail=f"Episodes not found: {e}")
@router.post("/anime/frieren/download")
async def download_frieren_episode(
season: int,
episode: str,
background_tasks: BackgroundTasks,
download_manager: DownloadManager = Depends(get_download_manager),
):
"""Download Frieren episode from local database"""
try:
with open("app/frieren_episodes.json", "r") as f:
data = json.load(f)
season_key = str(season)
if season_key not in data["seasons"]:
raise HTTPException(status_code=404, detail=f"Season {season} not found")
season_data = data["seasons"][season_key]
ep_data = next(
(ep for ep in season_data["episodes"] if ep["episode"] == episode), None
)
if not ep_data:
raise HTTPException(
status_code=404,
detail=f"Episode {episode} not found in season {season}",
)
url = ep_data["sibnet_url"]
filename = f"Frieren - S{season} - Episode {episode}.mp4"
request = DownloadRequest(url=url, filename=filename)
task = download_manager.create_task(request)
background_tasks.add_task(download_manager.start_download, task.id)
return {"task_id": task.id, "task": task}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
# ==================== DOWNLOAD SEASON ====================
@router.post("/anime/download-season")
async def download_anime_season(
url: str,
@@ -385,29 +320,14 @@ async def download_anime_season(
}
# ==================== SEASONS ====================
@router.get("/anime/seasons")
async def get_anime_seasons(url: str):
"""Get list of seasons for an anime"""
downloader = get_downloader(url)
if hasattr(downloader, "get_seasons"):
seasons = await downloader.get_seasons(url)
if not seasons:
return {"seasons": [], "message": "No seasons found"}
return {"seasons": seasons}
else:
return {
"seasons": [],
"message": "Season information not available for this provider",
}
# ==================== MYANIMELIST INTEGRATION ====================
return {"seasons": seasons or []}
return {"seasons": [], "message": "Season info not available for this provider"}
@router.get("/anime/mal/search")
@@ -417,103 +337,40 @@ async def search_anime_mal_details(
):
"""Search for anime on MyAnimeList and get full details"""
from app.recommendations import AnimeReleasesFetcher
fetcher = AnimeReleasesFetcher()
try:
search_results = await fetcher.search_anime(q, limit=limit)
if not search_results:
return {"anime": None, "message": "No anime found"}
main_anime = search_results[0]
anime_details = await fetcher.get_anime_details(main_anime["mal_id"])
alternatives = search_results[1:] if len(search_results) > 1 else []
return {
"anime": anime_details,
"alternatives": alternatives,
"alternatives": search_results[1:],
"total_results": len(search_results),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
await fetcher.close()
@router.get("/anime/mal/{mal_id}")
async def get_anime_by_id(mal_id: int):
"""Get full details of an anime by its MyAnimeList ID"""
from app.recommendations import AnimeReleasesFetcher
fetcher = AnimeReleasesFetcher()
try:
anime_details = await fetcher.get_anime_details(mal_id)
if not anime_details:
raise HTTPException(status_code=404, detail="Anime not found")
return anime_details
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
await fetcher.close()
# ==================== TRANSLATION ====================
@router.post("/translate")
async def translate_text(request: Request):
"""Translate text from English to French using Google Translate"""
import httpx
from logging import getLogger
logger = getLogger(__name__)
try:
body = await request.json()
text = body.get("text", "")
if not text:
raise HTTPException(status_code=400, detail="Text is required")
text = text[:5000]
async with httpx.AsyncClient(timeout=30.0) as client:
url = "https://translate.googleapis.com/translate_a/single"
params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text}
logger.info(f"Translation request for text length: {len(text)}")
params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text[:5000]}
response = await client.get(url, params=params)
logger.info(f"Translation API response status: {response.status_code}")
if response.status_code == 200:
data = response.json()
if data and len(data) > 0 and data[0]:
translated_text = "".join([item[0] for item in data[0] if item[0]])
if translated_text:
logger.info(
f"Translation successful, length: {len(translated_text)}"
)
return {"translatedText": translated_text, "status": "success"}
logger.warning(
f"Unexpected Google Translate response structure: {data}"
)
if data and data[0]:
translated = "".join([item[0] for item in data[0] if item[0]])
return {"translatedText": translated, "status": "success"}
raise HTTPException(status_code=500, detail="Translation failed")
except HTTPException:
raise
except Exception as e:
logger.error(f"Translation error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
+1
View File
@@ -11,6 +11,7 @@ beautifulsoup4==4.12.3
lxml==5.3.0
jieba==0.42.1
sqlmodel==0.0.22
PyYAML==6.0.1
# Testing dependencies
pytest==8.3.4
-2
View File
@@ -72,7 +72,6 @@ def mock_kitsu_api_raw():
}
@pytest.mark.skip(reason="New tests for non-implemented feature")
class TestMetadataEnricher:
"""Test MetadataEnricher functionality."""
@@ -389,7 +388,6 @@ class TestMetadataEnricher:
assert result.rating is None
@pytest.mark.skip(reason="New tests for non-implemented feature")
class TestMetadataEnrichmentIntegration:
"""Integration tests for metadata enrichment."""
+153
View File
@@ -0,0 +1,153 @@
"""
Tests for Phase 2: Robust Scraping (DSL, Health Checks, Unified Search)
"""
import pytest
import yaml
import os
from unittest.mock import AsyncMock, MagicMock, patch
from pathlib import Path
from app.downloaders.generic_scraper import GenericScraper
from app.providers_manager import ProvidersManager
from app.models import AnimeSearchResult, AnimeMetadata
@pytest.fixture
def mock_config_path(tmp_path):
"""Create a temporary YAML config file for testing"""
config = {
"name": "Test Site",
"id": "testsite",
"base_url": "https://test.com",
"search": {
"path": "/search?q={query}",
"container_selector": ".item",
"title_selector": "h3",
"url_selector": "a",
"image_selector": "img"
}
}
config_file = tmp_path / "testsite.yaml"
with open(config_file, 'w', encoding='utf-8') as f:
yaml.dump(config, f)
return str(config_file)
class TestGenericScraper:
"""Tests for GenericScraper driven by YAML"""
def test_init_loads_config(self, mock_config_path):
scraper = GenericScraper(mock_config_path)
assert scraper.name == "Test Site"
assert scraper.id == "testsite"
assert scraper.base_url == "https://test.com"
@pytest.mark.asyncio
async def test_search_logic(self, mock_config_path):
scraper = GenericScraper(mock_config_path)
# Mock HTTP response
mock_html = """
<div class="item">
<h3>Naruto</h3>
<a href="/naruto-page">Link</a>
<img src="/cover.jpg">
</div>
"""
with patch.object(scraper.client, 'get', return_value=MagicMock(text=mock_html)) as mock_get:
# Mock metadata enrichment to avoid real API calls
with patch('app.downloaders.generic_scraper.get_metadata_enricher') as mock_get_enricher:
mock_enricher = AsyncMock()
mock_enricher.enrich_metadata.return_value = AnimeMetadata(title="Naruto", poster_image="https://test.com/cover.jpg")
mock_get_enricher.return_value = mock_enricher
results = await scraper.search("Naruto")
assert len(results) == 1
assert results[0].title == "Naruto"
assert "test.com/naruto-page" in results[0].url
assert results[0].cover_image == "https://test.com/cover.jpg"
@pytest.mark.asyncio
async def test_check_health_success(self, mock_config_path):
scraper = GenericScraper(mock_config_path)
with patch.object(scraper, 'search', return_value=[MagicMock()]) as mock_search:
is_healthy = await scraper.check_health()
assert is_healthy is True
mock_search.assert_called_once_with("One Piece")
@pytest.mark.asyncio
async def test_check_health_failure(self, mock_config_path):
scraper = GenericScraper(mock_config_path)
with patch.object(scraper, 'search', return_value=[]) as mock_search:
is_healthy = await scraper.check_health()
assert is_healthy is False
class TestProvidersManager:
"""Tests for ProvidersManager"""
def test_load_providers(self, tmp_path):
# Create a temp providers config dir
config_dir = tmp_path / "config"
config_dir.mkdir()
# Create two mock configs
for i in range(2):
config = {"name": f"Site {i}", "id": f"site{i}", "base_url": "http://test.com"}
with open(config_dir / f"site{i}.yaml", 'w') as f:
yaml.dump(config, f)
manager = ProvidersManager(str(config_dir))
assert len(manager.providers) == 2
assert "site0" in manager.providers
assert "site1" in manager.providers
@pytest.mark.asyncio
async def test_check_all_health(self, tmp_path):
config_dir = tmp_path / "config"
config_dir.mkdir()
config = {"name": "Site", "id": "site", "base_url": "http://test.com"}
with open(config_dir / "site.yaml", 'w') as f:
yaml.dump(config, f)
manager = ProvidersManager(str(config_dir))
# Mock the health check of the scraper
with patch.object(manager.providers["site"], 'check_health', return_value=True) as mock_check:
await manager.check_all_health()
assert manager.health_status["site"]["status"] == "up"
assert manager.health_status["site"]["last_check"] is not None
@pytest.mark.asyncio
async def test_router_search_unified_modern(mock_config_path):
"""Test the modernized unified search route in the router"""
from app.routers.router_anime import search_anime_unified
from app.providers_manager import providers_manager
# Mock providers manager to return our test scraper
test_scraper = GenericScraper(mock_config_path)
mock_results = [
AnimeSearchResult(title="Naruto", url="https://test.com/n", cover_image="", type="direct")
]
test_scraper.search = AsyncMock(return_value=mock_results)
with patch.object(providers_manager, 'get_active_providers', return_value=[test_scraper]):
# Patch legacy downloaders to return nothing
with patch('app.routers.router_anime.AnimeUltimeDownloader') as mock_dl:
mock_dl.return_value.search_anime = AsyncMock(return_value=[])
# Patch metadata enricher
with patch('app.routers.router_anime.get_metadata_enricher') as mock_get_enricher:
mock_enricher = AsyncMock()
mock_enricher.enrich_metadata = AsyncMock(return_value=AnimeMetadata(title="Naruto"))
mock_enricher.enrich_search_results = AsyncMock(side_effect=lambda x: x)
mock_get_enricher.return_value = mock_enricher
response = await search_anime_unified("Naruto")
assert "results" in response
assert "testsite" in response["results"]
assert response["results"]["testsite"][0]["title"] == "Naruto"