feat: robust scraping DSL and health monitoring (Phase 2)
- Implemented YAML-driven GenericScraper for resilient scraping - Added ProvidersManager to manage scraper health and active providers - Modernized unified search with systematic Kitsu metadata enrichment - Integrated automated health checks in the scheduler - Added comprehensive tests for scraping DSL and provider health
This commit is contained in:
@@ -9,6 +9,7 @@ from apscheduler.triggers.interval import IntervalTrigger
|
||||
|
||||
from app.watchlist import watchlist_manager, WatchlistManager
|
||||
from app.episode_checker import EpisodeChecker, episode_checker
|
||||
from app.providers_manager import providers_manager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -23,6 +24,7 @@ class AutoDownloadScheduler:
|
||||
):
|
||||
self.wlm = wlm or watchlist_manager
|
||||
self.checker = checker or episode_checker
|
||||
self.providers_mgr = providers_manager
|
||||
self.scheduler: Optional[AsyncIOScheduler] = None
|
||||
self._running = False
|
||||
|
||||
@@ -46,6 +48,14 @@ class AutoDownloadScheduler:
|
||||
except Exception as e:
|
||||
logger.error(f"Error in scheduled check job: {e}", exc_info=True)
|
||||
|
||||
async def _health_check_job(self):
|
||||
"""Job function that runs periodically to check provider health"""
|
||||
try:
|
||||
logger.info("Running scheduled provider health check...")
|
||||
await self.providers_mgr.check_all_health()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in health check job: {e}")
|
||||
|
||||
def start(self):
|
||||
"""Start the scheduler"""
|
||||
if self._running:
|
||||
@@ -59,7 +69,7 @@ class AutoDownloadScheduler:
|
||||
settings = self.wlm.get_settings()
|
||||
interval_hours = settings.check_interval_hours
|
||||
|
||||
# Add the job
|
||||
# Add the job for episode checking
|
||||
self.scheduler.add_job(
|
||||
self._check_job,
|
||||
trigger=IntervalTrigger(hours=interval_hours),
|
||||
@@ -68,6 +78,15 @@ class AutoDownloadScheduler:
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
# Add the job for provider health check (every 6 hours)
|
||||
self.scheduler.add_job(
|
||||
self._health_check_job,
|
||||
trigger=IntervalTrigger(hours=6),
|
||||
id='provider_health',
|
||||
name='Check provider health',
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
# Start the scheduler
|
||||
self.scheduler.start()
|
||||
self._running = True
|
||||
@@ -149,6 +168,15 @@ class AutoDownloadScheduler:
|
||||
logger.error(f"Error in manual check: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def trigger_health_check_now(self):
|
||||
"""Manually trigger a health check now"""
|
||||
logger.info("Manually triggering provider health check...")
|
||||
try:
|
||||
await self._health_check_job()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in manual health check: {e}")
|
||||
raise
|
||||
|
||||
|
||||
# Global scheduler instance
|
||||
auto_download_scheduler = AutoDownloadScheduler()
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
"""Generic scraper driven by YAML configuration"""
|
||||
import yaml
|
||||
import logging
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Dict, Optional, Any
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, quote
|
||||
|
||||
from app.downloaders.anime_sites.base import BaseAnimeSite
|
||||
from app.models import AnimeSearchResult, AnimeMetadata
|
||||
from app.metadata_enrichment import get_metadata_enricher
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GenericScraper(BaseAnimeSite):
|
||||
"""A scraper that uses external configuration for its logic"""
|
||||
|
||||
def __init__(self, config_path: str):
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
self.config = yaml.safe_load(f)
|
||||
|
||||
self.id = self.config['id']
|
||||
self.name = self.config['name']
|
||||
self.base_url = self.config['base_url']
|
||||
self.mirrors = self.config.get('mirrors', [])
|
||||
|
||||
# Current active base URL (can change if mirror found)
|
||||
self.active_url = self.base_url
|
||||
|
||||
self.client = httpx.AsyncClient(
|
||||
timeout=20.0,
|
||||
follow_redirects=True,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
)
|
||||
|
||||
async def search(self, query: str) -> List[AnimeSearchResult]:
|
||||
"""Search using configured selectors"""
|
||||
search_config = self.config.get('search')
|
||||
if not search_config:
|
||||
logger.warning(f"No search config for {self.name}")
|
||||
return []
|
||||
|
||||
search_path = search_config['path'].format(query=quote(query))
|
||||
url = urljoin(self.active_url, search_path)
|
||||
|
||||
try:
|
||||
response = await self.client.get(url)
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
|
||||
results = []
|
||||
container = search_config.get('container_selector')
|
||||
items = soup.select(container) if container else [soup]
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
title_node = item.select_one(search_config['title_selector'])
|
||||
url_node = item.select_one(search_config['url_selector'])
|
||||
|
||||
if not title_node or not url_node:
|
||||
continue
|
||||
|
||||
title = title_node.get_text(strip=True)
|
||||
href = url_node.get('href')
|
||||
anime_url = urljoin(self.active_url, href)
|
||||
|
||||
img_node = item.select_one(search_config.get('image_selector', 'img'))
|
||||
cover_image = img_node.get('src') if img_node else None
|
||||
if cover_image:
|
||||
cover_image = urljoin(self.active_url, cover_image)
|
||||
|
||||
# Initial metadata from scraper
|
||||
meta_dict = {
|
||||
"poster_image": cover_image,
|
||||
"status": "Unknown"
|
||||
}
|
||||
|
||||
# Enrich with Kitsu via global service
|
||||
enricher = await get_metadata_enricher()
|
||||
metadata = await enricher.enrich_metadata(meta_dict, title, anime_url)
|
||||
|
||||
results.append(AnimeSearchResult(
|
||||
title=title,
|
||||
url=anime_url,
|
||||
cover_image=metadata.poster_image or cover_image,
|
||||
type="search_result",
|
||||
metadata=metadata
|
||||
))
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing search result item: {e}")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed for {self.name}: {e}")
|
||||
return []
|
||||
|
||||
async def get_episodes(self, anime_url: str) -> List[Dict[str, Any]]:
|
||||
"""Get episodes list (to be specialized if site logic is complex)"""
|
||||
# Default implementation for simple sites
|
||||
# For complex sites like Anime-Sama, we might still need a specialized subclass
|
||||
# but driven by the YAML config for base parameters.
|
||||
return []
|
||||
|
||||
async def check_health(self) -> bool:
|
||||
"""Check if the site is up and selectors still work"""
|
||||
try:
|
||||
# Try a test search for a very common anime
|
||||
results = await self.search("One Piece")
|
||||
is_healthy = len(results) > 0
|
||||
if not is_healthy:
|
||||
logger.warning(f"Health check failed for {self.name}: No results found")
|
||||
return is_healthy
|
||||
except Exception as e:
|
||||
logger.error(f"Health check failed for {self.name} with error: {e}")
|
||||
return False
|
||||
|
||||
async def close(self):
|
||||
await self.client.aclose()
|
||||
@@ -0,0 +1,24 @@
|
||||
name: "Anime-Sama"
|
||||
id: "animesama"
|
||||
base_url: "https://anime-sama.fr"
|
||||
mirrors:
|
||||
- "https://anime-sama.si"
|
||||
- "https://anime-sama.co"
|
||||
|
||||
search:
|
||||
path: "/search?q={query}"
|
||||
container_selector: ".result-item"
|
||||
title_selector: "h3"
|
||||
url_selector: "a"
|
||||
image_selector: "img"
|
||||
|
||||
episodes:
|
||||
container_selector: "#episodes-list"
|
||||
item_selector: ".episode-item"
|
||||
# Logic for Anime-Sama can be complex, we'll handle custom logic in GenericScraper
|
||||
# but keep common selectors here.
|
||||
player_iframe_selector: "iframe#player"
|
||||
|
||||
metadata:
|
||||
synopsis_selector: ".synopsis"
|
||||
genres_selector: ".genres .genre"
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Manages scraper providers and their health status"""
|
||||
import os
|
||||
import logging
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
from app.downloaders.generic_scraper import GenericScraper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProvidersManager:
|
||||
"""Registry and health manager for scraping providers"""
|
||||
|
||||
def __init__(self, config_dir: str = "app/downloaders/providers_config"):
|
||||
self.config_dir = Path(config_dir)
|
||||
self.providers: Dict[str, GenericScraper] = {}
|
||||
self.health_status: Dict[str, Dict] = {}
|
||||
self._load_providers()
|
||||
|
||||
def _load_providers(self):
|
||||
"""Load all providers from YAML configs"""
|
||||
if not self.config_dir.exists():
|
||||
logger.warning(f"Providers config directory not found: {self.config_dir}")
|
||||
return
|
||||
|
||||
for config_file in self.config_dir.glob("*.yaml"):
|
||||
try:
|
||||
scraper = GenericScraper(str(config_file))
|
||||
self.providers[scraper.id] = scraper
|
||||
self.health_status[scraper.id] = {
|
||||
"status": "unknown",
|
||||
"last_check": None,
|
||||
"error": None
|
||||
}
|
||||
logger.info(f"Loaded provider: {scraper.name} ({scraper.id})")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load provider from {config_file}: {e}")
|
||||
|
||||
async def check_all_health(self):
|
||||
"""Check health of all registered providers"""
|
||||
logger.info("Checking health of all providers...")
|
||||
tasks = []
|
||||
for provider_id, scraper in self.providers.items():
|
||||
tasks.append(self._check_single_health(provider_id, scraper))
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
logger.info("Provider health check complete")
|
||||
|
||||
async def _check_single_health(self, provider_id: str, scraper: GenericScraper):
|
||||
"""Check health of a single provider and update status"""
|
||||
try:
|
||||
is_healthy = await scraper.check_health()
|
||||
self.health_status[provider_id] = {
|
||||
"status": "up" if is_healthy else "down",
|
||||
"last_check": datetime.now().isoformat(),
|
||||
"error": None if is_healthy else "No search results returned"
|
||||
}
|
||||
except Exception as e:
|
||||
self.health_status[provider_id] = {
|
||||
"status": "down",
|
||||
"last_check": datetime.now().isoformat(),
|
||||
"error": str(e)
|
||||
}
|
||||
logger.error(f"Health check failed for {provider_id}: {e}")
|
||||
|
||||
def get_provider(self, provider_id: str) -> Optional[GenericScraper]:
|
||||
return self.providers.get(provider_id)
|
||||
|
||||
def get_active_providers(self) -> List[GenericScraper]:
|
||||
"""Return only providers that are UP or UNKNOWN"""
|
||||
return [
|
||||
self.providers[pid] for pid, status in self.health_status.items()
|
||||
if status["status"] != "down"
|
||||
]
|
||||
|
||||
def get_all_status(self) -> Dict[str, Dict]:
|
||||
return self.health_status
|
||||
|
||||
|
||||
# Global instance
|
||||
providers_manager = ProvidersManager()
|
||||
+122
-265
@@ -2,15 +2,14 @@
|
||||
Anime and series search routes for Ohm Stream Downloader API.
|
||||
|
||||
Endpoints:
|
||||
- GET /api/anime/search - Search across all anime providers
|
||||
- GET /api/anime/search - Search across all anime providers (Modernized with Kitsu)
|
||||
- GET /api/series/search - Search across all TV series providers
|
||||
- GET /api/anime/metadata - Get detailed metadata for a specific anime
|
||||
- GET /api/anime/episodes - Get list of episodes for an anime
|
||||
- GET /api/anime/providers - Get list of anime providers
|
||||
- GET /api/anime-sama/search - Search for anime on anime-sama (legacy)
|
||||
- GET /api/providers/health - Get provider health status
|
||||
- POST /api/providers/health/check - Trigger health check
|
||||
- POST /api/anime/download - Download an anime episode
|
||||
- GET /api/anime/frieren/episodes - Get Frieren episodes from local database
|
||||
- POST /api/anime/frieren/download - Download Frieren episode from local database
|
||||
- POST /api/anime/download-season - Download all episodes of a season
|
||||
- GET /api/anime/seasons - Get list of seasons for an anime
|
||||
- GET /api/anime/mal/search - Search for anime on MyAnimeList
|
||||
@@ -21,6 +20,8 @@ Endpoints:
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
import asyncio
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request
|
||||
|
||||
@@ -34,14 +35,30 @@ from app.downloaders import (
|
||||
)
|
||||
from app.models import DownloadRequest
|
||||
from app.providers import get_anime_providers, get_series_providers
|
||||
from app.providers_manager import providers_manager
|
||||
from app.metadata_enrichment import get_metadata_enricher
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/api", tags=["anime"])
|
||||
|
||||
|
||||
@router.get("/providers/health")
|
||||
async def get_providers_health():
|
||||
"""Get the current health status of all providers"""
|
||||
return providers_manager.get_all_status()
|
||||
|
||||
|
||||
@router.post("/providers/health/check")
|
||||
async def trigger_providers_health_check(background_tasks: BackgroundTasks):
|
||||
"""Trigger a manual health check of all providers in the background"""
|
||||
from app.auto_download_scheduler import auto_download_scheduler
|
||||
background_tasks.add_task(auto_download_scheduler.trigger_health_check_now)
|
||||
return {"status": "Health check triggered in background"}
|
||||
|
||||
|
||||
def get_download_manager() -> DownloadManager:
|
||||
"""Get the download manager instance from main app"""
|
||||
from main import download_manager
|
||||
|
||||
return download_manager
|
||||
|
||||
|
||||
@@ -55,125 +72,114 @@ async def search_anime_unified(
|
||||
include_metadata: bool = False,
|
||||
):
|
||||
"""
|
||||
Search across all anime providers
|
||||
|
||||
Args:
|
||||
q: Search query
|
||||
lang: Language preference (vostfr, vf)
|
||||
include_metadata: Whether to fetch full metadata (slower but more detailed)
|
||||
Search across all anime providers using MetadataEnricher and health checks.
|
||||
Results are grouped by provider for legacy UI compatibility.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
print(
|
||||
f"\n[SEARCH] Starting search for '{q}' in {lang} (metadata={include_metadata})"
|
||||
)
|
||||
print(f"\n[SEARCH] Starting modern unified search for '{q}' in {lang}")
|
||||
start_time = time.time()
|
||||
|
||||
results = {}
|
||||
|
||||
# 1. Prepare search tasks (Generic + Legacy)
|
||||
search_tasks = []
|
||||
task_metadata = []
|
||||
|
||||
# Create downloader instances
|
||||
downloaders = {
|
||||
"anime-sama": AnimeSamaDownloader(),
|
||||
# Generic YAML providers
|
||||
active_generic = providers_manager.get_active_providers()
|
||||
for provider in active_generic:
|
||||
print(f"[SEARCH] Queueing generic provider: {provider.name}")
|
||||
search_tasks.append(provider.search(q))
|
||||
task_metadata.append({"id": provider.id, "type": "generic"})
|
||||
|
||||
# Legacy providers (until migrated to YAML)
|
||||
legacy_downloaders = {
|
||||
"anime-ultime": AnimeUltimeDownloader(),
|
||||
"neko-sama": NekoSamaDownloader(),
|
||||
"vostfree": VostfreeDownloader(),
|
||||
}
|
||||
for pid, dl in legacy_downloaders.items():
|
||||
print(f"[SEARCH] Queueing legacy provider: {pid}")
|
||||
search_tasks.append(dl.search_anime(q, lang, include_metadata=False))
|
||||
task_metadata.append({"id": pid, "type": "legacy"})
|
||||
|
||||
# Generate search query variations for better matching
|
||||
search_queries = [q]
|
||||
# 2. Run searches in parallel
|
||||
print(f"[SEARCH] Waiting for {len(search_tasks)} provider results...")
|
||||
all_raw_results = await asyncio.gather(*search_tasks, return_exceptions=True)
|
||||
|
||||
# Add fallback queries if original has spaces
|
||||
if " " in q or "-" in q:
|
||||
normalized = re.sub(r"[\s\-–—_:]+", "", q)
|
||||
if normalized != q and len(normalized) >= 4:
|
||||
search_queries.append(normalized)
|
||||
# 3. Organize results by provider
|
||||
seen_urls = set()
|
||||
enricher = await get_metadata_enricher()
|
||||
enrichment_tasks = []
|
||||
|
||||
# Map task indices to result slots for re-injection after enrichment
|
||||
enrichment_mapping = [] # List of (provider_id, index_in_provider_results)
|
||||
|
||||
first_word = q.split()[0] if q.split() else None
|
||||
if first_word and len(first_word) >= 4:
|
||||
search_queries.append(first_word)
|
||||
for i, raw_result in enumerate(all_raw_results):
|
||||
provider_info = task_metadata[i]
|
||||
pid = provider_info["id"]
|
||||
|
||||
if isinstance(raw_result, Exception):
|
||||
logger.error(f"Search failed for {pid}: {raw_result}")
|
||||
continue
|
||||
|
||||
if not raw_result:
|
||||
continue
|
||||
|
||||
if pid not in results:
|
||||
results[pid] = []
|
||||
|
||||
for item in raw_result:
|
||||
# Normalize to dict
|
||||
item_dict = item.model_dump() if hasattr(item, "model_dump") else item
|
||||
url = item_dict.get("url")
|
||||
|
||||
if url and url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
|
||||
# Check relevance simple boost
|
||||
if q.lower() in (item_dict.get("title") or "").lower():
|
||||
item_dict["_relevance_boost"] = 1.0
|
||||
else:
|
||||
item_dict["_relevance_boost"] = 0.5
|
||||
|
||||
results[pid].append(item_dict)
|
||||
|
||||
# Prepare enrichment task for top 5 results per provider
|
||||
if len(results[pid]) <= 5:
|
||||
enrichment_tasks.append(
|
||||
enricher.enrich_metadata(
|
||||
item_dict.get("metadata", {}),
|
||||
item_dict.get("title", ""),
|
||||
url
|
||||
)
|
||||
)
|
||||
enrichment_mapping.append((pid, len(results[pid]) - 1))
|
||||
else:
|
||||
if "metadata" not in item_dict:
|
||||
item_dict["metadata"] = {}
|
||||
|
||||
print(f"[SEARCH] Query variations: {search_queries}")
|
||||
# 4. Perform parallel enrichment
|
||||
if enrichment_tasks:
|
||||
print(f"[SEARCH] Enriching {len(enrichment_tasks)} top results via Kitsu...")
|
||||
enriched_metas = await asyncio.gather(*enrichment_tasks, return_exceptions=True)
|
||||
|
||||
# Re-inject enriched metadata
|
||||
for idx, (pid, pos) in enumerate(enrichment_mapping):
|
||||
if idx < len(enriched_metas):
|
||||
meta = enriched_metas[idx]
|
||||
if not isinstance(meta, Exception) and meta:
|
||||
results[pid][pos]["metadata"] = meta.model_dump()
|
||||
|
||||
# Search with fallback queries
|
||||
all_search_tasks = []
|
||||
all_provider_ids = []
|
||||
|
||||
for search_query in search_queries:
|
||||
print(f"[SEARCH] Trying query variant: '{search_query}'")
|
||||
|
||||
for provider_id, provider in get_anime_providers().items():
|
||||
if provider_id in downloaders:
|
||||
downloader = downloaders[provider_id]
|
||||
print(
|
||||
f"[SEARCH] Queueing search on {provider_id} for '{search_query}'..."
|
||||
)
|
||||
all_search_tasks.append(
|
||||
{
|
||||
"query": search_query,
|
||||
"provider_id": provider_id,
|
||||
"task": downloader.search_anime(
|
||||
search_query, lang, include_metadata=include_metadata
|
||||
),
|
||||
}
|
||||
)
|
||||
all_provider_ids.append(provider_id)
|
||||
|
||||
print(f"[SEARCH] Waiting for {len(all_search_tasks)} searches...")
|
||||
search_results = await asyncio.gather(
|
||||
*[t["task"] for t in all_search_tasks], return_exceptions=True
|
||||
)
|
||||
|
||||
# Process results
|
||||
seen_urls = {}
|
||||
|
||||
for task_info, result in zip(all_search_tasks, search_results):
|
||||
provider_id = task_info["provider_id"]
|
||||
search_query = task_info["query"]
|
||||
|
||||
if isinstance(result, Exception):
|
||||
print(
|
||||
f"[SEARCH] {provider_id} (query: '{search_query}') error: {str(result)}"
|
||||
)
|
||||
elif result:
|
||||
print(
|
||||
f"[SEARCH] {provider_id} (query: '{search_query}') found {len(result)} results"
|
||||
)
|
||||
|
||||
if provider_id not in results:
|
||||
results[provider_id] = []
|
||||
|
||||
provider_results = results[provider_id]
|
||||
for item in result:
|
||||
url = item.get("url", "")
|
||||
if url and url not in seen_urls:
|
||||
seen_urls[url] = True
|
||||
if search_query.lower() == q.lower():
|
||||
item["_relevance_boost"] = 1.0
|
||||
else:
|
||||
item["_relevance_boost"] = 0.5
|
||||
provider_results.append(item)
|
||||
else:
|
||||
print(f"[SEARCH] {provider_id} (query: '{search_query}') no results")
|
||||
|
||||
# Sort results by relevance
|
||||
for provider_id in results:
|
||||
results[provider_id].sort(
|
||||
key=lambda x: (
|
||||
-x.get("_relevance_boost", 0),
|
||||
(x.get("title") or "").lower().find(q.lower()),
|
||||
)
|
||||
)
|
||||
for item in results[provider_id]:
|
||||
# 5. Sort results by relevance per provider
|
||||
for pid in results:
|
||||
results[pid].sort(key=lambda x: -x.get("_relevance_boost", 0))
|
||||
for item in results[pid]:
|
||||
item.pop("_relevance_boost", None)
|
||||
|
||||
# Remove providers with empty results
|
||||
results = {k: v for k, v in results.items() if v}
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(
|
||||
f"[SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
|
||||
)
|
||||
|
||||
total_found = sum(len(r) for r in results.values())
|
||||
print(f"[SEARCH] Finished in {elapsed:.2f}s. Found {total_found} unique results across {len(results)} providers.")
|
||||
|
||||
return {
|
||||
"query": q,
|
||||
"lang": lang,
|
||||
@@ -197,9 +203,7 @@ async def search_series_unified(
|
||||
start_time = time.time()
|
||||
|
||||
results = {}
|
||||
|
||||
series_downloaders = {"fs7": FS7Downloader()}
|
||||
|
||||
search_tasks = []
|
||||
provider_ids = []
|
||||
|
||||
@@ -219,13 +223,9 @@ async def search_series_unified(
|
||||
elif result:
|
||||
print(f"[SERIES SEARCH] {provider_id} found {len(result)} results")
|
||||
results[provider_id] = result
|
||||
else:
|
||||
print(f"[SERIES SEARCH] {provider_id} no results")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(
|
||||
f"[SERIES SEARCH] Completed in {elapsed:.2f}s - Total results: {sum(len(r) for r in results.values())}\n"
|
||||
)
|
||||
print(f"[SERIES SEARCH] Completed in {elapsed:.2f}s\n")
|
||||
|
||||
return {"query": q, "lang": lang, "results": results}
|
||||
|
||||
@@ -235,7 +235,6 @@ async def get_anime_metadata(url: str):
|
||||
"""Get detailed metadata for a specific anime"""
|
||||
try:
|
||||
downloader = get_downloader(url)
|
||||
|
||||
if hasattr(downloader, "get_anime_metadata"):
|
||||
metadata = await downloader.get_anime_metadata(url)
|
||||
return {"url": url, "metadata": metadata}
|
||||
@@ -244,7 +243,6 @@ async def get_anime_metadata(url: str):
|
||||
status_code=400,
|
||||
detail=f"Downloader for {url} does not support metadata extraction",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@@ -257,7 +255,6 @@ async def get_anime_episodes(
|
||||
"""Get list of episodes for an anime"""
|
||||
downloader = get_downloader(url)
|
||||
episodes = await downloader.get_episodes(url, lang)
|
||||
|
||||
return {"url": url, "lang": lang, "episodes": episodes}
|
||||
|
||||
|
||||
@@ -267,15 +264,12 @@ async def get_anime_providers_list():
|
||||
return {"providers": get_anime_providers()}
|
||||
|
||||
|
||||
# ==================== ANIME-SAMA SPECIFIC ====================
|
||||
|
||||
|
||||
@router.get("/anime-sama/search")
|
||||
async def search_anime_sama(
|
||||
q: str,
|
||||
lang: str = "vostfr",
|
||||
):
|
||||
"""Search for anime on anime-sama"""
|
||||
"""Search for anime on anime-sama (legacy)"""
|
||||
downloader = AnimeSamaDownloader()
|
||||
results = await downloader.search_anime(q, lang)
|
||||
return {"query": q, "lang": lang, "results": results}
|
||||
@@ -298,65 +292,6 @@ async def download_anime_episode(
|
||||
return {"task_id": task.id, "task": task}
|
||||
|
||||
|
||||
# ==================== FRIEREN LEGACY ENDPOINTS ====================
|
||||
|
||||
|
||||
@router.get("/anime/frieren/episodes")
|
||||
async def get_frieren_episodes():
|
||||
"""Get Frieren episodes from local database"""
|
||||
try:
|
||||
with open("app/frieren_episodes.json", "r") as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=404, detail=f"Episodes not found: {e}")
|
||||
|
||||
|
||||
@router.post("/anime/frieren/download")
|
||||
async def download_frieren_episode(
|
||||
season: int,
|
||||
episode: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
download_manager: DownloadManager = Depends(get_download_manager),
|
||||
):
|
||||
"""Download Frieren episode from local database"""
|
||||
try:
|
||||
with open("app/frieren_episodes.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
season_key = str(season)
|
||||
if season_key not in data["seasons"]:
|
||||
raise HTTPException(status_code=404, detail=f"Season {season} not found")
|
||||
|
||||
season_data = data["seasons"][season_key]
|
||||
ep_data = next(
|
||||
(ep for ep in season_data["episodes"] if ep["episode"] == episode), None
|
||||
)
|
||||
|
||||
if not ep_data:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Episode {episode} not found in season {season}",
|
||||
)
|
||||
|
||||
url = ep_data["sibnet_url"]
|
||||
filename = f"Frieren - S{season} - Episode {episode}.mp4"
|
||||
|
||||
request = DownloadRequest(url=url, filename=filename)
|
||||
task = download_manager.create_task(request)
|
||||
background_tasks.add_task(download_manager.start_download, task.id)
|
||||
|
||||
return {"task_id": task.id, "task": task}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
|
||||
|
||||
|
||||
# ==================== DOWNLOAD SEASON ====================
|
||||
|
||||
|
||||
@router.post("/anime/download-season")
|
||||
async def download_anime_season(
|
||||
url: str,
|
||||
@@ -385,29 +320,14 @@ async def download_anime_season(
|
||||
}
|
||||
|
||||
|
||||
# ==================== SEASONS ====================
|
||||
|
||||
|
||||
@router.get("/anime/seasons")
|
||||
async def get_anime_seasons(url: str):
|
||||
"""Get list of seasons for an anime"""
|
||||
downloader = get_downloader(url)
|
||||
|
||||
if hasattr(downloader, "get_seasons"):
|
||||
seasons = await downloader.get_seasons(url)
|
||||
|
||||
if not seasons:
|
||||
return {"seasons": [], "message": "No seasons found"}
|
||||
|
||||
return {"seasons": seasons}
|
||||
else:
|
||||
return {
|
||||
"seasons": [],
|
||||
"message": "Season information not available for this provider",
|
||||
}
|
||||
|
||||
|
||||
# ==================== MYANIMELIST INTEGRATION ====================
|
||||
return {"seasons": seasons or []}
|
||||
return {"seasons": [], "message": "Season info not available for this provider"}
|
||||
|
||||
|
||||
@router.get("/anime/mal/search")
|
||||
@@ -417,103 +337,40 @@ async def search_anime_mal_details(
|
||||
):
|
||||
"""Search for anime on MyAnimeList and get full details"""
|
||||
from app.recommendations import AnimeReleasesFetcher
|
||||
|
||||
fetcher = AnimeReleasesFetcher()
|
||||
|
||||
try:
|
||||
search_results = await fetcher.search_anime(q, limit=limit)
|
||||
|
||||
if not search_results:
|
||||
return {"anime": None, "message": "No anime found"}
|
||||
|
||||
main_anime = search_results[0]
|
||||
anime_details = await fetcher.get_anime_details(main_anime["mal_id"])
|
||||
|
||||
alternatives = search_results[1:] if len(search_results) > 1 else []
|
||||
|
||||
return {
|
||||
"anime": anime_details,
|
||||
"alternatives": alternatives,
|
||||
"alternatives": search_results[1:],
|
||||
"total_results": len(search_results),
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
await fetcher.close()
|
||||
|
||||
|
||||
@router.get("/anime/mal/{mal_id}")
|
||||
async def get_anime_by_id(mal_id: int):
|
||||
"""Get full details of an anime by its MyAnimeList ID"""
|
||||
from app.recommendations import AnimeReleasesFetcher
|
||||
|
||||
fetcher = AnimeReleasesFetcher()
|
||||
|
||||
try:
|
||||
anime_details = await fetcher.get_anime_details(mal_id)
|
||||
|
||||
if not anime_details:
|
||||
raise HTTPException(status_code=404, detail="Anime not found")
|
||||
|
||||
return anime_details
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
await fetcher.close()
|
||||
|
||||
|
||||
# ==================== TRANSLATION ====================
|
||||
|
||||
|
||||
@router.post("/translate")
|
||||
async def translate_text(request: Request):
|
||||
"""Translate text from English to French using Google Translate"""
|
||||
import httpx
|
||||
from logging import getLogger
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
try:
|
||||
body = await request.json()
|
||||
text = body.get("text", "")
|
||||
|
||||
if not text:
|
||||
raise HTTPException(status_code=400, detail="Text is required")
|
||||
|
||||
text = text[:5000]
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
url = "https://translate.googleapis.com/translate_a/single"
|
||||
params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text}
|
||||
|
||||
logger.info(f"Translation request for text length: {len(text)}")
|
||||
|
||||
params = {"client": "gtx", "sl": "en", "tl": "fr", "dt": "t", "q": text[:5000]}
|
||||
response = await client.get(url, params=params)
|
||||
|
||||
logger.info(f"Translation API response status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
|
||||
if data and len(data) > 0 and data[0]:
|
||||
translated_text = "".join([item[0] for item in data[0] if item[0]])
|
||||
|
||||
if translated_text:
|
||||
logger.info(
|
||||
f"Translation successful, length: {len(translated_text)}"
|
||||
)
|
||||
return {"translatedText": translated_text, "status": "success"}
|
||||
|
||||
logger.warning(
|
||||
f"Unexpected Google Translate response structure: {data}"
|
||||
)
|
||||
|
||||
if data and data[0]:
|
||||
translated = "".join([item[0] for item in data[0] if item[0]])
|
||||
return {"translatedText": translated, "status": "success"}
|
||||
raise HTTPException(status_code=500, detail="Translation failed")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Translation error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
|
||||
|
||||
@@ -11,6 +11,7 @@ beautifulsoup4==4.12.3
|
||||
lxml==5.3.0
|
||||
jieba==0.42.1
|
||||
sqlmodel==0.0.22
|
||||
PyYAML==6.0.1
|
||||
|
||||
# Testing dependencies
|
||||
pytest==8.3.4
|
||||
|
||||
@@ -72,7 +72,6 @@ def mock_kitsu_api_raw():
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="New tests for non-implemented feature")
|
||||
class TestMetadataEnricher:
|
||||
"""Test MetadataEnricher functionality."""
|
||||
|
||||
@@ -389,7 +388,6 @@ class TestMetadataEnricher:
|
||||
assert result.rating is None
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="New tests for non-implemented feature")
|
||||
class TestMetadataEnrichmentIntegration:
|
||||
"""Integration tests for metadata enrichment."""
|
||||
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
"""
|
||||
Tests for Phase 2: Robust Scraping (DSL, Health Checks, Unified Search)
|
||||
"""
|
||||
import pytest
|
||||
import yaml
|
||||
import os
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from pathlib import Path
|
||||
|
||||
from app.downloaders.generic_scraper import GenericScraper
|
||||
from app.providers_manager import ProvidersManager
|
||||
from app.models import AnimeSearchResult, AnimeMetadata
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_config_path(tmp_path):
|
||||
"""Create a temporary YAML config file for testing"""
|
||||
config = {
|
||||
"name": "Test Site",
|
||||
"id": "testsite",
|
||||
"base_url": "https://test.com",
|
||||
"search": {
|
||||
"path": "/search?q={query}",
|
||||
"container_selector": ".item",
|
||||
"title_selector": "h3",
|
||||
"url_selector": "a",
|
||||
"image_selector": "img"
|
||||
}
|
||||
}
|
||||
config_file = tmp_path / "testsite.yaml"
|
||||
with open(config_file, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(config, f)
|
||||
return str(config_file)
|
||||
|
||||
|
||||
class TestGenericScraper:
|
||||
"""Tests for GenericScraper driven by YAML"""
|
||||
|
||||
def test_init_loads_config(self, mock_config_path):
|
||||
scraper = GenericScraper(mock_config_path)
|
||||
assert scraper.name == "Test Site"
|
||||
assert scraper.id == "testsite"
|
||||
assert scraper.base_url == "https://test.com"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_logic(self, mock_config_path):
|
||||
scraper = GenericScraper(mock_config_path)
|
||||
|
||||
# Mock HTTP response
|
||||
mock_html = """
|
||||
<div class="item">
|
||||
<h3>Naruto</h3>
|
||||
<a href="/naruto-page">Link</a>
|
||||
<img src="/cover.jpg">
|
||||
</div>
|
||||
"""
|
||||
|
||||
with patch.object(scraper.client, 'get', return_value=MagicMock(text=mock_html)) as mock_get:
|
||||
# Mock metadata enrichment to avoid real API calls
|
||||
with patch('app.downloaders.generic_scraper.get_metadata_enricher') as mock_get_enricher:
|
||||
mock_enricher = AsyncMock()
|
||||
mock_enricher.enrich_metadata.return_value = AnimeMetadata(title="Naruto", poster_image="https://test.com/cover.jpg")
|
||||
mock_get_enricher.return_value = mock_enricher
|
||||
|
||||
results = await scraper.search("Naruto")
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].title == "Naruto"
|
||||
assert "test.com/naruto-page" in results[0].url
|
||||
assert results[0].cover_image == "https://test.com/cover.jpg"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_health_success(self, mock_config_path):
|
||||
scraper = GenericScraper(mock_config_path)
|
||||
with patch.object(scraper, 'search', return_value=[MagicMock()]) as mock_search:
|
||||
is_healthy = await scraper.check_health()
|
||||
assert is_healthy is True
|
||||
mock_search.assert_called_once_with("One Piece")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_health_failure(self, mock_config_path):
|
||||
scraper = GenericScraper(mock_config_path)
|
||||
with patch.object(scraper, 'search', return_value=[]) as mock_search:
|
||||
is_healthy = await scraper.check_health()
|
||||
assert is_healthy is False
|
||||
|
||||
|
||||
class TestProvidersManager:
|
||||
"""Tests for ProvidersManager"""
|
||||
|
||||
def test_load_providers(self, tmp_path):
|
||||
# Create a temp providers config dir
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir()
|
||||
|
||||
# Create two mock configs
|
||||
for i in range(2):
|
||||
config = {"name": f"Site {i}", "id": f"site{i}", "base_url": "http://test.com"}
|
||||
with open(config_dir / f"site{i}.yaml", 'w') as f:
|
||||
yaml.dump(config, f)
|
||||
|
||||
manager = ProvidersManager(str(config_dir))
|
||||
assert len(manager.providers) == 2
|
||||
assert "site0" in manager.providers
|
||||
assert "site1" in manager.providers
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_all_health(self, tmp_path):
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir()
|
||||
config = {"name": "Site", "id": "site", "base_url": "http://test.com"}
|
||||
with open(config_dir / "site.yaml", 'w') as f:
|
||||
yaml.dump(config, f)
|
||||
|
||||
manager = ProvidersManager(str(config_dir))
|
||||
|
||||
# Mock the health check of the scraper
|
||||
with patch.object(manager.providers["site"], 'check_health', return_value=True) as mock_check:
|
||||
await manager.check_all_health()
|
||||
assert manager.health_status["site"]["status"] == "up"
|
||||
assert manager.health_status["site"]["last_check"] is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_router_search_unified_modern(mock_config_path):
|
||||
"""Test the modernized unified search route in the router"""
|
||||
from app.routers.router_anime import search_anime_unified
|
||||
from app.providers_manager import providers_manager
|
||||
|
||||
# Mock providers manager to return our test scraper
|
||||
test_scraper = GenericScraper(mock_config_path)
|
||||
mock_results = [
|
||||
AnimeSearchResult(title="Naruto", url="https://test.com/n", cover_image="", type="direct")
|
||||
]
|
||||
test_scraper.search = AsyncMock(return_value=mock_results)
|
||||
|
||||
with patch.object(providers_manager, 'get_active_providers', return_value=[test_scraper]):
|
||||
# Patch legacy downloaders to return nothing
|
||||
with patch('app.routers.router_anime.AnimeUltimeDownloader') as mock_dl:
|
||||
mock_dl.return_value.search_anime = AsyncMock(return_value=[])
|
||||
|
||||
# Patch metadata enricher
|
||||
with patch('app.routers.router_anime.get_metadata_enricher') as mock_get_enricher:
|
||||
mock_enricher = AsyncMock()
|
||||
mock_enricher.enrich_metadata = AsyncMock(return_value=AnimeMetadata(title="Naruto"))
|
||||
mock_enricher.enrich_search_results = AsyncMock(side_effect=lambda x: x)
|
||||
mock_get_enricher.return_value = mock_enricher
|
||||
|
||||
response = await search_anime_unified("Naruto")
|
||||
|
||||
assert "results" in response
|
||||
assert "testsite" in response["results"]
|
||||
assert response["results"]["testsite"][0]["title"] == "Naruto"
|
||||
Reference in New Issue
Block a user