feat: robust scraping DSL and health monitoring (Phase 2)
- Implemented YAML-driven GenericScraper for resilient scraping - Added ProvidersManager to manage scraper health and active providers - Modernized unified search with systematic Kitsu metadata enrichment - Integrated automated health checks in the scheduler - Added comprehensive tests for scraping DSL and provider health
This commit is contained in:
@@ -0,0 +1,122 @@
|
||||
"""Generic scraper driven by YAML configuration"""
|
||||
import yaml
|
||||
import logging
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Dict, Optional, Any
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, quote
|
||||
|
||||
from app.downloaders.anime_sites.base import BaseAnimeSite
|
||||
from app.models import AnimeSearchResult, AnimeMetadata
|
||||
from app.metadata_enrichment import get_metadata_enricher
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GenericScraper(BaseAnimeSite):
|
||||
"""A scraper that uses external configuration for its logic"""
|
||||
|
||||
def __init__(self, config_path: str):
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
self.config = yaml.safe_load(f)
|
||||
|
||||
self.id = self.config['id']
|
||||
self.name = self.config['name']
|
||||
self.base_url = self.config['base_url']
|
||||
self.mirrors = self.config.get('mirrors', [])
|
||||
|
||||
# Current active base URL (can change if mirror found)
|
||||
self.active_url = self.base_url
|
||||
|
||||
self.client = httpx.AsyncClient(
|
||||
timeout=20.0,
|
||||
follow_redirects=True,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
)
|
||||
|
||||
async def search(self, query: str) -> List[AnimeSearchResult]:
|
||||
"""Search using configured selectors"""
|
||||
search_config = self.config.get('search')
|
||||
if not search_config:
|
||||
logger.warning(f"No search config for {self.name}")
|
||||
return []
|
||||
|
||||
search_path = search_config['path'].format(query=quote(query))
|
||||
url = urljoin(self.active_url, search_path)
|
||||
|
||||
try:
|
||||
response = await self.client.get(url)
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
|
||||
results = []
|
||||
container = search_config.get('container_selector')
|
||||
items = soup.select(container) if container else [soup]
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
title_node = item.select_one(search_config['title_selector'])
|
||||
url_node = item.select_one(search_config['url_selector'])
|
||||
|
||||
if not title_node or not url_node:
|
||||
continue
|
||||
|
||||
title = title_node.get_text(strip=True)
|
||||
href = url_node.get('href')
|
||||
anime_url = urljoin(self.active_url, href)
|
||||
|
||||
img_node = item.select_one(search_config.get('image_selector', 'img'))
|
||||
cover_image = img_node.get('src') if img_node else None
|
||||
if cover_image:
|
||||
cover_image = urljoin(self.active_url, cover_image)
|
||||
|
||||
# Initial metadata from scraper
|
||||
meta_dict = {
|
||||
"poster_image": cover_image,
|
||||
"status": "Unknown"
|
||||
}
|
||||
|
||||
# Enrich with Kitsu via global service
|
||||
enricher = await get_metadata_enricher()
|
||||
metadata = await enricher.enrich_metadata(meta_dict, title, anime_url)
|
||||
|
||||
results.append(AnimeSearchResult(
|
||||
title=title,
|
||||
url=anime_url,
|
||||
cover_image=metadata.poster_image or cover_image,
|
||||
type="search_result",
|
||||
metadata=metadata
|
||||
))
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing search result item: {e}")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed for {self.name}: {e}")
|
||||
return []
|
||||
|
||||
async def get_episodes(self, anime_url: str) -> List[Dict[str, Any]]:
|
||||
"""Get episodes list (to be specialized if site logic is complex)"""
|
||||
# Default implementation for simple sites
|
||||
# For complex sites like Anime-Sama, we might still need a specialized subclass
|
||||
# but driven by the YAML config for base parameters.
|
||||
return []
|
||||
|
||||
async def check_health(self) -> bool:
|
||||
"""Check if the site is up and selectors still work"""
|
||||
try:
|
||||
# Try a test search for a very common anime
|
||||
results = await self.search("One Piece")
|
||||
is_healthy = len(results) > 0
|
||||
if not is_healthy:
|
||||
logger.warning(f"Health check failed for {self.name}: No results found")
|
||||
return is_healthy
|
||||
except Exception as e:
|
||||
logger.error(f"Health check failed for {self.name} with error: {e}")
|
||||
return False
|
||||
|
||||
async def close(self):
|
||||
await self.client.aclose()
|
||||
@@ -0,0 +1,24 @@
|
||||
name: "Anime-Sama"
|
||||
id: "animesama"
|
||||
base_url: "https://anime-sama.fr"
|
||||
mirrors:
|
||||
- "https://anime-sama.si"
|
||||
- "https://anime-sama.co"
|
||||
|
||||
search:
|
||||
path: "/search?q={query}"
|
||||
container_selector: ".result-item"
|
||||
title_selector: "h3"
|
||||
url_selector: "a"
|
||||
image_selector: "img"
|
||||
|
||||
episodes:
|
||||
container_selector: "#episodes-list"
|
||||
item_selector: ".episode-item"
|
||||
# Logic for Anime-Sama can be complex, we'll handle custom logic in GenericScraper
|
||||
# but keep common selectors here.
|
||||
player_iframe_selector: "iframe#player"
|
||||
|
||||
metadata:
|
||||
synopsis_selector: ".synopsis"
|
||||
genres_selector: ".genres .genre"
|
||||
Reference in New Issue
Block a user