"""Generic scraper driven by YAML configuration""" import yaml import logging import httpx from bs4 import BeautifulSoup from typing import List, Dict, Optional, Any from pathlib import Path from urllib.parse import urljoin, quote from app.downloaders.anime_sites.base import BaseAnimeSite from app.models import AnimeSearchResult, AnimeMetadata from app.metadata_enrichment import get_metadata_enricher logger = logging.getLogger(__name__) class GenericScraper(BaseAnimeSite): """A scraper that uses external configuration for its logic""" def __init__(self, config_path: str): with open(config_path, 'r', encoding='utf-8') as f: self.config = yaml.safe_load(f) self.id = self.config['id'] self.name = self.config['name'] self.base_url = self.config['base_url'] self.mirrors = self.config.get('mirrors', []) # Current active base URL (can change if mirror found) self.active_url = self.base_url self.client = httpx.AsyncClient( timeout=20.0, follow_redirects=True, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } ) async def search(self, query: str) -> List[AnimeSearchResult]: """Search using configured selectors""" search_config = self.config.get('search') if not search_config: logger.warning(f"No search config for {self.name}") return [] search_path = search_config['path'].format(query=quote(query)) url = urljoin(self.active_url, search_path) try: response = await self.client.get(url) soup = BeautifulSoup(response.text, 'lxml') results = [] container = search_config.get('container_selector') items = soup.select(container) if container else [soup] for item in items: try: title_node = item.select_one(search_config['title_selector']) url_node = item.select_one(search_config['url_selector']) if not title_node or not url_node: continue title = title_node.get_text(strip=True) href = url_node.get('href') anime_url = urljoin(self.active_url, href) img_node = item.select_one(search_config.get('image_selector', 'img')) cover_image = img_node.get('src') if img_node else None if cover_image: cover_image = urljoin(self.active_url, cover_image) # Initial metadata from scraper meta_dict = { "poster_image": cover_image, "status": "Unknown" } # Enrich with Kitsu via global service enricher = await get_metadata_enricher() metadata = await enricher.enrich_metadata(meta_dict, title, anime_url) results.append(AnimeSearchResult( title=title, url=anime_url, cover_image=metadata.poster_image or cover_image, type="search_result", metadata=metadata )) except Exception as e: logger.error(f"Error parsing search result item: {e}") return results except Exception as e: logger.error(f"Search failed for {self.name}: {e}") return [] async def get_episodes(self, anime_url: str) -> List[Dict[str, Any]]: """Get episodes list (to be specialized if site logic is complex)""" # Default implementation for simple sites # For complex sites like Anime-Sama, we might still need a specialized subclass # but driven by the YAML config for base parameters. return [] async def check_health(self) -> bool: """Check if the site is up and selectors still work""" try: # Try a test search for a very common anime results = await self.search("One Piece") is_healthy = len(results) > 0 if not is_healthy: logger.warning(f"Health check failed for {self.name}: No results found") return is_healthy except Exception as e: logger.error(f"Health check failed for {self.name} with error: {e}") return False async def close(self): await self.client.aclose()