Files
ohm_streaming/app/providers_manager.py
T
root 2b4cc617cb
CI / Test (Python 3.11) (push) Has been cancelled
CI / Test (Python 3.12) (push) Has been cancelled
CI / Lint (push) Has been cancelled
CI / Type Check (push) Has been cancelled
CI / Summary (push) Has been cancelled
feat: robust scraping DSL and health monitoring (Phase 2)
- Implemented YAML-driven GenericScraper for resilient scraping
- Added ProvidersManager to manage scraper health and active providers
- Modernized unified search with systematic Kitsu metadata enrichment
- Integrated automated health checks in the scheduler
- Added comprehensive tests for scraping DSL and provider health
2026-03-24 10:57:19 +00:00

85 lines
3.1 KiB
Python

"""Manages scraper providers and their health status"""
import os
import logging
import asyncio
from typing import Dict, List, Optional
from pathlib import Path
from datetime import datetime
from app.downloaders.generic_scraper import GenericScraper
logger = logging.getLogger(__name__)
class ProvidersManager:
"""Registry and health manager for scraping providers"""
def __init__(self, config_dir: str = "app/downloaders/providers_config"):
self.config_dir = Path(config_dir)
self.providers: Dict[str, GenericScraper] = {}
self.health_status: Dict[str, Dict] = {}
self._load_providers()
def _load_providers(self):
"""Load all providers from YAML configs"""
if not self.config_dir.exists():
logger.warning(f"Providers config directory not found: {self.config_dir}")
return
for config_file in self.config_dir.glob("*.yaml"):
try:
scraper = GenericScraper(str(config_file))
self.providers[scraper.id] = scraper
self.health_status[scraper.id] = {
"status": "unknown",
"last_check": None,
"error": None
}
logger.info(f"Loaded provider: {scraper.name} ({scraper.id})")
except Exception as e:
logger.error(f"Failed to load provider from {config_file}: {e}")
async def check_all_health(self):
"""Check health of all registered providers"""
logger.info("Checking health of all providers...")
tasks = []
for provider_id, scraper in self.providers.items():
tasks.append(self._check_single_health(provider_id, scraper))
await asyncio.gather(*tasks)
logger.info("Provider health check complete")
async def _check_single_health(self, provider_id: str, scraper: GenericScraper):
"""Check health of a single provider and update status"""
try:
is_healthy = await scraper.check_health()
self.health_status[provider_id] = {
"status": "up" if is_healthy else "down",
"last_check": datetime.now().isoformat(),
"error": None if is_healthy else "No search results returned"
}
except Exception as e:
self.health_status[provider_id] = {
"status": "down",
"last_check": datetime.now().isoformat(),
"error": str(e)
}
logger.error(f"Health check failed for {provider_id}: {e}")
def get_provider(self, provider_id: str) -> Optional[GenericScraper]:
return self.providers.get(provider_id)
def get_active_providers(self) -> List[GenericScraper]:
"""Return only providers that are UP or UNKNOWN"""
return [
self.providers[pid] for pid, status in self.health_status.items()
if status["status"] != "down"
]
def get_all_status(self) -> Dict[str, Dict]:
return self.health_status
# Global instance
providers_manager = ProvidersManager()