feat: Add AGENTS.md and new downloaders with metadata enrichment

- Add AGENTS.md for agentic coding guidelines
- Add Oneupload and Smoothpre video player downloaders
- Add MetadataEnrichment service with Kitsu API fallback
- Add tests for metadata enrichment and provider detection
- Update .gitignore to ignore runtime config files
This commit is contained in:
root
2026-02-24 20:14:31 +00:00
parent da5403a307
commit 2482a1fe58
7 changed files with 2119 additions and 0 deletions
+294
View File
@@ -0,0 +1,294 @@
from .base import BaseVideoPlayer
from bs4 import BeautifulSoup
import re
import asyncio
from typing import Optional
class OneuploadDownloader(BaseVideoPlayer):
"""Downloader for oneupload.to video player"""
def can_handle(self, url: str) -> bool:
return 'oneupload.to' in url.lower()
async def get_download_link(self, url: str, target_filename: Optional[str] = None) -> tuple[str, str]:
"""
Extract download link from Oneupload video page
Oneupload uses a custom video player with dynamic loading
Args:
url: The Oneupload video page URL
target_filename: Optional filename override
Returns:
Tuple of (direct_video_url, filename)
"""
try:
print(f"[ONEUPLOAD] Extracting link from: {url}")
# Try using Playwright first (more reliable for dynamic content)
video_url = await self._extract_with_playwright(url)
if not video_url:
# Fallback to HTTP extraction
video_url = await self._extract_with_http(url)
if not video_url:
raise Exception("Could not find video URL in Oneupload page")
print(f"[ONEUPLOAD] Found video URL: {video_url[:80]}...")
# Generate filename
from app.utils import sanitize_filename
if target_filename:
filename = sanitize_filename(target_filename)
else:
# Try to extract filename from URL
filename = "oneupload_video.mp4"
return video_url, filename
except Exception as e:
raise Exception(f"Error extracting Oneupload link: {str(e)}")
async def _extract_with_playwright(self, url: str) -> str | None:
"""Extract video URL using Playwright with network interception"""
try:
from playwright.async_api import async_playwright
print("[ONEUPLOAD] Launching browser with network interception...")
video_urls = []
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
)
context = await browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
)
page = await context.new_page()
# Set up response interception
async def handle_response(response):
try:
resp_url = response.url
content_type = response.headers.get('content-type', '')
# Look for video files in responses
if any(ext in resp_url.lower() for ext in ['.m3u8', '.mp4', '.mkv', '.ts']):
if 'oneupload' not in resp_url.lower() and 'google' not in resp_url.lower():
print(f"[ONEUPLOAD] 🎥 Captured video URL: {resp_url[:100]}...")
video_urls.append(resp_url)
# Also check by content-type
elif any(ct in content_type.lower() for ct in ['video/', 'application/x-mpegurl']):
if 'oneupload' not in resp_url.lower():
print(f"[ONEUPLOAD] 🎥 Captured video response: {resp_url[:100]}...")
video_urls.append(resp_url)
except Exception as e:
pass # Ignore interception errors
page.on('response', handle_response)
print("[ONEUPLOAD] Navigating to page...")
try:
await page.goto(url, wait_until='networkidle', timeout=30000)
except Exception as e:
print(f"[ONEUPLOAD] Navigation warning: {e}")
# Wait for page to load
print("[ONEUPLOAD] Waiting for video player to load...")
await asyncio.sleep(3)
# Try to find and click play button
try:
play_selectors = [
'button[aria-label="Play"]',
'.play-button',
'button[class*="play"]',
'.jw-icon-display',
'video',
'.video-wrapper video',
]
for selector in play_selectors:
try:
element = await page.query_selector(selector)
if element:
print(f"[ONEUPLOAD] Found element: {selector}")
if 'button' in selector or 'jw' in selector:
await element.click()
await asyncio.sleep(2)
break
except:
continue
except Exception as e:
print(f"[ONEUPLOAD] Play button interaction: {e}")
# Wait more for network requests
await asyncio.sleep(4)
# Try JavaScript extraction
try:
js_code = r"""
() => {
// Check for JWPlayer setup
if (window.jwplayer) {
try {
const playlist = window.jwplayer().getPlaylist();
if (playlist && playlist[0] && playlist[0].sources) {
for (let source of playlist[0].sources) {
if (source.file && (source.file.includes('.m3u8') || source.file.includes('.mp4'))) {
return source.file;
}
}
}
} catch(e) {}
}
// Check all video elements
const videos = document.querySelectorAll('video');
for (let v of videos) {
if (v.src && (v.src.includes('.m3u8') || v.src.includes('.mp4'))) {
return v.src;
}
const sources = v.querySelectorAll('source');
for (let s of sources) {
if (s.src && (s.src.includes('.m3u8') || s.src.includes('.mp4'))) {
return s.src;
}
}
}
// Check window object for video URLs
const searchKeys = ['player', 'video', 'source', 'file', 'url'];
for (let key of searchKeys) {
if (window[key] && typeof window[key] === 'object') {
try {
const json = JSON.stringify(window[key]);
const match = json.match(/(https?:\/\/[^\s"\'<>]+\.(m3u8|mp4))/);
if (match) return match[1];
} catch(e) {}
}
}
return null;
}
"""
js_result = await page.evaluate(js_code)
if js_result and ('.m3u8' in js_result or '.mp4' in js_result):
print(f"[ONEUPLOAD] ✅ Found video URL via JavaScript: {js_result[:100]}...")
video_urls.append(js_result)
except Exception as e:
print(f"[ONEUPLOAD] JS extraction error: {e}")
# Parse page HTML for video URLs
try:
content = await page.content()
patterns = [
r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
r"url\s*[:=]\s*['\"]([^'\"]+\.m3u8[^'\"]*)['\"]",
r"url\s*[:=]\s*['\"]([^'\"]+\.mp4[^'\"]*)['\"]",
]
for pattern in patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
for match in matches:
# Clean up the URL
match = match.replace('\\/', '/').replace('\\', '')
if 'http' in match and 'oneupload' not in match and 'google' not in match:
print(f"[ONEUPLOAD] Found in HTML: {match[:100]}...")
video_urls.append(match)
except Exception as e:
print(f"[ONEUPLOAD] HTML parsing error: {e}")
await browser.close()
# Return first valid video URL (prefer .m3u8 over .mp4)
if video_urls:
seen = set()
unique_urls = []
for vid_url in video_urls:
if vid_url not in seen:
seen.add(vid_url)
unique_urls.append(vid_url)
if unique_urls:
# Sort to prefer .m3u8 (source quality)
unique_urls.sort(key=lambda x: 0 if '.m3u8' in x else 1)
print(f"[ONEUPLOAD] ✅ Found {len(unique_urls)} video URL(s)")
print(f"[ONEUPLOAD] Selected: {unique_urls[0][:100]}...")
return unique_urls[0]
print("[ONEUPLOAD] ❌ No video URLs found")
return None
except ImportError:
print("[ONEUPLOAD] ⚠️ Playwright not installed - using HTTP extraction only")
return None
except Exception as e:
print(f"[ONEUPLOAD] Playwright error: {e}")
import traceback
traceback.print_exc()
return None
async def _extract_with_http(self, url: str) -> str | None:
"""Extract video URL using simple HTTP requests"""
try:
print(f"[ONEUPLOAD] Trying HTTP extraction from: {url}")
response = await self.client.get(url, follow_redirects=True)
soup = BeautifulSoup(response.text, 'lxml')
# Method 1: Look for video/source tags
videos = soup.find_all('video')
for video in videos:
src = video.get('src') or video.get('data-src')
if src and any(ext in src for ext in ['.m3u8', '.mp4']):
print(f"[ONEUPLOAD] ✅ Found video in video tag: {src[:100]}...")
return src
sources = video.find_all('source')
for source in sources:
src = source.get('src')
if src and any(ext in src for ext in ['.m3u8', '.mp4']):
print(f"[ONEUPLOAD] ✅ Found video in source tag: {src[:100]}...")
return src
# Method 2: Look in script tags for video URLs
scripts = soup.find_all('script')
for script in scripts:
if script.string:
patterns = [
r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
]
for pattern in patterns:
matches = re.findall(pattern, script.string, re.IGNORECASE)
for match in matches:
match = match.replace('\\/', '/')
if 'http' in match and 'oneupload' not in match.lower():
print(f"[ONEUPLOAD] ✅ Found video in script: {match[:100]}...")
return match
print("[ONEUPLOAD] ❌ HTTP extraction failed - no video URLs found")
return None
except Exception as e:
print(f"[ONEUPLOAD] HTTP extraction error: {e}")
return None
+290
View File
@@ -0,0 +1,290 @@
from .base import BaseVideoPlayer
from bs4 import BeautifulSoup
import re
import asyncio
from typing import Optional
class SmoothpreDownloader(BaseVideoPlayer):
"""Downloader for smoothpre.com video player (JWPlayer-based)"""
def can_handle(self, url: str) -> bool:
return 'smoothpre.com' in url.lower()
async def get_download_link(self, url: str, target_filename: Optional[str] = None) -> tuple[str, str]:
"""
Extract download link from Smoothpre video page
Smoothpre uses JWPlayer with dynamic JavaScript - requires Playwright
Args:
url: The Smoothpre video page URL
target_filename: Optional filename override
Returns:
Tuple of (direct_video_url, filename)
"""
try:
print(f"[SMOOTHPRE] Extracting link from: {url}")
# Try using Playwright to extract video URL
video_url = await self._extract_with_playwright(url)
if not video_url:
raise Exception("Could not find video URL in Smoothpre page")
print(f"[SMOOTHPRE] Found video URL: {video_url[:80]}...")
# Generate filename
from app.utils import sanitize_filename
if target_filename:
filename = sanitize_filename(target_filename)
else:
filename = "smoothpre_video.mp4"
return video_url, filename
except Exception as e:
raise Exception(f"Error extracting Smoothpre link: {str(e)}")
async def _extract_with_playwright(self, url: str) -> str | None:
"""Extract video URL using Playwright with network interception"""
try:
from playwright.async_api import async_playwright
print("[SMOOTHPRE] Launching browser with network interception...")
video_urls = []
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
)
context = await browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
)
page = await context.new_page()
# Set up response interception
async def handle_response(response):
try:
resp_url = response.url
content_type = response.headers.get('content-type', '')
# Look for video files in responses
if any(ext in resp_url.lower() for ext in ['.m3u8', '.mp4', '.mkv', '.ts']):
if 'smoothpre' not in resp_url.lower() and 'google' not in resp_url.lower():
print(f"[SMOOTHPRE] 🎥 Captured video URL: {resp_url[:100]}...")
video_urls.append(resp_url)
# Also check by content-type
elif any(ct in content_type.lower() for ct in ['video/', 'application/x-mpegurl']):
if 'smoothpre' not in resp_url.lower():
print(f"[SMOOTHPRE] 🎥 Captured video response: {resp_url[:100]}...")
video_urls.append(resp_url)
except Exception as e:
pass # Ignore interception errors
page.on('response', handle_response)
print("[SMOOTHPRE] Navigating to page...")
try:
await page.goto(url, wait_until='networkidle', timeout=30000)
except Exception as e:
print(f"[SMOOTHPRE] Navigation warning: {e}")
# Wait for page to load
print("[SMOOTHPRE] Waiting for video player to load...")
await asyncio.sleep(3)
# Try to find and click play button
try:
play_selectors = [
'button[aria-label="Play"]',
'.play-button',
'button[class*="play"]',
'.jw-icon-display',
'video',
]
for selector in play_selectors:
try:
element = await page.query_selector(selector)
if element:
print(f"[SMOOTHPRE] Found element: {selector}")
if 'button' in selector or 'jw' in selector:
await element.click()
await asyncio.sleep(2)
break
except:
continue
except Exception as e:
print(f"[SMOOTHPRE] Play button interaction: {e}")
# Wait more for network requests
await asyncio.sleep(4)
# Try JavaScript extraction - JWPlayer specific
try:
js_code = r"""
() => {
// Check for JWPlayer setup (primary method for Smoothpre)
if (window.jwplayer) {
try {
const playlist = window.jwplayer().getPlaylist();
if (playlist && playlist[0] && playlist[0].sources) {
for (let source of playlist[0].sources) {
if (source.file && (source.file.includes('.m3u8') || source.file.includes('.mp4'))) {
return source.file;
}
}
}
} catch(e) {}
}
// Check all video elements
const videos = document.querySelectorAll('video');
for (let v of videos) {
if (v.src && (v.src.includes('.m3u8') || v.src.includes('.mp4'))) {
return v.src;
}
const sources = v.querySelectorAll('source');
for (let s of sources) {
if (s.src && (s.src.includes('.m3u8') || s.src.includes('.mp4'))) {
return s.src;
}
}
}
// Check window object for video URLs
const searchKeys = ['player', 'video', 'source', 'file', 'url', 'jw'];
for (let key of searchKeys) {
if (window[key] && typeof window[key] === 'object') {
try {
const json = JSON.stringify(window[key]);
const match = json.match(/(https?:\/\/[^\s"\'<>]+\.(m3u8|mp4))/);
if (match) return match[1];
} catch(e) {}
}
}
return null;
}
"""
js_result = await page.evaluate(js_code)
if js_result and ('.m3u8' in js_result or '.mp4' in js_result):
print(f"[SMOOTHPRE] ✅ Found video URL via JavaScript: {js_result[:100]}...")
video_urls.append(js_result)
except Exception as e:
print(f"[SMOOTHPRE] JS extraction error: {e}")
# Parse page HTML for video URLs - enhanced patterns
try:
content = await page.content()
patterns = [
r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
r"url\s*[:=]\s*['\"]([^'\"]+\.m3u8[^'\"]*)['\"]",
r"url\s*[:=]\s*['\"]([^'\"]+\.mp4[^'\"]*)['\"]",
]
for pattern in patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
for match in matches:
# Clean up the URL
match = match.replace('\\/', '/').replace('\\', '')
if 'http' in match and 'smoothpre' not in match and 'google' not in match:
print(f"[SMOOTHPRE] Found in HTML: {match[:100]}...")
video_urls.append(match)
except Exception as e:
print(f"[SMOOTHPRE] HTML parsing error: {e}")
await browser.close()
# Return first valid video URL (prefer .m3u8 over .mp4 as it's usually the source)
if video_urls:
seen = set()
unique_urls = []
for vid_url in video_urls:
if vid_url not in seen:
seen.add(vid_url)
unique_urls.append(vid_url)
if unique_urls:
# Sort to prefer .m3u8 (source quality)
unique_urls.sort(key=lambda x: 0 if '.m3u8' in x else 1)
print(f"[SMOOTHPRE] ✅ Found {len(unique_urls)} video URL(s)")
print(f"[SMOOTHPRE] Selected: {unique_urls[0][:100]}...")
return unique_urls[0]
print("[SMOOTHPRE] ❌ No video URLs found")
return None
except ImportError:
print("[SMOOTHPRE] ⚠️ Playwright not installed - falling back to HTTP extraction")
return await self._extract_with_http(url)
except Exception as e:
print(f"[SMOOTHPRE] Playwright error: {e}")
import traceback
traceback.print_exc()
# Fallback to HTTP extraction
return await self._extract_with_http(url)
async def _extract_with_http(self, url: str) -> str | None:
"""Extract video URL using simple HTTP requests (fallback when Playwright fails)"""
try:
print(f"[SMOOTHPRE] Trying HTTP extraction from: {url}")
response = await self.client.get(url, follow_redirects=True)
soup = BeautifulSoup(response.text, 'lxml')
# Method 1: Look for video/source tags
videos = soup.find_all('video')
for video in videos:
src = video.get('src') or video.get('data-src')
if src and any(ext in src for ext in ['.m3u8', '.mp4']):
print(f"[SMOOTHPRE] ✅ Found video in video tag: {src[:100]}...")
return src
sources = video.find_all('source')
for source in sources:
src = source.get('src')
if src and any(ext in src for ext in ['.m3u8', '.mp4']):
print(f"[SMOOTHPRE] ✅ Found video in source tag: {src[:100]}...")
return src
# Method 2: Look in script tags for JWPlayer configuration
scripts = soup.find_all('script')
for script in scripts:
if script.string:
# JWPlayer patterns
patterns = [
r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
]
for pattern in patterns:
matches = re.findall(pattern, script.string, re.IGNORECASE)
for match in matches:
match = match.replace('\\/', '/')
if 'http' in match and 'smoothpre' not in match.lower():
print(f"[SMOOTHPRE] ✅ Found video in script: {match[:100]}...")
return match
print("[SMOOTHPRE] ❌ HTTP extraction failed - no video URLs found")
return None
except Exception as e:
print(f"[SMOOTHPRE] HTTP extraction error: {e}")
return None
+423
View File
@@ -0,0 +1,423 @@
"""
Metadata enrichment service with Kitsu API fallback.
This module provides intelligent metadata enrichment by:
1. Merging provider metadata with Kitsu API data
2. Filling missing fields from Kitsu
3. Normalizing data formats across providers
4. Caching enriched metadata to reduce API calls
"""
import asyncio
import logging
from typing import Dict, Optional, List, Set
from datetime import datetime, timedelta
from pathlib import Path
import json
import hashlib
from app.kitsu_api import KitsuAPI
from app.models import AnimeMetadata
logger = logging.getLogger(__name__)
class MetadataEnricher:
"""
Enriches anime metadata by combining provider data with Kitsu API fallback.
Caches results to minimize API calls.
"""
# Fields that Kitsu can provide as fallback
# Note: studio is not included as Kitsu API requires separate calls
KITSU_FIELDS = {
'synopsis', 'genres', 'rating', 'release_year',
'poster_image', 'banner_image', 'total_episodes', 'status',
'alternative_titles'
}
# Cache duration in hours
CACHE_DURATION_HOURS = 24
def __init__(self, cache_dir: str = "config"):
self.cache_dir = Path(cache_dir)
self.cache_file = self.cache_dir / "metadata_cache.json"
self.kitsu_api = KitsuAPI()
self._cache: Dict[str, Dict] = {}
self._cache_dirty = False
# Load cache on initialization
self._load_cache()
def _load_cache(self):
"""Load metadata cache from disk."""
try:
if self.cache_file.exists():
with open(self.cache_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Filter out expired entries
now = datetime.now()
self._cache = {
k: v for k, v in data.items()
if datetime.fromisoformat(v.get('cached_at', '')) >
now - timedelta(hours=self.CACHE_DURATION_HOURS)
}
logger.info(f"Loaded {len(self._cache)} cached metadata entries")
except Exception as e:
logger.warning(f"Failed to load metadata cache: {e}")
self._cache = {}
def _save_cache(self):
"""Save metadata cache to disk."""
if not self._cache_dirty:
return
try:
self.cache_dir.mkdir(parents=True, exist_ok=True)
with open(self.cache_file, 'w', encoding='utf-8') as f:
json.dump(self._cache, f, ensure_ascii=False, indent=2)
self._cache_dirty = False
logger.debug("Saved metadata cache")
except Exception as e:
logger.error(f"Failed to save metadata cache: {e}")
def _get_cache_key(self, title: str, url: Optional[str] = None) -> str:
"""Generate cache key from title and URL."""
# Use both title and URL for more precise caching
key_data = f"{title}|{url or ''}"
return hashlib.md5(key_data.encode()).hexdigest()
def _get_cached_metadata(self, cache_key: str) -> Optional[Dict]:
"""Get cached metadata if available and not expired."""
if cache_key in self._cache:
entry = self._cache[cache_key]
cached_at = datetime.fromisoformat(entry.get('cached_at', ''))
if cached_at > datetime.now() - timedelta(hours=self.CACHE_DURATION_HOURS):
logger.debug(f"Cache hit for key: {cache_key}")
return entry.get('metadata')
else:
# Remove expired entry
del self._cache[cache_key]
self._cache_dirty = True
return None
def _set_cached_metadata(self, cache_key: str, metadata: Dict):
"""Cache enriched metadata."""
self._cache[cache_key] = {
'metadata': metadata,
'cached_at': datetime.now().isoformat()
}
self._cache_dirty = True
async def enrich_metadata(
self,
provider_metadata: Dict,
title: str,
url: Optional[str] = None,
use_kitsu_fallback: bool = True
) -> AnimeMetadata:
"""
Enrich provider metadata with Kitsu API fallback.
Args:
provider_metadata: Metadata dict from anime provider
title: Anime title (for Kitsu search)
url: Optional anime URL (for cache key)
use_kitsu_fallback: Whether to use Kitsu API for missing fields
Returns:
Enriched AnimeMetadata object
"""
# Check cache first
cache_key = self._get_cache_key(title, url)
cached = self._get_cached_metadata(cache_key)
if cached:
return AnimeMetadata(**cached)
# Start with provider metadata
enriched = provider_metadata.copy()
# Check which fields are missing
missing_fields = self._get_missing_fields(enriched)
if missing_fields and use_kitsu_fallback:
logger.info(f"Missing fields for '{title}': {missing_fields} - fetching from Kitsu")
try:
# Fetch from Kitsu
kitsu_metadata = await self._fetch_from_kitsu(title)
if kitsu_metadata:
# Merge Kitsu data
enriched = self._merge_metadata(enriched, kitsu_metadata)
enriched['_kitsu_enriched'] = True
enriched['_enriched_fields'] = list(missing_fields)
except Exception as e:
logger.warning(f"Failed to fetch Kitsu metadata for '{title}': {e}")
# Calculate quality score
enriched['_quality_score'] = self._calculate_quality_score(enriched)
# Convert to AnimeMetadata
result = AnimeMetadata(**{
k: v for k, v in enriched.items()
if not k.startswith('_') # Exclude internal fields
})
# Cache the result
self._set_cached_metadata(cache_key, result.model_dump())
# Periodically save cache
if self._cache_dirty and len(self._cache) % 10 == 0:
self._save_cache()
return result
def _get_missing_fields(self, metadata: Dict) -> Set[str]:
"""Identify which metadata fields are missing or empty."""
missing = set()
for field in self.KITSU_FIELDS:
value = metadata.get(field)
if value is None or value == [] or value == '':
missing.add(field)
return missing
async def _fetch_from_kitsu(self, title: str) -> Optional[Dict]:
"""Fetch metadata from Kitsu API."""
try:
# Search for anime
results = await self.kitsu_api.search_anime(title, limit=1)
if results and len(results) > 0:
anime_data = results[0]
return self._convert_kitsu_to_metadata(anime_data)
else:
logger.debug(f"No Kitsu results for '{title}'")
return None
except Exception as e:
logger.error(f"Error fetching from Kitsu for '{title}': {e}")
return None
def _convert_kitsu_to_metadata(self, kitsu_data: Dict) -> Dict:
"""Convert Kitsu API response to metadata format."""
metadata = {}
# Synopsis
if kitsu_data.get('synopsis'):
metadata['synopsis'] = kitsu_data['synopsis']
# Genres
if kitsu_data.get('genres'):
metadata['genres'] = kitsu_data['genres']
# Rating (Kitsu returns score out of 10, convert to string)
if kitsu_data.get('score'):
score = kitsu_data['score']
if score > 0:
metadata['rating'] = f"{score:.1f}/10"
# Release year
if kitsu_data.get('year'):
metadata['release_year'] = kitsu_data['year']
# Poster image
if kitsu_data.get('images', {}).get('jpg', {}).get('large_image_url'):
metadata['poster_image'] = kitsu_data['images']['jpg']['large_image_url']
elif kitsu_data.get('images', {}).get('jpg', {}).get('image_url'):
metadata['poster_image'] = kitsu_data['images']['jpg']['image_url']
# Banner image (Kitsu calls it coverImage)
# Note: Kitsu API structure doesn't clearly separate poster vs banner,
# but we can use different sizes if available
if kitsu_data.get('images', {}).get('webp', {}).get('large_image_url'):
metadata['banner_image'] = kitsu_data['images']['webp']['large_image_url']
# Total episodes
if kitsu_data.get('episodes'):
metadata['total_episodes'] = kitsu_data['episodes']
# Status
if kitsu_data.get('status'):
# Translate Kitsu status to our format
status_map = {
'Airing': 'Ongoing',
'Finished Airing': 'Completed',
'To Be Aired': 'Upcoming'
}
metadata['status'] = status_map.get(
kitsu_data['status'],
kitsu_data['status']
)
# Alternative titles
alt_titles = []
if kitsu_data.get('title_japanese'):
alt_titles.append(kitsu_data['title_japanese'])
if kitsu_data.get('title_english'):
alt_titles.append(kitsu_data['title_english'])
if alt_titles:
metadata['alternative_titles'] = alt_titles
return metadata
def _merge_metadata(
self,
provider_metadata: Dict,
kitsu_metadata: Dict
) -> Dict:
"""
Merge provider and Kitsu metadata, preferring provider data.
Provider data takes priority except for missing fields.
"""
merged = provider_metadata.copy()
for field, value in kitsu_metadata.items():
# Only use Kitsu data if provider doesn't have it
if field not in merged or not merged[field]:
merged[field] = value
return merged
def _calculate_quality_score(self, metadata: Dict) -> float:
"""
Calculate metadata quality score (0-1).
Based on completeness of critical fields.
"""
weights = {
'synopsis': 0.2,
'genres': 0.15,
'rating': 0.1,
'release_year': 0.1,
'studio': 0.1,
'poster_image': 0.15,
'banner_image': 0.05,
'total_episodes': 0.05,
'status': 0.05,
'alternative_titles': 0.05
}
total_weight = sum(weights.values())
score = 0.0
for field, weight in weights.items():
value = metadata.get(field)
if value:
# For lists, check if not empty
if isinstance(value, list):
if len(value) > 0:
score += weight
# For strings, check if not empty
elif isinstance(value, str):
if len(value) > 10: # Minimum meaningful length
score += weight
# For numbers
else:
score += weight
return round(score / total_weight, 2) if total_weight > 0 else 0.0
async def enrich_search_results(
self,
results: List[Dict],
use_kitsu_fallback: bool = True
) -> List[Dict]:
"""
Enrich metadata for a list of search results.
Args:
results: List of search result dicts with optional 'metadata' field
use_kitsu_fallback: Whether to use Kitsu API
Returns:
List of results with enriched metadata
"""
enriched_results = []
# Process results in parallel for better performance
enrichment_tasks = []
for result in results:
# Skip if no metadata - will add later in order
if 'metadata' not in result:
continue
task = self.enrich_metadata(
provider_metadata=result['metadata'],
title=result.get('title', ''),
url=result.get('url'),
use_kitsu_fallback=use_kitsu_fallback
)
enrichment_tasks.append(task)
# Wait for all enrichment tasks
if enrichment_tasks:
enriched_metadata_list = await asyncio.gather(
*enrichment_tasks,
return_exceptions=True
)
# Update results with enriched metadata
# Create index mapping to preserve order
temp_results = {}
metadata_idx = 0
for i, result in enumerate(results):
if 'metadata' in result:
enriched_meta = enriched_metadata_list[metadata_idx]
if isinstance(enriched_meta, Exception):
logger.warning(
f"Failed to enrich metadata for '{result.get('title')}': {enriched_meta}"
)
# Keep original metadata
result_copy = result.copy()
else:
result_copy = result.copy()
result_copy['metadata'] = enriched_meta.model_dump()
temp_results[i] = result_copy
metadata_idx += 1
# Build final result list in correct order
enriched_results = []
for i in range(len(results)):
if i in temp_results:
enriched_results.append(temp_results[i])
else:
# No metadata result - use original
enriched_results.append(results[i].copy())
return enriched_results
async def close(self):
"""Close resources and save cache."""
await self.kitsu_api.close()
self._save_cache()
logger.info("MetadataEnricher closed")
# Global instance
_enricher_instance: Optional[MetadataEnricher] = None
_enricher_lock = asyncio.Lock()
async def get_metadata_enricher() -> MetadataEnricher:
"""Get or create the global MetadataEnricher instance."""
global _enricher_instance
if _enricher_instance is None:
async with _enricher_lock:
if _enricher_instance is None:
_enricher_instance = MetadataEnricher()
logger.info("Created global MetadataEnricher instance")
return _enricher_instance
async def close_metadata_enricher():
"""Close the global MetadataEnricher instance."""
global _enricher_instance
if _enricher_instance is not None:
await _enricher_instance.close()
_enricher_instance = None
logger.info("Closed global MetadataEnricher instance")