feat: Add AGENTS.md and new downloaders with metadata enrichment

- Add AGENTS.md for agentic coding guidelines - Add Oneupload and Smoothpre video player downloaders - Add MetadataEnrichment service with Kitsu API fallback - Add tests for metadata enrichment and provider detection - Update .gitignore to ignore runtime config files
2026-02-24 20:14:31 +00:00
parent da5403a307
commit 2482a1fe58
7 changed files with 2119 additions and 0 deletions
@@ -0,0 +1,294 @@
+from .base import BaseVideoPlayer
+from bs4 import BeautifulSoup
+import re
+import asyncio
+from typing import Optional
+
+
+class OneuploadDownloader(BaseVideoPlayer):
+    """Downloader for oneupload.to video player"""
+
+    def can_handle(self, url: str) -> bool:
+        return 'oneupload.to' in url.lower()
+
+    async def get_download_link(self, url: str, target_filename: Optional[str] = None) -> tuple[str, str]:
+        """
+        Extract download link from Oneupload video page
+        Oneupload uses a custom video player with dynamic loading
+
+        Args:
+            url: The Oneupload video page URL
+            target_filename: Optional filename override
+
+        Returns:
+            Tuple of (direct_video_url, filename)
+        """
+        try:
+            print(f"[ONEUPLOAD] Extracting link from: {url}")
+
+            # Try using Playwright first (more reliable for dynamic content)
+            video_url = await self._extract_with_playwright(url)
+
+            if not video_url:
+                # Fallback to HTTP extraction
+                video_url = await self._extract_with_http(url)
+
+            if not video_url:
+                raise Exception("Could not find video URL in Oneupload page")
+
+            print(f"[ONEUPLOAD] Found video URL: {video_url[:80]}...")
+
+            # Generate filename
+            from app.utils import sanitize_filename
+            if target_filename:
+                filename = sanitize_filename(target_filename)
+            else:
+                # Try to extract filename from URL
+                filename = "oneupload_video.mp4"
+
+            return video_url, filename
+
+        except Exception as e:
+            raise Exception(f"Error extracting Oneupload link: {str(e)}")
+
+    async def _extract_with_playwright(self, url: str) -> str | None:
+        """Extract video URL using Playwright with network interception"""
+        try:
+            from playwright.async_api import async_playwright
+
+            print("[ONEUPLOAD] Launching browser with network interception...")
+
+            video_urls = []
+
+            async with async_playwright() as p:
+                browser = await p.chromium.launch(
+                    headless=True,
+                    args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
+                )
+
+                context = await browser.new_context(
+                    user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
+                )
+
+                page = await context.new_page()
+
+                # Set up response interception
+                async def handle_response(response):
+                    try:
+                        resp_url = response.url
+                        content_type = response.headers.get('content-type', '')
+
+                        # Look for video files in responses
+                        if any(ext in resp_url.lower() for ext in ['.m3u8', '.mp4', '.mkv', '.ts']):
+                            if 'oneupload' not in resp_url.lower() and 'google' not in resp_url.lower():
+                                print(f"[ONEUPLOAD] 🎥 Captured video URL: {resp_url[:100]}...")
+                                video_urls.append(resp_url)
+                        # Also check by content-type
+                        elif any(ct in content_type.lower() for ct in ['video/', 'application/x-mpegurl']):
+                            if 'oneupload' not in resp_url.lower():
+                                print(f"[ONEUPLOAD] 🎥 Captured video response: {resp_url[:100]}...")
+                                video_urls.append(resp_url)
+                    except Exception as e:
+                        pass  # Ignore interception errors
+
+                page.on('response', handle_response)
+
+                print("[ONEUPLOAD] Navigating to page...")
+
+                try:
+                    await page.goto(url, wait_until='networkidle', timeout=30000)
+                except Exception as e:
+                    print(f"[ONEUPLOAD] Navigation warning: {e}")
+
+                # Wait for page to load
+                print("[ONEUPLOAD] Waiting for video player to load...")
+                await asyncio.sleep(3)
+
+                # Try to find and click play button
+                try:
+                    play_selectors = [
+                        'button[aria-label="Play"]',
+                        '.play-button',
+                        'button[class*="play"]',
+                        '.jw-icon-display',
+                        'video',
+                        '.video-wrapper video',
+                    ]
+
+                    for selector in play_selectors:
+                        try:
+                            element = await page.query_selector(selector)
+                            if element:
+                                print(f"[ONEUPLOAD] Found element: {selector}")
+                                if 'button' in selector or 'jw' in selector:
+                                    await element.click()
+                                    await asyncio.sleep(2)
+                                break
+                        except:
+                            continue
+                except Exception as e:
+                    print(f"[ONEUPLOAD] Play button interaction: {e}")
+
+                # Wait more for network requests
+                await asyncio.sleep(4)
+
+                # Try JavaScript extraction
+                try:
+                    js_code = r"""
+                        () => {
+                            // Check for JWPlayer setup
+                            if (window.jwplayer) {
+                                try {
+                                    const playlist = window.jwplayer().getPlaylist();
+                                    if (playlist && playlist[0] && playlist[0].sources) {
+                                        for (let source of playlist[0].sources) {
+                                            if (source.file && (source.file.includes('.m3u8') || source.file.includes('.mp4'))) {
+                                                return source.file;
+                                            }
+                                        }
+                                    }
+                                } catch(e) {}
+                            }
+
+                            // Check all video elements
+                            const videos = document.querySelectorAll('video');
+                            for (let v of videos) {
+                                if (v.src && (v.src.includes('.m3u8') || v.src.includes('.mp4'))) {
+                                    return v.src;
+                                }
+                                const sources = v.querySelectorAll('source');
+                                for (let s of sources) {
+                                    if (s.src && (s.src.includes('.m3u8') || s.src.includes('.mp4'))) {
+                                        return s.src;
+                                    }
+                                }
+                            }
+
+                            // Check window object for video URLs
+                            const searchKeys = ['player', 'video', 'source', 'file', 'url'];
+                            for (let key of searchKeys) {
+                                if (window[key] && typeof window[key] === 'object') {
+                                    try {
+                                        const json = JSON.stringify(window[key]);
+                                        const match = json.match(/(https?:\/\/[^\s"\'<>]+\.(m3u8|mp4))/);
+                                        if (match) return match[1];
+                                    } catch(e) {}
+                                }
+                            }
+
+                            return null;
+                        }
+                    """
+                    js_result = await page.evaluate(js_code)
+
+                    if js_result and ('.m3u8' in js_result or '.mp4' in js_result):
+                        print(f"[ONEUPLOAD] ✅ Found video URL via JavaScript: {js_result[:100]}...")
+                        video_urls.append(js_result)
+                except Exception as e:
+                    print(f"[ONEUPLOAD] JS extraction error: {e}")
+
+                # Parse page HTML for video URLs
+                try:
+                    content = await page.content()
+                    patterns = [
+                        r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
+                        r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
+                        r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
+                        r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
+                        r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
+                        r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
+                        r"url\s*[:=]\s*['\"]([^'\"]+\.m3u8[^'\"]*)['\"]",
+                        r"url\s*[:=]\s*['\"]([^'\"]+\.mp4[^'\"]*)['\"]",
+                    ]
+
+                    for pattern in patterns:
+                        matches = re.findall(pattern, content, re.IGNORECASE)
+                        for match in matches:
+                            # Clean up the URL
+                            match = match.replace('\\/', '/').replace('\\', '')
+                            if 'http' in match and 'oneupload' not in match and 'google' not in match:
+                                print(f"[ONEUPLOAD] Found in HTML: {match[:100]}...")
+                                video_urls.append(match)
+                except Exception as e:
+                    print(f"[ONEUPLOAD] HTML parsing error: {e}")
+
+                await browser.close()
+
+                # Return first valid video URL (prefer .m3u8 over .mp4)
+                if video_urls:
+                    seen = set()
+                    unique_urls = []
+                    for vid_url in video_urls:
+                        if vid_url not in seen:
+                            seen.add(vid_url)
+                            unique_urls.append(vid_url)
+
+                    if unique_urls:
+                        # Sort to prefer .m3u8 (source quality)
+                        unique_urls.sort(key=lambda x: 0 if '.m3u8' in x else 1)
+                        print(f"[ONEUPLOAD] ✅ Found {len(unique_urls)} video URL(s)")
+                        print(f"[ONEUPLOAD] Selected: {unique_urls[0][:100]}...")
+                        return unique_urls[0]
+
+                print("[ONEUPLOAD] ❌ No video URLs found")
+                return None
+
+        except ImportError:
+            print("[ONEUPLOAD] ⚠️ Playwright not installed - using HTTP extraction only")
+            return None
+        except Exception as e:
+            print(f"[ONEUPLOAD] Playwright error: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+
+    async def _extract_with_http(self, url: str) -> str | None:
+        """Extract video URL using simple HTTP requests"""
+        try:
+            print(f"[ONEUPLOAD] Trying HTTP extraction from: {url}")
+
+            response = await self.client.get(url, follow_redirects=True)
+            soup = BeautifulSoup(response.text, 'lxml')
+
+            # Method 1: Look for video/source tags
+            videos = soup.find_all('video')
+            for video in videos:
+                src = video.get('src') or video.get('data-src')
+                if src and any(ext in src for ext in ['.m3u8', '.mp4']):
+                    print(f"[ONEUPLOAD] ✅ Found video in video tag: {src[:100]}...")
+                    return src
+
+                sources = video.find_all('source')
+                for source in sources:
+                    src = source.get('src')
+                    if src and any(ext in src for ext in ['.m3u8', '.mp4']):
+                        print(f"[ONEUPLOAD] ✅ Found video in source tag: {src[:100]}...")
+                        return src
+
+            # Method 2: Look in script tags for video URLs
+            scripts = soup.find_all('script')
+            for script in scripts:
+                if script.string:
+                    patterns = [
+                        r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
+                        r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
+                        r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
+                        r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
+                        r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
+                        r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
+                    ]
+
+                    for pattern in patterns:
+                        matches = re.findall(pattern, script.string, re.IGNORECASE)
+                        for match in matches:
+                            match = match.replace('\\/', '/')
+                            if 'http' in match and 'oneupload' not in match.lower():
+                                print(f"[ONEUPLOAD] ✅ Found video in script: {match[:100]}...")
+                                return match
+
+            print("[ONEUPLOAD] ❌ HTTP extraction failed - no video URLs found")
+            return None
+
+        except Exception as e:
+            print(f"[ONEUPLOAD] HTTP extraction error: {e}")
+            return None
@@ -0,0 +1,290 @@
+from .base import BaseVideoPlayer
+from bs4 import BeautifulSoup
+import re
+import asyncio
+from typing import Optional
+
+
+class SmoothpreDownloader(BaseVideoPlayer):
+    """Downloader for smoothpre.com video player (JWPlayer-based)"""
+
+    def can_handle(self, url: str) -> bool:
+        return 'smoothpre.com' in url.lower()
+
+    async def get_download_link(self, url: str, target_filename: Optional[str] = None) -> tuple[str, str]:
+        """
+        Extract download link from Smoothpre video page
+        Smoothpre uses JWPlayer with dynamic JavaScript - requires Playwright
+
+        Args:
+            url: The Smoothpre video page URL
+            target_filename: Optional filename override
+
+        Returns:
+            Tuple of (direct_video_url, filename)
+        """
+        try:
+            print(f"[SMOOTHPRE] Extracting link from: {url}")
+
+            # Try using Playwright to extract video URL
+            video_url = await self._extract_with_playwright(url)
+
+            if not video_url:
+                raise Exception("Could not find video URL in Smoothpre page")
+
+            print(f"[SMOOTHPRE] Found video URL: {video_url[:80]}...")
+
+            # Generate filename
+            from app.utils import sanitize_filename
+            if target_filename:
+                filename = sanitize_filename(target_filename)
+            else:
+                filename = "smoothpre_video.mp4"
+
+            return video_url, filename
+
+        except Exception as e:
+            raise Exception(f"Error extracting Smoothpre link: {str(e)}")
+
+    async def _extract_with_playwright(self, url: str) -> str | None:
+        """Extract video URL using Playwright with network interception"""
+        try:
+            from playwright.async_api import async_playwright
+
+            print("[SMOOTHPRE] Launching browser with network interception...")
+
+            video_urls = []
+
+            async with async_playwright() as p:
+                browser = await p.chromium.launch(
+                    headless=True,
+                    args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
+                )
+
+                context = await browser.new_context(
+                    user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
+                )
+
+                page = await context.new_page()
+
+                # Set up response interception
+                async def handle_response(response):
+                    try:
+                        resp_url = response.url
+                        content_type = response.headers.get('content-type', '')
+
+                        # Look for video files in responses
+                        if any(ext in resp_url.lower() for ext in ['.m3u8', '.mp4', '.mkv', '.ts']):
+                            if 'smoothpre' not in resp_url.lower() and 'google' not in resp_url.lower():
+                                print(f"[SMOOTHPRE] 🎥 Captured video URL: {resp_url[:100]}...")
+                                video_urls.append(resp_url)
+                        # Also check by content-type
+                        elif any(ct in content_type.lower() for ct in ['video/', 'application/x-mpegurl']):
+                            if 'smoothpre' not in resp_url.lower():
+                                print(f"[SMOOTHPRE] 🎥 Captured video response: {resp_url[:100]}...")
+                                video_urls.append(resp_url)
+                    except Exception as e:
+                        pass  # Ignore interception errors
+
+                page.on('response', handle_response)
+
+                print("[SMOOTHPRE] Navigating to page...")
+
+                try:
+                    await page.goto(url, wait_until='networkidle', timeout=30000)
+                except Exception as e:
+                    print(f"[SMOOTHPRE] Navigation warning: {e}")
+
+                # Wait for page to load
+                print("[SMOOTHPRE] Waiting for video player to load...")
+                await asyncio.sleep(3)
+
+                # Try to find and click play button
+                try:
+                    play_selectors = [
+                        'button[aria-label="Play"]',
+                        '.play-button',
+                        'button[class*="play"]',
+                        '.jw-icon-display',
+                        'video',
+                    ]
+
+                    for selector in play_selectors:
+                        try:
+                            element = await page.query_selector(selector)
+                            if element:
+                                print(f"[SMOOTHPRE] Found element: {selector}")
+                                if 'button' in selector or 'jw' in selector:
+                                    await element.click()
+                                    await asyncio.sleep(2)
+                                break
+                        except:
+                            continue
+                except Exception as e:
+                    print(f"[SMOOTHPRE] Play button interaction: {e}")
+
+                # Wait more for network requests
+                await asyncio.sleep(4)
+
+                # Try JavaScript extraction - JWPlayer specific
+                try:
+                    js_code = r"""
+                        () => {
+                            // Check for JWPlayer setup (primary method for Smoothpre)
+                            if (window.jwplayer) {
+                                try {
+                                    const playlist = window.jwplayer().getPlaylist();
+                                    if (playlist && playlist[0] && playlist[0].sources) {
+                                        for (let source of playlist[0].sources) {
+                                            if (source.file && (source.file.includes('.m3u8') || source.file.includes('.mp4'))) {
+                                                return source.file;
+                                            }
+                                        }
+                                    }
+                                } catch(e) {}
+                            }
+
+                            // Check all video elements
+                            const videos = document.querySelectorAll('video');
+                            for (let v of videos) {
+                                if (v.src && (v.src.includes('.m3u8') || v.src.includes('.mp4'))) {
+                                    return v.src;
+                                }
+                                const sources = v.querySelectorAll('source');
+                                for (let s of sources) {
+                                    if (s.src && (s.src.includes('.m3u8') || s.src.includes('.mp4'))) {
+                                        return s.src;
+                                    }
+                                }
+                            }
+
+                            // Check window object for video URLs
+                            const searchKeys = ['player', 'video', 'source', 'file', 'url', 'jw'];
+                            for (let key of searchKeys) {
+                                if (window[key] && typeof window[key] === 'object') {
+                                    try {
+                                        const json = JSON.stringify(window[key]);
+                                        const match = json.match(/(https?:\/\/[^\s"\'<>]+\.(m3u8|mp4))/);
+                                        if (match) return match[1];
+                                    } catch(e) {}
+                                }
+                            }
+
+                            return null;
+                        }
+                    """
+                    js_result = await page.evaluate(js_code)
+
+                    if js_result and ('.m3u8' in js_result or '.mp4' in js_result):
+                        print(f"[SMOOTHPRE] ✅ Found video URL via JavaScript: {js_result[:100]}...")
+                        video_urls.append(js_result)
+                except Exception as e:
+                    print(f"[SMOOTHPRE] JS extraction error: {e}")
+
+                # Parse page HTML for video URLs - enhanced patterns
+                try:
+                    content = await page.content()
+                    patterns = [
+                        r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
+                        r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
+                        r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
+                        r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
+                        r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
+                        r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
+                        r"url\s*[:=]\s*['\"]([^'\"]+\.m3u8[^'\"]*)['\"]",
+                        r"url\s*[:=]\s*['\"]([^'\"]+\.mp4[^'\"]*)['\"]",
+                    ]
+
+                    for pattern in patterns:
+                        matches = re.findall(pattern, content, re.IGNORECASE)
+                        for match in matches:
+                            # Clean up the URL
+                            match = match.replace('\\/', '/').replace('\\', '')
+                            if 'http' in match and 'smoothpre' not in match and 'google' not in match:
+                                print(f"[SMOOTHPRE] Found in HTML: {match[:100]}...")
+                                video_urls.append(match)
+                except Exception as e:
+                    print(f"[SMOOTHPRE] HTML parsing error: {e}")
+
+                await browser.close()
+
+                # Return first valid video URL (prefer .m3u8 over .mp4 as it's usually the source)
+                if video_urls:
+                    seen = set()
+                    unique_urls = []
+                    for vid_url in video_urls:
+                        if vid_url not in seen:
+                            seen.add(vid_url)
+                            unique_urls.append(vid_url)
+
+                    if unique_urls:
+                        # Sort to prefer .m3u8 (source quality)
+                        unique_urls.sort(key=lambda x: 0 if '.m3u8' in x else 1)
+                        print(f"[SMOOTHPRE] ✅ Found {len(unique_urls)} video URL(s)")
+                        print(f"[SMOOTHPRE] Selected: {unique_urls[0][:100]}...")
+                        return unique_urls[0]
+
+                print("[SMOOTHPRE] ❌ No video URLs found")
+                return None
+
+        except ImportError:
+            print("[SMOOTHPRE] ⚠️ Playwright not installed - falling back to HTTP extraction")
+            return await self._extract_with_http(url)
+        except Exception as e:
+            print(f"[SMOOTHPRE] Playwright error: {e}")
+            import traceback
+            traceback.print_exc()
+            # Fallback to HTTP extraction
+            return await self._extract_with_http(url)
+
+    async def _extract_with_http(self, url: str) -> str | None:
+        """Extract video URL using simple HTTP requests (fallback when Playwright fails)"""
+        try:
+            print(f"[SMOOTHPRE] Trying HTTP extraction from: {url}")
+
+            response = await self.client.get(url, follow_redirects=True)
+            soup = BeautifulSoup(response.text, 'lxml')
+
+            # Method 1: Look for video/source tags
+            videos = soup.find_all('video')
+            for video in videos:
+                src = video.get('src') or video.get('data-src')
+                if src and any(ext in src for ext in ['.m3u8', '.mp4']):
+                    print(f"[SMOOTHPRE] ✅ Found video in video tag: {src[:100]}...")
+                    return src
+
+                sources = video.find_all('source')
+                for source in sources:
+                    src = source.get('src')
+                    if src and any(ext in src for ext in ['.m3u8', '.mp4']):
+                        print(f"[SMOOTHPRE] ✅ Found video in source tag: {src[:100]}...")
+                        return src
+
+            # Method 2: Look in script tags for JWPlayer configuration
+            scripts = soup.find_all('script')
+            for script in scripts:
+                if script.string:
+                    # JWPlayer patterns
+                    patterns = [
+                        r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
+                        r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
+                        r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
+                        r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
+                        r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
+                        r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
+                    ]
+
+                    for pattern in patterns:
+                        matches = re.findall(pattern, script.string, re.IGNORECASE)
+                        for match in matches:
+                            match = match.replace('\\/', '/')
+                            if 'http' in match and 'smoothpre' not in match.lower():
+                                print(f"[SMOOTHPRE] ✅ Found video in script: {match[:100]}...")
+                                return match
+
+            print("[SMOOTHPRE] ❌ HTTP extraction failed - no video URLs found")
+            return None
+
+        except Exception as e:
+            print(f"[SMOOTHPRE] HTTP extraction error: {e}")
+            return None
@@ -0,0 +1,423 @@
+"""
+Metadata enrichment service with Kitsu API fallback.
+
+This module provides intelligent metadata enrichment by:
+1. Merging provider metadata with Kitsu API data
+2. Filling missing fields from Kitsu
+3. Normalizing data formats across providers
+4. Caching enriched metadata to reduce API calls
+"""
+import asyncio
+import logging
+from typing import Dict, Optional, List, Set
+from datetime import datetime, timedelta
+from pathlib import Path
+import json
+import hashlib
+
+from app.kitsu_api import KitsuAPI
+from app.models import AnimeMetadata
+
+logger = logging.getLogger(__name__)
+
+
+class MetadataEnricher:
+    """
+    Enriches anime metadata by combining provider data with Kitsu API fallback.
+    Caches results to minimize API calls.
+    """
+
+    # Fields that Kitsu can provide as fallback
+    # Note: studio is not included as Kitsu API requires separate calls
+    KITSU_FIELDS = {
+        'synopsis', 'genres', 'rating', 'release_year',
+        'poster_image', 'banner_image', 'total_episodes', 'status',
+        'alternative_titles'
+    }
+
+    # Cache duration in hours
+    CACHE_DURATION_HOURS = 24
+
+    def __init__(self, cache_dir: str = "config"):
+        self.cache_dir = Path(cache_dir)
+        self.cache_file = self.cache_dir / "metadata_cache.json"
+        self.kitsu_api = KitsuAPI()
+        self._cache: Dict[str, Dict] = {}
+        self._cache_dirty = False
+
+        # Load cache on initialization
+        self._load_cache()
+
+    def _load_cache(self):
+        """Load metadata cache from disk."""
+        try:
+            if self.cache_file.exists():
+                with open(self.cache_file, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                    # Filter out expired entries
+                    now = datetime.now()
+                    self._cache = {
+                        k: v for k, v in data.items()
+                        if datetime.fromisoformat(v.get('cached_at', '')) >
+                        now - timedelta(hours=self.CACHE_DURATION_HOURS)
+                    }
+                    logger.info(f"Loaded {len(self._cache)} cached metadata entries")
+        except Exception as e:
+            logger.warning(f"Failed to load metadata cache: {e}")
+            self._cache = {}
+
+    def _save_cache(self):
+        """Save metadata cache to disk."""
+        if not self._cache_dirty:
+            return
+
+        try:
+            self.cache_dir.mkdir(parents=True, exist_ok=True)
+            with open(self.cache_file, 'w', encoding='utf-8') as f:
+                json.dump(self._cache, f, ensure_ascii=False, indent=2)
+            self._cache_dirty = False
+            logger.debug("Saved metadata cache")
+        except Exception as e:
+            logger.error(f"Failed to save metadata cache: {e}")
+
+    def _get_cache_key(self, title: str, url: Optional[str] = None) -> str:
+        """Generate cache key from title and URL."""
+        # Use both title and URL for more precise caching
+        key_data = f"{title}|{url or ''}"
+        return hashlib.md5(key_data.encode()).hexdigest()
+
+    def _get_cached_metadata(self, cache_key: str) -> Optional[Dict]:
+        """Get cached metadata if available and not expired."""
+        if cache_key in self._cache:
+            entry = self._cache[cache_key]
+            cached_at = datetime.fromisoformat(entry.get('cached_at', ''))
+            if cached_at > datetime.now() - timedelta(hours=self.CACHE_DURATION_HOURS):
+                logger.debug(f"Cache hit for key: {cache_key}")
+                return entry.get('metadata')
+            else:
+                # Remove expired entry
+                del self._cache[cache_key]
+                self._cache_dirty = True
+        return None
+
+    def _set_cached_metadata(self, cache_key: str, metadata: Dict):
+        """Cache enriched metadata."""
+        self._cache[cache_key] = {
+            'metadata': metadata,
+            'cached_at': datetime.now().isoformat()
+        }
+        self._cache_dirty = True
+
+    async def enrich_metadata(
+        self,
+        provider_metadata: Dict,
+        title: str,
+        url: Optional[str] = None,
+        use_kitsu_fallback: bool = True
+    ) -> AnimeMetadata:
+        """
+        Enrich provider metadata with Kitsu API fallback.
+
+        Args:
+            provider_metadata: Metadata dict from anime provider
+            title: Anime title (for Kitsu search)
+            url: Optional anime URL (for cache key)
+            use_kitsu_fallback: Whether to use Kitsu API for missing fields
+
+        Returns:
+            Enriched AnimeMetadata object
+        """
+        # Check cache first
+        cache_key = self._get_cache_key(title, url)
+        cached = self._get_cached_metadata(cache_key)
+        if cached:
+            return AnimeMetadata(**cached)
+
+        # Start with provider metadata
+        enriched = provider_metadata.copy()
+
+        # Check which fields are missing
+        missing_fields = self._get_missing_fields(enriched)
+
+        if missing_fields and use_kitsu_fallback:
+            logger.info(f"Missing fields for '{title}': {missing_fields} - fetching from Kitsu")
+            try:
+                # Fetch from Kitsu
+                kitsu_metadata = await self._fetch_from_kitsu(title)
+
+                if kitsu_metadata:
+                    # Merge Kitsu data
+                    enriched = self._merge_metadata(enriched, kitsu_metadata)
+                    enriched['_kitsu_enriched'] = True
+                    enriched['_enriched_fields'] = list(missing_fields)
+            except Exception as e:
+                logger.warning(f"Failed to fetch Kitsu metadata for '{title}': {e}")
+
+        # Calculate quality score
+        enriched['_quality_score'] = self._calculate_quality_score(enriched)
+
+        # Convert to AnimeMetadata
+        result = AnimeMetadata(**{
+            k: v for k, v in enriched.items()
+            if not k.startswith('_')  # Exclude internal fields
+        })
+
+        # Cache the result
+        self._set_cached_metadata(cache_key, result.model_dump())
+
+        # Periodically save cache
+        if self._cache_dirty and len(self._cache) % 10 == 0:
+            self._save_cache()
+
+        return result
+
+    def _get_missing_fields(self, metadata: Dict) -> Set[str]:
+        """Identify which metadata fields are missing or empty."""
+        missing = set()
+        for field in self.KITSU_FIELDS:
+            value = metadata.get(field)
+            if value is None or value == [] or value == '':
+                missing.add(field)
+        return missing
+
+    async def _fetch_from_kitsu(self, title: str) -> Optional[Dict]:
+        """Fetch metadata from Kitsu API."""
+        try:
+            # Search for anime
+            results = await self.kitsu_api.search_anime(title, limit=1)
+
+            if results and len(results) > 0:
+                anime_data = results[0]
+                return self._convert_kitsu_to_metadata(anime_data)
+            else:
+                logger.debug(f"No Kitsu results for '{title}'")
+                return None
+
+        except Exception as e:
+            logger.error(f"Error fetching from Kitsu for '{title}': {e}")
+            return None
+
+    def _convert_kitsu_to_metadata(self, kitsu_data: Dict) -> Dict:
+        """Convert Kitsu API response to metadata format."""
+        metadata = {}
+
+        # Synopsis
+        if kitsu_data.get('synopsis'):
+            metadata['synopsis'] = kitsu_data['synopsis']
+
+        # Genres
+        if kitsu_data.get('genres'):
+            metadata['genres'] = kitsu_data['genres']
+
+        # Rating (Kitsu returns score out of 10, convert to string)
+        if kitsu_data.get('score'):
+            score = kitsu_data['score']
+            if score > 0:
+                metadata['rating'] = f"{score:.1f}/10"
+
+        # Release year
+        if kitsu_data.get('year'):
+            metadata['release_year'] = kitsu_data['year']
+
+        # Poster image
+        if kitsu_data.get('images', {}).get('jpg', {}).get('large_image_url'):
+            metadata['poster_image'] = kitsu_data['images']['jpg']['large_image_url']
+        elif kitsu_data.get('images', {}).get('jpg', {}).get('image_url'):
+            metadata['poster_image'] = kitsu_data['images']['jpg']['image_url']
+
+        # Banner image (Kitsu calls it coverImage)
+        # Note: Kitsu API structure doesn't clearly separate poster vs banner,
+        # but we can use different sizes if available
+        if kitsu_data.get('images', {}).get('webp', {}).get('large_image_url'):
+            metadata['banner_image'] = kitsu_data['images']['webp']['large_image_url']
+
+        # Total episodes
+        if kitsu_data.get('episodes'):
+            metadata['total_episodes'] = kitsu_data['episodes']
+
+        # Status
+        if kitsu_data.get('status'):
+            # Translate Kitsu status to our format
+            status_map = {
+                'Airing': 'Ongoing',
+                'Finished Airing': 'Completed',
+                'To Be Aired': 'Upcoming'
+            }
+            metadata['status'] = status_map.get(
+                kitsu_data['status'],
+                kitsu_data['status']
+            )
+
+        # Alternative titles
+        alt_titles = []
+        if kitsu_data.get('title_japanese'):
+            alt_titles.append(kitsu_data['title_japanese'])
+        if kitsu_data.get('title_english'):
+            alt_titles.append(kitsu_data['title_english'])
+        if alt_titles:
+            metadata['alternative_titles'] = alt_titles
+
+        return metadata
+
+    def _merge_metadata(
+        self,
+        provider_metadata: Dict,
+        kitsu_metadata: Dict
+    ) -> Dict:
+        """
+        Merge provider and Kitsu metadata, preferring provider data.
+
+        Provider data takes priority except for missing fields.
+        """
+        merged = provider_metadata.copy()
+
+        for field, value in kitsu_metadata.items():
+            # Only use Kitsu data if provider doesn't have it
+            if field not in merged or not merged[field]:
+                merged[field] = value
+
+        return merged
+
+    def _calculate_quality_score(self, metadata: Dict) -> float:
+        """
+        Calculate metadata quality score (0-1).
+
+        Based on completeness of critical fields.
+        """
+        weights = {
+            'synopsis': 0.2,
+            'genres': 0.15,
+            'rating': 0.1,
+            'release_year': 0.1,
+            'studio': 0.1,
+            'poster_image': 0.15,
+            'banner_image': 0.05,
+            'total_episodes': 0.05,
+            'status': 0.05,
+            'alternative_titles': 0.05
+        }
+
+        total_weight = sum(weights.values())
+        score = 0.0
+
+        for field, weight in weights.items():
+            value = metadata.get(field)
+            if value:
+                # For lists, check if not empty
+                if isinstance(value, list):
+                    if len(value) > 0:
+                        score += weight
+                # For strings, check if not empty
+                elif isinstance(value, str):
+                    if len(value) > 10:  # Minimum meaningful length
+                        score += weight
+                # For numbers
+                else:
+                    score += weight
+
+        return round(score / total_weight, 2) if total_weight > 0 else 0.0
+
+    async def enrich_search_results(
+        self,
+        results: List[Dict],
+        use_kitsu_fallback: bool = True
+    ) -> List[Dict]:
+        """
+        Enrich metadata for a list of search results.
+
+        Args:
+            results: List of search result dicts with optional 'metadata' field
+            use_kitsu_fallback: Whether to use Kitsu API
+
+        Returns:
+            List of results with enriched metadata
+        """
+        enriched_results = []
+
+        # Process results in parallel for better performance
+        enrichment_tasks = []
+        for result in results:
+            # Skip if no metadata - will add later in order
+            if 'metadata' not in result:
+                continue
+
+            task = self.enrich_metadata(
+                provider_metadata=result['metadata'],
+                title=result.get('title', ''),
+                url=result.get('url'),
+                use_kitsu_fallback=use_kitsu_fallback
+            )
+            enrichment_tasks.append(task)
+
+        # Wait for all enrichment tasks
+        if enrichment_tasks:
+            enriched_metadata_list = await asyncio.gather(
+                *enrichment_tasks,
+                return_exceptions=True
+            )
+
+            # Update results with enriched metadata
+            # Create index mapping to preserve order
+            temp_results = {}
+            metadata_idx = 0
+            for i, result in enumerate(results):
+                if 'metadata' in result:
+                    enriched_meta = enriched_metadata_list[metadata_idx]
+
+                    if isinstance(enriched_meta, Exception):
+                        logger.warning(
+                            f"Failed to enrich metadata for '{result.get('title')}': {enriched_meta}"
+                        )
+                        # Keep original metadata
+                        result_copy = result.copy()
+                    else:
+                        result_copy = result.copy()
+                        result_copy['metadata'] = enriched_meta.model_dump()
+
+                    temp_results[i] = result_copy
+                    metadata_idx += 1
+
+            # Build final result list in correct order
+            enriched_results = []
+            for i in range(len(results)):
+                if i in temp_results:
+                    enriched_results.append(temp_results[i])
+                else:
+                    # No metadata result - use original
+                    enriched_results.append(results[i].copy())
+
+        return enriched_results
+
+    async def close(self):
+        """Close resources and save cache."""
+        await self.kitsu_api.close()
+        self._save_cache()
+        logger.info("MetadataEnricher closed")
+
+
+# Global instance
+_enricher_instance: Optional[MetadataEnricher] = None
+_enricher_lock = asyncio.Lock()
+
+
+async def get_metadata_enricher() -> MetadataEnricher:
+    """Get or create the global MetadataEnricher instance."""
+    global _enricher_instance
+
+    if _enricher_instance is None:
+        async with _enricher_lock:
+            if _enricher_instance is None:
+                _enricher_instance = MetadataEnricher()
+                logger.info("Created global MetadataEnricher instance")
+
+    return _enricher_instance
+
+
+async def close_metadata_enricher():
+    """Close the global MetadataEnricher instance."""
+    global _enricher_instance
+
+    if _enricher_instance is not None:
+        await _enricher_instance.close()
+        _enricher_instance = None
+        logger.info("Closed global MetadataEnricher instance")