2b4cc617cb
- Implemented YAML-driven GenericScraper for resilient scraping - Added ProvidersManager to manage scraper health and active providers - Modernized unified search with systematic Kitsu metadata enrichment - Integrated automated health checks in the scheduler - Added comprehensive tests for scraping DSL and provider health
443 lines
16 KiB
Python
443 lines
16 KiB
Python
"""
|
|
Tests for metadata enrichment with Kitsu API fallback.
|
|
"""
|
|
import pytest
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
from datetime import datetime, timedelta
|
|
|
|
from app.metadata_enrichment import MetadataEnricher
|
|
from app.models import AnimeMetadata
|
|
|
|
|
|
@pytest.fixture
|
|
async def enricher(temp_dir):
|
|
"""Create a MetadataEnricher instance with temp cache dir."""
|
|
enricher = MetadataEnricher(cache_dir=temp_dir)
|
|
yield enricher
|
|
await enricher.close()
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_kitsu_api():
|
|
"""Mock Kitsu API responses in raw Kitsu format."""
|
|
mock_data = {
|
|
'title': 'Naruto',
|
|
'title_japanese': 'ナルト',
|
|
'title_english': 'Naruto',
|
|
'synopsis': 'A test synopsis from Kitsu',
|
|
'genres': ['Action', 'Adventure'],
|
|
'score': 8.5,
|
|
'year': 2002,
|
|
'episodes': 220,
|
|
'status': 'Finished Airing',
|
|
'images': {
|
|
'jpg': {
|
|
'large_image_url': 'https://kitsu.io/naruto-poster.jpg',
|
|
'image_url': 'https://kitsu.io/naruto-poster-small.jpg'
|
|
},
|
|
'webp': {
|
|
'large_image_url': 'https://kitsu.io/naruto-banner.jpg'
|
|
}
|
|
}
|
|
}
|
|
return mock_data
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_kitsu_api_raw():
|
|
"""Mock raw Kitsu API response format."""
|
|
return {
|
|
'mal_id': 123,
|
|
'title': 'Naruto',
|
|
'title_japanese': 'ナルト',
|
|
'title_english': 'Naruto',
|
|
'episodes': 220,
|
|
'status': 'Finished Airing',
|
|
'score': 8.5,
|
|
'synopsis': 'A test synopsis from Kitsu',
|
|
'genres': ['Action', 'Adventure'],
|
|
'images': {
|
|
'jpg': {
|
|
'image_url': 'https://kitsu.io/naruto-poster-small.jpg',
|
|
'large_image_url': 'https://kitsu.io/naruto-poster.jpg'
|
|
},
|
|
'webp': {
|
|
'image_url': 'https://kitsu.io/naruto-poster-small.webp',
|
|
'large_image_url': 'https://kitsu.io/naruto-banner.jpg'
|
|
}
|
|
},
|
|
'url': 'https://kitsu.io/anime/123',
|
|
'subtype': 'TV',
|
|
'year': 2002
|
|
}
|
|
|
|
|
|
class TestMetadataEnricher:
|
|
"""Test MetadataEnricher functionality."""
|
|
|
|
def test_init_creates_cache_dir(self, enricher, temp_dir):
|
|
"""Test that enricher creates cache directory."""
|
|
assert enricher.cache_dir == temp_dir
|
|
assert enricher.cache_file == temp_dir / "metadata_cache.json"
|
|
|
|
def test_get_cache_key(self, enricher):
|
|
"""Test cache key generation."""
|
|
key1 = enricher._get_cache_key("Naruto", "https://example.com/naruto")
|
|
key2 = enricher._get_cache_key("Naruto", "https://example.com/naruto")
|
|
key3 = enricher._get_cache_key("Naruto", "https://example.com/sasuke")
|
|
|
|
# Same inputs should produce same key
|
|
assert key1 == key2
|
|
|
|
# Different URL should produce different key
|
|
assert key1 != key3
|
|
|
|
def test_get_missing_fields(self, enricher):
|
|
"""Test identification of missing fields."""
|
|
# Complete metadata
|
|
complete = {
|
|
'synopsis': 'Test synopsis',
|
|
'genres': ['Action'],
|
|
'rating': '8.5/10',
|
|
'release_year': 2020,
|
|
'studio': 'Studio Pierrot',
|
|
'poster_image': 'https://example.com/poster.jpg',
|
|
'banner_image': 'https://example.com/banner.jpg',
|
|
'total_episodes': 12,
|
|
'status': 'Completed',
|
|
'alternative_titles': ['Japanese Title'] # Now required for completeness
|
|
}
|
|
|
|
missing = enricher._get_missing_fields(complete)
|
|
assert len(missing) == 0
|
|
|
|
# Incomplete metadata
|
|
incomplete = {
|
|
'synopsis': 'Test synopsis',
|
|
'genres': [] # Empty list counts as missing
|
|
}
|
|
|
|
missing = enricher._get_missing_fields(incomplete)
|
|
assert 'rating' in missing
|
|
assert 'release_year' in missing
|
|
# Note: studio is not in KITSU_FIELDS, so it won't be detected as missing
|
|
assert 'status' in missing
|
|
assert 'genres' in missing # Empty list is considered missing
|
|
assert len(missing) >= 4
|
|
|
|
def test_convert_kitsu_to_metadata(self, enricher, mock_kitsu_api):
|
|
"""Test conversion of Kitsu API response to metadata format."""
|
|
metadata = enricher._convert_kitsu_to_metadata(mock_kitsu_api)
|
|
|
|
assert metadata['synopsis'] == 'A test synopsis from Kitsu'
|
|
assert metadata['genres'] == ['Action', 'Adventure']
|
|
assert metadata['rating'] == '8.5/10'
|
|
assert metadata['release_year'] == 2002
|
|
assert metadata['poster_image'] == 'https://kitsu.io/naruto-poster.jpg'
|
|
assert metadata['banner_image'] == 'https://kitsu.io/naruto-banner.jpg'
|
|
assert metadata['total_episodes'] == 220
|
|
assert metadata['status'] == 'Completed'
|
|
assert 'ナルト' in metadata['alternative_titles']
|
|
assert 'Naruto' in metadata['alternative_titles']
|
|
|
|
def test_convert_kitsu_status_translation(self, enricher):
|
|
"""Test Kitsu status translation."""
|
|
test_cases = [
|
|
('Airing', 'Ongoing'),
|
|
('Finished Airing', 'Completed'),
|
|
('To Be Aired', 'Upcoming'),
|
|
]
|
|
|
|
for kitsu_status, expected_status in test_cases:
|
|
metadata = enricher._convert_kitsu_to_metadata({
|
|
'status': kitsu_status
|
|
})
|
|
assert metadata['status'] == expected_status
|
|
|
|
def test_merge_metadata_prefer_provider(self, enricher, mock_kitsu_api):
|
|
"""Test that provider metadata takes priority over Kitsu."""
|
|
provider_meta = {
|
|
'synopsis': 'Provider synopsis (better)',
|
|
'genres': ['Action'],
|
|
'rating': '9.0/10', # Different from Kitsu
|
|
'release_year': 2002,
|
|
'studio': 'Studio Pierrot', # Not in Kitsu
|
|
}
|
|
|
|
kitsu_meta = enricher._convert_kitsu_to_metadata(mock_kitsu_api)
|
|
|
|
merged = enricher._merge_metadata(provider_meta, kitsu_meta)
|
|
|
|
# Provider data should be preserved
|
|
assert merged['synopsis'] == 'Provider synopsis (better)'
|
|
assert merged['rating'] == '9.0/10'
|
|
assert merged['studio'] == 'Studio Pierrot'
|
|
|
|
# Kitsu data should fill gaps
|
|
assert merged['total_episodes'] == 220
|
|
assert merged['status'] == 'Completed'
|
|
|
|
def test_calculate_quality_score(self, enricher):
|
|
"""Test metadata quality score calculation."""
|
|
# Complete metadata should have high score
|
|
complete = {
|
|
'synopsis': 'A detailed synopsis of the anime with lots of information',
|
|
'genres': ['Action', 'Adventure', 'Fantasy'],
|
|
'rating': '8.5/10',
|
|
'release_year': 2020,
|
|
'studio': 'Studio Pierrot',
|
|
'poster_image': 'https://example.com/poster.jpg',
|
|
'banner_image': 'https://example.com/banner.jpg',
|
|
'total_episodes': 12,
|
|
'status': 'Completed',
|
|
'alternative_titles': ['Japanese Title']
|
|
}
|
|
|
|
score = enricher._calculate_quality_score(complete)
|
|
assert score > 0.8 # Should be high quality
|
|
|
|
# Minimal metadata should have low score
|
|
minimal = {
|
|
'synopsis': 'Short',
|
|
'genres': ['Action']
|
|
}
|
|
|
|
score = enricher._calculate_quality_score(minimal)
|
|
assert score < 0.5 # Should be low quality
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_metadata_with_kitsu_fallback(self, enricher, mock_kitsu_api_raw):
|
|
"""Test enrichment with Kitsu API fallback."""
|
|
provider_metadata = {
|
|
'synopsis': 'Provider synopsis',
|
|
'genres': ['Action'],
|
|
# Missing: rating, release_year, poster_image, etc.
|
|
}
|
|
|
|
# Mock the Kitsu API search to return raw format
|
|
with patch.object(enricher.kitsu_api, 'search_anime', return_value=[mock_kitsu_api_raw]):
|
|
result = await enricher.enrich_metadata(
|
|
provider_metadata=provider_metadata,
|
|
title='Naruto',
|
|
url='https://example.com/naruto',
|
|
use_kitsu_fallback=True
|
|
)
|
|
|
|
# Should have Kitsu data
|
|
assert result.rating == '8.5/10'
|
|
assert result.release_year == 2002
|
|
assert result.poster_image is not None
|
|
assert result.total_episodes == 220
|
|
assert result.status == 'Completed'
|
|
|
|
# Should preserve provider data
|
|
assert result.synopsis == 'Provider synopsis'
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_metadata_without_kitsu_fallback(self, enricher):
|
|
"""Test enrichment without Kitsu fallback."""
|
|
provider_metadata = {
|
|
'synopsis': 'Provider synopsis',
|
|
'genres': ['Action'],
|
|
}
|
|
|
|
result = await enricher.enrich_metadata(
|
|
provider_metadata=provider_metadata,
|
|
title='Naruto',
|
|
url='https://example.com/naruto',
|
|
use_kitsu_fallback=False
|
|
)
|
|
|
|
# Should only have provider data
|
|
assert result.synopsis == 'Provider synopsis'
|
|
assert result.genres == ['Action']
|
|
assert result.rating is None # No Kitsu fallback
|
|
assert result.release_year is None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_metadata_caching(self, enricher, mock_kitsu_api_raw):
|
|
"""Test that enriched metadata is cached."""
|
|
provider_metadata = {
|
|
'synopsis': 'Provider synopsis',
|
|
'genres': ['Action'],
|
|
}
|
|
|
|
with patch.object(enricher.kitsu_api, 'search_anime', return_value=[mock_kitsu_api_raw]) as mock_search:
|
|
# First call should fetch from Kitsu
|
|
result1 = await enricher.enrich_metadata(
|
|
provider_metadata=provider_metadata,
|
|
title='Naruto',
|
|
url='https://example.com/naruto',
|
|
use_kitsu_fallback=True
|
|
)
|
|
assert mock_search.call_count == 1
|
|
|
|
# Second call should use cache
|
|
result2 = await enricher.enrich_metadata(
|
|
provider_metadata=provider_metadata,
|
|
title='Naruto',
|
|
url='https://example.com/naruto',
|
|
use_kitsu_fallback=True
|
|
)
|
|
assert mock_search.call_count == 1 # No additional call
|
|
|
|
# Results should be identical
|
|
assert result1.model_dump() == result2.model_dump()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_enrich_search_results(self, enricher, mock_kitsu_api_raw):
|
|
"""Test enrichment of multiple search results."""
|
|
search_results = [
|
|
{
|
|
'title': 'Naruto',
|
|
'url': 'https://example.com/naruto',
|
|
'metadata': {
|
|
'synopsis': 'Brief synopsis',
|
|
'genres': ['Action']
|
|
}
|
|
},
|
|
{
|
|
'title': 'One Piece',
|
|
'url': 'https://example.com/onepiece',
|
|
'metadata': {
|
|
'synopsis': 'Another synopsis',
|
|
'genres': ['Adventure']
|
|
}
|
|
},
|
|
{
|
|
'title': 'No Metadata',
|
|
'url': 'https://example.com/nometa'
|
|
}
|
|
]
|
|
|
|
with patch.object(enricher.kitsu_api, 'search_anime', return_value=[mock_kitsu_api_raw]):
|
|
enriched = await enricher.enrich_search_results(
|
|
results=search_results,
|
|
use_kitsu_fallback=True
|
|
)
|
|
|
|
# Should enrich results with metadata
|
|
assert len(enriched) == 3
|
|
|
|
# First result should be enriched
|
|
assert enriched[0]['metadata']['rating'] == '8.5/10'
|
|
assert enriched[0]['metadata']['release_year'] == 2002
|
|
|
|
# Second result should also be enriched
|
|
assert enriched[1]['metadata']['rating'] == '8.5/10'
|
|
|
|
# Third result should have no metadata field
|
|
assert 'metadata' not in enriched[2] or enriched[2].get('metadata') is None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cache_expiry(self, enricher, mock_kitsu_api_raw):
|
|
"""Test that expired cache entries are removed."""
|
|
provider_metadata = {'synopsis': 'Test'}
|
|
|
|
# Add an expired entry to cache
|
|
cache_key = enricher._get_cache_key('Test', 'https://example.com/test')
|
|
enricher._cache[cache_key] = {
|
|
'metadata': provider_metadata,
|
|
'cached_at': (datetime.now() - timedelta(hours=25)).isoformat() # Expired
|
|
}
|
|
enricher._cache_dirty = True
|
|
|
|
with patch.object(enricher.kitsu_api, 'search_anime', return_value=[mock_kitsu_api_raw]) as mock_search:
|
|
# Should fetch from Kitsu since cache is expired
|
|
result = await enricher.enrich_metadata(
|
|
provider_metadata=provider_metadata,
|
|
title='Test',
|
|
url='https://example.com/test',
|
|
use_kitsu_fallback=True
|
|
)
|
|
|
|
assert mock_search.call_count == 1
|
|
assert result.rating == '8.5/10'
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_close_saves_cache(self, enricher):
|
|
"""Test that closing the enricher saves the cache."""
|
|
# Add something to cache
|
|
cache_key = 'test_key'
|
|
enricher._cache[cache_key] = {
|
|
'metadata': {'test': 'data'},
|
|
'cached_at': datetime.now().isoformat()
|
|
}
|
|
enricher._cache_dirty = True
|
|
|
|
await enricher.close()
|
|
|
|
# Cache file should exist
|
|
assert enricher.cache_file.exists()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_from_kitsu_error_handling(self, enricher):
|
|
"""Test error handling when Kitsu API fails."""
|
|
provider_metadata = {'synopsis': 'Test'}
|
|
|
|
with patch.object(enricher, '_fetch_from_kitsu', side_effect=Exception("API Error")):
|
|
result = await enricher.enrich_metadata(
|
|
provider_metadata=provider_metadata,
|
|
title='NonExistent Anime',
|
|
url='https://example.com/nonexistent',
|
|
use_kitsu_fallback=True
|
|
)
|
|
|
|
# Should return provider metadata despite error
|
|
assert result.synopsis == 'Test'
|
|
assert result.rating is None
|
|
|
|
|
|
class TestMetadataEnrichmentIntegration:
|
|
"""Integration tests for metadata enrichment."""
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.slow
|
|
async def test_kitsu_api_integration(self):
|
|
"""Test actual Kitsu API integration (marked as slow)."""
|
|
enricher = MetadataEnricher()
|
|
|
|
try:
|
|
# Search for a well-known anime
|
|
results = await enricher.kitsu_api.search_anime('Naruto', limit=1)
|
|
|
|
assert len(results) > 0
|
|
assert 'title' in results[0]
|
|
assert 'synopsis' in results[0] or 'genres' in results[0]
|
|
|
|
finally:
|
|
await enricher.close()
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.slow
|
|
async def test_full_enrichment_flow(self):
|
|
"""Test complete enrichment flow with real data (marked as slow)."""
|
|
enricher = MetadataEnricher()
|
|
|
|
try:
|
|
# Simulate provider metadata with gaps
|
|
provider_metadata = {
|
|
'synopsis': 'Naruto Uzumaki wants to be the best ninja.',
|
|
'genres': ['Action'],
|
|
# Missing many fields
|
|
}
|
|
|
|
result = await enricher.enrich_metadata(
|
|
provider_metadata=provider_metadata,
|
|
title='Naruto',
|
|
url='https://test.com/naruto',
|
|
use_kitsu_fallback=True
|
|
)
|
|
|
|
# Should have enriched data
|
|
assert result.synopsis is not None
|
|
assert len(result.genres) > 0
|
|
|
|
# Kitsu might have filled some gaps
|
|
# (We can't assert specific fields as Kitsu responses may vary)
|
|
quality_score = result.model_dump().get('_quality_score', 0)
|
|
assert quality_score >= 0
|
|
|
|
finally:
|
|
await enricher.close()
|