Files
ohm_streaming/tests/test_metadata_enrichment.py
T
root 2b4cc617cb
CI / Test (Python 3.11) (push) Has been cancelled
CI / Test (Python 3.12) (push) Has been cancelled
CI / Lint (push) Has been cancelled
CI / Type Check (push) Has been cancelled
CI / Summary (push) Has been cancelled
feat: robust scraping DSL and health monitoring (Phase 2)
- Implemented YAML-driven GenericScraper for resilient scraping
- Added ProvidersManager to manage scraper health and active providers
- Modernized unified search with systematic Kitsu metadata enrichment
- Integrated automated health checks in the scheduler
- Added comprehensive tests for scraping DSL and provider health
2026-03-24 10:57:19 +00:00

443 lines
16 KiB
Python

"""
Tests for metadata enrichment with Kitsu API fallback.
"""
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from datetime import datetime, timedelta
from app.metadata_enrichment import MetadataEnricher
from app.models import AnimeMetadata
@pytest.fixture
async def enricher(temp_dir):
"""Create a MetadataEnricher instance with temp cache dir."""
enricher = MetadataEnricher(cache_dir=temp_dir)
yield enricher
await enricher.close()
@pytest.fixture
def mock_kitsu_api():
"""Mock Kitsu API responses in raw Kitsu format."""
mock_data = {
'title': 'Naruto',
'title_japanese': 'ナルト',
'title_english': 'Naruto',
'synopsis': 'A test synopsis from Kitsu',
'genres': ['Action', 'Adventure'],
'score': 8.5,
'year': 2002,
'episodes': 220,
'status': 'Finished Airing',
'images': {
'jpg': {
'large_image_url': 'https://kitsu.io/naruto-poster.jpg',
'image_url': 'https://kitsu.io/naruto-poster-small.jpg'
},
'webp': {
'large_image_url': 'https://kitsu.io/naruto-banner.jpg'
}
}
}
return mock_data
@pytest.fixture
def mock_kitsu_api_raw():
"""Mock raw Kitsu API response format."""
return {
'mal_id': 123,
'title': 'Naruto',
'title_japanese': 'ナルト',
'title_english': 'Naruto',
'episodes': 220,
'status': 'Finished Airing',
'score': 8.5,
'synopsis': 'A test synopsis from Kitsu',
'genres': ['Action', 'Adventure'],
'images': {
'jpg': {
'image_url': 'https://kitsu.io/naruto-poster-small.jpg',
'large_image_url': 'https://kitsu.io/naruto-poster.jpg'
},
'webp': {
'image_url': 'https://kitsu.io/naruto-poster-small.webp',
'large_image_url': 'https://kitsu.io/naruto-banner.jpg'
}
},
'url': 'https://kitsu.io/anime/123',
'subtype': 'TV',
'year': 2002
}
class TestMetadataEnricher:
"""Test MetadataEnricher functionality."""
def test_init_creates_cache_dir(self, enricher, temp_dir):
"""Test that enricher creates cache directory."""
assert enricher.cache_dir == temp_dir
assert enricher.cache_file == temp_dir / "metadata_cache.json"
def test_get_cache_key(self, enricher):
"""Test cache key generation."""
key1 = enricher._get_cache_key("Naruto", "https://example.com/naruto")
key2 = enricher._get_cache_key("Naruto", "https://example.com/naruto")
key3 = enricher._get_cache_key("Naruto", "https://example.com/sasuke")
# Same inputs should produce same key
assert key1 == key2
# Different URL should produce different key
assert key1 != key3
def test_get_missing_fields(self, enricher):
"""Test identification of missing fields."""
# Complete metadata
complete = {
'synopsis': 'Test synopsis',
'genres': ['Action'],
'rating': '8.5/10',
'release_year': 2020,
'studio': 'Studio Pierrot',
'poster_image': 'https://example.com/poster.jpg',
'banner_image': 'https://example.com/banner.jpg',
'total_episodes': 12,
'status': 'Completed',
'alternative_titles': ['Japanese Title'] # Now required for completeness
}
missing = enricher._get_missing_fields(complete)
assert len(missing) == 0
# Incomplete metadata
incomplete = {
'synopsis': 'Test synopsis',
'genres': [] # Empty list counts as missing
}
missing = enricher._get_missing_fields(incomplete)
assert 'rating' in missing
assert 'release_year' in missing
# Note: studio is not in KITSU_FIELDS, so it won't be detected as missing
assert 'status' in missing
assert 'genres' in missing # Empty list is considered missing
assert len(missing) >= 4
def test_convert_kitsu_to_metadata(self, enricher, mock_kitsu_api):
"""Test conversion of Kitsu API response to metadata format."""
metadata = enricher._convert_kitsu_to_metadata(mock_kitsu_api)
assert metadata['synopsis'] == 'A test synopsis from Kitsu'
assert metadata['genres'] == ['Action', 'Adventure']
assert metadata['rating'] == '8.5/10'
assert metadata['release_year'] == 2002
assert metadata['poster_image'] == 'https://kitsu.io/naruto-poster.jpg'
assert metadata['banner_image'] == 'https://kitsu.io/naruto-banner.jpg'
assert metadata['total_episodes'] == 220
assert metadata['status'] == 'Completed'
assert 'ナルト' in metadata['alternative_titles']
assert 'Naruto' in metadata['alternative_titles']
def test_convert_kitsu_status_translation(self, enricher):
"""Test Kitsu status translation."""
test_cases = [
('Airing', 'Ongoing'),
('Finished Airing', 'Completed'),
('To Be Aired', 'Upcoming'),
]
for kitsu_status, expected_status in test_cases:
metadata = enricher._convert_kitsu_to_metadata({
'status': kitsu_status
})
assert metadata['status'] == expected_status
def test_merge_metadata_prefer_provider(self, enricher, mock_kitsu_api):
"""Test that provider metadata takes priority over Kitsu."""
provider_meta = {
'synopsis': 'Provider synopsis (better)',
'genres': ['Action'],
'rating': '9.0/10', # Different from Kitsu
'release_year': 2002,
'studio': 'Studio Pierrot', # Not in Kitsu
}
kitsu_meta = enricher._convert_kitsu_to_metadata(mock_kitsu_api)
merged = enricher._merge_metadata(provider_meta, kitsu_meta)
# Provider data should be preserved
assert merged['synopsis'] == 'Provider synopsis (better)'
assert merged['rating'] == '9.0/10'
assert merged['studio'] == 'Studio Pierrot'
# Kitsu data should fill gaps
assert merged['total_episodes'] == 220
assert merged['status'] == 'Completed'
def test_calculate_quality_score(self, enricher):
"""Test metadata quality score calculation."""
# Complete metadata should have high score
complete = {
'synopsis': 'A detailed synopsis of the anime with lots of information',
'genres': ['Action', 'Adventure', 'Fantasy'],
'rating': '8.5/10',
'release_year': 2020,
'studio': 'Studio Pierrot',
'poster_image': 'https://example.com/poster.jpg',
'banner_image': 'https://example.com/banner.jpg',
'total_episodes': 12,
'status': 'Completed',
'alternative_titles': ['Japanese Title']
}
score = enricher._calculate_quality_score(complete)
assert score > 0.8 # Should be high quality
# Minimal metadata should have low score
minimal = {
'synopsis': 'Short',
'genres': ['Action']
}
score = enricher._calculate_quality_score(minimal)
assert score < 0.5 # Should be low quality
@pytest.mark.asyncio
async def test_enrich_metadata_with_kitsu_fallback(self, enricher, mock_kitsu_api_raw):
"""Test enrichment with Kitsu API fallback."""
provider_metadata = {
'synopsis': 'Provider synopsis',
'genres': ['Action'],
# Missing: rating, release_year, poster_image, etc.
}
# Mock the Kitsu API search to return raw format
with patch.object(enricher.kitsu_api, 'search_anime', return_value=[mock_kitsu_api_raw]):
result = await enricher.enrich_metadata(
provider_metadata=provider_metadata,
title='Naruto',
url='https://example.com/naruto',
use_kitsu_fallback=True
)
# Should have Kitsu data
assert result.rating == '8.5/10'
assert result.release_year == 2002
assert result.poster_image is not None
assert result.total_episodes == 220
assert result.status == 'Completed'
# Should preserve provider data
assert result.synopsis == 'Provider synopsis'
@pytest.mark.asyncio
async def test_enrich_metadata_without_kitsu_fallback(self, enricher):
"""Test enrichment without Kitsu fallback."""
provider_metadata = {
'synopsis': 'Provider synopsis',
'genres': ['Action'],
}
result = await enricher.enrich_metadata(
provider_metadata=provider_metadata,
title='Naruto',
url='https://example.com/naruto',
use_kitsu_fallback=False
)
# Should only have provider data
assert result.synopsis == 'Provider synopsis'
assert result.genres == ['Action']
assert result.rating is None # No Kitsu fallback
assert result.release_year is None
@pytest.mark.asyncio
async def test_enrich_metadata_caching(self, enricher, mock_kitsu_api_raw):
"""Test that enriched metadata is cached."""
provider_metadata = {
'synopsis': 'Provider synopsis',
'genres': ['Action'],
}
with patch.object(enricher.kitsu_api, 'search_anime', return_value=[mock_kitsu_api_raw]) as mock_search:
# First call should fetch from Kitsu
result1 = await enricher.enrich_metadata(
provider_metadata=provider_metadata,
title='Naruto',
url='https://example.com/naruto',
use_kitsu_fallback=True
)
assert mock_search.call_count == 1
# Second call should use cache
result2 = await enricher.enrich_metadata(
provider_metadata=provider_metadata,
title='Naruto',
url='https://example.com/naruto',
use_kitsu_fallback=True
)
assert mock_search.call_count == 1 # No additional call
# Results should be identical
assert result1.model_dump() == result2.model_dump()
@pytest.mark.asyncio
async def test_enrich_search_results(self, enricher, mock_kitsu_api_raw):
"""Test enrichment of multiple search results."""
search_results = [
{
'title': 'Naruto',
'url': 'https://example.com/naruto',
'metadata': {
'synopsis': 'Brief synopsis',
'genres': ['Action']
}
},
{
'title': 'One Piece',
'url': 'https://example.com/onepiece',
'metadata': {
'synopsis': 'Another synopsis',
'genres': ['Adventure']
}
},
{
'title': 'No Metadata',
'url': 'https://example.com/nometa'
}
]
with patch.object(enricher.kitsu_api, 'search_anime', return_value=[mock_kitsu_api_raw]):
enriched = await enricher.enrich_search_results(
results=search_results,
use_kitsu_fallback=True
)
# Should enrich results with metadata
assert len(enriched) == 3
# First result should be enriched
assert enriched[0]['metadata']['rating'] == '8.5/10'
assert enriched[0]['metadata']['release_year'] == 2002
# Second result should also be enriched
assert enriched[1]['metadata']['rating'] == '8.5/10'
# Third result should have no metadata field
assert 'metadata' not in enriched[2] or enriched[2].get('metadata') is None
@pytest.mark.asyncio
async def test_cache_expiry(self, enricher, mock_kitsu_api_raw):
"""Test that expired cache entries are removed."""
provider_metadata = {'synopsis': 'Test'}
# Add an expired entry to cache
cache_key = enricher._get_cache_key('Test', 'https://example.com/test')
enricher._cache[cache_key] = {
'metadata': provider_metadata,
'cached_at': (datetime.now() - timedelta(hours=25)).isoformat() # Expired
}
enricher._cache_dirty = True
with patch.object(enricher.kitsu_api, 'search_anime', return_value=[mock_kitsu_api_raw]) as mock_search:
# Should fetch from Kitsu since cache is expired
result = await enricher.enrich_metadata(
provider_metadata=provider_metadata,
title='Test',
url='https://example.com/test',
use_kitsu_fallback=True
)
assert mock_search.call_count == 1
assert result.rating == '8.5/10'
@pytest.mark.asyncio
async def test_close_saves_cache(self, enricher):
"""Test that closing the enricher saves the cache."""
# Add something to cache
cache_key = 'test_key'
enricher._cache[cache_key] = {
'metadata': {'test': 'data'},
'cached_at': datetime.now().isoformat()
}
enricher._cache_dirty = True
await enricher.close()
# Cache file should exist
assert enricher.cache_file.exists()
@pytest.mark.asyncio
async def test_fetch_from_kitsu_error_handling(self, enricher):
"""Test error handling when Kitsu API fails."""
provider_metadata = {'synopsis': 'Test'}
with patch.object(enricher, '_fetch_from_kitsu', side_effect=Exception("API Error")):
result = await enricher.enrich_metadata(
provider_metadata=provider_metadata,
title='NonExistent Anime',
url='https://example.com/nonexistent',
use_kitsu_fallback=True
)
# Should return provider metadata despite error
assert result.synopsis == 'Test'
assert result.rating is None
class TestMetadataEnrichmentIntegration:
"""Integration tests for metadata enrichment."""
@pytest.mark.asyncio
@pytest.mark.slow
async def test_kitsu_api_integration(self):
"""Test actual Kitsu API integration (marked as slow)."""
enricher = MetadataEnricher()
try:
# Search for a well-known anime
results = await enricher.kitsu_api.search_anime('Naruto', limit=1)
assert len(results) > 0
assert 'title' in results[0]
assert 'synopsis' in results[0] or 'genres' in results[0]
finally:
await enricher.close()
@pytest.mark.asyncio
@pytest.mark.slow
async def test_full_enrichment_flow(self):
"""Test complete enrichment flow with real data (marked as slow)."""
enricher = MetadataEnricher()
try:
# Simulate provider metadata with gaps
provider_metadata = {
'synopsis': 'Naruto Uzumaki wants to be the best ninja.',
'genres': ['Action'],
# Missing many fields
}
result = await enricher.enrich_metadata(
provider_metadata=provider_metadata,
title='Naruto',
url='https://test.com/naruto',
use_kitsu_fallback=True
)
# Should have enriched data
assert result.synopsis is not None
assert len(result.genres) > 0
# Kitsu might have filled some gaps
# (We can't assert specific fields as Kitsu responses may vary)
quality_score = result.model_dump().get('_quality_score', 0)
assert quality_score >= 0
finally:
await enricher.close()