fix: detect Format A by domain differences, remove duplicate detection code
This commit is contained in:
@@ -778,33 +778,42 @@ class AnimeSamaDownloader(BaseAnimeSite):
|
|||||||
|
|
||||||
if eps_matches:
|
if eps_matches:
|
||||||
# Determine the format by looking at the data
|
# Determine the format by looking at the data
|
||||||
# Format A: each epsX array is one source with all episodes (few arrays, many URLs each)
|
# Format A: each epsX array is one SOURCE with all episodes (different domains per array)
|
||||||
# Format B: each epsX array is one episode with multiple sources (many arrays or similar counts)
|
# Format B: each epsX array is one EPISODE with multiple sources (same domains across arrays)
|
||||||
|
|
||||||
eps1_urls = re.findall(r"'(https?://[^']+)'", eps_matches[0][1])
|
eps1_urls = re.findall(r"'(https?://[^']+)'", eps_matches[0][1])
|
||||||
num_episode_arrays = len(eps_matches)
|
num_episode_arrays = len(eps_matches)
|
||||||
|
|
||||||
is_format_a = True # Default
|
is_format_a = True # Default
|
||||||
|
|
||||||
if num_episode_arrays > 5:
|
if num_episode_arrays >= 2:
|
||||||
# Many arrays = Format B (each array = one episode)
|
# Extract domains from first URLs of each array
|
||||||
is_format_a = False
|
def get_domain(url):
|
||||||
elif num_episode_arrays >= 2:
|
return url.split('/')[2] if '/' in url else url
|
||||||
# Check URL counts - if similar, it's Format B
|
|
||||||
url_counts = []
|
domains_per_array = []
|
||||||
for eps_num, urls_text in eps_matches:
|
for eps_num, urls_text in eps_matches:
|
||||||
urls = re.findall(r"'(https?://[^']+)'", urls_text)
|
urls = re.findall(r"'(https?://[^']+)'", urls_text)
|
||||||
url_counts.append(len(urls))
|
if urls:
|
||||||
|
domains = set(get_domain(u) for u in urls[:3]) # Sample first 3
|
||||||
|
domains_per_array.append(domains)
|
||||||
|
|
||||||
if url_counts and max(url_counts) > 0:
|
# Check if domains are different across arrays
|
||||||
avg_count = sum(url_counts) / len(url_counts)
|
# If each array has completely different domains → Format A (each = source)
|
||||||
variance = max(url_counts) / avg_count if avg_count > 0 else 1
|
# If arrays share domains → Format B (each = episode with multiple sources)
|
||||||
|
all_domains = set()
|
||||||
|
for domains in domains_per_array:
|
||||||
|
all_domains.update(domains)
|
||||||
|
|
||||||
# Similar counts (< 1.5x variance) = Format B
|
# If total unique domains ≈ sum of domains per array → Format A
|
||||||
if variance < 1.5:
|
# If total unique domains << sum of domains per array → Format B (shared)
|
||||||
|
total_domain_count = sum(len(d) for d in domains_per_array)
|
||||||
|
if len(all_domains) < total_domain_count * 0.7:
|
||||||
|
# Domains are shared across arrays → Format B
|
||||||
is_format_a = False
|
is_format_a = False
|
||||||
|
|
||||||
logger.debug(f"Detected format {'A (source-based)' if is_format_a else 'B (episode-based)'} - {num_episode_arrays} arrays")
|
|
||||||
|
# No more host preference!
|
||||||
|
|
||||||
# No more host preference! Just collect all available URLs for each episode
|
# No more host preference! Just collect all available URLs for each episode
|
||||||
# The download system will automatically detect and use the appropriate downloader
|
# The download system will automatically detect and use the appropriate downloader
|
||||||
|
|||||||
Reference in New Issue
Block a user