feat: Add AGENTS.md and new downloaders with metadata enrichment
- Add AGENTS.md for agentic coding guidelines - Add Oneupload and Smoothpre video player downloaders - Add MetadataEnrichment service with Kitsu API fallback - Add tests for metadata enrichment and provider detection - Update .gitignore to ignore runtime config files
This commit is contained in:
@@ -0,0 +1,294 @@
|
||||
from .base import BaseVideoPlayer
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class OneuploadDownloader(BaseVideoPlayer):
|
||||
"""Downloader for oneupload.to video player"""
|
||||
|
||||
def can_handle(self, url: str) -> bool:
|
||||
return 'oneupload.to' in url.lower()
|
||||
|
||||
async def get_download_link(self, url: str, target_filename: Optional[str] = None) -> tuple[str, str]:
|
||||
"""
|
||||
Extract download link from Oneupload video page
|
||||
Oneupload uses a custom video player with dynamic loading
|
||||
|
||||
Args:
|
||||
url: The Oneupload video page URL
|
||||
target_filename: Optional filename override
|
||||
|
||||
Returns:
|
||||
Tuple of (direct_video_url, filename)
|
||||
"""
|
||||
try:
|
||||
print(f"[ONEUPLOAD] Extracting link from: {url}")
|
||||
|
||||
# Try using Playwright first (more reliable for dynamic content)
|
||||
video_url = await self._extract_with_playwright(url)
|
||||
|
||||
if not video_url:
|
||||
# Fallback to HTTP extraction
|
||||
video_url = await self._extract_with_http(url)
|
||||
|
||||
if not video_url:
|
||||
raise Exception("Could not find video URL in Oneupload page")
|
||||
|
||||
print(f"[ONEUPLOAD] Found video URL: {video_url[:80]}...")
|
||||
|
||||
# Generate filename
|
||||
from app.utils import sanitize_filename
|
||||
if target_filename:
|
||||
filename = sanitize_filename(target_filename)
|
||||
else:
|
||||
# Try to extract filename from URL
|
||||
filename = "oneupload_video.mp4"
|
||||
|
||||
return video_url, filename
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Error extracting Oneupload link: {str(e)}")
|
||||
|
||||
async def _extract_with_playwright(self, url: str) -> str | None:
|
||||
"""Extract video URL using Playwright with network interception"""
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
print("[ONEUPLOAD] Launching browser with network interception...")
|
||||
|
||||
video_urls = []
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(
|
||||
headless=True,
|
||||
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
||||
)
|
||||
|
||||
context = await browser.new_context(
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
|
||||
)
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
# Set up response interception
|
||||
async def handle_response(response):
|
||||
try:
|
||||
resp_url = response.url
|
||||
content_type = response.headers.get('content-type', '')
|
||||
|
||||
# Look for video files in responses
|
||||
if any(ext in resp_url.lower() for ext in ['.m3u8', '.mp4', '.mkv', '.ts']):
|
||||
if 'oneupload' not in resp_url.lower() and 'google' not in resp_url.lower():
|
||||
print(f"[ONEUPLOAD] 🎥 Captured video URL: {resp_url[:100]}...")
|
||||
video_urls.append(resp_url)
|
||||
# Also check by content-type
|
||||
elif any(ct in content_type.lower() for ct in ['video/', 'application/x-mpegurl']):
|
||||
if 'oneupload' not in resp_url.lower():
|
||||
print(f"[ONEUPLOAD] 🎥 Captured video response: {resp_url[:100]}...")
|
||||
video_urls.append(resp_url)
|
||||
except Exception as e:
|
||||
pass # Ignore interception errors
|
||||
|
||||
page.on('response', handle_response)
|
||||
|
||||
print("[ONEUPLOAD] Navigating to page...")
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until='networkidle', timeout=30000)
|
||||
except Exception as e:
|
||||
print(f"[ONEUPLOAD] Navigation warning: {e}")
|
||||
|
||||
# Wait for page to load
|
||||
print("[ONEUPLOAD] Waiting for video player to load...")
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# Try to find and click play button
|
||||
try:
|
||||
play_selectors = [
|
||||
'button[aria-label="Play"]',
|
||||
'.play-button',
|
||||
'button[class*="play"]',
|
||||
'.jw-icon-display',
|
||||
'video',
|
||||
'.video-wrapper video',
|
||||
]
|
||||
|
||||
for selector in play_selectors:
|
||||
try:
|
||||
element = await page.query_selector(selector)
|
||||
if element:
|
||||
print(f"[ONEUPLOAD] Found element: {selector}")
|
||||
if 'button' in selector or 'jw' in selector:
|
||||
await element.click()
|
||||
await asyncio.sleep(2)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"[ONEUPLOAD] Play button interaction: {e}")
|
||||
|
||||
# Wait more for network requests
|
||||
await asyncio.sleep(4)
|
||||
|
||||
# Try JavaScript extraction
|
||||
try:
|
||||
js_code = r"""
|
||||
() => {
|
||||
// Check for JWPlayer setup
|
||||
if (window.jwplayer) {
|
||||
try {
|
||||
const playlist = window.jwplayer().getPlaylist();
|
||||
if (playlist && playlist[0] && playlist[0].sources) {
|
||||
for (let source of playlist[0].sources) {
|
||||
if (source.file && (source.file.includes('.m3u8') || source.file.includes('.mp4'))) {
|
||||
return source.file;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(e) {}
|
||||
}
|
||||
|
||||
// Check all video elements
|
||||
const videos = document.querySelectorAll('video');
|
||||
for (let v of videos) {
|
||||
if (v.src && (v.src.includes('.m3u8') || v.src.includes('.mp4'))) {
|
||||
return v.src;
|
||||
}
|
||||
const sources = v.querySelectorAll('source');
|
||||
for (let s of sources) {
|
||||
if (s.src && (s.src.includes('.m3u8') || s.src.includes('.mp4'))) {
|
||||
return s.src;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check window object for video URLs
|
||||
const searchKeys = ['player', 'video', 'source', 'file', 'url'];
|
||||
for (let key of searchKeys) {
|
||||
if (window[key] && typeof window[key] === 'object') {
|
||||
try {
|
||||
const json = JSON.stringify(window[key]);
|
||||
const match = json.match(/(https?:\/\/[^\s"\'<>]+\.(m3u8|mp4))/);
|
||||
if (match) return match[1];
|
||||
} catch(e) {}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
"""
|
||||
js_result = await page.evaluate(js_code)
|
||||
|
||||
if js_result and ('.m3u8' in js_result or '.mp4' in js_result):
|
||||
print(f"[ONEUPLOAD] ✅ Found video URL via JavaScript: {js_result[:100]}...")
|
||||
video_urls.append(js_result)
|
||||
except Exception as e:
|
||||
print(f"[ONEUPLOAD] JS extraction error: {e}")
|
||||
|
||||
# Parse page HTML for video URLs
|
||||
try:
|
||||
content = await page.content()
|
||||
patterns = [
|
||||
r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
|
||||
r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
|
||||
r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
|
||||
r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
|
||||
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
|
||||
r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
|
||||
r"url\s*[:=]\s*['\"]([^'\"]+\.m3u8[^'\"]*)['\"]",
|
||||
r"url\s*[:=]\s*['\"]([^'\"]+\.mp4[^'\"]*)['\"]",
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||||
for match in matches:
|
||||
# Clean up the URL
|
||||
match = match.replace('\\/', '/').replace('\\', '')
|
||||
if 'http' in match and 'oneupload' not in match and 'google' not in match:
|
||||
print(f"[ONEUPLOAD] Found in HTML: {match[:100]}...")
|
||||
video_urls.append(match)
|
||||
except Exception as e:
|
||||
print(f"[ONEUPLOAD] HTML parsing error: {e}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Return first valid video URL (prefer .m3u8 over .mp4)
|
||||
if video_urls:
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
for vid_url in video_urls:
|
||||
if vid_url not in seen:
|
||||
seen.add(vid_url)
|
||||
unique_urls.append(vid_url)
|
||||
|
||||
if unique_urls:
|
||||
# Sort to prefer .m3u8 (source quality)
|
||||
unique_urls.sort(key=lambda x: 0 if '.m3u8' in x else 1)
|
||||
print(f"[ONEUPLOAD] ✅ Found {len(unique_urls)} video URL(s)")
|
||||
print(f"[ONEUPLOAD] Selected: {unique_urls[0][:100]}...")
|
||||
return unique_urls[0]
|
||||
|
||||
print("[ONEUPLOAD] ❌ No video URLs found")
|
||||
return None
|
||||
|
||||
except ImportError:
|
||||
print("[ONEUPLOAD] ⚠️ Playwright not installed - using HTTP extraction only")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"[ONEUPLOAD] Playwright error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
async def _extract_with_http(self, url: str) -> str | None:
|
||||
"""Extract video URL using simple HTTP requests"""
|
||||
try:
|
||||
print(f"[ONEUPLOAD] Trying HTTP extraction from: {url}")
|
||||
|
||||
response = await self.client.get(url, follow_redirects=True)
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
|
||||
# Method 1: Look for video/source tags
|
||||
videos = soup.find_all('video')
|
||||
for video in videos:
|
||||
src = video.get('src') or video.get('data-src')
|
||||
if src and any(ext in src for ext in ['.m3u8', '.mp4']):
|
||||
print(f"[ONEUPLOAD] ✅ Found video in video tag: {src[:100]}...")
|
||||
return src
|
||||
|
||||
sources = video.find_all('source')
|
||||
for source in sources:
|
||||
src = source.get('src')
|
||||
if src and any(ext in src for ext in ['.m3u8', '.mp4']):
|
||||
print(f"[ONEUPLOAD] ✅ Found video in source tag: {src[:100]}...")
|
||||
return src
|
||||
|
||||
# Method 2: Look in script tags for video URLs
|
||||
scripts = soup.find_all('script')
|
||||
for script in scripts:
|
||||
if script.string:
|
||||
patterns = [
|
||||
r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
|
||||
r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
|
||||
r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
|
||||
r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
|
||||
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
|
||||
r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, script.string, re.IGNORECASE)
|
||||
for match in matches:
|
||||
match = match.replace('\\/', '/')
|
||||
if 'http' in match and 'oneupload' not in match.lower():
|
||||
print(f"[ONEUPLOAD] ✅ Found video in script: {match[:100]}...")
|
||||
return match
|
||||
|
||||
print("[ONEUPLOAD] ❌ HTTP extraction failed - no video URLs found")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ONEUPLOAD] HTTP extraction error: {e}")
|
||||
return None
|
||||
@@ -0,0 +1,290 @@
|
||||
from .base import BaseVideoPlayer
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class SmoothpreDownloader(BaseVideoPlayer):
|
||||
"""Downloader for smoothpre.com video player (JWPlayer-based)"""
|
||||
|
||||
def can_handle(self, url: str) -> bool:
|
||||
return 'smoothpre.com' in url.lower()
|
||||
|
||||
async def get_download_link(self, url: str, target_filename: Optional[str] = None) -> tuple[str, str]:
|
||||
"""
|
||||
Extract download link from Smoothpre video page
|
||||
Smoothpre uses JWPlayer with dynamic JavaScript - requires Playwright
|
||||
|
||||
Args:
|
||||
url: The Smoothpre video page URL
|
||||
target_filename: Optional filename override
|
||||
|
||||
Returns:
|
||||
Tuple of (direct_video_url, filename)
|
||||
"""
|
||||
try:
|
||||
print(f"[SMOOTHPRE] Extracting link from: {url}")
|
||||
|
||||
# Try using Playwright to extract video URL
|
||||
video_url = await self._extract_with_playwright(url)
|
||||
|
||||
if not video_url:
|
||||
raise Exception("Could not find video URL in Smoothpre page")
|
||||
|
||||
print(f"[SMOOTHPRE] Found video URL: {video_url[:80]}...")
|
||||
|
||||
# Generate filename
|
||||
from app.utils import sanitize_filename
|
||||
if target_filename:
|
||||
filename = sanitize_filename(target_filename)
|
||||
else:
|
||||
filename = "smoothpre_video.mp4"
|
||||
|
||||
return video_url, filename
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Error extracting Smoothpre link: {str(e)}")
|
||||
|
||||
async def _extract_with_playwright(self, url: str) -> str | None:
|
||||
"""Extract video URL using Playwright with network interception"""
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
print("[SMOOTHPRE] Launching browser with network interception...")
|
||||
|
||||
video_urls = []
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(
|
||||
headless=True,
|
||||
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
||||
)
|
||||
|
||||
context = await browser.new_context(
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
|
||||
)
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
# Set up response interception
|
||||
async def handle_response(response):
|
||||
try:
|
||||
resp_url = response.url
|
||||
content_type = response.headers.get('content-type', '')
|
||||
|
||||
# Look for video files in responses
|
||||
if any(ext in resp_url.lower() for ext in ['.m3u8', '.mp4', '.mkv', '.ts']):
|
||||
if 'smoothpre' not in resp_url.lower() and 'google' not in resp_url.lower():
|
||||
print(f"[SMOOTHPRE] 🎥 Captured video URL: {resp_url[:100]}...")
|
||||
video_urls.append(resp_url)
|
||||
# Also check by content-type
|
||||
elif any(ct in content_type.lower() for ct in ['video/', 'application/x-mpegurl']):
|
||||
if 'smoothpre' not in resp_url.lower():
|
||||
print(f"[SMOOTHPRE] 🎥 Captured video response: {resp_url[:100]}...")
|
||||
video_urls.append(resp_url)
|
||||
except Exception as e:
|
||||
pass # Ignore interception errors
|
||||
|
||||
page.on('response', handle_response)
|
||||
|
||||
print("[SMOOTHPRE] Navigating to page...")
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until='networkidle', timeout=30000)
|
||||
except Exception as e:
|
||||
print(f"[SMOOTHPRE] Navigation warning: {e}")
|
||||
|
||||
# Wait for page to load
|
||||
print("[SMOOTHPRE] Waiting for video player to load...")
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# Try to find and click play button
|
||||
try:
|
||||
play_selectors = [
|
||||
'button[aria-label="Play"]',
|
||||
'.play-button',
|
||||
'button[class*="play"]',
|
||||
'.jw-icon-display',
|
||||
'video',
|
||||
]
|
||||
|
||||
for selector in play_selectors:
|
||||
try:
|
||||
element = await page.query_selector(selector)
|
||||
if element:
|
||||
print(f"[SMOOTHPRE] Found element: {selector}")
|
||||
if 'button' in selector or 'jw' in selector:
|
||||
await element.click()
|
||||
await asyncio.sleep(2)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"[SMOOTHPRE] Play button interaction: {e}")
|
||||
|
||||
# Wait more for network requests
|
||||
await asyncio.sleep(4)
|
||||
|
||||
# Try JavaScript extraction - JWPlayer specific
|
||||
try:
|
||||
js_code = r"""
|
||||
() => {
|
||||
// Check for JWPlayer setup (primary method for Smoothpre)
|
||||
if (window.jwplayer) {
|
||||
try {
|
||||
const playlist = window.jwplayer().getPlaylist();
|
||||
if (playlist && playlist[0] && playlist[0].sources) {
|
||||
for (let source of playlist[0].sources) {
|
||||
if (source.file && (source.file.includes('.m3u8') || source.file.includes('.mp4'))) {
|
||||
return source.file;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(e) {}
|
||||
}
|
||||
|
||||
// Check all video elements
|
||||
const videos = document.querySelectorAll('video');
|
||||
for (let v of videos) {
|
||||
if (v.src && (v.src.includes('.m3u8') || v.src.includes('.mp4'))) {
|
||||
return v.src;
|
||||
}
|
||||
const sources = v.querySelectorAll('source');
|
||||
for (let s of sources) {
|
||||
if (s.src && (s.src.includes('.m3u8') || s.src.includes('.mp4'))) {
|
||||
return s.src;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check window object for video URLs
|
||||
const searchKeys = ['player', 'video', 'source', 'file', 'url', 'jw'];
|
||||
for (let key of searchKeys) {
|
||||
if (window[key] && typeof window[key] === 'object') {
|
||||
try {
|
||||
const json = JSON.stringify(window[key]);
|
||||
const match = json.match(/(https?:\/\/[^\s"\'<>]+\.(m3u8|mp4))/);
|
||||
if (match) return match[1];
|
||||
} catch(e) {}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
"""
|
||||
js_result = await page.evaluate(js_code)
|
||||
|
||||
if js_result and ('.m3u8' in js_result or '.mp4' in js_result):
|
||||
print(f"[SMOOTHPRE] ✅ Found video URL via JavaScript: {js_result[:100]}...")
|
||||
video_urls.append(js_result)
|
||||
except Exception as e:
|
||||
print(f"[SMOOTHPRE] JS extraction error: {e}")
|
||||
|
||||
# Parse page HTML for video URLs - enhanced patterns
|
||||
try:
|
||||
content = await page.content()
|
||||
patterns = [
|
||||
r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
|
||||
r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
|
||||
r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
|
||||
r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
|
||||
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
|
||||
r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
|
||||
r"url\s*[:=]\s*['\"]([^'\"]+\.m3u8[^'\"]*)['\"]",
|
||||
r"url\s*[:=]\s*['\"]([^'\"]+\.mp4[^'\"]*)['\"]",
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||||
for match in matches:
|
||||
# Clean up the URL
|
||||
match = match.replace('\\/', '/').replace('\\', '')
|
||||
if 'http' in match and 'smoothpre' not in match and 'google' not in match:
|
||||
print(f"[SMOOTHPRE] Found in HTML: {match[:100]}...")
|
||||
video_urls.append(match)
|
||||
except Exception as e:
|
||||
print(f"[SMOOTHPRE] HTML parsing error: {e}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# Return first valid video URL (prefer .m3u8 over .mp4 as it's usually the source)
|
||||
if video_urls:
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
for vid_url in video_urls:
|
||||
if vid_url not in seen:
|
||||
seen.add(vid_url)
|
||||
unique_urls.append(vid_url)
|
||||
|
||||
if unique_urls:
|
||||
# Sort to prefer .m3u8 (source quality)
|
||||
unique_urls.sort(key=lambda x: 0 if '.m3u8' in x else 1)
|
||||
print(f"[SMOOTHPRE] ✅ Found {len(unique_urls)} video URL(s)")
|
||||
print(f"[SMOOTHPRE] Selected: {unique_urls[0][:100]}...")
|
||||
return unique_urls[0]
|
||||
|
||||
print("[SMOOTHPRE] ❌ No video URLs found")
|
||||
return None
|
||||
|
||||
except ImportError:
|
||||
print("[SMOOTHPRE] ⚠️ Playwright not installed - falling back to HTTP extraction")
|
||||
return await self._extract_with_http(url)
|
||||
except Exception as e:
|
||||
print(f"[SMOOTHPRE] Playwright error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Fallback to HTTP extraction
|
||||
return await self._extract_with_http(url)
|
||||
|
||||
async def _extract_with_http(self, url: str) -> str | None:
|
||||
"""Extract video URL using simple HTTP requests (fallback when Playwright fails)"""
|
||||
try:
|
||||
print(f"[SMOOTHPRE] Trying HTTP extraction from: {url}")
|
||||
|
||||
response = await self.client.get(url, follow_redirects=True)
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
|
||||
# Method 1: Look for video/source tags
|
||||
videos = soup.find_all('video')
|
||||
for video in videos:
|
||||
src = video.get('src') or video.get('data-src')
|
||||
if src and any(ext in src for ext in ['.m3u8', '.mp4']):
|
||||
print(f"[SMOOTHPRE] ✅ Found video in video tag: {src[:100]}...")
|
||||
return src
|
||||
|
||||
sources = video.find_all('source')
|
||||
for source in sources:
|
||||
src = source.get('src')
|
||||
if src and any(ext in src for ext in ['.m3u8', '.mp4']):
|
||||
print(f"[SMOOTHPRE] ✅ Found video in source tag: {src[:100]}...")
|
||||
return src
|
||||
|
||||
# Method 2: Look in script tags for JWPlayer configuration
|
||||
scripts = soup.find_all('script')
|
||||
for script in scripts:
|
||||
if script.string:
|
||||
# JWPlayer patterns
|
||||
patterns = [
|
||||
r'"file"\s*:\s*"([^"]+\.m3u8[^"]*)"',
|
||||
r'"file"\s*:\s*"([^"]+\.mp4[^"]*)"',
|
||||
r'"source"\s*:\s*"([^"]+\.m3u8[^"]*)"',
|
||||
r'"source"\s*:\s*"([^"]+\.mp4[^"]*)"',
|
||||
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*)',
|
||||
r'(https?://[^\s"\'<>]+\.mp4[^\s"\'<>]*)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, script.string, re.IGNORECASE)
|
||||
for match in matches:
|
||||
match = match.replace('\\/', '/')
|
||||
if 'http' in match and 'smoothpre' not in match.lower():
|
||||
print(f"[SMOOTHPRE] ✅ Found video in script: {match[:100]}...")
|
||||
return match
|
||||
|
||||
print("[SMOOTHPRE] ❌ HTTP extraction failed - no video URLs found")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"[SMOOTHPRE] HTTP extraction error: {e}")
|
||||
return None
|
||||
Reference in New Issue
Block a user