# Project - Extract podcast audio file from Xiaoyuzhou web ## Automate the web scrap process - Cmd input: xiaoyuzhou podcast episode link + lang: ```https://www.xiaoyuzhoufm.com/episode/xxxx zh``` ```https://www.xiaoyuzhoufm.com/episode/xxxx en``` - Extract the audio file (support .mp3, .mp4a, .m4a, .wav, .aac, .ogg) - Download and save to the ./Download folder ```python= import requests from bs4 import BeautifulSoup import re import os import sys def fetch_audio_url(podcast_url): headers = { 'User-Agent': 'Mozilla/5.0' } response = requests.get(podcast_url, headers=headers) if response.status_code != 200: raise Exception("Failed to fetch podcast page.") soup = BeautifulSoup(response.text, 'html.parser') # Search for audio URLs with different formats using regex # Look for common audio formats: mp3, mp4a, m4a, wav, aac, ogg audio_patterns = [ r'https://media\.xyzcdn\.net/[^"]+\.mp4a', # Original mp4a format r'https://media\.xyzcdn\.net/[^"]+\.mp3', # MP3 format r'https://media\.xyzcdn\.net/[^"]+\.m4a', # M4A format r'https://media\.xyzcdn\.net/[^"]+\.wav', # WAV format r'https://media\.xyzcdn\.net/[^"]+\.aac', # AAC format r'https://media\.xyzcdn\.net/[^"]+\.ogg', # OGG format ] # Try each pattern until we find a match for pattern in audio_patterns: match = re.search(pattern, response.text) if match: return match.group(0) # If no specific format found, try a more general pattern general_match = re.search(r'https://media\.xyzcdn\.net/[^"]+\.(mp3|mp4a|m4a|wav|aac|ogg)', response.text) if general_match: return general_match.group(0) raise Exception("Audio URL not found. No supported audio format detected.") def download_audio(audio_url, filename=None): if not filename: filename = audio_url.split('/')[-1] # Set download directory to ~/Downloads download_dir = os.path.expanduser("~/Downloads") filepath = os.path.join(download_dir, filename) # Create Downloads directory if it doesn't exist os.makedirs(download_dir, exist_ok=True) response = requests.get(audio_url, stream=True) if response.status_code == 200: with open(filepath, 'wb') as f: for chunk in response.iter_content(1024): f.write(chunk) print(f"Downloaded: {filepath}") return filepath else: raise Exception("Failed to download audio.") if __name__ == "__main__": # Check if URL argument is provided if len(sys.argv) != 2: print("Usage: python3 download_podcast.py <podcast_web_url>") print("Example: python3 download_podcast.py https://www.xiaoyuzhoufm.com/episode/xxxxxx") sys.exit(1) podcast_web_url = sys.argv[1] try: audio_url = fetch_audio_url(podcast_web_url) print("Audio URL found:", audio_url) filepath = download_audio(audio_url) print(f"File saved to: {filepath}") except Exception as e: print("Error:", e) sys.exit(1) ```