# Project - Extract podcast audio file from Xiaoyuzhou web
## Automate the web scrap process
- Cmd input: xiaoyuzhou podcast episode link + lang:
```https://www.xiaoyuzhoufm.com/episode/xxxx zh```
```https://www.xiaoyuzhoufm.com/episode/xxxx en```
- Extract the audio file (support .mp3, .mp4a, .m4a, .wav, .aac, .ogg)
- Download and save to the ./Download folder
```python=
import requests
from bs4 import BeautifulSoup
import re
import os
import sys
def fetch_audio_url(podcast_url):
headers = {
'User-Agent': 'Mozilla/5.0'
}
response = requests.get(podcast_url, headers=headers)
if response.status_code != 200:
raise Exception("Failed to fetch podcast page.")
soup = BeautifulSoup(response.text, 'html.parser')
# Search for audio URLs with different formats using regex
# Look for common audio formats: mp3, mp4a, m4a, wav, aac, ogg
audio_patterns = [
r'https://media\.xyzcdn\.net/[^"]+\.mp4a', # Original mp4a format
r'https://media\.xyzcdn\.net/[^"]+\.mp3', # MP3 format
r'https://media\.xyzcdn\.net/[^"]+\.m4a', # M4A format
r'https://media\.xyzcdn\.net/[^"]+\.wav', # WAV format
r'https://media\.xyzcdn\.net/[^"]+\.aac', # AAC format
r'https://media\.xyzcdn\.net/[^"]+\.ogg', # OGG format
]
# Try each pattern until we find a match
for pattern in audio_patterns:
match = re.search(pattern, response.text)
if match:
return match.group(0)
# If no specific format found, try a more general pattern
general_match = re.search(r'https://media\.xyzcdn\.net/[^"]+\.(mp3|mp4a|m4a|wav|aac|ogg)', response.text)
if general_match:
return general_match.group(0)
raise Exception("Audio URL not found. No supported audio format detected.")
def download_audio(audio_url, filename=None):
if not filename:
filename = audio_url.split('/')[-1]
# Set download directory to ~/Downloads
download_dir = os.path.expanduser("~/Downloads")
filepath = os.path.join(download_dir, filename)
# Create Downloads directory if it doesn't exist
os.makedirs(download_dir, exist_ok=True)
response = requests.get(audio_url, stream=True)
if response.status_code == 200:
with open(filepath, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"Downloaded: {filepath}")
return filepath
else:
raise Exception("Failed to download audio.")
if __name__ == "__main__":
# Check if URL argument is provided
if len(sys.argv) != 2:
print("Usage: python3 download_podcast.py <podcast_web_url>")
print("Example: python3 download_podcast.py https://www.xiaoyuzhoufm.com/episode/xxxxxx")
sys.exit(1)
podcast_web_url = sys.argv[1]
try:
audio_url = fetch_audio_url(podcast_web_url)
print("Audio URL found:", audio_url)
filepath = download_audio(audio_url)
print(f"File saved to: {filepath}")
except Exception as e:
print("Error:", e)
sys.exit(1)
```