# MarkdownLoader
HackMD and other editors may leave linked media in a markdown document. While this is convenient, links break and media is generally not available when offline.
The `download_md_media.py` script below
- accepts a list of files `file1.md file2.md` (should also work per drag&drop)
- creates a `data/` subdirectory
- finds and attempts to download linked media
- subsequently replaces links to successfully downloaded media with relative paths
## download_md_media.py
```python
#!python3
# Auto-install missing modules
import sys
import subprocess
required_modules = ["argparse", "requests", "urllib"]
for module in required_modules:
try:
__import__(module)
except ImportError:
try:
print(f"Module '{module}' not found. Attempting to install...")
subprocess.check_call([sys.executable, "-m", "pip", "install", module])
except:
print(f"Required module '{module}' is missing and cannot be installed automatically.")
sys.exit(1)
# Application code
import argparse
import os
import re
import requests
from urllib.parse import urlparse
def download_file(url, dest_folder):
os.makedirs(dest_folder, exist_ok=True)
local_filename = os.path.basename(urlparse(url).path)
local_path = os.path.join(dest_folder, local_filename)
try:
with requests.get(url, stream=True, timeout=15) as r:
r.raise_for_status()
with open(local_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Downloaded: {url} -> {local_path}")
return local_path
except Exception as e:
print(f"Failed to download {url}: {e}")
return None
def find_media_links(markdown_text):
# Match images and other media (basic pattern for  or [alt](url))
pattern = r'(!?\[.*?\])\((https?://[^\s)]+)\)'
return re.findall(pattern, markdown_text)
def update_markdown(markdown_text, url_map):
def replacer(match):
alt_text, url = match.groups()
new_url = url_map.get(url)
# Only replace if local copy was downloaded successfully
if new_url and os.path.exists(new_url):
return f"{alt_text}({new_url})"
else:
return f"{alt_text}({url})"
pattern = re.compile(r'(!?\[.*?\])\((https?://[^\s)]+)\)')
return pattern.sub(replacer, markdown_text)
def main():
parser = argparse.ArgumentParser(description="Download media from one or more markdown files.")
parser.add_argument("md_file", nargs='+', help="Paths to one or more Markdown (.md) files")
args = parser.parse_args()
for md_path in args.md_file:
if not os.path.isfile(md_path):
print(f"File not found: {md_path}")
continue
print(f"\nProcessing: {md_path}")
with open(md_path, 'r', encoding='utf-8') as f:
markdown_text = f.read()
media_links = find_media_links(markdown_text)
if not media_links:
print(" No media links found.")
continue
md_dir = os.path.dirname(md_path)
data_dir = os.path.join(md_dir, "data")
url_map = {}
for alt_text, url in media_links:
print(f"downloading {url} ...")
local_path = download_file(url, data_dir)
if local_path:
rel_path = os.path.relpath(local_path, md_dir)
url_map[url] = rel_path.replace("\\", "/") # Normalize for Markdown
updated_text = update_markdown(markdown_text, url_map)
with open(md_path, 'w', encoding='utf-8') as f:
f.write(updated_text)
print(" Markdown file updated with local media paths.")
if __name__ == "__main__":
main()
```