MarkdownLoader

# MarkdownLoader HackMD and other editors may leave linked media in a markdown document. While this is convenient, links break and media is generally not available when offline. The `download_md_media.py` script below - accepts a list of files `file1.md file2.md` (should also work per drag&drop) - creates a `data/` subdirectory - finds and attempts to download linked media - subsequently replaces links to successfully downloaded media with relative paths ## download_md_media.py ```python #!python3 # Auto-install missing modules import sys import subprocess required_modules = ["argparse", "requests", "urllib"] for module in required_modules: try: __import__(module) except ImportError: try: print(f"Module '{module}' not found. Attempting to install...") subprocess.check_call([sys.executable, "-m", "pip", "install", module]) except: print(f"Required module '{module}' is missing and cannot be installed automatically.") sys.exit(1) # Application code import argparse import os import re import requests from urllib.parse import urlparse def download_file(url, dest_folder): os.makedirs(dest_folder, exist_ok=True) local_filename = os.path.basename(urlparse(url).path) local_path = os.path.join(dest_folder, local_filename) try: with requests.get(url, stream=True, timeout=15) as r: r.raise_for_status() with open(local_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) print(f"Downloaded: {url} -> {local_path}") return local_path except Exception as e: print(f"Failed to download {url}: {e}") return None def find_media_links(markdown_text): # Match images and other media (basic pattern for ![alt](url) or [alt](url)) pattern = r'(!?\[.*?\])\((https?://[^\s)]+)\)' return re.findall(pattern, markdown_text) def update_markdown(markdown_text, url_map): def replacer(match): alt_text, url = match.groups() new_url = url_map.get(url) # Only replace if local copy was downloaded successfully if new_url and os.path.exists(new_url): return f"{alt_text}({new_url})" else: return f"{alt_text}({url})" pattern = re.compile(r'(!?\[.*?\])\((https?://[^\s)]+)\)') return pattern.sub(replacer, markdown_text) def main(): parser = argparse.ArgumentParser(description="Download media from one or more markdown files.") parser.add_argument("md_file", nargs='+', help="Paths to one or more Markdown (.md) files") args = parser.parse_args() for md_path in args.md_file: if not os.path.isfile(md_path): print(f"File not found: {md_path}") continue print(f"\nProcessing: {md_path}") with open(md_path, 'r', encoding='utf-8') as f: markdown_text = f.read() media_links = find_media_links(markdown_text) if not media_links: print(" No media links found.") continue md_dir = os.path.dirname(md_path) data_dir = os.path.join(md_dir, "data") url_map = {} for alt_text, url in media_links: print(f"downloading {url} ...") local_path = download_file(url, data_dir) if local_path: rel_path = os.path.relpath(local_path, md_dir) url_map[url] = rel_path.replace("\\", "/") # Normalize for Markdown updated_text = update_markdown(markdown_text, url_map) with open(md_path, 'w', encoding='utf-8') as f: f.write(updated_text) print(" Markdown file updated with local media paths.") if __name__ == "__main__": main() ```