Creating Wikipedia Search Embeddings with a Gaia Node

# Creating Wikipedia Search Embeddings with a Gaia Node This tutorial shows how to create embeddings from Wikipedia articles using Gaia nodes. We'll download articles about the 2022 Olympics, split them into sections, and create embeddings using Gaia's OpenAI-compatible API. ## Overview 1. Collect Wikipedia articles about 2022 Olympics 2. Split documents into searchable chunks 3. Create embeddings using your own Gaia node 4. Store results in a CSV file ## Prerequisites Install required packages: ```bash pip install mwclient mwparserfromhell pandas tiktoken openai ``` ## Code Here's the complete code that implements the workflow: ```python import mwclient import mwparserfromhell import os import pandas as pd import re import tiktoken from openai import OpenAI # Configure constants GAIA_NODE_URL = "https://llama8b.gaia.domains/v1/" CATEGORY_TITLE = "Category:2022 Winter Olympics" WIKI_SITE = "en.wikipedia.org" MAX_TOKENS = 1600 BATCH_SIZE = 50 # Initialize OpenAI client with Gaia endpoint client = OpenAI(base_url=GAIA_NODE_URL, api_key="not-needed") SECTIONS_TO_IGNORE = [ "See also", "References", "External links", "Further reading", "Footnotes", "Bibliography", "Sources", "Citations", "Literature", "Notes and references", "Photo gallery", "Works cited", "Photos", "Gallery", "Notes", "References and sources", "References and notes", ] def titles_from_category(category: mwclient.listing.Category, max_depth: int) -> set[str]: """Get all page titles from a Wikipedia category and its subcategories.""" titles = set() for cm in category.members(): if type(cm) == mwclient.page.Page: titles.add(cm.name) elif isinstance(cm, mwclient.listing.Category) and max_depth > 0: deeper_titles = titles_from_category(cm, max_depth=max_depth - 1) titles.update(deeper_titles) return titles def all_subsections_from_section(section: mwparserfromhell.wikicode.Wikicode, parent_titles: list[str], sections_to_ignore: set[str]) -> list[tuple[list[str], str]]: """Extract all subsections from a Wikipedia section.""" headings = [str(h) for h in section.filter_headings()] title = headings[0] if title.strip("=" + " ") in sections_to_ignore: return [] titles = parent_titles + [title] full_text = str(section) section_text = full_text.split(title)[1] if len(headings) == 1: return [(titles, section_text)] else: first_subtitle = headings[1] section_text = section_text.split(first_subtitle)[0] results = [(titles, section_text)] for subsection in section.get_sections(levels=[len(titles) + 1]): results.extend(all_subsections_from_section(subsection, titles, sections_to_ignore)) return results def all_subsections_from_title(title: str, sections_to_ignore: set[str] = SECTIONS_TO_IGNORE, site_name: str = WIKI_SITE) -> list[tuple[list[str], str]]: """Get all subsections from a Wikipedia page title.""" site = mwclient.Site(site_name) page = site.pages[title] text = page.text() parsed_text = mwparserfromhell.parse(text) headings = [str(h) for h in parsed_text.filter_headings()] if headings: summary_text = str(parsed_text).split(headings[0])[0] else: summary_text = str(parsed_text) results = [([title], summary_text)] for subsection in parsed_text.get_sections(levels=[2]): results.extend(all_subsections_from_section(subsection, [title], sections_to_ignore)) return results def clean_section(section: tuple[list[str], str]) -> tuple[list[str], str]: """Clean up a Wikipedia section by removing references and whitespace.""" titles, text = section text = re.sub(r"<ref.*?</ref>", "", text) text = text.strip() return (titles, text) def keep_section(section: tuple[list[str], str]) -> bool: """Determine if a section should be kept based on length.""" titles, text = section return len(text) >= 16 def num_tokens(text: str) -> int: """Count the number of tokens in a text.""" encoding = tiktoken.encoding_for_model("gpt-4") return len(encoding.encode(text)) def halved_by_delimiter(string: str, delimiter: str = "\n") -> list[str]: """Split a string in two parts at a delimiter, balancing tokens.""" chunks = string.split(delimiter) if len(chunks) == 1: return [string, ""] elif len(chunks) == 2: return chunks else: total_tokens = num_tokens(string) halfway = total_tokens // 2 best_diff = halfway for i, chunk in enumerate(chunks): left = delimiter.join(chunks[: i + 1]) left_tokens = num_tokens(left) diff = abs(halfway - left_tokens) if diff >= best_diff: break else: best_diff = diff left = delimiter.join(chunks[:i]) right = delimiter.join(chunks[i:]) return [left, right] def split_strings_from_subsection(subsection: tuple[list[str], str], max_tokens: int = 1000, max_recursion: int = 5) -> list[str]: """Split a subsection into smaller pieces that fit within max_tokens.""" titles, text = subsection string = "\n\n".join(titles + [text]) if num_tokens(string) <= max_tokens: return [string] elif max_recursion == 0: return [string[:max_tokens]] else: titles, text = subsection for delimiter in ["\n\n", "\n", ". "]: left, right = halved_by_delimiter(text, delimiter=delimiter) if left == "" or right == "": continue else: results = [] for half in [left, right]: half_subsection = (titles, half) half_strings = split_strings_from_subsection( half_subsection, max_tokens=max_tokens, max_recursion=max_recursion - 1, ) results.extend(half_strings) return results return [string[:max_tokens]] def main(): # 1. Collect Wikipedia articles site = mwclient.Site(WIKI_SITE) category_page = site.pages[CATEGORY_TITLE] titles = titles_from_category(category_page, max_depth=1) print(f"Found {len(titles)} articles") # 2. Extract and clean sections wikipedia_sections = [] for title in titles: wikipedia_sections.extend(all_subsections_from_title(title)) print(f"Found {len(wikipedia_sections)} sections") wikipedia_sections = [clean_section(ws) for ws in wikipedia_sections] wikipedia_sections = [ws for ws in wikipedia_sections if keep_section(ws)] # 3. Split into chunks wikipedia_strings = [] for section in wikipedia_sections: wikipedia_strings.extend(split_strings_from_subsection(section, max_tokens=MAX_TOKENS)) print(f"Split into {len(wikipedia_strings)} chunks") # 4. Get embeddings with retries embeddings = [] for batch_start in range(0, len(wikipedia_strings), BATCH_SIZE): batch_end = batch_start + BATCH_SIZE batch = wikipedia_strings[batch_start:batch_end] print(f"Processing batch {batch_start} to {batch_end-1}") max_retries = 3 for attempt in range(max_retries): try: response = client.embeddings.create( model="nomic-embed", input=batch ) batch_embeddings = [e.embedding for e in response.data] embeddings.extend(batch_embeddings) break except Exception as e: print(f"Error on attempt {attempt + 1}: {str(e)}") if attempt == max_retries - 1: raise import time time.sleep(5 * (attempt + 1)) # Exponential backoff # 5. Save to CSV df = pd.DataFrame({"text": wikipedia_strings, "embedding": embeddings}) df.to_csv("winter_olympics_2022.csv", index=False) print("Saved embeddings to winter_olympics_2022.csv") if __name__ == "__main__": main() ``` ## How It Works 1. **Document Collection**: Downloads Wikipedia articles from the "2022 Winter Olympics" category. 2. **Text Processing**: - Splits articles into sections - Removes references and cleans text - Excludes short sections and irrelevant parts like "References" - Splits long sections to fit token limits 3. **Embedding Generation**: - Uses Gaia's OpenAI-compatible endpoint - Processes text in small batches (50 chunks) - Includes retry logic for reliability - Uses the nomic-embed model 4. **Storage**: Saves text chunks and their embeddings to a CSV file. ## Usage Save the code as `wikipedia_embeddings.py` and run: ```bash python wikipedia_embeddings.py ``` The script will create a CSV file containing text chunks and their embeddings, which can be used for semantic search or other applications. ## Key Features - OpenAI-compatible API usage - Robust error handling with retries - Efficient text chunking - Clean Wikipedia text processing - Token-aware splitting ## Notes - The script uses public Gaia nodes - no API key needed - Adjust BATCH_SIZE if you encounter timeouts - The embeddings CSV can be large - ensure sufficient disk space - The process may take several minutes depending on article count You can find ready-to-use Gaia nodes at [Public Nodes from Gaia](https://docs.gaianet.ai/user-guide/public-gaianet-nodes). ### Credits Inspired from [OpenAI's Cookbook](https://cookbook.openai.com/examples/embedding_wikipedia_articles_for_search) ### Results ![image](https://hackmd.io/_uploads/SyWqs9b81x.png)