# KG: Knowledge Graphs for RAG
### Part 1: Neo4j Environment Setup
step 1: 建立 sandbox 資料庫
連線到 https://neo4j.com/sandbox, 點選左下方的 "Launch the Free Sandbox", 建立 "movie" 專案
step 2: 預備連線資料
```
NEO4J_URI=bolt://your.neo4j.instance.ip:7687
NEO4J_USERNAME=neo4j
NEO4J_PASSWORD=multisystem-slings-dare
```
step 3: 確定資料庫連線
setup.py
```python=
import dotenv
import os
from neo4j import GraphDatabase, basic_auth
load_status = dotenv.load_dotenv("Neo4j-Sandbox-0619.txt")
url = os.getenv("NEO4J_URI")
username =os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
driver = GraphDatabase.driver(
url,
auth=basic_auth(username, password))
cypher_query = '''
MATCH (movie:Movie {title:$favorite})<-[:ACTED_IN]-(actor)-[:ACTED_IN]->(rec:Movie)
RETURN distinct rec.title as title LIMIT 20
'''
with driver.session(database="neo4j") as session:
results = session.read_transaction(
lambda tx: tx.run(cypher_query,
favorite="The Matrix").data())
for record in results:
print(record['title'])
driver.close()
```
### Part 2: Connect Neo4j Via langchain
example1.py
```python=
import dotenv
import os
from langchain_community.graphs import Neo4jGraph
load_status = dotenv.load_dotenv("Neo4j-Sandbox-0619.txt")
url = os.getenv("NEO4J_URI")
username =os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
kg = Neo4jGraph(
url=url, username=username, password=password, database="neo4j"
)
cypher = """
MATCH (n)
RETURN count(n)
"""
result = kg.query(cypher)
print(result)
print(kg.schema)
```
example2.py
```python=
import dotenv
import os
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI
load_status = dotenv.load_dotenv("Neo4j-Sandbox-0619.txt")
url = os.getenv("NEO4J_URI")
username =os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
kg = Neo4jGraph(
url=url, username=username, password=password, database="neo4j"
)
chain = GraphCypherQAChain.from_llm(
# default top_k = 10
ChatOpenAI(temperature=0), graph=kg, verbose=True
# limit top_k to 2
# ChatOpenAI(temperature=0), graph=kg, verbose=True, top_k=2
)
chain.invoke({"query": "Who played in Top Gun?"})
```
example3.py
```python=
import dotenv
import os
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate
load_status = dotenv.load_dotenv("Neo4j-Sandbox-0619.txt")
url = os.getenv("NEO4J_URI")
username =os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
kg = Neo4jGraph(
url=url, username=username, password=password, database="neo4j"
)
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:
# How many people played in Top Gun?
MATCH (m:Movie {{name:"Top Gun"}})<-[:ACTED_IN]-()
RETURN count(*) AS numberOfActors
The question is:
{question}"""
CYPHER_GENERATION_PROMPT = PromptTemplate(
input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)
chain = GraphCypherQAChain.from_llm(
llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
graph=kg,
verbose=True,
cypher_prompt=CYPHER_GENERATION_PROMPT,
)
chain.invoke({"query": "How many people played in Top Gun?"})
chain.invoke({"query": "Who played in Top Gun?"})
```
### Part 3: Preparing Text Data for RAG
example4.py
```python=
import dotenv
import os
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate
load_status = dotenv.load_dotenv("Neo4j-Sandbox-0619_ex4.txt")
url = os.getenv("NEO4J_URI")
username =os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
# Note the code below is unique to this course environment, and not a
# standard part of Neo4j's integration with OpenAI. Remove if running
# in your own environment.
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # This one get from environment variable
kg = Neo4jGraph(
url=url, username=username, password=password, database="neo4j"
)
# Create a vector index
kg.query("""
CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
FOR (m:Movie) ON (m.taglineEmbedding)
OPTIONS { indexConfig: {
`vector.dimensions`: 1536,
`vector.similarity_function`: 'cosine'
}}"""
)
r_vi=kg.query("""
SHOW VECTOR INDEXES
"""
)
for index in r_vi:
print(index)
# Populate the vector index
# - Calculate vector representation for each movie tagline using OpenAI
# - Add vector to the Movie node as taglineEmbedding property
kg.query("""
MATCH (movie:Movie) WHERE movie.tagline IS NOT NULL
WITH movie, genai.vector.encode(
movie.tagline,
"OpenAI",
{
token: $openAiApiKey,
endpoint: $openAiEndpoint
}) AS vector
CALL db.create.setNodeVectorProperty(movie, "taglineEmbedding", vector)
""",
params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )
result = kg.query("""
MATCH (m:Movie)
WHERE m.tagline IS NOT NULL
RETURN m.tagline, m.taglineEmbedding
LIMIT 1
"""
)
# print(result[0]['m.tagline'])
# print(result[0]['m.taglineEmbedding'][:10])
# print(result[0]['m.taglineEmbedding'])
# Similarity search
# - Calculate embedding for question
# - Identify matching movies based on similarity of question and taglineEmbedding vectors
question = "What movies are about love?"
result2=kg.query("""
WITH genai.vector.encode(
$question,
"OpenAI",
{
token: $openAiApiKey,
endpoint: $openAiEndpoint
}) AS question_embedding
CALL db.index.vector.queryNodes(
'movie_tagline_embeddings',
$top_k,
question_embedding
) YIELD node AS movie, score
RETURN movie.title, movie.tagline, score
""",
params={"openAiApiKey":OPENAI_API_KEY,
"openAiEndpoint": OPENAI_ENDPOINT,
"question": question,
"top_k": 5
})
# print(f"There are {len(result2)} movies")
# i=len(result2)
# for i in range(len(result2)):
# print(f"===== Movie # {i+1} =====")
# print(result2[i]['movie.tagline'])
# print(result2[i]['movie.title'])
# print(result2[i]['score'])
```
---
參考資料
1. DeepLearning.AI: Knowledge Graphs for RAG
https://learn.deeplearning.ai/courses/knowledge-graphs-rag/lesson/1/introduction
2. LangChain Documentation
https://python.langchain.com/v0.2/docs/integrations/graphs/neo4j_cypher/
2.