Retrieval Augmented Generation (RAG) is the de facto technique for giving LLMs the ability to interact with any document or dataset, regardless of its size. Follow along as I cover how to parse and manipulate documents, explore how embeddings are used to describe abstract concepts, implement a simple yet powerful way to surface the most relevant parts of a document to a given query, and ultimately build a script that you can use to have a locally-hosted LLM engage your own documents.
# set up environment
ollama pull nomic-embed-text
python -m venv .venv
source .venv/bin/activate
python -m pip install ollama numpy
import ollama
import time
import os
import json
import numpy as np
from numpy.linalg import norm
# open a file and return paragraphs
def parse_file(filename):
with open(filename, encoding="utf-8-sig") as f:
paragraphs = []
buffer = []
for line in f.readlines():
line = line.strip()
if line:
buffer.append(line)
elif len(buffer):
paragraphs.append((" ").join(buffer))
buffer = []
if len(buffer):
paragraphs.append((" ").join(buffer))
return paragraphs
def save_embeddings(filename, embeddings):
# create dir if it doesn't exist
if not os.path.exists("embeddings"):
os.makedirs("embeddings")
# dump embeddings to json
with open(f"embeddings/{filename}.json", "w") as f:
json.dump(embeddings, f)
def load_embeddings(filename):
# check if file exists
if not os.path.exists(f"embeddings/{filename}.json"):
return False
# load embeddings from json
with open(f"embeddings/{filename}.json", "r") as f:
return json.load(f)
def get_embeddings(filename, modelname, chunks):
# check if embeddings are already saved
if (embeddings := load_embeddings(filename)) is not False:
return embeddings
# get embeddings from ollama
embeddings = [
ollama.embeddings(model=modelname, prompt=chunk)["embedding"]
for chunk in chunks
]
# save embeddings
save_embeddings(filename, embeddings)
return embeddings
# find cosine similarity of every chunk to a given embedding
def find_most_similar(needle, haystack):
needle_norm = norm(needle)
similarity_scores = [
np.dot(needle, item) / (needle_norm * norm(item)) for item in haystack
]
return sorted(zip(similarity_scores, range(len(haystack))), reverse=True)
def main():
SYSTEM_PROMPT = """You are a helpful reading assistant who answers questions
based on snippets of text provided in context. Answer only using the context provided,
being as concise as possible. If you're unsure, just say that you don't know.
Context:
"""
# open file
filename = "peter-pan.txt"
paragraphs = parse_file(filename)
embeddings = get_embeddings(filename, "nomic-embed-text", paragraphs)
prompt = input("what do you want to know? -> ")
# strongly recommended that all embeddings are generated by the same model (don't mix and match)
prompt_embedding = ollama.embeddings(model="nomic-embed-text", prompt=prompt)["embedding"]
# find most similar to each other
most_similar_chunks = find_most_similar(prompt_embedding, embeddings)[:5]
response = ollama.chat(
model="mistral",
messages=[
{
"role": "system",
"content": SYSTEM_PROMPT
+ "\n".join(paragraphs[item[1]] for item in most_similar_chunks),
},
{"role": "user", "content": prompt},
],
)
print("\n\n")
print(response["message"]["content"])
if __name__ == "__main__":
main()