Skip to content

Commit 34fada1

Browse files
committed
Finish refining rag code.
1 parent 31a1a4f commit 34fada1

File tree

6 files changed

+38
-32
lines changed

6 files changed

+38
-32
lines changed

4_rag/3_rag_text_splitting_deep_dive.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
import os
22

3-
# Set environment variable to suppress tokenizer parallelism warning
4-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
5-
63
from langchain.text_splitter import (
74
CharacterTextSplitter,
85
RecursiveCharacterTextSplitter,
@@ -16,7 +13,7 @@
1613

1714
# Define the directory containing the text file
1815
current_dir = os.path.dirname(os.path.abspath(__file__))
19-
file_path = os.path.join(current_dir, "books/odyssey.txt")
16+
file_path = os.path.join(current_dir, "books", "romeo_and_juliet.txt")
2017
db_dir = os.path.join(current_dir, "db")
2118

2219
# Check if the text file exists
@@ -31,7 +28,7 @@
3128

3229
# Define the embedding model
3330
embeddings = OpenAIEmbeddings(
34-
model="text-embedding-ada-002"
31+
model="text-embedding-3-small"
3532
) # Update to a valid embedding model if needed
3633

3734

@@ -45,7 +42,8 @@ def create_vector_store(docs, store_name):
4542
)
4643
print(f"--- Finished creating vector store {store_name} ---")
4744
else:
48-
print(f"Vector store {store_name} already exists. No need to initialize.")
45+
print(
46+
f"Vector store {store_name} already exists. No need to initialize.")
4947

5048

5149
# 1. Character-based Splitting
@@ -76,7 +74,8 @@ def create_vector_store(docs, store_name):
7674
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
7775
# Balances between maintaining coherence and adhering to character limits.
7876
print("\n--- Using Recursive Character-based Splitting ---")
79-
rec_char_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
77+
rec_char_splitter = RecursiveCharacterTextSplitter(
78+
chunk_size=1000, chunk_overlap=100)
8079
rec_char_docs = rec_char_splitter.split_documents(documents)
8180
create_vector_store(rec_char_docs, "chroma_db_rec_char")
8281

@@ -107,7 +106,7 @@ def query_vector_store(store_name, query):
107106
)
108107
retriever = db.as_retriever(
109108
search_type="similarity_score_threshold",
110-
search_kwargs={"k": 1, "score_threshold": 0.75},
109+
search_kwargs={"k": 1, "score_threshold": 0.1},
111110
)
112111
relevant_docs = retriever.invoke(query)
113112
# Display the relevant results with metadata
@@ -121,7 +120,7 @@ def query_vector_store(store_name, query):
121120

122121

123122
# Define the user's question
124-
query = "Who is Odysseus' wife?"
123+
query = "How did Juliet die?"
125124

126125
# Query each vector store
127126
query_vector_store("chroma_db_char", query)

4_rag/4_rag_embedding_deep_dive.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
import os
22

3-
# Set environment variable to suppress tokenizer parallelism warning
4-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
5-
63
from langchain.embeddings import HuggingFaceEmbeddings
74
from langchain.text_splitter import CharacterTextSplitter
85
from langchain_community.document_loaders import TextLoader
@@ -11,7 +8,7 @@
118

129
# Define the directory containing the text file and the persistent directory
1310
current_dir = os.path.dirname(os.path.abspath(__file__))
14-
file_path = os.path.join(current_dir, "books/odyssey.txt")
11+
file_path = os.path.join(current_dir, "books", "odyssey.txt")
1512
db_dir = os.path.join(current_dir, "db")
1613

1714
# Check if the text file exists
@@ -39,10 +36,12 @@ def create_vector_store(docs, embeddings, store_name):
3936
persistent_directory = os.path.join(db_dir, store_name)
4037
if not os.path.exists(persistent_directory):
4138
print(f"\n--- Creating vector store {store_name} ---")
42-
Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
39+
Chroma.from_documents(
40+
docs, embeddings, persist_directory=persistent_directory)
4341
print(f"--- Finished creating vector store {store_name} ---")
4442
else:
45-
print(f"Vector store {store_name} already exists. No need to initialize.")
43+
print(
44+
f"Vector store {store_name} already exists. No need to initialize.")
4645

4746

4847
# 1. OpenAI Embeddings
@@ -78,10 +77,8 @@ def query_vector_store(store_name, query, embedding_function):
7877
embedding_function=embedding_function,
7978
)
8079
retriever = db.as_retriever(
81-
search_type="similarity",
82-
search_kwargs={
83-
"k": 3,
84-
},
80+
search_type="similarity_score_threshold",
81+
search_kwargs={"k": 3, "score_threshold": 0.1},
8582
)
8683
relevant_docs = retriever.invoke(query)
8784
# Display the relevant results with metadata

4_rag/5_rag_retriever_deep_dive.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import os
22

3+
from dotenv import load_dotenv
34
from langchain_community.vectorstores import Chroma
45
from langchain_openai import OpenAIEmbeddings
56

7+
load_dotenv()
8+
69
# Define the persistent directory
710
current_dir = os.path.dirname(os.path.abspath(__file__))
811
db_dir = os.path.join(current_dir, "db")
@@ -12,7 +15,8 @@
1215
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
1316

1417
# Load the existing vector store with the embedding function
15-
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
18+
db = Chroma(persist_directory=persistent_directory,
19+
embedding_function=embeddings)
1620

1721

1822
# Function to query a vector store with different search types and parameters
@@ -41,7 +45,7 @@ def query_vector_store(
4145

4246

4347
# Define the user's question
44-
query = "Who is the main character in Moby Dick?"
48+
query = "How did Juliet die?"
4549

4650
# Showcase different retrieval methods
4751

@@ -50,7 +54,8 @@ def query_vector_store(
5054
# It finds the most similar documents to the query vector based on cosine similarity.
5155
# Use this when you want to retrieve the top k most similar documents.
5256
print("\n--- Using Similarity Search ---")
53-
query_vector_store("chroma_db_with_metadata", query, embeddings, "similarity", {"k": 3})
57+
query_vector_store("chroma_db_with_metadata", query,
58+
embeddings, "similarity", {"k": 3})
5459

5560
# 2. Max Marginal Relevance (MMR)
5661
# This method balances between selecting documents that are relevant to the query and diverse among themselves.
@@ -79,7 +84,7 @@ def query_vector_store(
7984
query,
8085
embeddings,
8186
"similarity_score_threshold",
82-
{"k": 3, "score_threshold": 0.7},
87+
{"k": 3, "score_threshold": 0.1},
8388
)
8489

8590
print("Querying demonstrations with different search types completed.")

4_rag/6_rag_one_off_question.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,18 @@
1010

1111
# Define the persistent directory
1212
current_dir = os.path.dirname(os.path.abspath(__file__))
13-
persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")
13+
persistent_directory = os.path.join(
14+
current_dir, "db", "chroma_db_with_metadata")
1415

1516
# Define the embedding model
1617
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
1718

1819
# Load the existing vector store with the embedding function
19-
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
20+
db = Chroma(persist_directory=persistent_directory,
21+
embedding_function=embeddings)
2022

2123
# Define the user's question
22-
query = "Who is tom brady?"
24+
query = "How can I learn more about LangChain?"
2325

2426
# Retrieve relevant documents based on the query
2527
retriever = db.as_retriever(
@@ -56,7 +58,7 @@
5658

5759
# Display the full result and content only
5860
print("\n--- Generated Response ---")
59-
print("Full result:")
60-
print(result)
61+
# print("Full result:")
62+
# print(result)
6163
print("Content only:")
6264
print(result.content)

4_rag/8_rag_web_scrape_firecrawl.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ def create_vector_store():
2424

2525
# Step 1: Crawl the website using FireCrawlLoader
2626
print("Begin crawling the website...")
27-
loader = FireCrawlLoader(api_key=api_key, url="https://apple.com", mode="scrape")
27+
loader = FireCrawlLoader(
28+
api_key=api_key, url="https://apple.com", mode="scrape")
2829
docs = loader.load()
2930
print("Finished crawling the website.")
3031

@@ -58,11 +59,13 @@ def create_vector_store():
5859
if not os.path.exists(persistent_directory):
5960
create_vector_store()
6061
else:
61-
print(f"Vector store {persistent_directory} already exists. No need to initialize.")
62+
print(
63+
f"Vector store {persistent_directory} already exists. No need to initialize.")
6264

6365
# Load the vector store with the embeddings
6466
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
65-
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
67+
db = Chroma(persist_directory=persistent_directory,
68+
embedding_function=embeddings)
6669

6770

6871
# Step 5: Query the vector store
@@ -86,7 +89,7 @@ def query_vector_store(query):
8689

8790

8891
# Define the user's question
89-
query = "WWDC24?"
92+
query = "Apple Intelligence?"
9093

9194
# Query the vector store with the user's question
9295
query_vector_store(query)

0 commit comments

Comments
 (0)