Merge pull request #8 from neo4j-contrib/sourceprompt

tomasonjo · web-flow · commit 9e0d12658691 · 2023-09-26T13:02:22.000+02:00
prompt engineering sources
diff --git a/bot.py b/bot.py
@@ -8,7 +8,8 @@
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.embeddings import OllamaEmbeddings, SentenceTransformerEmbeddings
 from langchain.chat_models import ChatOpenAI, ChatOllama
-from langchain.chains import ConversationalRetrievalChain
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 from langchain.prompts.chat import (
     ChatPromptTemplate,
     SystemMessagePromptTemplate,
@@ -34,7 +35,8 @@
 
 neo4j_graph = Neo4jGraph(url=url, username=username, password=password)
 
-def create_vector_index(dimension):
+
+def create_vector_index(dimension: int) -> None:
     index_query = "CALL db.index.vector.createNodeIndex('stackoverflow', 'Question', 'embedding', $dimension, 'cosine')"
     try:
         neo4j_graph.query(index_query, {"dimension": dimension})
@@ -46,6 +48,7 @@ def create_vector_index(dimension):
     except:  # Already exists
         pass
 
+
 class StreamHandler(BaseCallbackHandler):
     def __init__(self, container, initial_text=""):
         self.container = container
@@ -55,6 +58,7 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
         self.text += token
         self.container.markdown(self.text)
 
+
 if embedding_model_name == "ollama":
     embeddings = OllamaEmbeddings(base_url=ollama_base_url, model="llama2")
     dimension = 4096
@@ -64,7 +68,9 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
     dimension = 1536
     logger.info("Embedding: Using OpenAI")
 else:
-    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2", cache_folder="/embedding_model")
+    embeddings = SentenceTransformerEmbeddings(
+        model_name="all-MiniLM-L6-v2", cache_folder="/embedding_model"
+    )
     dimension = 384
     logger.info("Embedding: Using SentenceTransformer")
 
@@ -75,7 +81,13 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
     logger.info("LLM: Using GPT-4")
 elif llm_name == "ollama":
     llm = ChatOllama(
-        temperature=0, base_url=ollama_base_url, model="llama2", streaming=True
+        temperature=0,
+        base_url=ollama_base_url,
+        model="llama2",
+        streaming=True,
+        top_k=10,  # A higher value (100) will give more diverse answers, while a lower value (10) will be more conservative.
+        top_p=0.3,  # Higher value (0.95) will lead to more diverse text, while a lower value (0.5) will generate more focused text.
+        num_ctx=3072,  # Sets the size of the context window used to generate the next token.
     )
     logger.info("LLM: Using Ollama (llama2)")
 else:
@@ -102,10 +114,10 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
         ).to_messages(),
         callbacks=callbacks,
     ).content
-    return answer
+    return {'answer':answer}
 
 
-# Rag response
+# Vector response
 neo4j_db = Neo4jVector.from_existing_index(
     embedding=embeddings,
     url=url,
@@ -116,18 +128,28 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
     text_node_property="body",  # text by default
     retrieval_query="""
     OPTIONAL MATCH (node)-[:ANSWERS]->(question)
-    RETURN question.title + '\n' + question.body + '\n' + coalesce(node.body,"") AS text, score, {source:question.link} AS metadata
+    RETURN 'Question: ' + question.title + '\n' + question.body + '\nAnswer: ' + 
+            coalesce(node.body,"") AS text, score, {source:question.link} AS metadata
+    ORDER BY score ASC // so that best answer are the last
 """,
 )
 
 general_system_template = """ 
 Use the following pieces of context to answer the question at the end.
+The context contains question-answer pairs and their links from Stackoverflow.
+You should prefer information from accepted or more upvoted answers.
+Make sure to rely on information from the answers and not on questions to provide accuate responses.
+When you find particular answer in the context useful, make sure to cite it in the answer using the link.
 If you don't know the answer, just say that you don't know, don't try to make up an answer.
-Each document in the context contains the source information.
-For every document that you use information from, return their source link at the end of the answer.
 ----
-{context}
+{summaries}
 ----
+Each answer you generate should contain a section at the end of links to 
+Stackoverflow questions and answers you found useful, which are described under Source value.
+You can only use links to StackOverflow questions that are present in the context and always
+add links to the end of the answer in the style of citations.
+Generate concise answers with references sources section of links to 
+relevant StackOverflow questions only at the end of the answer.
 """
 general_user_template = "Question:```{question}```"
 messages = [
@@ -136,13 +158,19 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
 ]
 qa_prompt = ChatPromptTemplate.from_messages(messages)
 
-qa = ConversationalRetrievalChain.from_llm(
+qa_chain = load_qa_with_sources_chain(
     llm,
+    chain_type="stuff",
+    prompt=qa_prompt,
+)
+qa = RetrievalQAWithSourcesChain(
+    combine_documents_chain=qa_chain,
     retriever=neo4j_db.as_retriever(search_kwargs={"k": 2}),
-    combine_docs_chain_kwargs={"prompt": qa_prompt},
+    reduce_k_below_max_tokens=True,
+    max_tokens_limit=3375,
 )
 
-# Rag + Knowledge Graph response
+# Vector + Knowledge Graph response
 kg = Neo4jVector.from_existing_index(
     embedding=embeddings,
     url=url,
@@ -156,20 +184,24 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
     MATCH (node)<-[:ANSWERS]-(a)
     WITH a
     ORDER BY a.is_accepted DESC, a.score DESC
-    WITH collect(a.body)[..2] as answers
-    RETURN reduce(str='', text IN answers | str +  text + '\n') as answerTexts
+    WITH collect(a)[..2] as answers
+    RETURN reduce(str='', a IN answers | str + 
+            '\n### Answer (Accepted: '+ a.is_accepted +' Score: ' + a.score+ '): '+  a.body + '\n') as answerTexts
 } 
-RETURN node.title + '\n' + node.body + '\n' + answerTexts AS text, score, {source:node.link} AS metadata
+RETURN '##Question: ' + node.title + '\n' + node.body + '\n' 
+       + answerTexts AS text, score, {source: node.link} AS metadata
+ORDER BY score ASC // so that best answers are the last
 """,
 )
 
-kg_qa = ConversationalRetrievalChain.from_llm(
-    llm,
+kg_qa = RetrievalQAWithSourcesChain(
+    combine_documents_chain=qa_chain,
     retriever=kg.as_retriever(search_kwargs={"k": 2}),
-    combine_docs_chain_kwargs={"prompt": qa_prompt},
+    reduce_k_below_max_tokens=True,
+    max_tokens_limit=3375,
 )
 
-# Streamlit stuff
+# Streamlit UI
 styl = f"""
 <style>
     /* not great support for :has yet (hello FireFox), but using it for now */
@@ -195,8 +227,8 @@ def chat_input():
             stream_handler = StreamHandler(st.empty())
             result = output_function(
                 {"question": user_input, "chat_history": []}, callbacks=[stream_handler]
-            )
-            output = result  # ["answer"] + "\n" + result["sources"]
+            )['answer']
+            output = result
             st.session_state[f"user_input"].append(user_input)
             st.session_state[f"generated"].append(output)
             st.session_state[f"rag_mode"].append(name)
@@ -216,7 +248,6 @@ def display_chat():
     if st.session_state[f"generated"]:
         size = len(st.session_state[f"generated"])
         # Display only the last three exchanges
-        # Excluding the latest since it's streamed
         for i in range(max(size - 3, 0), size):
             with st.chat_message("user"):
                 st.write(st.session_state[f"user_input"][i])
@@ -235,9 +266,9 @@ def mode_select() -> str:
 if name == "LLM only":
     output_function = generate_llm_output
 elif name == "Vector":
-    output_function = qa.run
+    output_function = qa
 elif name == "Vector + Graph":
-    output_function = kg_qa.run
+    output_function = kg_qa
 
 display_chat()
 chat_input()