Update RAG notebook

raghavbali · raghavbali · commit 17cefccbcc92 · 2025-08-03T15:53:16.000+02:00
diff --git a/docs/module_04_llm_ops/01_retrieval_augmented_llm_app.ipynb b/docs/module_04_llm_ops/01_retrieval_augmented_llm_app.ipynb
@@ -105,7 +105,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Overwriting app.py\n"
+      "Writing app.py\n"
      ]
     }
    ],
@@ -198,14 +198,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "id": "mlGSHYN0bQSm"
    },
    "outputs": [],
    "source": [
     "# check the log file for localhost port\n",
-    "# !streamlit run app.py &>logs.txt & "
+    "!streamlit run app.py &>logs.txt & "
    ]
   },
   {
diff --git a/docs/module_04_llm_ops/app.py b/docs/module_04_llm_ops/app.py
@@ -0,0 +1,83 @@
+## import required components
+
+import pandas as pd
+from utils import (
+    get_lines,
+    load_data,
+    get_relevant_documents,
+    get_answer,
+    create_db,
+    sidebar,
+)
+import streamlit as st
+chroma_client, db = create_db()
+
+## Setup Page Header and Sidebar
+st.set_page_config(page_title="PersonalGPT", page_icon="📖", layout="wide")
+lm_model = sidebar()
+st.header(f"📖PersonalGPT")
+st.markdown(f">:zap: Responses Powered by **{lm_model}**")
+
+if 'is_doc_uploaded' not in st.session_state:
+    st.session_state['is_doc_uploaded'] = False
+
+
+## Add Uploader Component
+uploaded_file = st.file_uploader(
+    "Upload a txt file",
+    type=["txt"],
+    help="Text files with each sentence acting as a document",
+)
+
+if not st.session_state['is_doc_uploaded']:
+    ## Check if upload is complete
+    if not uploaded_file:
+        st.stop()
+    
+    ## Read uploaded file
+    try:
+        file_data = get_lines(uploaded_file)
+        ## Verbose Status update
+        st.markdown(f"> Uploaded file has {len(file_data)} lines of text")
+        st.session_state['is_doc_uploaded'] = True
+    except Exception as e:
+        st.markdown(f"Could not upload/read file={e}")
+        st.session_state['is_doc_uploaded'] = False
+    
+    ## Index Uploaded text file
+    with st.spinner("Indexing document... This may take a while⏳"):
+        db_status_msg = load_data(db, documents=file_data)
+    
+    ## status update
+    st.markdown(f"> Database indexed {db.count()} documents")
+    if db.count() == 0:
+        st.markdown(db_status_msg)
+        st.session_state['is_doc_uploaded'] = False
+
+## Get User Input
+with st.form(key="qa_form"):
+    query = st.text_area("Enter Your Query:",
+                         placeholder="Examples: \nwhat is tf-idf?\nwhich module covers RLHF\nhow many moons does Jupiter have?")
+    submit = st.form_submit_button("Submit")
+
+## Provide additional Options for citing source
+with st.expander("Advanced Options"):
+    show_source = st.checkbox("Show Source")
+
+## Generate Output upon button click
+if submit:
+  # Get relevant documents from DB
+  context = get_relevant_documents(query, db)
+
+  # get answer from LLM
+  answer,score,error = get_answer(query,context,lm_model)
+
+  # Showcase response on screen
+  st.markdown(f"**Answer:** _{answer}_")
+  st.markdown(f"> **Relevance Score**:{score}")
+  st.markdown("---")
+
+  # Add more details if advanced option is chosen
+  if show_source:
+    st.markdown("**Source(s):**")
+    st.markdown(f"- <i>{context[:100]}...</i>", unsafe_allow_html=True)
diff --git a/docs/module_04_llm_ops/constants.py b/docs/module_04_llm_ops/constants.py
@@ -0,0 +1,37 @@
+#####################
+## Set Constants
+#####################
+HF_TOKEN = '<YOUR KEY>'
+OPENAI_TOKEN = '<YOUR KEY>'
+HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
+
+# Constants for embedding model
+EMB_MODEL_ID = 'pinecone/mpnet-retriever-discourse'
+EMB_API_URL = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{EMB_MODEL_ID}"
+
+# Constants for QA model
+QA_MODEL_ID = 'deepset/roberta-base-squad2'
+
+# List of Different Endpoints
+HF_QA_ENDPOINT = 'HF-QA'
+HF_LM_ENDPOINT = 'HF-LM'
+OPENAI_ENDPOINT = 'OPENAI-LM'
+LOCAL_OLLAMA_ENDPOINT = 'OLLAMA'
+AVAILABLE_LMs = {
+    'models':
+    [
+        'deepset/roberta-base-squad2',
+        'Intel/dynamic_tinybert',
+        #'google/gemma-2-2b-it', # this is timing out mostly
+        'Local-LLAMA-3.1:8b',
+        'OpenAI-GPT4o-mini'
+    ],
+    'endpoints':
+    [
+        HF_QA_ENDPOINT,
+        HF_QA_ENDPOINT,
+        #HF_LM_ENDPOINT, #this is timing out mostly
+        LOCAL_OLLAMA_ENDPOINT,
+        OPENAI_ENDPOINT,
+    ]
+}
diff --git a/docs/module_04_llm_ops/scraper_utils.py b/docs/module_04_llm_ops/scraper_utils.py
@@ -0,0 +1,45 @@
+# Adapted From: https://gist.github.com/psychemedia/925e190e2afd15b050f32334ceff9ef6
+import os
+import nbformat
+
+class NB_Markdown_Scraper:
+
+    def __init__(self,input_paths=None):
+        self.notebook_md_dict = dict()
+        self.input_paths = input_paths
+
+    def nbpathwalk(self,path):
+        ''' Walk down a directory path looking for ipynb notebook files... '''
+        valid_notebook_files = []
+        for path, _, files in os.walk(path):
+            if '.ipynb_checkpoints' in path or 'solutions' in path : continue
+            for f in [i for i in files if i.endswith('.ipynb') and not i.startswith('dontcommit')]:
+                valid_notebook_files.append(os.path.join(path, f))
+        return valid_notebook_files
+
+
+    def get_cell_contents(self,nb_fn, c_md=None, cell_typ=None):
+        ''' Extract the content of Jupyter notebook cells. '''
+        if cell_typ is None: cell_typ=['markdown']
+        if c_md is None: c_md = []
+        nb=nbformat.read(nb_fn,nbformat.NO_CONVERT)
+        _c_md=[i for i in nb.cells if i['cell_type'] in cell_typ]
+        ix=len(c_md)
+        for c in _c_md:
+            c.update( {"ix":str(ix)})
+            c.update( {"title":nb_fn})
+            ix = ix+1
+        c_md = c_md + _c_md
+        return c_md
+
+
+    # scraper
+    def scrape_markdowns(self):
+        for directory in self.input_paths:
+            directory_notebooks = self.nbpathwalk(directory)
+            for notebook in directory_notebooks:
+                notebook_cells = self.get_cell_contents(notebook, cell_typ=['markdown'])
+                notebook_name = '_'.join(notebook.split('/')[1:]).split('.')[0]
+                self.notebook_md_dict[notebook_name] = ' '.join([cell['source'] for cell in sorted(notebook_cells, 
+                                                                                                   key=lambda d: d['ix'])])
+            
diff --git a/docs/module_04_llm_ops/utils.py b/docs/module_04_llm_ops/utils.py
@@ -0,0 +1,196 @@
+#####################
+## imports
+#####################
+import pandas as pd
+import json
+import requests
+from retry import retry
+import streamlit as st
+import chromadb.utils.embedding_functions as embedding_functions
+from huggingface_hub import InferenceClient
+from openai import OpenAI
+import ollama
+from constants import (
+    HF_TOKEN,
+    OPENAI_TOKEN,
+    HEADERS,
+    EMB_MODEL_ID,
+    EMB_API_URL,
+    QA_MODEL_ID,
+    HF_QA_ENDPOINT,
+    HF_LM_ENDPOINT,
+    OPENAI_ENDPOINT,
+    LOCAL_OLLAMA_ENDPOINT,
+    AVAILABLE_LMs)
+
+
+import chromadb
+
+
+lm_df = pd.DataFrame.from_dict(AVAILABLE_LMs)
+
+#####################
+## Utility Functions
+#####################
+
+def get_lines(uploaded_file):
+  """
+    Utility to read raw text file in binary
+  """
+  raw_data = []
+  for line in uploaded_file:
+        raw_data.append(line.decode("utf-8") )
+  return raw_data
+
+def create_db():
+  """
+    Utility to instantiate vector db client and collection
+  """
+  chroma_client = chromadb.Client()
+  # huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
+  #     api_key=HF_TOKEN,
+  #     model_name=EMB_MODEL_ID
+  # )
+  db = chroma_client.get_or_create_collection(name="nlp_llm_workshop",)
+                                      #embedding_function=huggingface_ef)
+  return chroma_client,db
+
+def load_data(db, documents):
+  """
+    Utility to add/index data into vector db
+  """
+  try:
+      db.add(
+          documents=documents,
+          ids=[str(i) for i in range(len(documents))]
+  )
+  except Exception as ex:
+      return "Apologies but I could not ingest document", 0.0, ex
+
+def get_relevant_documents(query, db):
+  """
+    Utility to retrieve relevant documents from vector DB
+  """
+  try:
+      relevant_doc = db.query(query_texts=[query], n_results=1)['documents'][0][0]
+      return relevant_doc
+  except Exception as ex:
+      return "Apologies but I could not process your query", 0.0, ex
+
+def get_hf_qa_answer(payload,lm_model):
+    data = json.dumps(payload)
+    try:
+      QA_API_URL = f"https://api-inference.huggingface.co/models/{lm_model}" 
+      response = requests.request("POST", QA_API_URL, headers=HEADERS, data=data)
+      decoded_response = json.loads(response.content.decode("utf-8"))
+      return decoded_response['answer'], decoded_response['score'], ""
+    except Exception as ex:
+      return "Apologies but I could not find any relevant answer", 0.0, ex
+
+# this is mostly timing out
+def get_hf_llm_answer(payload,lm_model):
+    try:
+        client = InferenceClient(
+        "google/gemma-2-2b-it",
+        token=HF_TOKEN,)
+
+        content = f"Given the context, answer the question. \ncontext:{payload['context']}\nquestion:{payload['question']}"
+        response= client.chat_completion(
+    	messages=[{"role": "user", "content": content}],
+    	max_tokens=500,
+    	stream=False,
+        )
+    
+        return json.loads(message.choices[0].delta.content), 0.0 
+    except Exception as ex:
+      return "Apologies but I could not find any relevant answer", 0.0, ex
+
+def get_local_llama_answer(payload,lm_model):
+    try:
+        content = f"Given the context, perform the following tasks:1.Respond with a summarized answer to the question factually in few words only if the provided context contains the answer\n2.Check if your answer is really in the provided context, otherwise respond with 'Sorry I could not find the answer'.\n 3.Generate a relevance score between 0 and 1.\n4. Format the output as a json with answer and score as keys.\n5.Do not add makrdown syntax only respond with json.\nBe careful and Think step by step.\ncontext:{payload['context']}\nquestion:{payload['question']}"
+        response = ollama.chat(model='llama3.1:8b', messages=[
+            {
+                'role': 'user',
+                'content': content,
+            },
+        ]
+                              )
+        json_output = json.loads(response['message']['content'])
+        return json_output['answer'], json_output['score'], ""
+    except Exception as ex:
+      st.markdown(ex)
+      return "Apologies but I could not find any relevant answer", 0.0, ex
+        
+def get_opeai_answer(payload,lm_model):
+    try:
+        client = OpenAI(
+            api_key=OPENAI_TOKEN,
+        )
+        content = f"Given the context, perform the following tasks:1.Respond with a summarized answer to the question factually in few words only if the provided context contains the answer\n 2.Generate a relevance score.\n3. Format the output as a json with answer and score as keys. Do not add makrdown syntax.\nThink step by step.\ncontext:{payload['context']}\nquestion:{payload['question']}"
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": content,
+                }
+            ],
+            model="gpt-4o-mini",
+        )
+        json_output = json.loads(chat_completion.choices[0].message.content)
+        return json_output['answer'], json_output['score'], ""
+    except Exception as ex:
+      return "Apologies but I could not find any relevant answer", 0.0, ex
+
+
+def get_answer(question,context,lm_model):
+    """
+      Utility to leverage QA model for answering question using given context
+      and the mentioned model
+    """
+    payload = {
+        "question": question,
+        "context":context
+    }
+    try:
+      endpoint_type = lm_df[lm_df['models']==lm_model]['endpoints'].values[0]  
+      if endpoint_type == HF_QA_ENDPOINT:
+          return get_hf_qa_answer(payload,lm_model)
+      elif endpoint_type == HF_LM_ENDPOINT:  
+          return get_hf_llm_answer(payload,lm_model)
+      elif endpoint_type == OPENAI_ENDPOINT: 
+          return get_opeai_answer(payload,lm_model)
+      elif endpoint_type == LOCAL_OLLAMA_ENDPOINT:
+          return get_local_llama_answer(payload,lm_model)
+      else:
+          "This is not implemented yet", 0.0, ex
+    except Exception as ex:
+      return "Apologies but I could not find any relevant answer", 0.0, ex
+
+
+def sidebar():
+    """
+      Utility to add content to sidebar
+    """
+    with st.sidebar:
+        st.markdown(
+            "## How to use\n"
+            "1. Upload a txt file📄\n"
+            "3. Ask a question about the document💬\n"
+        )
+        st.markdown("---")
+        st.markdown("## Which LM would you like to use?")
+        option = st.selectbox(
+            "Select a Model",
+             lm_df['models'],
+            label_visibility='hidden'
+        )
+
+        st.markdown("---")
+        st.markdown("# About")
+        st.markdown(
+            "📖PersonalGPT is a demo to showcase retrieval augmented question answering system"
+        )
+        st.markdown(":heart: Made by [raghav bali](https://raghavbali.github.io)")
+        st.markdown("---")
+
+        return option

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@`
`105`	`105`	`"name": "stdout",`
`106`	`106`	`"output_type": "stream",`
`107`	`107`	`"text": [`
`108`		`- "Overwriting app.py\n"`
	`108`	`+ "Writing app.py\n"`
`109`	`109`	`]`
`110`	`110`	`}`
`111`	`111`	`],`
`@@ -198,14 +198,14 @@`
`198`	`198`	`},`
`199`	`199`	`{`
`200`	`200`	`"cell_type": "code",`
`201`		`- "execution_count": 3,`
	`201`	`+ "execution_count": null,`
`202`	`202`	`"metadata": {`
`203`	`203`	`"id": "mlGSHYN0bQSm"`
`204`	`204`	`},`
`205`	`205`	`"outputs": [],`
`206`	`206`	`"source": [`
`207`	`207`	`"# check the log file for localhost port\n",`
`208`		`- "# !streamlit run app.py &>logs.txt & "`
	`208`	`+ "!streamlit run app.py &>logs.txt & "`
`209`	`209`	`]`
`210`	`210`	`},`
`211`	`211`	`{`