Fix rag notebook

raghavbali · raghavbali · commit 4fecbecf1f93 · 2025-08-17T22:51:47.000+02:00
diff --git a/docs/module_04_llm_apps/01_retrieval_augmented_llm_app.ipynb b/docs/module_04_llm_apps/01_retrieval_augmented_llm_app.ipynb
@@ -86,10 +86,35 @@
     "# install dependencies\n",
     "# !pip install -q chromadb\n",
     "# !pip install retry\n",
+    "# !pip install ollama\n",
+    "# !pip install openai\n",
     "# !pip install -q streamlit \n",
+    "# !pip install -U huggingface_hub\n",
     "# !npm install localtunnel # this is needed if you are working from colab"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Start Chroma DB in Server mode\n",
+    "```bash\n",
+    ">chroma run --path ./chromadb \n",
+    "```\n",
+    "\n",
+    "### Install NodeJS\n",
+    "```bash\n",
+    ">curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | bash\n",
+    "\n",
+    ">export NVM_DIR=\"$([ -z \"${XDG_CONFIG_HOME-}\" ] && printf %s \"${HOME}/.nvm\" || printf %s \"${XDG_CONFIG_HOME}/nvm\")\"\n",
+    "[ -s \"$NVM_DIR/nvm.sh\" ] && \\. \"$NVM_DIR/nvm.sh\" # This loads nvm\n",
+    "\n",
+    ">source ~/.bashrc\n",
+    ">nvm install node\n",
+    ">npm install localtunnel\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -105,7 +130,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Writing app.py\n"
+      "Overwriting app.py\n"
      ]
     }
    ],
@@ -196,6 +221,13 @@
     "    st.markdown(f\"- <i>{context[:100]}...</i>\", unsafe_allow_html=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> Following steps are useful for colab, on local systems start streamlit from terminal"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/docs/module_04_llm_apps/constants.py b/docs/module_04_llm_apps/constants.py
@@ -1,10 +1,9 @@
 #####################
 ## Set Constants
 #####################
-HF_TOKEN = '<YOUR KEY>'
-OPENAI_TOKEN = '<YOUR KEY>'
+HF_TOKEN = '<YOUR TOKEN>'
+OPENAI_TOKEN = '<YOUR TOKEN>'
 HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
-
 # Constants for embedding model
 EMB_MODEL_ID = 'pinecone/mpnet-retriever-discourse'
 EMB_API_URL = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{EMB_MODEL_ID}"
@@ -13,24 +12,19 @@
 QA_MODEL_ID = 'deepset/roberta-base-squad2'
 
 # List of Different Endpoints
-HF_QA_ENDPOINT = 'HF-QA'
 HF_LM_ENDPOINT = 'HF-LM'
 OPENAI_ENDPOINT = 'OPENAI-LM'
 LOCAL_OLLAMA_ENDPOINT = 'OLLAMA'
 AVAILABLE_LMs = {
     'models':
     [
-        'deepset/roberta-base-squad2',
-        'Intel/dynamic_tinybert',
-        #'google/gemma-2-2b-it', # this is timing out mostly
+        'HuggingFaceTB/SmolLM3-3B',
         'Local-LLAMA-3.1:8b',
         'OpenAI-GPT4o-mini'
     ],
     'endpoints':
     [
-        HF_QA_ENDPOINT,
-        HF_QA_ENDPOINT,
-        #HF_LM_ENDPOINT, #this is timing out mostly
+        HF_LM_ENDPOINT,
         LOCAL_OLLAMA_ENDPOINT,
         OPENAI_ENDPOINT,
     ]
diff --git a/docs/module_04_llm_apps/utils.py b/docs/module_04_llm_apps/utils.py
@@ -7,7 +7,7 @@
 from retry import retry
 import streamlit as st
 import chromadb.utils.embedding_functions as embedding_functions
-from huggingface_hub import InferenceClient
+from huggingface_hub import InferenceClient,login
 from openai import OpenAI
 import ollama
 from constants import (
@@ -16,8 +16,6 @@
     HEADERS,
     EMB_MODEL_ID,
     EMB_API_URL,
-    QA_MODEL_ID,
-    HF_QA_ENDPOINT,
     HF_LM_ENDPOINT,
     OPENAI_ENDPOINT,
     LOCAL_OLLAMA_ENDPOINT,
@@ -26,8 +24,8 @@
 
 import chromadb
 
-
 lm_df = pd.DataFrame.from_dict(AVAILABLE_LMs)
+login(token=HF_TOKEN, add_to_git_credential=True)
 
 #####################
 ## Utility Functions
@@ -77,31 +75,24 @@ def get_relevant_documents(query, db):
   except Exception as ex:
       return "Apologies but I could not process your query", 0.0, ex
 
-def get_hf_qa_answer(payload,lm_model):
-    data = json.dumps(payload)
-    try:
-      QA_API_URL = f"https://api-inference.huggingface.co/models/{lm_model}" 
-      response = requests.request("POST", QA_API_URL, headers=HEADERS, data=data)
-      decoded_response = json.loads(response.content.decode("utf-8"))
-      return decoded_response['answer'], decoded_response['score'], ""
-    except Exception as ex:
-      return "Apologies but I could not find any relevant answer", 0.0, ex
-
 # this is mostly timing out
 def get_hf_llm_answer(payload,lm_model):
     try:
         client = InferenceClient(
-        "google/gemma-2-2b-it",
-        token=HF_TOKEN,)
-
-        content = f"Given the context, answer the question. \ncontext:{payload['context']}\nquestion:{payload['question']}"
-        response= client.chat_completion(
-    	messages=[{"role": "user", "content": content}],
-    	max_tokens=500,
-    	stream=False,
+           provider="hf-inference",
+           api_key=HF_TOKEN
         )
-    
-        return json.loads(message.choices[0].delta.content), 0.0 
+        f"Given the context, perform the following tasks:1.Respond with a summarized answer to the question factually in few words only if the provided context contains the answer\n 2.Generate a relevance score.\n3. Format the output as a json with answer and score as keys. Do not add makrdown syntax.\nThink step by step.\ncontext:{payload['context']}\nquestion:{payload['question']}"
+        completion = client.chat.completions.create(
+            model=lm_model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": content
+                }
+            ],
+        )        
+        return completion.choices[0].message.content, "", ""
     except Exception as ex:
       return "Apologies but I could not find any relevant answer", 0.0, ex
 
@@ -134,10 +125,10 @@ def get_opeai_answer(payload,lm_model):
                     "content": content,
                 }
             ],
-            model="gpt-4o-mini",
+            max_tokens=500,
+            model="gpt-4o-2024-11-20",
         )
-        json_output = json.loads(chat_completion.choices[0].message.content)
-        return json_output['answer'], json_output['score'], ""
+        return chat_completion.choices[0].message.content,"", ""
     except Exception as ex:
       return "Apologies but I could not find any relevant answer", 0.0, ex
 
@@ -153,9 +144,7 @@ def get_answer(question,context,lm_model):
     }
     try:
       endpoint_type = lm_df[lm_df['models']==lm_model]['endpoints'].values[0]  
-      if endpoint_type == HF_QA_ENDPOINT:
-          return get_hf_qa_answer(payload,lm_model)
-      elif endpoint_type == HF_LM_ENDPOINT:  
+      if endpoint_type == HF_LM_ENDPOINT:  
           return get_hf_llm_answer(payload,lm_model)
       elif endpoint_type == OPENAI_ENDPOINT: 
           return get_opeai_answer(payload,lm_model)