From 6d235627133e2084ffc1033c7ad6b32bb42dbd18 Mon Sep 17 00:00:00 2001 From: Gautam Date: Fri, 7 Nov 2025 20:46:11 -0500 Subject: [PATCH 1/8] Initial notebooks for demo of the movies DB. --- .../app/notebooks/movie_db/tmdb_queries.ipynb | 365 ++++++++++++++++++ .../notebooks/movie_db/tmdb_queries_nl.ipynb | 138 +++++++ .../notebooks/movie_db/tmdb_visualize.ipynb | 278 +++++++++++++ 3 files changed, 781 insertions(+) create mode 100644 apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb create mode 100644 apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb create mode 100644 apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb new file mode 100644 index 0000000..f7cf70d --- /dev/null +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb @@ -0,0 +1,365 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Querying [tmdb](https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata) movie information from aperturedb.\n", + "\n", + "This notebook will work on an instance of ApertureDB, which can be on the [cloud](https://cloud.aperturedata.io), or running as a [local docker container(s)](https://docs.aperturedata.io/Setup/server/Local)\n", + "\n", + "The dataset is hosted on kaggle, and available via a mlcroissant link.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --quiet mlcroissant pandas dotenv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import all the modules needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from IPython.display import display\n", + "\n", + "\n", + "from aperturedb.CommonLibrary import (\n", + " execute_query,\n", + " create_connector\n", + ")\n", + "from aperturedb.Utils import Utils\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client=create_connector()\n", + "utils = Utils(client)\n", + "utils.summary()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query time!\n", + "### Find all the movies where Tom Hanks as been a part of " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "q = [\n", + " {\n", + " \"FindEntity\": {\n", + " \"_ref\": 1,\n", + " \"with_class\": \"PROFESSIONAL\",\n", + " \"constraints\": {\n", + " \"name\": [\"==\", \"Tom Hanks\"]\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"FindEntity\": {\n", + " \"_ref\": 2,\n", + " \"is_connected_to\": {\n", + " \"ref\": 1\n", + " },\n", + " \"with_class\": \"MOVIE\",\n", + " \"results\": {\n", + " # \"list\": [\"id\", \"title\"]\n", + " \"all_properties\": True\n", + " }\n", + " }\n", + " }\n", + "]\n", + "\n", + "_, response, _ = execute_query(client, q)\n", + "\n", + "display(pd.json_normalize(response[0][\"FindEntity\"][\"entities\"]))\n", + "display(pd.json_normalize(response[1][\"FindEntity\"][\"entities\"]))\n", + "\n", + "movie_ids = [e[\"movie_id\"] for e in response[1][\"FindEntity\"][\"entities\"]]\n", + "display(movie_ids)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get more info.\n", + "\n", + "This response from cast and movies entities still misses the character information, because it's been encoded on the properties on connection between the 2. Let's merge that info in and get more richer details about the movies Tom Hanks has been a part of." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "professional = pd.json_normalize(response[1][\"FindEntity\"][\"entities\"])\n", + "\n", + "professional_details = []\n", + "for p in response[0][\"FindEntity\"][\"entities\"]:\n", + " src = p[\"_uniqueid\"]\n", + " for m in response[1][\"FindEntity\"][\"entities\"]:\n", + " dst = m[\"_uniqueid\"]\n", + " q = [{\n", + " \"FindEntity\": {\n", + " \"_ref\": 1,\n", + " \"with_class\": \"PROFESSIONAL\",\n", + " \"constraints\": {\n", + " \"_uniqueid\": [\"==\", src]\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"FindEntity\": {\n", + " \"_ref\": 2,\n", + " \"is_connected_to\": {\n", + " \"ref\": 1\n", + " },\n", + " \"with_class\": \"MOVIE\",\n", + " \"constraints\": {\n", + " \"_uniqueid\": [\"==\", dst]\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True\n", + " }\n", + " }\n", + " },{\n", + " \"FindConnection\": {\n", + " \"src\": 2,\n", + " \"dst\": 1,\n", + " \"results\": {\n", + " \"all_properties\": True\n", + " }\n", + " }\n", + " }]\n", + " _, responsec, _ = execute_query(client, q)\n", + " # print(f\"{response=}\")\n", + "\n", + " if responsec[2][\"FindConnection\"][\"returned\"] > 0:\n", + " c = responsec[2][\"FindConnection\"][\"connections\"][0]\n", + " # print(f\"{p['name']} has acted in {m['title']} as {c['character']}\")\n", + " # print(f\"{p['name']} has contributed in {m['title']}\")\n", + " if \"character\" in c:\n", + " professional_details.append(f\"as character: {c['character']}\")\n", + " else:\n", + " professional_details.append(f\"as {c['job']} in {c['department']}\")\n", + " # display(pd.json_normalize(response[3][\"FindConnection\"][\"connections\"]))\n", + "display(len(professional_details))\n", + "professional['details'] = professional_details\n", + "\n", + "display(professional)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find 2 cast poeple. Find the movies in which they both appear (Logical AND)\n", + "\n", + "Here we search for Tom Hanks and Meg Ryan. The All in the Find Connected entity means that find a set which is connected to both the cast people. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from aperturedb.CommonLibrary import execute_query\n", + "\n", + "q = [\n", + " {\n", + " \"FindEntity\": {\n", + " \"_ref\": 1,\n", + " \"with_class\": \"PROFESSIONAL\",\n", + " \"constraints\":{\n", + " \"name\": [\"in\", [\"Tom Hanks\"]]\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True\n", + " # \"list\": [\"name\", \"_uniqueid\"]\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"FindEntity\": {\n", + " \"_ref\": 2,\n", + " \"with_class\": \"PROFESSIONAL\",\n", + " \"constraints\":{\n", + " \"name\": [\"in\", [ \"Meg Ryan\"]]\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True\n", + " # \"list\": [\"name\", \"_uniqueid\"]\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"FindEntity\": {\n", + " \"is_connected_to\": {\n", + " \"all\": [\n", + " {\"ref\": 1},\n", + " {\"ref\": 2}\n", + " ]\n", + " },\n", + " \"with_class\": \"MOVIE\",\n", + " \"results\": {\n", + " # \"list\": [\"id\", \"title\"],\n", + " # \"group_by_source\": True\n", + " \"all_properties\": True\n", + " }\n", + " }\n", + " }\n", + "]\n", + "\n", + "_, response, _ = execute_query(client, q)\n", + "\n", + "pd.json_normalize(response[2][\"FindEntity\"][\"entities\"])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## We can write the same queries in SPARQL.\n", + "\n", + "Trying the above examples (whatever is possible), as sparql does not deal with properties on relations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from aperturedb.SPARQL import SPARQL\n", + "import json\n", + "\n", + "\n", + "sparql = SPARQL(client, debug=True)\n", + "print(\"namespaces:\", json.dumps({k: str(v) for k, v in sparql.namespaces.items()}, indent=2))\n", + "\n", + "print(\"properties:\", json.dumps({sparql.graph.qname(k): str(v)\n", + " for k, v in sparql.properties.items()}, indent=2))\n", + "\n", + "print(\"connections:\", json.dumps({sparql.graph.qname(k): str(v)\n", + " for k, v in sparql.connections.items()}, indent=2))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find all the movies where Tom Hanks as been a part of " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "query = \"\"\"\n", + "SELECT ?title ?pop ?budget\n", + "WHERE {\n", + " ?p p:name \"Tom Hanks\" .\n", + " ?m c:HAS_CAST ?p .\n", + " ?m p:title ?title ;\n", + " p:popularity ?pop ;\n", + " p:budget ?budget .\n", + "}\n", + "\"\"\"\n", + "\n", + "results = sparql.query(query)\n", + "df = sparql.to_dataframe(results)\n", + "display(df)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find 2 cast people. Find the movies they have been part of." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "PREFIX wd: \n", + "PREFIX wdt: \n", + "SELECT ?title ?pop ?budget ?wTitle\n", + "WHERE {\n", + " ?m c:HAS_CAST [p:name \"Tom Hanks\"] , [p:name \"Meg Ryan\"] ;\n", + " p:title ?title ;\n", + " p:popularity ?pop ;\n", + " p:budget ?budget .\n", + "}\n", + "\n", + "\n", + "\"\"\"\n", + "\n", + "results = sparql.query(query)\n", + "df = sparql.to_dataframe(results)\n", + "display(df)\n", + "# print(json.dumps(sparql.input_query, indent=2))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "package", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb new file mode 100644 index 0000000..bd3f16b --- /dev/null +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a23d6c2b", + "metadata": {}, + "source": [ + "## This is new section, where we try to define queries in Natural Language.\n", + "\n", + "Aperturedb has a property graph interface in llama_index repo. will try to use that to query the KG." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20954ea0", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-llms-openai\n", + "%pip install --upgrade git+https://github.com/aperture-data/llama_index@kg_fixes#subdirectory=llama-index-integrations/graph_stores/llama-index-graph-stores-ApertureDB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c816cbe0", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Literal\n", + "from llama_index.core import PropertyGraphIndex\n", + "\n", + "from llama_index.core import StorageContext\n", + "from llama_index.graph_stores.ApertureDB import ApertureDBGraphStore\n", + "\n", + "from llama_index.core.indices.property_graph import SchemaLLMPathExtractor\n", + "from llama_index.llms.openai import OpenAI\n", + "from llama_index.core import Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22ec8bc5", + "metadata": {}, + "outputs": [], + "source": [ + "entities = Literal[\"MOVIE\", \"GENRE\", \"KEYWORD\", \"PROFESSIONAL\"]\n", + "relations = Literal[\"CAST\", \"CREW\", \"HAS_GENRE\", \"HAS_KEYWORD\"]\n", + "\n", + "validation_schema = {\n", + " \"PERSON\": [\"HAS\", \"PART_OF\", \"WORKED_ON\", \"WORKED_WITH\", \"WORKED_AT\"],\n", + " \"PLACE\": [\"HAS\", \"PART_OF\", \"WORKED_AT\"],\n", + " \"ORGANIZATION\": [\"HAS\", \"PART_OF\", \"WORKED_WITH\"],\n", + " \"CONCEPT\": [\"HAS\", \"PART_OF\", \"WORKED_ON\", \"WORKED_WITH\"],\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ce7960c", + "metadata": {}, + "outputs": [], + "source": [ + "kg_extractor = SchemaLLMPathExtractor(\n", + " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.0),\n", + " possible_entities=entities,\n", + " possible_relations=relations,\n", + " possible_entity_props=[\"name\", \"budget\"],\n", + ")\n", + "\n", + "graph_store = ApertureDBGraphStore()\n", + "storage_context = StorageContext.from_defaults(graph_store=graph_store)\n", + "\n", + "index = PropertyGraphIndex.from_existing(\n", + " embed_kg_nodes=False,\n", + " kg_extractors=[kg_extractor],\n", + " property_graph_store=graph_store,\n", + " show_progress=True,\n", + ")\n", + "\n", + "query_engine = index.as_query_engine()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3c213a9", + "metadata": {}, + "outputs": [], + "source": [ + "def run_queries(query_engine):\n", + " query_str = [\n", + " (\"Which movies is Tom Hanks associated with? Give me a list of movies.\", True),\n", + " (\"Give me a list of movies starring Tom Hanks and Meg Ryan.\", True),\n", + " (\"What is the genre of the movie 'Forrest Gump'?\", True),\n", + " (\"Which other actors has Tom Hanks worked with?\", True),\n", + " (\"What is the budget of the movie 'Forrest Gump'?\", False),\n", + " (\n", + " \"List all the crew members of the movie 'Forrest Gump', which were not actors.\",\n", + " False,\n", + " ),\n", + " ]\n", + " for qs, is_correct in query_str:\n", + " response = query_engine.query(qs)\n", + " print(f\"{qs=}\")\n", + " print(f\"{response.response=}\")\n", + " print(\"=\" * 50)\n", + "\n", + "\n", + "run_queries(query_engine)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "croissant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb new file mode 100644 index 0000000..c98552c --- /dev/null +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "069b3dfa", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install jaal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32f815ba", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "from aperturedb.CommonLibrary import (\n", + " execute_query,\n", + " create_connector\n", + ")\n", + "from aperturedb.Utils import Utils\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf80210e", + "metadata": {}, + "outputs": [], + "source": [ + "client=create_connector()\n", + "utils = Utils(client)\n", + "utils.summary()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "167f81a7", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the list of movie ids that we will use to visualize the graph\n", + "# They are the movies where Tom Hanks has been involved.\n", + "movie_ids = [\n", + " 64685,\n", + " 9800,\n", + " 10193,\n", + " 5255,\n", + " 140823,\n", + " 5516,\n", + " 13448,\n", + " 20763,\n", + " 591,\n", + " 862,\n", + " 83542,\n", + " 863,\n", + " 8358,\n", + " 9591,\n", + " 59861,\n", + " 4147,\n", + " 594,\n", + " 6538,\n", + " 35,\n", + " 920,\n", + " 857,\n", + " 568,\n", + " 9489,\n", + " 497,\n", + " 13508,\n", + " 2619,\n", + " 13,\n", + " 109424,\n", + " 640,\n", + " 2280,\n", + " 11287,\n", + " 296098]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61ac3abc", + "metadata": {}, + "outputs": [], + "source": [ + "q = [\n", + " {\n", + " \"FindEntity\": {\n", + " \"with_class\": \"MOVIE\",\n", + " \"constraints\": {\n", + " \"movie_id\": [\"in\", movie_ids]\n", + " },\n", + " \"_ref\": 1,\n", + " \"results\": {\n", + " \"all_properties\": True,\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"FindEntity\": {\n", + " \"with_class\": \"PROFESSIONAL\",\n", + " \"is_connected_to\": {\n", + " \"ref\": 1,\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True,\n", + " \"group_by_source\": True,\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"FindEntity\": {\n", + " \"with_class\": \"GENRE\",\n", + " \"is_connected_to\": {\n", + " \"ref\": 1,\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True,\n", + " \"group_by_source\": True,\n", + " }\n", + " }\n", + " }\n", + "]\n", + "\n", + "result, response, b = execute_query(client, q)\n", + "types = [\"PROFESSIONAL\", \"GENRE\"]\n", + "nodes = []\n", + "nodeset = set()\n", + "edges = set()\n", + "for e in response[0]['FindEntity']['entities']:\n", + " nodes.append({\n", + " \"id\": e['_uniqueid'],\n", + " \"label\": e['title'],\n", + " \"budget\": e['budget'],\n", + " \"popularity\": e['popularity'],\n", + " \"type\": \"MOVIE\"\n", + " })\n", + " nodeset.add(e['_uniqueid'])\n", + "for src in response[0]['FindEntity']['entities']:\n", + " src_id = src['_uniqueid']\n", + " for i, sr in enumerate(response[1:]):\n", + " for e in sr['FindEntity']['entities'][src_id]:\n", + " if e['_uniqueid'] not in nodeset:\n", + " nodes.append({\n", + " \"id\": e['_uniqueid'],\n", + " \"label\": e['name'],\n", + " \"gender\": e['gender'] if 'gender' in e else None,\n", + " \"budget\": 10000,\n", + " # \"type\": \"PROFESSIONAL\" if 'job' in e else \"GENRE\"\n", + " \"type\": e['label']\n", + " })\n", + " nodeset.add(e['_uniqueid'])\n", + " edges.add((src_id, e['_uniqueid']))\n", + "\n", + "nodes_df = pd.DataFrame(nodes)\n", + "nodes_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f05e4231", + "metadata": {}, + "outputs": [], + "source": [ + "cq = []\n", + "i = 0\n", + "for src, dst in edges:\n", + " cq += [\n", + " {\n", + " \"FindEntity\": {\n", + " \"_ref\": 1 + i,\n", + " \"constraints\": {\n", + " \"_uniqueid\": [\"==\", src],\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True,\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"FindEntity\": {\n", + " \"_ref\": 2 + i,\n", + " \"constraints\": {\n", + " \"_uniqueid\": [\"==\", dst],\n", + " },\n", + " \"results\": {\n", + " \"all_properties\": True,\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"FindConnection\": {\n", + " \"src\": 1 + i,\n", + " \"dst\": 2 + i,\n", + " # \"with_class\": \"CAST\",\n", + " \"results\": {\n", + " \"all_properties\": True,\n", + " },\n", + " }\n", + " },\n", + " ]\n", + " i += 2\n", + "\n", + "connections = set()\n", + "\n", + "result, response, b = execute_query(client, cq)\n", + "for i in range(0, len(response), 3):\n", + " src = response[i][\"FindEntity\"][\"entities\"][0][\"_uniqueid\"]\n", + " dst = response[i + 1][\"FindEntity\"][\"entities\"][0][\"_uniqueid\"]\n", + " if \"connections\" not in response[i + 2][\"FindConnection\"]:\n", + " continue\n", + " for c in response[i + 2][\"FindConnection\"][\"connections\"]:\n", + " if \"character\" in c:\n", + " connections.add((src, dst, c[\"character\"]))\n", + " break\n", + " else:\n", + " if \"job\" in c:\n", + " id = f'{c[\"job\"]}_{c[\"department\"]}'\n", + " connections.add((src, dst, id))\n", + " break\n", + " else:\n", + " id = f'{c[\"_uniqueid\"]}'\n", + " connections.add((src, dst, id))\n", + " break\n", + "\n", + "\n", + "\n", + "edge_df = pd.json_normalize(\n", + " [\n", + " {\"from\": s, \"to\": d, \"label\": id, \"id\": f\"{s}_{d}_{id}\"}\n", + " for s, d, id in connections\n", + " ]\n", + ")\n", + "edge_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c06fee04", + "metadata": {}, + "outputs": [], + "source": [ + "from jaal import Jaal\n", + "Jaal(edge_df, nodes_df).plot()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "croissant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 51aafcde0238d30e3e5a8893f37b45ef5faa9221 Mon Sep 17 00:00:00 2001 From: Gautam Saluja Date: Thu, 20 Nov 2025 09:58:34 +0530 Subject: [PATCH 2/8] Notebooks with newer labels --- .../app/notebooks/movie_db/tmdb_queries.ipynb | 96 +++++++++++++++---- .../notebooks/movie_db/tmdb_queries_nl.ipynb | 90 +++++++++++++++-- .../notebooks/movie_db/tmdb_visualize.ipynb | 54 ++++++++++- 3 files changed, 208 insertions(+), 32 deletions(-) diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb index f7cf70d..36e059a 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb @@ -20,6 +20,16 @@ "%pip install --quiet mlcroissant pandas dotenv" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsInBuZmVKdnR5cXVwSDdsZ1k4RE5pOVEzZWhUV3kxYTAybXB0Il0=\"" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -73,7 +83,7 @@ " {\n", " \"FindEntity\": {\n", " \"_ref\": 1,\n", - " \"with_class\": \"PROFESSIONAL\",\n", + " \"with_class\": \"Professional\",\n", " \"constraints\": {\n", " \"name\": [\"==\", \"Tom Hanks\"]\n", " },\n", @@ -88,10 +98,10 @@ " \"is_connected_to\": {\n", " \"ref\": 1\n", " },\n", - " \"with_class\": \"MOVIE\",\n", + " \"with_class\": \"Movie\",\n", " \"results\": {\n", - " # \"list\": [\"id\", \"title\"]\n", - " \"all_properties\": True\n", + " \"list\": [\"_uniqueid\", \"movie_id\", \"title\", \"popularity\", \"budget\"]\n", + " # \"all_properties\": True\n", " }\n", " }\n", " }\n", @@ -131,7 +141,7 @@ " q = [{\n", " \"FindEntity\": {\n", " \"_ref\": 1,\n", - " \"with_class\": \"PROFESSIONAL\",\n", + " \"with_class\": \"Professional\",\n", " \"constraints\": {\n", " \"_uniqueid\": [\"==\", src]\n", " },\n", @@ -146,7 +156,7 @@ " \"is_connected_to\": {\n", " \"ref\": 1\n", " },\n", - " \"with_class\": \"MOVIE\",\n", + " \"with_class\": \"Movie\",\n", " \"constraints\": {\n", " \"_uniqueid\": [\"==\", dst]\n", " },\n", @@ -164,17 +174,14 @@ " }\n", " }]\n", " _, responsec, _ = execute_query(client, q)\n", - " # print(f\"{response=}\")\n", "\n", " if responsec[2][\"FindConnection\"][\"returned\"] > 0:\n", " c = responsec[2][\"FindConnection\"][\"connections\"][0]\n", - " # print(f\"{p['name']} has acted in {m['title']} as {c['character']}\")\n", - " # print(f\"{p['name']} has contributed in {m['title']}\")\n", " if \"character\" in c:\n", " professional_details.append(f\"as character: {c['character']}\")\n", " else:\n", " professional_details.append(f\"as {c['job']} in {c['department']}\")\n", - " # display(pd.json_normalize(response[3][\"FindConnection\"][\"connections\"]))\n", + "\n", "display(len(professional_details))\n", "professional['details'] = professional_details\n", "\n", @@ -202,7 +209,7 @@ " {\n", " \"FindEntity\": {\n", " \"_ref\": 1,\n", - " \"with_class\": \"PROFESSIONAL\",\n", + " \"with_class\": \"Professional\",\n", " \"constraints\":{\n", " \"name\": [\"in\", [\"Tom Hanks\"]]\n", " },\n", @@ -215,7 +222,7 @@ " {\n", " \"FindEntity\": {\n", " \"_ref\": 2,\n", - " \"with_class\": \"PROFESSIONAL\",\n", + " \"with_class\": \"Professional\",\n", " \"constraints\":{\n", " \"name\": [\"in\", [ \"Meg Ryan\"]]\n", " },\n", @@ -233,7 +240,7 @@ " {\"ref\": 2}\n", " ]\n", " },\n", - " \"with_class\": \"MOVIE\",\n", + " \"with_class\": \"Movie\",\n", " \"results\": {\n", " # \"list\": [\"id\", \"title\"],\n", " # \"group_by_source\": True\n", @@ -295,7 +302,7 @@ "SELECT ?title ?pop ?budget\n", "WHERE {\n", " ?p p:name \"Tom Hanks\" .\n", - " ?m c:HAS_CAST ?p .\n", + " ?m c:HasCast ?p .\n", " ?m p:title ?title ;\n", " p:popularity ?pop ;\n", " p:budget ?budget .\n", @@ -321,17 +328,13 @@ "outputs": [], "source": [ "query = \"\"\"\n", - "PREFIX wd: \n", - "PREFIX wdt: \n", "SELECT ?title ?pop ?budget ?wTitle\n", "WHERE {\n", - " ?m c:HAS_CAST [p:name \"Tom Hanks\"] , [p:name \"Meg Ryan\"] ;\n", + " ?m c:HasCast [p:name \"Tom Hanks\"] , [p:name \"Meg Ryan\"] ;\n", " p:title ?title ;\n", " p:popularity ?pop ;\n", " p:budget ?budget .\n", "}\n", - "\n", - "\n", "\"\"\"\n", "\n", "results = sparql.query(query)\n", @@ -339,11 +342,62 @@ "display(df)\n", "# print(json.dumps(sparql.input_query, indent=2))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Augment the SPARQL data with federated query and response." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Merge info from Wikidata\n", + "\n", + "Given the above query, let's write a query that fetches the location of the selected movie above. That data is not present in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "PREFIX wd: \n", + "PREFIX wdt: \n", + "SELECT ?title ?pop ?budget ?wTitle ?wmovie ?wTitle ?genreLabel ?loclabel\n", + "WHERE {\n", + " ?m c:HasCast [p:name \"Tom Hanks\"] , [p:name \"Meg Ryan\"] ;\n", + " p:title ?title ;\n", + " p:popularity ?pop ;\n", + " p:budget ?budget .\n", + " ?m c:HasGenre ?genre.\n", + " ?genre p:name ?genreLabel.\n", + " BIND(strlang(?title, \"en\") AS ?stitle)\n", + " SERVICE {\n", + " ?wmovie wdt:P31 wd:Q11424.\n", + " ?wmovie rdfs:label ?wTitle.\n", + " ?wmovie wdt:P840 ?location.\n", + " ?location rdfs:label ?loclabel.\n", + " ?wmovie rdfs:label ?stitle.\n", + " FILTER(lang(?wTitle) = \"en\")\n", + " FILTER(lang(?loclabel) = \"en\")\n", + " }\n", + "}\n", + "\"\"\"\n", + "\n", + "results = sparql.query(query)\n", + "df = sparql.to_dataframe(results)\n", + "display(df)" + ] } ], "metadata": { "kernelspec": { - "display_name": "package", + "display_name": "croissant", "language": "python", "name": "python3" }, @@ -357,7 +411,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb index bd3f16b..121154f 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb @@ -21,6 +21,68 @@ "%pip install --upgrade git+https://github.com/aperture-data/llama_index@kg_fixes#subdirectory=llama-index-integrations/graph_stores/llama-index-graph-stores-ApertureDB" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4a708e1", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsInBuZmVKdnR5cXVwSDdsZ1k4RE5pOVEzZWhUV3kxYTAybXB0Il0=\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b11eb29", + "metadata": {}, + "outputs": [], + "source": [ + "from aperturedb.CommonLibrary import create_connector\n", + "from aperturedb.Utils import Utils\n", + "\n", + "client = create_connector()\n", + "utils = Utils(client)\n", + "utils.get_schema()" + ] + }, + { + "cell_type": "markdown", + "id": "37604f26", + "metadata": {}, + "source": [ + "## Log internal queries\n", + "\n", + "Sometimes to figure out the internal workings of the QA request/response it might be useful to figure out the inner workings. Run the following section to log the internal process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8748dd32", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "import llama_index.core\n", + "\n", + "llama_index.core.set_global_handler(\"simple\")" + ] + }, + { + "cell_type": "markdown", + "id": "38432632", + "metadata": {}, + "source": [ + "## Build and query the ApertureDB Property graph" + ] + }, { "cell_type": "code", "execution_count": null, @@ -39,6 +101,14 @@ "from llama_index.core import Settings" ] }, + { + "cell_type": "markdown", + "id": "2e210506", + "metadata": {}, + "source": [ + "### Define the Schema to guide the LLM" + ] + }, { "cell_type": "code", "execution_count": null, @@ -46,17 +116,25 @@ "metadata": {}, "outputs": [], "source": [ - "entities = Literal[\"MOVIE\", \"GENRE\", \"KEYWORD\", \"PROFESSIONAL\"]\n", - "relations = Literal[\"CAST\", \"CREW\", \"HAS_GENRE\", \"HAS_KEYWORD\"]\n", + "entities = Literal[\"Movie\", \"Genre\", \"Keyword\", \"Professional\"]\n", + "relations = Literal[\"HasCast\", \"HasCrew\", \"HasGenre\", \"HasKeyword\"]\n", "\n", "validation_schema = {\n", - " \"PERSON\": [\"HAS\", \"PART_OF\", \"WORKED_ON\", \"WORKED_WITH\", \"WORKED_AT\"],\n", - " \"PLACE\": [\"HAS\", \"PART_OF\", \"WORKED_AT\"],\n", - " \"ORGANIZATION\": [\"HAS\", \"PART_OF\", \"WORKED_WITH\"],\n", - " \"CONCEPT\": [\"HAS\", \"PART_OF\", \"WORKED_ON\", \"WORKED_WITH\"],\n", + " \"Person\": [\"HasCast\", \"HasCrew\", \"HasGenre\", \"HasKeyword\"],\n", + " \"Movie\": [\"HasCast\", \"HasCrew\", \"HasGenre\", \"HasKeyword\"],\n", + " \"Genre\": [\"HasCast\", \"HasCrew\", \"HasGenre\", \"HasKeyword\"],\n", + " \"Keyword\": [\"HasCast\", \"HasCrew\", \"HasGenre\", \"HasKeyword\"],\n", "}\n" ] }, + { + "cell_type": "markdown", + "id": "2db6d5a1", + "metadata": {}, + "source": [ + "### Initialize the Property Graph Index, with an instance of SchemaLLMPathExtractor" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb index c98552c..ec6a618 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb @@ -10,6 +10,14 @@ "%pip install jaal" ] }, + { + "cell_type": "markdown", + "id": "1dd211a8", + "metadata": {}, + "source": [ + "## Import the requisites" + ] + }, { "cell_type": "code", "execution_count": null, @@ -27,6 +35,17 @@ "from aperturedb.Utils import Utils\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "45180309", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsIkk4ZFBLT2tLZXlWTmx1OG5lWllxcTFwaGRMOU5VYmZuN1N4Il0=\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -83,6 +102,16 @@ " 296098]" ] }, + { + "cell_type": "markdown", + "id": "1051f40d", + "metadata": {}, + "source": [ + "## Define the constraints.\n", + "\n", + "The following aperturedb query restricts the Entities to be comprised of the above movie Ids." + ] + }, { "cell_type": "code", "execution_count": null, @@ -93,7 +122,7 @@ "q = [\n", " {\n", " \"FindEntity\": {\n", - " \"with_class\": \"MOVIE\",\n", + " \"with_class\": \"Movie\",\n", " \"constraints\": {\n", " \"movie_id\": [\"in\", movie_ids]\n", " },\n", @@ -105,7 +134,7 @@ " },\n", " {\n", " \"FindEntity\": {\n", - " \"with_class\": \"PROFESSIONAL\",\n", + " \"with_class\": \"Professional\",\n", " \"is_connected_to\": {\n", " \"ref\": 1,\n", " },\n", @@ -117,7 +146,7 @@ " },\n", " {\n", " \"FindEntity\": {\n", - " \"with_class\": \"GENRE\",\n", + " \"with_class\": \"Genre\",\n", " \"is_connected_to\": {\n", " \"ref\": 1,\n", " },\n", @@ -130,7 +159,7 @@ "]\n", "\n", "result, response, b = execute_query(client, q)\n", - "types = [\"PROFESSIONAL\", \"GENRE\"]\n", + "types = [\"Professional\", \"Genre\"]\n", "nodes = []\n", "nodeset = set()\n", "edges = set()\n", @@ -153,7 +182,6 @@ " \"label\": e['name'],\n", " \"gender\": e['gender'] if 'gender' in e else None,\n", " \"budget\": 10000,\n", - " # \"type\": \"PROFESSIONAL\" if 'job' in e else \"GENRE\"\n", " \"type\": e['label']\n", " })\n", " nodeset.add(e['_uniqueid'])\n", @@ -163,6 +191,14 @@ "nodes_df" ] }, + { + "cell_type": "markdown", + "id": "cd03b13c", + "metadata": {}, + "source": [ + "## Convert the ApertureDB responses into the data format for Jaal" + ] + }, { "cell_type": "code", "execution_count": null, @@ -242,6 +278,14 @@ "edge_df" ] }, + { + "cell_type": "markdown", + "id": "0fe5b122", + "metadata": {}, + "source": [ + "## Run the visualization tool" + ] + }, { "cell_type": "code", "execution_count": null, From d2a99d16ce98b893f970c4137884ba36cae00f76 Mon Sep 17 00:00:00 2001 From: Gautam Saluja Date: Fri, 21 Nov 2025 00:30:14 +0530 Subject: [PATCH 3/8] Embedding search on TMDB data --- .../movie_db/tmdb_vector_search.ipynb | 466 ++++++++++++++++++ 1 file changed, 466 insertions(+) create mode 100644 apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb new file mode 100644 index 0000000..0cf72ff --- /dev/null +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb @@ -0,0 +1,466 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "62ad0fb2-b13c-43a4-9332-3c942532d571", + "metadata": {}, + "source": [ + "# TMDB Semantic Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d81d8e0-729a-4eb3-bf1d-f0d9bbd90f83", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "import os\n", + "import clip\n", + "import torch\n", + "from aperturedb.CommonLibrary import (\n", + " create_connector\n", + ")\n", + "from aperturedb.Utils import Utils\n", + "\n", + "\n", + "descriptor_set = \"ViT-B/16\"\n", + "\n", + "# Choose the model to be used.\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "clip.available_models()\n", + "\n", + "# We change the descriptor set here since we are looking for images in the CLIP descriptor set\n", + "model, preprocess = clip.load(descriptor_set, device=device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cc9590e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsInBuZmVKdnR5cXVwSDdsZ1k4RE5pOVEzZWhUV3kxYTAybXB0Il0=\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1f0eb74", + "metadata": {}, + "outputs": [], + "source": [ + "from aperturedb.Constraints import Constraints\n", + "from aperturedb.Connector import Connector\n", + "import json\n", + "from aperturedb.Images import Images\n", + "\n", + "def find_movie_posters(db: Connector,\n", + " display_input:bool = True,\n", + " constraints: Constraints = None,\n", + " search_set_name: str = None,\n", + " embedding: bytes = None,\n", + " k_neighbors: int = 10,\n", + " output_limit:int = 10,\n", + " log_raw_output:bool = False) -> Images:\n", + " \"\"\"\n", + " Find similar images to the input embedding.\n", + " \"\"\"\n", + "\n", + " #First implementation for images search.\n", + " find_image_command = {\n", + " # Retrieve the images associated with the results above.\n", + " \"FindImage\": {\n", + " # Find images connected to the descriptors returned above.\n", + " \"is_connected_to\": {\n", + " \"ref\": 1,\n", + " },\n", + " \"group_by_source\": True, # Group the results by the source descriptor.\n", + " \"results\": {\n", + " \"list\": [\"_uniqueid\"],\n", + " \"limit\": k_neighbors\n", + " }\n", + " }\n", + " }\n", + " if constraints is not None:\n", + " print(constraints.constraints)\n", + " find_image_command[\"FindImage\"][\"constraints\"] = constraints.constraints\n", + " find_image_command[\"FindImage\"][\"results\"][\"list\"].extend(list(constraints.constraints.keys()))\n", + "\n", + " # This ApertureDB query finds images that are similary to the query image.\n", + " # Read about the [FindDescriptor](https://docs.aperturedata.io/query_language/Reference/descriptor_commands/desc_commands/FindDescriptor) command.\n", + " q = [{\n", + " # Find descriptors similar to the input descriptor.\n", + " \"FindDescriptor\": {\n", + " \"set\": search_set_name,\n", + " \"k_neighbors\": 100,\n", + " \"_ref\": 1,\n", + " \"distances\": True,\n", + " \"results\": {\n", + " \"all_properties\": True,\n", + " },\n", + " \"constraints\": {\n", + " \"source\": [\"==\", \"image\"]\n", + " }\n", + " }\n", + " }, find_image_command ]\n", + "\n", + " # Run the query.\n", + " # As additional input, include the descriptor data generated from our query image above.\n", + " responses, images = db.query(q, [embedding])\n", + " # assert len(descriptors) == len(images), f\"The number of descriptors and images should be the same {responses}\"\n", + " print(f\"{responses=}\")\n", + "\n", + " ordered_images = []\n", + " # Compose an ordered response of the images\n", + " descriptors = responses[0]['FindDescriptor']['entities']\n", + " images = responses[1]['FindImage']['entities']\n", + " for descriptor in descriptors:\n", + " desc_id = descriptor['_uniqueid']\n", + " if desc_id in images and len(images[desc_id]) > 0:\n", + " ordered_images.append(\n", + " {**images[desc_id][0]}\n", + " )\n", + " if len(ordered_images) >= output_limit:\n", + " break\n", + "\n", + " if log_raw_output:\n", + " print(f\"{json.dumps(responses, indent=2)}\")\n", + "\n", + " imgs = Images(db, response=ordered_images)\n", + " return imgs\n", + "\n", + "def find_movies(db: Connector,\n", + " display_input:bool = True,\n", + " movie_constraints: Constraints = None,\n", + " search_set_name: str = None,\n", + " embedding: bytes = None,\n", + " k_neighbors: int = 10,\n", + " output_limit:int = 10,\n", + " log_raw_output:bool = False) -> Images:\n", + "\n", + "\n", + " \"\"\"\n", + " Find similar images to the input embedding.\n", + " \"\"\"\n", + "\n", + " find_descriptor_image_command = {\n", + " # Find descriptors similar to the input descriptor.\n", + " \"FindDescriptor\": {\n", + " \"set\": search_set_name,\n", + " \"k_neighbors\": 10000,\n", + " \"_ref\": 1,\n", + " \"results\": {\n", + " \"list\": [\"_uniqueid\"],\n", + " },\n", + " \"constraints\": {\n", + " \"source\": [\"==\", \"image\"]\n", + " }\n", + " }\n", + " }\n", + "\n", + " #First implementation for images search.\n", + " find_image_command = {\n", + " # Retrieve the images associated with the results above.\n", + " \"FindImage\": {\n", + " # Find images connected to the descriptors returned above.\n", + " \"is_connected_to\": {\n", + " \"ref\": 1,\n", + " },\n", + " \"group_by_source\": True, # Group the results by the source descriptor.\n", + " \"results\": {\n", + " \"list\": [\"_uniqueid\", \"title\"],\n", + " \"limit\": k_neighbors\n", + " }\n", + " }\n", + " }\n", + "\n", + " find_descriptor_title_command = {\n", + " # Find descriptors similar to the input descriptor.\n", + " \"FindDescriptor\": {\n", + " \"set\": search_set_name,\n", + " \"k_neighbors\": 10000,\n", + " \"_ref\": 2,\n", + " \"results\": {\n", + " \"list\": [\"_uniqueid\"],\n", + " },\n", + " \"constraints\": {\n", + " \"source\": [\"==\", \"tagline\"]\n", + " }\n", + " }\n", + " }\n", + "\n", + " find_descriptor_tagline_command = {\n", + " # Find descriptors similar to the input descriptor.\n", + " \"FindDescriptor\": {\n", + " \"set\": search_set_name,\n", + " \"k_neighbors\": 10000,\n", + " \"_ref\": 3,\n", + " \"results\": {\n", + " \"list\": [\"_uniqueid\"],\n", + " },\n", + " \"constraints\": {\n", + " \"source\": [\"==\", \"tagline\"]\n", + " }\n", + " }\n", + " }\n", + "\n", + " #First implementation for images search.\n", + " find_movie_command_title = {\n", + " # Retrieve the images associated with the results above.\n", + " \"FindEntity\": {\n", + " \"with_class\": \"Movie\",\n", + " # Find images connected to the descriptors returned above.\n", + " \"is_connected_to\": {\n", + " \"ref\": 2,\n", + " },\n", + " \"group_by_source\": True, # Group the results by the source descriptor.\n", + " \"results\": {\n", + " \"list\": [\"title\", \"tagline\", \"popularity\", \"vote_average\"],\n", + " \"limit\": k_neighbors\n", + " }\n", + " }\n", + " }\n", + " if movie_constraints is not None:\n", + " print(movie_constraints.constraints)\n", + " find_movie_command_title[\"FindEntity\"][\"constraints\"] = movie_constraints.constraints\n", + "\n", + "\n", + " #First implementation for images search.\n", + " find_movie_command_tagline = {\n", + " # Retrieve the images associated with the results above.\n", + " \"FindEntity\": {\n", + " \"with_class\": \"Movie\",\n", + " # Find images connected to the descriptors returned above.\n", + " \"is_connected_to\": {\n", + " \"ref\": 3,\n", + " },\n", + " \"group_by_source\": True, # Group the results by the source descriptor.\n", + " \"results\": {\n", + " \"list\": [\"title\", \"tagline\", \"popularity\", \"vote_average\"],\n", + " \"limit\": k_neighbors\n", + " }\n", + " }\n", + " }\n", + " if movie_constraints is not None:\n", + " print(movie_constraints.constraints)\n", + " find_movie_command_tagline[\"FindEntity\"][\"constraints\"] = movie_constraints.constraints\n", + "\n", + " # This ApertureDB query finds images that are similary to the query image.\n", + " # Read about the [FindDescriptor](https://docs.aperturedata.io/query_language/Reference/descriptor_commands/desc_commands/FindDescriptor) command.\n", + " q = [find_descriptor_image_command, find_image_command,\n", + " find_descriptor_title_command, find_movie_command_title,\n", + " find_descriptor_tagline_command, find_movie_command_tagline]\n", + "\n", + "\n", + " #print(q)\n", + "\n", + " # Run the query.\n", + " # As additional input, include the descriptor data generated from our query image above.\n", + " responses, images = db.query(q, [embedding, embedding, embedding])\n", + " #db.print_last_response()\n", + "\n", + " ordered_images = []\n", + " # Compose an ordered response of the images\n", + " descriptors = responses[0]['FindDescriptor']['entities']\n", + " images = responses[1]['FindImage']['entities']\n", + " for descriptor in descriptors:\n", + " desc_id = descriptor['_uniqueid']\n", + " if desc_id in images and len(images[desc_id]) > 0:\n", + " ordered_images.append(\n", + " {**images[desc_id][0]}\n", + " )\n", + " if len(ordered_images) >= output_limit:\n", + " break\n", + "\n", + " print(\"##### Matches based on titles ####\\n\")\n", + " descriptors = responses[2]['FindDescriptor']['entities']\n", + " movies = responses[3]['FindEntity']['entities']\n", + " prop_list = []\n", + " for descriptor in descriptors:\n", + " desc_id = descriptor['_uniqueid']\n", + " if desc_id in movies and len(movies[desc_id]) > 0:\n", + " prop_list.append(movies[desc_id][0])\n", + " display(pd.json_normalize(prop_list))\n", + "\n", + " if log_raw_output:\n", + " print(f\"{json.dumps(responses, indent=2)}\")\n", + "\n", + " print(\"\\n##### Matches based on taglines #### \")\n", + " descriptors = responses[4]['FindDescriptor']['entities']\n", + " movies = responses[5]['FindEntity']['entities']\n", + " prop_list = []\n", + " for descriptor in descriptors:\n", + " desc_id = descriptor['_uniqueid']\n", + " if desc_id in movies and len(movies[desc_id]) > 0:\n", + " prop_list.append(movies[desc_id][0])\n", + "\n", + " display(pd.json_normalize(prop_list))\n", + "\n", + " if log_raw_output:\n", + " print(f\"{json.dumps(responses, indent=2)}\")\n", + "\n", + " print(\"##### Matches based on images #### \")\n", + " imgs = Images(db, response=ordered_images)\n", + " return imgs\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8c70391-5723-48e3-8ae2-bb126687406e", + "metadata": {}, + "outputs": [], + "source": [ + "client=create_connector()\n", + "utils = Utils(client)\n", + "utils.summary()" + ] + }, + { + "cell_type": "markdown", + "id": "84b2bbff-5247-494e-9084-e2da22b510e5", + "metadata": {}, + "source": [ + "## Find movie posters by multimodal search of images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aeccc5e1-8e48-4e27-86b5-5f2841af8d69", + "metadata": {}, + "outputs": [], + "source": [ + "# Prompt for free language input\n", + "inp = input(\"Enter a search term as described above: \")\n", + "# action scenes in superhero movies\n", + "# blue aliens\n", + "\n", + "search_tokens = clip.tokenize([f\"a photo of {inp}\"]).to(device)\n", + "search_embeddings = model.encode_text(search_tokens)\n", + "\n", + "if device == \"cuda\":\n", + " search_embeddings = search_embeddings.float()\n", + " blobs = search_embeddings[0].cpu().detach().numpy().tobytes()\n", + "else:\n", + " blobs = search_embeddings[0].detach().numpy().tobytes()\n", + "\n", + "imgs = find_movie_posters(client, embedding=blobs, k_neighbors=1000, output_limit=10, search_set_name=\"wf_embeddings_clip\")\n", + "\n", + "slider, table = imgs.inspect()\n", + "display(slider, table)" + ] + }, + { + "cell_type": "markdown", + "id": "f391711e-7a43-41e0-909c-00043d00fbec", + "metadata": {}, + "source": [ + "## Find movies and their posters by search in title, tagline, images (multimodal RAG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6a72b6f-c895-4a6d-90e3-50fdb1356c6a", + "metadata": {}, + "outputs": [], + "source": [ + "# Prompt for free language input\n", + "inp = input(\"Enter a search term as described above\")\n", + "# action scenes in superhero movies\n", + "# blue aliens\n", + "\n", + "search_tokens = clip.tokenize([f\"a photo of {inp}\"]).to(device)\n", + "search_embeddings = model.encode_text(search_tokens)\n", + "\n", + "if device == \"cuda\":\n", + " search_embeddings = search_embeddings.float()\n", + " blobs = search_embeddings[0].cpu().detach().numpy().tobytes()\n", + "else:\n", + " blobs = search_embeddings[0].detach().numpy().tobytes()\n", + "\n", + "imgs = find_movies(client, embedding=blobs, k_neighbors=10, output_limit=10, search_set_name=\"wf_embeddings_clip\")\n", + "\n", + "slider, table = imgs.inspect()\n", + "display(slider, table)" + ] + }, + { + "cell_type": "markdown", + "id": "0faa5a22-8376-4744-bac9-2e318be1cdcc", + "metadata": {}, + "source": [ + "## Vector search with additional metadata constraints (vector + graph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd3d9e5b-257d-4715-a21a-6cbdac34753f", + "metadata": {}, + "outputs": [], + "source": [ + "from aperturedb.Constraints import Constraints\n", + "\n", + "# Prompt for free language input\n", + "inp = input(\"Enter a search term as described above: \")\n", + "# action scenes in superhero movies\n", + "# blue aliens\n", + "\n", + "const = Constraints()\n", + "\n", + "# We not only want to filter using some metadata key,value property here,\n", + "# but want to retrieve the corresponding image in one query\n", + "const.greater(\"popularity\", 20)\n", + "\n", + "search_tokens = clip.tokenize([f\"a photo of {inp}\"]).to(device)\n", + "search_embeddings = model.encode_text(search_tokens)\n", + "\n", + "if device == \"cuda\":\n", + " search_embeddings = search_embeddings.float()\n", + " blobs = search_embeddings[0].cpu().detach().numpy().tobytes()\n", + "else:\n", + " blobs = search_embeddings[0].detach().numpy().tobytes()\n", + "\n", + "imgs = helper.find_movies(client, movie_constraints=const, embedding=blobs, k_neighbors=10, output_limit=10, search_set_name=descriptor_set)\n", + "\n", + "slider, table = imgs.inspect()\n", + "display(slider, table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcb35a98-1f16-4363-98e3-e61d36bbed29", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "package", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 5400703a3477ac14bf8689be589e8b9faa268a48 Mon Sep 17 00:00:00 2001 From: Gautam Saluja Date: Fri, 21 Nov 2025 00:32:32 +0530 Subject: [PATCH 4/8] Cleanup --- .../app/notebooks/movie_db/tmdb_queries.ipynb | 10 --------- .../notebooks/movie_db/tmdb_queries_nl.ipynb | 11 ---------- .../movie_db/tmdb_vector_search.ipynb | 22 ++----------------- .../notebooks/movie_db/tmdb_visualize.ipynb | 11 ---------- 4 files changed, 2 insertions(+), 52 deletions(-) diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb index 36e059a..f1abba7 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb @@ -20,16 +20,6 @@ "%pip install --quiet mlcroissant pandas dotenv" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsInBuZmVKdnR5cXVwSDdsZ1k4RE5pOVEzZWhUV3kxYTAybXB0Il0=\"" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb index 121154f..a8ad2d6 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries_nl.ipynb @@ -21,17 +21,6 @@ "%pip install --upgrade git+https://github.com/aperture-data/llama_index@kg_fixes#subdirectory=llama-index-integrations/graph_stores/llama-index-graph-stores-ApertureDB" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4a708e1", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsInBuZmVKdnR5cXVwSDdsZ1k4RE5pOVEzZWhUV3kxYTAybXB0Il0=\"" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb index 0cf72ff..4773bab 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb @@ -35,17 +35,6 @@ "model, preprocess = clip.load(descriptor_set, device=device)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cc9590e", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsInBuZmVKdnR5cXVwSDdsZ1k4RE5pOVEzZWhUV3kxYTAybXB0Il0=\"" - ] - }, { "cell_type": "code", "execution_count": null, @@ -57,6 +46,7 @@ "from aperturedb.Connector import Connector\n", "import json\n", "from aperturedb.Images import Images\n", + "import pandas as pd\n", "\n", "def find_movie_posters(db: Connector,\n", " display_input:bool = True,\n", @@ -427,19 +417,11 @@ "else:\n", " blobs = search_embeddings[0].detach().numpy().tobytes()\n", "\n", - "imgs = helper.find_movies(client, movie_constraints=const, embedding=blobs, k_neighbors=10, output_limit=10, search_set_name=descriptor_set)\n", + "imgs = find_movies(client, movie_constraints=const, embedding=blobs, k_neighbors=10, output_limit=10, search_set_name=\"wf_embeddings_clip\")\n", "\n", "slider, table = imgs.inspect()\n", "display(slider, table)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fcb35a98-1f16-4363-98e3-e61d36bbed29", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb index ec6a618..8134814 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_visualize.ipynb @@ -35,17 +35,6 @@ "from aperturedb.Utils import Utils\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "45180309", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsIkk4ZFBLT2tLZXlWTmx1OG5lWllxcTFwaGRMOU5VYmZuN1N4Il0=\"" - ] - }, { "cell_type": "code", "execution_count": null, From 3c0eea3642dae500df81609df4d7ef185f3d0cdb Mon Sep 17 00:00:00 2001 From: Gautam Date: Thu, 4 Dec 2025 16:40:05 -0500 Subject: [PATCH 5/8] Fix queries in vector search notebooks. --- .../movie_db/tmdb_vector_search.ipynb | 213 ++++++++---------- 1 file changed, 95 insertions(+), 118 deletions(-) diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb index 4773bab..482db55 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb @@ -5,7 +5,30 @@ "id": "62ad0fb2-b13c-43a4-9332-3c942532d571", "metadata": {}, "source": [ - "# TMDB Semantic Search" + "# TMDB Semantic Search\n", + "\n", + "This notebook will work on an instance of ApertureDB, which can be on the [cloud](https://cloud.aperturedata.io), or running as a [local docker container(s)](https://docs.aperturedata.io/Setup/server/Local)\n", + "\n", + "The dataset is hosted on kaggle, and available via a mlcroissant link." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c1f4c9c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsIkk4ZFBLT2tLZXlWTmx1OG5lWllxcTFwaGRMOU5VYmZuN1N4Il0=\"" + ] + }, + { + "cell_type": "markdown", + "id": "7186842f", + "metadata": {}, + "source": [ + "## Load CLIP" ] }, { @@ -35,6 +58,19 @@ "model, preprocess = clip.load(descriptor_set, device=device)" ] }, + { + "cell_type": "markdown", + "id": "59e256bb", + "metadata": {}, + "source": [ + "## Define 2 helper functions find_movie_posters, and find_movies.\n", + "\n", + "In the dataset we ingested, from the [dataset ingestion movies workflow](https://github.com/aperture-data/workflows/tree/main/apps/dataset-ingestion-movies), the schema has a move connected to multiple professionals associated with the cast, and a poster image. In addition to these entities, there are embeddings generated on tagline property of the movie and the poster image of the movie.\n", + "\n", + "- *find_movie_posters* : returns images that semantically match the input terms.\n", + "- *find_movies*: returns movies where the tagline semantically matches the input terms." + ] + }, { "cell_type": "code", "execution_count": null, @@ -64,6 +100,7 @@ " find_image_command = {\n", " # Retrieve the images associated with the results above.\n", " \"FindImage\": {\n", + " \"_ref\": 2,\n", " # Find images connected to the descriptors returned above.\n", " \"is_connected_to\": {\n", " \"ref\": 1,\n", @@ -75,10 +112,23 @@ " }\n", " }\n", " }\n", + "\n", + " find_movie_command = {\n", + " \"FindEntity\": {\n", + " \"with_class\": \"Movie\",\n", + " \"group_by_source\": True,\n", + " \"is_connected_to\": {\n", + " \"ref\": 2,\n", + " },\n", + " \"results\": {\n", + " \"list\": [\"_uniqueid\", \"title\", \"tagline\", \"popularity\", \"vote_average\"],\n", + " \"limit\": k_neighbors\n", + " }\n", + " }\n", + " }\n", " if constraints is not None:\n", " print(constraints.constraints)\n", - " find_image_command[\"FindImage\"][\"constraints\"] = constraints.constraints\n", - " find_image_command[\"FindImage\"][\"results\"][\"list\"].extend(list(constraints.constraints.keys()))\n", + " find_movie_command[\"FindEntity\"][\"constraints\"] = constraints.constraints\n", "\n", " # This ApertureDB query finds images that are similary to the query image.\n", " # Read about the [FindDescriptor](https://docs.aperturedata.io/query_language/Reference/descriptor_commands/desc_commands/FindDescriptor) command.\n", @@ -86,7 +136,7 @@ " # Find descriptors similar to the input descriptor.\n", " \"FindDescriptor\": {\n", " \"set\": search_set_name,\n", - " \"k_neighbors\": 100,\n", + " \"k_neighbors\": k_neighbors,\n", " \"_ref\": 1,\n", " \"distances\": True,\n", " \"results\": {\n", @@ -96,27 +146,32 @@ " \"source\": [\"==\", \"image\"]\n", " }\n", " }\n", - " }, find_image_command ]\n", + " }, find_image_command, find_movie_command ]\n", "\n", " # Run the query.\n", " # As additional input, include the descriptor data generated from our query image above.\n", " responses, images = db.query(q, [embedding])\n", - " # assert len(descriptors) == len(images), f\"The number of descriptors and images should be the same {responses}\"\n", " print(f\"{responses=}\")\n", "\n", " ordered_images = []\n", - " # Compose an ordered response of the images\n", + " # Compose an ordered response of the images and movies, where a match is found between the image and the movie.\n", " descriptors = responses[0]['FindDescriptor']['entities']\n", " images = responses[1]['FindImage']['entities']\n", + " movies = responses[2]['FindEntity']['entities']\n", " for descriptor in descriptors:\n", " desc_id = descriptor['_uniqueid']\n", - " if desc_id in images and len(images[desc_id]) > 0:\n", + " image_id = images[desc_id][0]['_uniqueid']\n", + " if desc_id in images and len(images[desc_id]) > 0 and len(movies[image_id]) > 0:\n", " ordered_images.append(\n", - " {**images[desc_id][0]}\n", + " # Merge the movie and image properties\n", + " # Imp : The image properties will override the movie properties if they have the same key\n", + " movies[image_id][0] | images[desc_id][0]\n", " )\n", " if len(ordered_images) >= output_limit:\n", " break\n", "\n", + " print(f\"{ordered_images=}\")\n", + "\n", " if log_raw_output:\n", " print(f\"{json.dumps(responses, indent=2)}\")\n", "\n", @@ -137,57 +192,12 @@ " Find similar images to the input embedding.\n", " \"\"\"\n", "\n", - " find_descriptor_image_command = {\n", - " # Find descriptors similar to the input descriptor.\n", - " \"FindDescriptor\": {\n", - " \"set\": search_set_name,\n", - " \"k_neighbors\": 10000,\n", - " \"_ref\": 1,\n", - " \"results\": {\n", - " \"list\": [\"_uniqueid\"],\n", - " },\n", - " \"constraints\": {\n", - " \"source\": [\"==\", \"image\"]\n", - " }\n", - " }\n", - " }\n", - "\n", - " #First implementation for images search.\n", - " find_image_command = {\n", - " # Retrieve the images associated with the results above.\n", - " \"FindImage\": {\n", - " # Find images connected to the descriptors returned above.\n", - " \"is_connected_to\": {\n", - " \"ref\": 1,\n", - " },\n", - " \"group_by_source\": True, # Group the results by the source descriptor.\n", - " \"results\": {\n", - " \"list\": [\"_uniqueid\", \"title\"],\n", - " \"limit\": k_neighbors\n", - " }\n", - " }\n", - " }\n", - "\n", - " find_descriptor_title_command = {\n", - " # Find descriptors similar to the input descriptor.\n", - " \"FindDescriptor\": {\n", - " \"set\": search_set_name,\n", - " \"k_neighbors\": 10000,\n", - " \"_ref\": 2,\n", - " \"results\": {\n", - " \"list\": [\"_uniqueid\"],\n", - " },\n", - " \"constraints\": {\n", - " \"source\": [\"==\", \"tagline\"]\n", - " }\n", - " }\n", - " }\n", "\n", " find_descriptor_tagline_command = {\n", " # Find descriptors similar to the input descriptor.\n", " \"FindDescriptor\": {\n", " \"set\": search_set_name,\n", - " \"k_neighbors\": 10000,\n", + " \"k_neighbors\": k_neighbors,\n", " \"_ref\": 3,\n", " \"results\": {\n", " \"list\": [\"_uniqueid\"],\n", @@ -198,28 +208,7 @@ " }\n", " }\n", "\n", - " #First implementation for images search.\n", - " find_movie_command_title = {\n", - " # Retrieve the images associated with the results above.\n", - " \"FindEntity\": {\n", - " \"with_class\": \"Movie\",\n", - " # Find images connected to the descriptors returned above.\n", - " \"is_connected_to\": {\n", - " \"ref\": 2,\n", - " },\n", - " \"group_by_source\": True, # Group the results by the source descriptor.\n", - " \"results\": {\n", - " \"list\": [\"title\", \"tagline\", \"popularity\", \"vote_average\"],\n", - " \"limit\": k_neighbors\n", - " }\n", - " }\n", - " }\n", - " if movie_constraints is not None:\n", - " print(movie_constraints.constraints)\n", - " find_movie_command_title[\"FindEntity\"][\"constraints\"] = movie_constraints.constraints\n", "\n", - "\n", - " #First implementation for images search.\n", " find_movie_command_tagline = {\n", " # Retrieve the images associated with the results above.\n", " \"FindEntity\": {\n", @@ -241,61 +230,42 @@ "\n", " # This ApertureDB query finds images that are similary to the query image.\n", " # Read about the [FindDescriptor](https://docs.aperturedata.io/query_language/Reference/descriptor_commands/desc_commands/FindDescriptor) command.\n", - " q = [find_descriptor_image_command, find_image_command,\n", - " find_descriptor_title_command, find_movie_command_title,\n", - " find_descriptor_tagline_command, find_movie_command_tagline]\n", - "\n", - "\n", - " #print(q)\n", + " q = [\n", + " find_descriptor_tagline_command, find_movie_command_tagline\n", + " ]\n", "\n", - " # Run the query.\n", - " # As additional input, include the descriptor data generated from our query image above.\n", - " responses, images = db.query(q, [embedding, embedding, embedding])\n", - " #db.print_last_response()\n", "\n", - " ordered_images = []\n", - " # Compose an ordered response of the images\n", - " descriptors = responses[0]['FindDescriptor']['entities']\n", - " images = responses[1]['FindImage']['entities']\n", - " for descriptor in descriptors:\n", - " desc_id = descriptor['_uniqueid']\n", - " if desc_id in images and len(images[desc_id]) > 0:\n", - " ordered_images.append(\n", - " {**images[desc_id][0]}\n", - " )\n", - " if len(ordered_images) >= output_limit:\n", - " break\n", "\n", - " print(\"##### Matches based on titles ####\\n\")\n", - " descriptors = responses[2]['FindDescriptor']['entities']\n", - " movies = responses[3]['FindEntity']['entities']\n", - " prop_list = []\n", - " for descriptor in descriptors:\n", - " desc_id = descriptor['_uniqueid']\n", - " if desc_id in movies and len(movies[desc_id]) > 0:\n", - " prop_list.append(movies[desc_id][0])\n", - " display(pd.json_normalize(prop_list))\n", + " responses, images = db.query(q, [embedding])\n", "\n", " if log_raw_output:\n", " print(f\"{json.dumps(responses, indent=2)}\")\n", "\n", " print(\"\\n##### Matches based on taglines #### \")\n", - " descriptors = responses[4]['FindDescriptor']['entities']\n", - " movies = responses[5]['FindEntity']['entities']\n", + " descriptors = responses[0]['FindDescriptor']['entities']\n", + " movies = responses[1]['FindEntity']['entities']\n", " prop_list = []\n", " for descriptor in descriptors:\n", " desc_id = descriptor['_uniqueid']\n", " if desc_id in movies and len(movies[desc_id]) > 0:\n", " prop_list.append(movies[desc_id][0])\n", - "\n", + " if len(prop_list) >= output_limit:\n", + " break\n", " display(pd.json_normalize(prop_list))\n", "\n", " if log_raw_output:\n", " print(f\"{json.dumps(responses, indent=2)}\")\n", "\n", " print(\"##### Matches based on images #### \")\n", - " imgs = Images(db, response=ordered_images)\n", - " return imgs\n" + "\n", + " # Also return the posters for the same embeddings\n", + " return find_movie_posters(\n", + " db,\n", + " embedding=embedding,\n", + " constraints=movie_constraints,\n", + " k_neighbors=k_neighbors,\n", + " output_limit=output_limit,\n", + " search_set_name=search_set_name)\n" ] }, { @@ -315,7 +285,9 @@ "id": "84b2bbff-5247-494e-9084-e2da22b510e5", "metadata": {}, "source": [ - "## Find movie posters by multimodal search of images" + "## Find movie posters by multimodal search of images\n", + "\n", + "Base case. A simple semantic search for movie posters " ] }, { @@ -350,7 +322,9 @@ "id": "f391711e-7a43-41e0-909c-00043d00fbec", "metadata": {}, "source": [ - "## Find movies and their posters by search in title, tagline, images (multimodal RAG)" + "## Find movies and their posters by search in title, tagline, images (multimodal RAG)\n", + "\n", + "We have additional embeddings based on taglines of movies. Show both the results." ] }, { @@ -365,8 +339,9 @@ "# action scenes in superhero movies\n", "# blue aliens\n", "\n", - "search_tokens = clip.tokenize([f\"a photo of {inp}\"]).to(device)\n", + "search_tokens = clip.tokenize([f\"a photo of {inp}\", f\"a story of {inp}\"]).to(device)\n", "search_embeddings = model.encode_text(search_tokens)\n", + "print(f\"{len(search_embeddings)=}\")\n", "\n", "if device == \"cuda\":\n", " search_embeddings = search_embeddings.float()\n", @@ -374,7 +349,7 @@ "else:\n", " blobs = search_embeddings[0].detach().numpy().tobytes()\n", "\n", - "imgs = find_movies(client, embedding=blobs, k_neighbors=10, output_limit=10, search_set_name=\"wf_embeddings_clip\")\n", + "imgs = find_movies(client, embedding=blobs, k_neighbors=1000, output_limit=10, search_set_name=\"wf_embeddings_clip\")\n", "\n", "slider, table = imgs.inspect()\n", "display(slider, table)" @@ -385,7 +360,9 @@ "id": "0faa5a22-8376-4744-bac9-2e318be1cdcc", "metadata": {}, "source": [ - "## Vector search with additional metadata constraints (vector + graph)" + "## Vector search with additional metadata constraints (vector + graph)\n", + "\n", + "Only return the movies that have a popularity rating greater than 20." ] }, { @@ -417,7 +394,7 @@ "else:\n", " blobs = search_embeddings[0].detach().numpy().tobytes()\n", "\n", - "imgs = find_movies(client, movie_constraints=const, embedding=blobs, k_neighbors=10, output_limit=10, search_set_name=\"wf_embeddings_clip\")\n", + "imgs = find_movies(client, movie_constraints=const, embedding=blobs, k_neighbors=100, output_limit=10, search_set_name=\"wf_embeddings_clip\")\n", "\n", "slider, table = imgs.inspect()\n", "display(slider, table)" From def6f13f572f2346a55a5d8e49629847d1adf4ea Mon Sep 17 00:00:00 2001 From: Gautam Saluja Date: Tue, 2 Dec 2025 11:15:19 -0500 Subject: [PATCH 6/8] Description of notebooks. --- apps/jupyterlab/app/notebooks/hello.ipynb | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/apps/jupyterlab/app/notebooks/hello.ipynb b/apps/jupyterlab/app/notebooks/hello.ipynb index 67b0fc3..5d0f333 100644 --- a/apps/jupyterlab/app/notebooks/hello.ipynb +++ b/apps/jupyterlab/app/notebooks/hello.ipynb @@ -32,6 +32,27 @@ "response, _ = client.query([{\"GetStatus\": {}}])\n", "print(response)\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Other notebooks\n", + "\n", + "In addition, the other notebooks provided in here depend on data on your instance.\n", + "| Notebook(s) | Data | Remarks | Related Workflow | \n", + "| --- | --- | --- | --- |\n", + "| boundingbox.ipynb | Images with labels of chair or person | Ingesting COCO dataset would have these images | [Dataset Ingestion (COCO)](https://docs.aperturedata.io/workflows/ingest_datasets) |\n", + "| clip.ipynb | Images | Images present in the DB | [Generate Embeddings](https://docs.aperturedata.io/workflows/embeddings_extraction) | \n", + "| croissant.ipynb | Any dataset represented as croissant URL | Source such datasets from kaaggle, HF, etc. | [Ingest From a croissant URL](https://docs.aperturedata.io/workflows/ingest_from_croissant) |\n", + "| facedetection.ipynb | Images which have people in them | Ingesting COCO, celebA datasets would get some of thos images | [Dataset Ingestion](https://docs.aperturedata.io/workflows/ingest_datasets) | \n", + "| hello.ipynb | None | This NB ensures that connectivity to a DB is up and working | |\n", + "| mcp.ipynb | Any data | This NB has cells to interact with MCP server. | [MCP Server](https://docs.aperturedata.io/workflows/mcp_server) |\n", + "| ocr.ipynb | Documents scanned as images | This NB lets you interact with information extracted from text extracted from images | [OCR](https://docs.aperturedata.io/workflows/ocr_extraction) |\n", + "| rag.ipynb | A crawled website with segmentations and embeddings | | [Website Chatbot Workflow](https://docs.aperturedata.io/workflows/crawl_to_rag) |\n", + "| sql.ipynb | Interacts with a postgres server | Can be used with sql server workflow and Aperturedb | [SQL server](https://docs.aperturedata.io/workflows/sql_server) |\n", + "| movie_db | Notebooks to query a knowledge graph | Needs to have tmdb ingested into the Database | [Movie DB]()" + ] } ], "metadata": { From a4b9b997c79b819c74ef9745f8bcd79c9e515074 Mon Sep 17 00:00:00 2001 From: Gautam Saluja Date: Thu, 4 Dec 2025 16:45:45 -0500 Subject: [PATCH 7/8] remove the adb key. --- .../app/notebooks/movie_db/tmdb_vector_search.ipynb | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb index 482db55..e37d730 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_vector_search.ipynb @@ -12,17 +12,6 @@ "The dataset is hosted on kaggle, and available via a mlcroissant link." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c1f4c9c", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"APERTUREDB_KEY\"] = \"WzEsMSwicmFnLXRlc3Qtcmg4ZWNlZmwuZmFybTAwMDQuY2xvdWQuYXBlcnR1cmVkYXRhLmRldiIsIkk4ZFBLT2tLZXlWTmx1OG5lWllxcTFwaGRMOU5VYmZuN1N4Il0=\"" - ] - }, { "cell_type": "markdown", "id": "7186842f", From 8a270562080a9004f74804f73e8b097eac19bb21 Mon Sep 17 00:00:00 2001 From: Gautam Saluja Date: Mon, 22 Dec 2025 18:38:56 -0500 Subject: [PATCH 8/8] a new query. --- .../app/notebooks/movie_db/tmdb_queries.ipynb | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb index f1abba7..3a68fb4 100644 --- a/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb +++ b/apps/jupyterlab/app/notebooks/movie_db/tmdb_queries.ipynb @@ -383,6 +383,39 @@ "df = sparql.to_dataframe(results)\n", "display(df)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How prolific is Tom Hanks?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "SELECT ?name (COUNT(?name) as ?count) ?gender\n", + "WHERE {\n", + " ?p p:name ?name .\n", + " ?p p:gender ?gender .\n", + " ?m c:HasCast ?p .\n", + " ?m p:title ?title ;\n", + " p:popularity ?pop ;\n", + " p:budget ?budget .\n", + "}\n", + "GROUP BY ?name\n", + "ORDER BY DESC(?count)\n", + "LIMIT 50\n", + "\"\"\"\n", + "\n", + "results = sparql.query(query)\n", + "df = sparql.to_dataframe(results)\n", + "display(df)" + ] } ], "metadata": {