aperture-data · gsaluja9 · Dec 10, 2025 · Dec 10, 2025
diff --git a/apps/dataset-ingestion-movies/README.md b/apps/dataset-ingestion-movies/README.md
@@ -17,7 +17,7 @@ dataset-ingestion-movies adds all the records from TMDB dataset to the ApertureD
 
 ```mermaid
 erDiagram
-    MOVIE {
+    Movie {
         string id
         string movie_id
         string title
@@ -30,35 +30,53 @@ erDiagram
         string dataset_name
         label movie
     }
-    PROFESSIONAL {
+    Professional {
         string name
         int gender
         string dataset_name
     }
-    KEYWORD {
-
+    ProductionCompany {
+        string dataset_name
+    }
+    SpokenLanguage {
+        string dataset_name
+    }
+    Genre {
+        string dataset_name
+    }
+    Keyword {
+        string dataset_name
     }
-    MOVIE }o--o{ PROFESSIONAL : HAS_CAST
-    MOVIE }o--o{ PROFESSIONAL : HAS_CREW
-    MOVIE }o--o{ GENRE : HAS_GENRE
-    MOVIE }o--o{ SPOKEN_LANGUAGE : HAS_SPOKEN_LANGUAGE
-    MOVIE }o--o{ KEYWORD : HAS_KEYWORD
-    MOVIE }o--o{ PRODUCTION_COMPANY : HAS_PRODUCTION_COMPANY
-    MOVIE ||--|| TAGLINE_EMBEDDING : HAS_TAGLINE_EMBEDDING
-    MOVIE |o--|| POSTER: HAS_POSTER
+    Descriptor {
+        string source
+    }
+    Image {
+        string dataset_name
+    }
+    Movie }o--o{ Professional : HasCast
+    Movie }o--o{ Professional : HasCrew
+    Movie }o--o{ Genre : HasGenre
+    Movie }o--o{ SpokenLanguage : HasSpokenLanguage
+    Movie }o--o{ Keyword : HasKeyword
+    Movie }o--o{ ProductionCompany : HasProductionCompany
+    Movie ||--|| Descriptor : HasTaglineEmbedding
+    Movie |o--|| Image: HasPoster
+    Image |o--|| Descriptor: HasPosterEmbedding
 
 ```
 
 
 
 After a successful ingestion, the following types of objects are typically added to ApertureDB:
 
-- **MOVIE**
-- **PROFESSIONAL**: Crew and Cast associated with the movie
-- **KEYWORD**: Each data item (e.g., row, record) is stored as an entity.
+- **Movie**
+- **Professional**: Crew and Cast associated with the movie
+- **Keyword**: Each data item (e.g., row, record) is stored as an entity.
 - **Image**: Posters for some of the movies.
-- **SPOKEN_LANGUAGE**
-- **GENRE**
+- **SpokenLanguage**
+- **Genre**
+- **ProductionCompany**
+
 
 
 
@@ -72,7 +90,12 @@ docker run \
            aperturedata/workflows-dataset-ingestion-movies
 ```
 
-How dataset ingestion demos work:
+Parameters:
+* **`INGEST_POSTERS`**: Add poster images and their embeddings to database.
+* **`EMBED_TAGLINE`**: Add embeddings for the tagline text to the database.
+* **`SAMPLE_COUNT`**: Number of movies to ingest. Defaults to -1 (all).
+
+How dataset ingestion (movies) works:
 
 1. **Cleanup**: Removes all objects that have a property called dataset_name, and it's value as 'tmdb_5000'.
 2. **Ingestion**: It changes the flat records from the croissant url of the dbs and stores it in property graph.
@@ -83,7 +106,7 @@ How dataset ingestion demos work:
 
 ## Cleaning up
 
-Executing the [query](https://github.com/aperture-data/workflows/blob/main/apps/ingest-croissant/app/delete_dataset_by_url.json) against the instance of ApertureDB will selectively clean the DB of the ingested Croissant dataset, if the constraint is specified in selection of the DatasetModel Entity. Here's an example:
+Executing the following query will selectively delete all the Objects added to the DB via the workflow.
 
 ```json
 [
@@ -94,6 +117,13 @@ Executing the [query](https://github.com/aperture-data/workflows/blob/main/apps/
             }
         }
     },
+    {
+        "DeleteImage": {
+            "constraints": {
+                "dataset_name": ["==", 'tmdb_5000']
+            }
+        }
+    },
     {
         "DeleteDescriptorSet": {
             "constraints": {

diff --git a/apps/dataset-ingestion-movies/app/app.sh b/apps/dataset-ingestion-movies/app/app.sh
@@ -1,6 +1,21 @@
 #!/bin/bash
 set -e
 
-python ingest_movies.py --ingest-posters --embed-tagline
+INGEST_POSTERS=${INGEST_POSTERS:-true}
+EMBED_TAGLINE=${EMBED_TAGLINE:-true}
+SAMPLE_COUNT=${SAMPLE_COUNT:--1}
+
+INGEST_POSTERS_COMMAND="--no-ingest-posters"
+INGEST_TAGLINE_COMMAND="--no-embed-tagline"
+
+if [ "$INGEST_POSTERS" = "true" ]; then
+    INGEST_POSTERS_COMMAND="--ingest-posters"
+fi
+
+if [ "$EMBED_TAGLINE" = "true" ]; then
+    INGEST_TAGLINE_COMMAND="--embed-tagline"
+fi
+
+python ingest_movies.py $INGEST_POSTERS_COMMAND $INGEST_TAGLINE_COMMAND --sample-count $SAMPLE_COUNT
 
 adb utils execute summary
diff --git a/apps/dataset-ingestion-movies/app/ingest_movies.py b/apps/dataset-ingestion-movies/app/ingest_movies.py
@@ -42,6 +42,13 @@ def cleanup_movies(db):
             }
         }
     },
+    {
+        "DeleteImage": {
+            "constraints": {
+                "dataset_name": ["==", DATASET_NAME]
+            }
+        }
+    },
     {
         "DeleteDescriptorSet": {
             "constraints": {
@@ -53,7 +60,7 @@ def cleanup_movies(db):
     execute_query(db, query=query)
 
 @app.command()
-def ingest_movies(ingest_posters: bool = False, embed_tagline: bool = False):
+def ingest_movies(ingest_posters: bool = False, embed_tagline: bool = False, sample_count: int = -1):
     """
     Ingest the movies dataset into ApertureDB.
     """
@@ -77,6 +84,9 @@ def ingest_movies(ingest_posters: bool = False, embed_tagline: bool = False):
         right_on="tmdb_5000_movies.csv/id",
         left_on="tmdb_5000_credits.csv/movie_id")
 
+    if sample_count > 0:
+        records = records.head(sample_count)
+
     collection = []
     db = create_connector()
     cleanup_movies(db)
@@ -97,8 +107,6 @@ def ingest_movies(ingest_posters: bool = False, embed_tagline: bool = False):
         movie = make_movie_with_all_connections(j, embedder, ingest_posters, embed_tagline)
         collection.append(movie)
 
-
-
     parser = MovieParser(collection)
 
     utils = Utils(db)

diff --git a/apps/dataset-ingestion-movies/app/movie_record.py b/apps/dataset-ingestion-movies/app/movie_record.py
@@ -26,9 +26,9 @@
 HAS_PRODUCTION_COMPANY_CONNECTION_LABEL = "HasProductionCompany"
 HAS_KEYWORD_CONNECTION_LABEL = "HasKeyword"
 HAS_SPOKEN_LANGUAGE_CONNECTION_LABEL = "HasSpokenLanguage"
-HAS_IMAGE_CONNECTION_LABEL = "HasImage"
+HAS_IMAGE_CONNECTION_LABEL = "HasPoster"
 HAS_TAGLINE_EMBEDDING_CONNECTION_LABEL = "HasTaglineEmbedding"
-HAS_IMAGE_EMBEDDING_CONNECTION_LABEL = "HasImageEmbedding"
+HAS_IMAGE_EMBEDDING_CONNECTION_LABEL = "HasPosterEmbedding"
 
 def make_movie_with_all_connections(j: dict, embedder: Embedder, ingest_posters: bool = False, embed_tagline: bool = False) -> List[dict]:
     """

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -434,6 +434,7 @@ services:
         environment:
             <<: *common-env
             WF_LOG_LEVEL: "${WF_LOG_LEVEL:-DEBUG}"
+            SAMPLE_COUNT: "${SAMPLE_COUNT:--1}"
 
     add-image:
         image: aperturedata/wf-add-image:latest