From 3b071381615c8268e51f754a4f7f41093e119430 Mon Sep 17 00:00:00 2001
From: Zane Shelby <git@zane.io>
Date: Sun, 4 Feb 2024 03:07:33 -0500
Subject: [PATCH 1/2] refactor: Guess schema with DuckDB

---
 data/.gitignore                               |   4 +-
 dvc.yaml                                      |  40 +++----
 scripts/cgpm_hydrate.py                       |  10 +-
 scripts/guess_schema.sql                      |  45 ++++++++
 scripts/ignore.sh                             |   9 ++
 .../inferenceql/structure_learning/csv.clj    |  37 -------
 .../inferenceql/structure_learning/main.clj   |  40 -------
 .../inferenceql/structure_learning/schema.clj | 100 ------------------
 8 files changed, 82 insertions(+), 203 deletions(-)
 create mode 100644 scripts/guess_schema.sql
 create mode 100755 scripts/ignore.sh
 delete mode 100644 src/clojure/inferenceql/structure_learning/schema.clj

diff --git a/data/.gitignore b/data/.gitignore
index 0eb04f4a..b09f2b91 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -12,12 +12,12 @@
 /nullified.csv
 /numericalized.csv
 /subsampled.csv
+/summarized.csv
 /validated.csv
 
 # schema artifacts
-/cgpm-schema.edn
 /mapping-table.edn
-/schema.edn
+/schema.csv
 
 # loom artifacts
 /loom-schema.json
diff --git a/dvc.yaml b/dvc.yaml
index f4f29fc6..6fff6222 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -36,37 +36,34 @@ stages:
     outs:
       - data/nullified.csv
 
-  guess-schema:
+  summarize:
     cmd: >
-      clojure -X inferenceql.structure-learning.main/guess-schema
+      duckdb -csv :memory: "SUMMARIZE SELECT * FROM read_csv_auto('/dev/stdin', header=true)"
       < data/nullified.csv
-      > data/schema.edn
+      > data/summarized.csv
     deps:
       - data/nullified.csv
-    params:
-      - schema
     outs:
-      - data/schema.edn
+      - data/summarized.csv
 
-  cgpm-schema:
+  guess-schema:
     cmd: >
-      clojure -X inferenceql.structure-learning.main/cgpm-schema
-      < data/schema.edn
-      > data/cgpm-schema.edn
+      duckdb -csv :memory: "$(cat scripts/guess_schema.sql)"
+      < data/summarized.csv
+      > data/schema.csv
     deps:
-      - data/schema.edn
+      - data/summarized.csv
     outs:
-      - data/cgpm-schema.edn
+      - data/schema.csv
 
   ignore:
     cmd: >
-      clojure -X inferenceql.structure-learning.main/ignore
-      :schema '"data/schema.edn"'
+      ./scripts/ignore.sh
       < data/nullified.csv
       > data/ignored.csv
     deps:
       - data/nullified.csv
-      - data/schema.edn
+      - data/schema.csv
     outs:
       - data/ignored.csv
 
@@ -86,11 +83,14 @@ stages:
 
   loom-schema:
     cmd: >
-      clojure -X inferenceql.structure-learning.main/loom-schema
-      < data/schema.edn
+      duckdb -json :memory: '
+      SELECT column_name, loom_statistical_type
+      FROM read_csv_auto("data/schema.csv", header=true)
+      WHERE loom_statistical_type IS NOT NULL'
+      | jq 'map({(.column_name): .loom_statistical_type}) | add'
       > data/loom-schema.json
     deps:
-      - data/schema.edn
+      - data/schema.csv
     outs:
       - data/loom-schema.json
 
@@ -153,14 +153,14 @@ stages:
       --metadata {}
       --output data/cgpm/hydrated/{/}
       --data data/numericalized.csv
-      --schema data/cgpm-schema.edn
+      --schema data/schema.csv
       --mapping-table data/mapping-table.edn
       --seed $((${seed} + {#} - 1))'
     params:
       - parallel.flags
       - seed
     deps:
-      - data/cgpm-schema.edn
+      - data/schema.csv
       - data/cgpm/raw
       - data/mapping-table.edn
       - data/numericalized.csv
diff --git a/scripts/cgpm_hydrate.py b/scripts/cgpm_hydrate.py
index 258b2f24..1e89efae 100644
--- a/scripts/cgpm_hydrate.py
+++ b/scripts/cgpm_hydrate.py
@@ -2,6 +2,7 @@
 
 import argparse
 import cgpm.utils.general as general
+import duckdb
 import edn_format
 import json
 import pandas
@@ -25,7 +26,7 @@ def main():
         "--data", type=argparse.FileType("r"), help="Path to numericalized CSV."
     )
     parser.add_argument(
-        "--schema", type=argparse.FileType("r"), help="Path to CGPM schema."
+        "--schema", help="Path to schema."
     )
     parser.add_argument(
         "--mapping-table",
@@ -51,16 +52,17 @@ def main():
         sys.exit(1)
 
     df = pandas.read_csv(args.data)
-    schema = edn_format.loads(args.schema.read(), write_ply_tables=False)
+    schema = duckdb.read_csv(args.schema, header=True)
+    stattypes = dict(duckdb.sql("SELECT column_name, column_cgpm_statistical_type FROM schema").fetchall())
     mapping_table = edn_format.loads(args.mapping_table.read(), write_ply_tables=False)
 
     def n_categories(column):
         return len(mapping_table[column])
 
     def distarg(column):
-        return {"k": n_categories(column)} if schema[column] == "categorical" else None
+        return {"k": n_categories(column)} if stattypes[column] == "categorical" else None
 
-    cctypes = [schema[column] for column in df.columns]
+    cctypes = [stattypes[column] for column in df.columns]
     distargs = [distarg(column) for column in df.columns]
 
     if args.metadata is not None:
diff --git a/scripts/guess_schema.sql b/scripts/guess_schema.sql
new file mode 100644
index 00000000..086ca7a3
--- /dev/null
+++ b/scripts/guess_schema.sql
@@ -0,0 +1,45 @@
+SELECT
+    column_name,
+    column_type,
+    (
+        CASE
+        WHEN column_type = 'VARCHAR' THEN
+        (
+            CASE
+            WHEN approx_unique > 1 AND approx_unique <= 50
+            THEN 'NOMINAL'
+            ELSE 'IGNORE'
+            END
+        )
+        WHEN column_type = 'DOUBLE' THEN
+        (
+            -- Not enough unique values. Better off modeling as nominal.
+            CASE
+            WHEN approx_unique < 20
+            OR (approx_unique / count) < 0.02
+            THEN 'NOMINAL'
+
+            -- Consecutive numbers -- probably an ID?
+            WHEN approx_unique = count
+            AND count = TRY_CAST(min AS INTEGER) - TRY_CAST(max AS INTEGER) + 1
+            THEN 'IGNORE'
+
+            ELSE 'NUMERICAL'
+            END
+        )
+        END
+    ) AS column_statistical_type,
+    (
+        CASE
+        WHEN column_statistical_type = 'IGNORE' THEN 'ignore'
+        WHEN column_statistical_type = 'NOMINAL' THEN 'categorical'
+        WHEN column_statistical_type = 'NUMERICAL' THEN 'normal'
+        END
+    ) AS column_cgpm_statistical_type,
+    (
+        CASE
+        WHEN column_statistical_type = 'NOMINAL' THEN 'dd'
+        WHEN column_statistical_type = 'NUMERICAL' THEN 'nich'
+        END
+    ) AS loom_statistical_type,
+FROM read_csv_auto('/dev/stdin', header=true)
diff --git a/scripts/ignore.sh b/scripts/ignore.sh
new file mode 100755
index 00000000..f0999b5f
--- /dev/null
+++ b/scripts/ignore.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+columns=$(
+duckdb -noheader -list :memory: <<-EOF | paste -s -d , -
+    SELECT column_name
+    FROM read_csv_auto('data/schema.csv', header=true)
+    WHERE column_statistical_type != 'IGNORE'
+EOF
+)
+duckdb -csv :memory: "SELECT $columns FROM read_csv_auto('/dev/stdin', header=true)"
diff --git a/src/clojure/inferenceql/structure_learning/csv.clj b/src/clojure/inferenceql/structure_learning/csv.clj
index f0f2f284..fa3f9696 100644
--- a/src/clojure/inferenceql/structure_learning/csv.clj
+++ b/src/clojure/inferenceql/structure_learning/csv.clj
@@ -116,43 +116,6 @@
        (recur csv (first ks) (next ks))
        csv))))
 
-(defn heuristic-coerce
-  [& coll]
-  (try (into []
-             (map #(if (nil? %)
-                     %
-                     (Long/parseLong %)))
-             coll)
-       (catch java.lang.NumberFormatException _
-         (try (into []
-                    (map #(if (nil? %)
-                            %
-                            (Double/parseDouble %)))
-                    coll)
-              (catch java.lang.NumberFormatException _
-                coll)))))
-
-(defn apply-column
-  [k f coll]
-  (let [new-column (->> coll
-                        (map #(get % k))
-                        (apply f))]
-    (map-indexed (fn [index m]
-                   (let [v (get new-column index)]
-                     (cond-> m
-                       (some? v) (assoc k v))))
-                 coll)))
-
-(defn heuristic-coerce-all
-  [coll]
-  (let [ks (into #{}
-                 (mapcat keys)
-                 coll)]
-    (reduce (fn [acc k]
-              (apply-column k heuristic-coerce acc))
-            coll
-            ks)))
-
 (defn update-by-key
   "For each key k in coll if (f k) returns a function update the value for k in
   coll with that function."
diff --git a/src/clojure/inferenceql/structure_learning/main.clj b/src/clojure/inferenceql/structure_learning/main.clj
index 45077a85..b66823c0 100644
--- a/src/clojure/inferenceql/structure_learning/main.clj
+++ b/src/clojure/inferenceql/structure_learning/main.clj
@@ -9,7 +9,6 @@
             [clojure.string :as string]
             [inferenceql.structure-learning.csv :as iql.csv]
             [inferenceql.structure-learning.dvc :as dvc]
-            [inferenceql.structure-learning.schema :as schema]
             [inferenceql.inference.gpm :as gpm]
             [inferenceql.query.db :as db]
             [inferenceql.query.io :as query.io]
@@ -26,45 +25,6 @@
                headers)
          (csv/write-csv *out*))))
 
-(defn guess-schema
-  [_]
-  (let [params (dvc/yaml)
-        params-schema (:schema params)
-        default-stattype (get params :default-stat-type  :ignore)
-        guessed-schema (->> (csv/read-csv *in*)
-                            (sequence (comp (iql.csv/as-maps)
-                                            (map #(medley/remove-vals (every-pred string? string/blank?) %))
-                                            (map #(medley/remove-keys (set (keys params-schema)) %))))
-                            (iql.csv/heuristic-coerce-all)
-                            (schema/guess default-stattype))
-        schema (merge guessed-schema params-schema)]
-    (assert (not (every? #{:ignore} (vals schema)))
-            "The statistical types of the columns in data.csv can't be guessed confidently.\nAll columns are ignored. Set statistical types manually in params.yaml to fix this")
-    (schema/print-ignored schema)
-    (prn schema)))
-
-(defn loom-schema
-  [_]
-  (-> (edn/read *in*)
-      (schema/loom)
-      (json/generate-stream *out*)))
-
-(defn cgpm-schema
-  [_]
-  (-> (edn/read *in*)
-      (schema/cgpm)
-      (pr)))
-
-(defn ignore
-  [{:keys [schema]}]
-  (let [ignored (into #{}
-                      (comp (filter (comp #{:ignore} val))
-                            (map key))
-                      (edn/read-string (slurp schema)))
-        csv (csv/read-csv *in*)
-        ignored-csv (apply iql.csv/dissoc csv ignored)]
-    (csv/write-csv *out* ignored-csv)))
-
 (defn numericalize
   [{table-path :table schema-path :schema}]
   (let [schema (edn/read-string (slurp (str schema-path)))
diff --git a/src/clojure/inferenceql/structure_learning/schema.clj b/src/clojure/inferenceql/structure_learning/schema.clj
deleted file mode 100644
index ec4c5c3c..00000000
--- a/src/clojure/inferenceql/structure_learning/schema.clj
+++ /dev/null
@@ -1,100 +0,0 @@
-(ns inferenceql.structure-learning.schema
-  (:require [clojure.set]))
-
-(defn consecutive?
-  "Returns true if if the provided column is comprised entirely of consecutive integers."
-  [coll]
-  (let [sorted-coll (sort coll)]
-    (= sorted-coll
-       (take (count coll)
-             (iterate inc (first sorted-coll))))))
-
-(defn column
-  "Given a key k and a collection of associative data structures coll, `column`
-  returns a sequence of the values in for key k. Returns a transducer if only a
-  key is provided."
-  ([k]
-   (map #(get % k)))
-  ([k coll]
-   (sequence (column k) coll)))
-
-(defn guess-stattype
-  "Guess the statistical type of a collection of values."
-  [default-type coll]
-  (let [max-categories 50
-        dv (distinct coll) ; Distinct vals.
-        num-dv (count dv)
-        ratio-dv (/ num-dv (count coll)) ; Ratio of distinct vals to all vals.
-
-        ;; Number of distinct values below which columns whose values can all be parsed as
-        ;; numbers will be considered nominal anyway.
-        numcat-count 20
-        ;; Ratio of distinct values to total values below which columns whose values can
-        ;; all be parsed as numbers will be considered nominal anyway.
-        numcat-ratio 0.02]
-    (cond (= 1 num-dv)
-          :ignore
-
-          (some string? coll)
-          (if (< num-dv max-categories)
-            :nominal
-            :ignore)
-
-          (= num-dv (count coll))
-          (if (every? integer? coll)
-            (if (consecutive? coll)
-              :ignore
-              :numerical)
-            :numerical)
-
-          (every? number? coll)
-          (if (or (< num-dv numcat-count)
-                  (< ratio-dv numcat-ratio))
-            :nominal
-            :numerical)
-
-          :else
-          (keyword default-type))))
-
-(defn guess
-  "Guess a schema for a collection of maps."
-  [default-type coll]
-  (let [columns (into #{} (mapcat keys) coll)]
-    (zipmap columns
-            (map #(guess-stattype default-type (column % coll))
-                 columns))))
-
-(defn loom
-  "Returns the Loom schema for an InferenceQL schema."
-  [schema]
-  (let [replacements {:nominal "dd" ; discrete dirichlet
-                      :numerical "nich"}] ; normal inverse chi squared
-    (into {}
-          (comp (remove (comp #{:ignore} val))
-                (map (juxt key (comp replacements val))))
-          schema)))
-
-(defn cgpm
-  "Returns the CGPM schema for an InferenceQL schema."
-  [schema]
-  (let [replacements {:nominal "categorical" ; discrete dirichlet
-                      :numerical "normal"}]
-    (into {}
-          (comp (remove (comp #{:ignore} val))
-                (map (juxt (comp  name key) (comp replacements val))))
-          schema)))
-
-(defn print-ignored
-  "Tells at the user which columns were ignored."
-  [schema]
-  (let [ignored  (keys (filter (fn [[_ v]] (= v :ignore)) schema))]
-    (when (not-empty ignored)
-      (binding [*out* *err*]
-        (println "")
-        (println "======== CAVEAT - we ignore the following columns ========")
-        (println "")
-        (mapv println ignored)
-        (println "")
-        (println "- this is fine if it was intended.")
-        (println "==========================================================")
-        (println "")))))

From 8767d7779813a940b9c652248a3aef6e89da1624 Mon Sep 17 00:00:00 2001
From: Zane Shelby <git@zane.io>
Date: Fri, 23 Feb 2024 17:24:57 -0500
Subject: [PATCH 2/2] WIP

---
 dvc-cgpm.yaml | 25 ++++++++++++++++---------
 flake.lock    | 27 +++++++++++++++++++++++++++
 flake.nix     | 12 ++++++++++++
 3 files changed, 55 insertions(+), 9 deletions(-)
 create mode 100644 flake.lock
 create mode 100644 flake.nix

diff --git a/dvc-cgpm.yaml b/dvc-cgpm.yaml
index e9a70d56..16461c7b 100644
--- a/dvc-cgpm.yaml
+++ b/dvc-cgpm.yaml
@@ -36,17 +36,25 @@ stages:
     outs:
       - data/nullified.csv
 
-  guess-schema:
+  summarize:
     cmd: >
-      clojure -X inferenceql.structure-learning.main/guess-schema
+      duckdb -csv :memory: "SUMMARIZE SELECT * FROM read_csv_auto('/dev/stdin', header=true)"
       < data/nullified.csv
-      > data/schema.edn
+      > data/summarized.csv
     deps:
       - data/nullified.csv
-    params:
-      - schema
     outs:
-      - data/schema.edn
+      - data/summarized.csv
+
+  guess-schema:
+    cmd: >
+      duckdb -csv :memory: "$(cat scripts/guess_schema.sql)"
+      < data/summarized.csv
+      > data/schema.csv
+    deps:
+      - data/summarized.csv
+    outs:
+      - data/schema.csv
 
   cgpm-schema:
     cmd: >
@@ -60,13 +68,12 @@ stages:
 
   ignore:
     cmd: >
-      clojure -X inferenceql.structure-learning.main/ignore
-      :schema '"data/schema.edn"'
+      ./scripts/ignore.sh
       < data/nullified.csv
       > data/ignored.csv
     deps:
       - data/nullified.csv
-      - data/schema.edn
+      - data/schema.csv
     outs:
       - data/ignored.csv
 
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 00000000..93912036
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,27 @@
+{
+  "nodes": {
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1706826059,
+        "narHash": "sha256-N69Oab+cbt3flLvYv8fYnEHlBsWwdKciNZHUbynVEOA=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "25e3d4c0d3591c99929b1ec07883177f6ea70c9d",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-23.11",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 00000000..83b51bea
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,12 @@
+{
+  inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
+
+  outputs = { self, nixpkgs, ... }: let
+    system = "aarch64-darwin";
+    pkgs = nixpkgs.legacyPackages."${system}";
+  in {
+    devShells."${system}".default = pkgs.mkShell {
+      buildInputs = with pkgs; [ (python3.withPackages (ps: with ps; [ duckdb ])) ];
+    };
+  };
+}