From 3b071381615c8268e51f754a4f7f41093e119430 Mon Sep 17 00:00:00 2001 From: Zane Shelby Date: Sun, 4 Feb 2024 03:07:33 -0500 Subject: [PATCH 1/2] refactor: Guess schema with DuckDB --- data/.gitignore | 4 +- dvc.yaml | 40 +++---- scripts/cgpm_hydrate.py | 10 +- scripts/guess_schema.sql | 45 ++++++++ scripts/ignore.sh | 9 ++ .../inferenceql/structure_learning/csv.clj | 37 ------- .../inferenceql/structure_learning/main.clj | 40 ------- .../inferenceql/structure_learning/schema.clj | 100 ------------------ 8 files changed, 82 insertions(+), 203 deletions(-) create mode 100644 scripts/guess_schema.sql create mode 100755 scripts/ignore.sh delete mode 100644 src/clojure/inferenceql/structure_learning/schema.clj diff --git a/data/.gitignore b/data/.gitignore index 0eb04f4a..b09f2b91 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -12,12 +12,12 @@ /nullified.csv /numericalized.csv /subsampled.csv +/summarized.csv /validated.csv # schema artifacts -/cgpm-schema.edn /mapping-table.edn -/schema.edn +/schema.csv # loom artifacts /loom-schema.json diff --git a/dvc.yaml b/dvc.yaml index f4f29fc6..6fff6222 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -36,37 +36,34 @@ stages: outs: - data/nullified.csv - guess-schema: + summarize: cmd: > - clojure -X inferenceql.structure-learning.main/guess-schema + duckdb -csv :memory: "SUMMARIZE SELECT * FROM read_csv_auto('/dev/stdin', header=true)" < data/nullified.csv - > data/schema.edn + > data/summarized.csv deps: - data/nullified.csv - params: - - schema outs: - - data/schema.edn + - data/summarized.csv - cgpm-schema: + guess-schema: cmd: > - clojure -X inferenceql.structure-learning.main/cgpm-schema - < data/schema.edn - > data/cgpm-schema.edn + duckdb -csv :memory: "$(cat scripts/guess_schema.sql)" + < data/summarized.csv + > data/schema.csv deps: - - data/schema.edn + - data/summarized.csv outs: - - data/cgpm-schema.edn + - data/schema.csv ignore: cmd: > - clojure -X inferenceql.structure-learning.main/ignore - :schema '"data/schema.edn"' + ./scripts/ignore.sh < data/nullified.csv > data/ignored.csv deps: - data/nullified.csv - - data/schema.edn + - data/schema.csv outs: - data/ignored.csv @@ -86,11 +83,14 @@ stages: loom-schema: cmd: > - clojure -X inferenceql.structure-learning.main/loom-schema - < data/schema.edn + duckdb -json :memory: ' + SELECT column_name, loom_statistical_type + FROM read_csv_auto("data/schema.csv", header=true) + WHERE loom_statistical_type IS NOT NULL' + | jq 'map({(.column_name): .loom_statistical_type}) | add' > data/loom-schema.json deps: - - data/schema.edn + - data/schema.csv outs: - data/loom-schema.json @@ -153,14 +153,14 @@ stages: --metadata {} --output data/cgpm/hydrated/{/} --data data/numericalized.csv - --schema data/cgpm-schema.edn + --schema data/schema.csv --mapping-table data/mapping-table.edn --seed $((${seed} + {#} - 1))' params: - parallel.flags - seed deps: - - data/cgpm-schema.edn + - data/schema.csv - data/cgpm/raw - data/mapping-table.edn - data/numericalized.csv diff --git a/scripts/cgpm_hydrate.py b/scripts/cgpm_hydrate.py index 258b2f24..1e89efae 100644 --- a/scripts/cgpm_hydrate.py +++ b/scripts/cgpm_hydrate.py @@ -2,6 +2,7 @@ import argparse import cgpm.utils.general as general +import duckdb import edn_format import json import pandas @@ -25,7 +26,7 @@ def main(): "--data", type=argparse.FileType("r"), help="Path to numericalized CSV." ) parser.add_argument( - "--schema", type=argparse.FileType("r"), help="Path to CGPM schema." + "--schema", help="Path to schema." ) parser.add_argument( "--mapping-table", @@ -51,16 +52,17 @@ def main(): sys.exit(1) df = pandas.read_csv(args.data) - schema = edn_format.loads(args.schema.read(), write_ply_tables=False) + schema = duckdb.read_csv(args.schema, header=True) + stattypes = dict(duckdb.sql("SELECT column_name, column_cgpm_statistical_type FROM schema").fetchall()) mapping_table = edn_format.loads(args.mapping_table.read(), write_ply_tables=False) def n_categories(column): return len(mapping_table[column]) def distarg(column): - return {"k": n_categories(column)} if schema[column] == "categorical" else None + return {"k": n_categories(column)} if stattypes[column] == "categorical" else None - cctypes = [schema[column] for column in df.columns] + cctypes = [stattypes[column] for column in df.columns] distargs = [distarg(column) for column in df.columns] if args.metadata is not None: diff --git a/scripts/guess_schema.sql b/scripts/guess_schema.sql new file mode 100644 index 00000000..086ca7a3 --- /dev/null +++ b/scripts/guess_schema.sql @@ -0,0 +1,45 @@ +SELECT + column_name, + column_type, + ( + CASE + WHEN column_type = 'VARCHAR' THEN + ( + CASE + WHEN approx_unique > 1 AND approx_unique <= 50 + THEN 'NOMINAL' + ELSE 'IGNORE' + END + ) + WHEN column_type = 'DOUBLE' THEN + ( + -- Not enough unique values. Better off modeling as nominal. + CASE + WHEN approx_unique < 20 + OR (approx_unique / count) < 0.02 + THEN 'NOMINAL' + + -- Consecutive numbers -- probably an ID? + WHEN approx_unique = count + AND count = TRY_CAST(min AS INTEGER) - TRY_CAST(max AS INTEGER) + 1 + THEN 'IGNORE' + + ELSE 'NUMERICAL' + END + ) + END + ) AS column_statistical_type, + ( + CASE + WHEN column_statistical_type = 'IGNORE' THEN 'ignore' + WHEN column_statistical_type = 'NOMINAL' THEN 'categorical' + WHEN column_statistical_type = 'NUMERICAL' THEN 'normal' + END + ) AS column_cgpm_statistical_type, + ( + CASE + WHEN column_statistical_type = 'NOMINAL' THEN 'dd' + WHEN column_statistical_type = 'NUMERICAL' THEN 'nich' + END + ) AS loom_statistical_type, +FROM read_csv_auto('/dev/stdin', header=true) diff --git a/scripts/ignore.sh b/scripts/ignore.sh new file mode 100755 index 00000000..f0999b5f --- /dev/null +++ b/scripts/ignore.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +columns=$( +duckdb -noheader -list :memory: <<-EOF | paste -s -d , - + SELECT column_name + FROM read_csv_auto('data/schema.csv', header=true) + WHERE column_statistical_type != 'IGNORE' +EOF +) +duckdb -csv :memory: "SELECT $columns FROM read_csv_auto('/dev/stdin', header=true)" diff --git a/src/clojure/inferenceql/structure_learning/csv.clj b/src/clojure/inferenceql/structure_learning/csv.clj index f0f2f284..fa3f9696 100644 --- a/src/clojure/inferenceql/structure_learning/csv.clj +++ b/src/clojure/inferenceql/structure_learning/csv.clj @@ -116,43 +116,6 @@ (recur csv (first ks) (next ks)) csv)))) -(defn heuristic-coerce - [& coll] - (try (into [] - (map #(if (nil? %) - % - (Long/parseLong %))) - coll) - (catch java.lang.NumberFormatException _ - (try (into [] - (map #(if (nil? %) - % - (Double/parseDouble %))) - coll) - (catch java.lang.NumberFormatException _ - coll))))) - -(defn apply-column - [k f coll] - (let [new-column (->> coll - (map #(get % k)) - (apply f))] - (map-indexed (fn [index m] - (let [v (get new-column index)] - (cond-> m - (some? v) (assoc k v)))) - coll))) - -(defn heuristic-coerce-all - [coll] - (let [ks (into #{} - (mapcat keys) - coll)] - (reduce (fn [acc k] - (apply-column k heuristic-coerce acc)) - coll - ks))) - (defn update-by-key "For each key k in coll if (f k) returns a function update the value for k in coll with that function." diff --git a/src/clojure/inferenceql/structure_learning/main.clj b/src/clojure/inferenceql/structure_learning/main.clj index 45077a85..b66823c0 100644 --- a/src/clojure/inferenceql/structure_learning/main.clj +++ b/src/clojure/inferenceql/structure_learning/main.clj @@ -9,7 +9,6 @@ [clojure.string :as string] [inferenceql.structure-learning.csv :as iql.csv] [inferenceql.structure-learning.dvc :as dvc] - [inferenceql.structure-learning.schema :as schema] [inferenceql.inference.gpm :as gpm] [inferenceql.query.db :as db] [inferenceql.query.io :as query.io] @@ -26,45 +25,6 @@ headers) (csv/write-csv *out*)))) -(defn guess-schema - [_] - (let [params (dvc/yaml) - params-schema (:schema params) - default-stattype (get params :default-stat-type :ignore) - guessed-schema (->> (csv/read-csv *in*) - (sequence (comp (iql.csv/as-maps) - (map #(medley/remove-vals (every-pred string? string/blank?) %)) - (map #(medley/remove-keys (set (keys params-schema)) %)))) - (iql.csv/heuristic-coerce-all) - (schema/guess default-stattype)) - schema (merge guessed-schema params-schema)] - (assert (not (every? #{:ignore} (vals schema))) - "The statistical types of the columns in data.csv can't be guessed confidently.\nAll columns are ignored. Set statistical types manually in params.yaml to fix this") - (schema/print-ignored schema) - (prn schema))) - -(defn loom-schema - [_] - (-> (edn/read *in*) - (schema/loom) - (json/generate-stream *out*))) - -(defn cgpm-schema - [_] - (-> (edn/read *in*) - (schema/cgpm) - (pr))) - -(defn ignore - [{:keys [schema]}] - (let [ignored (into #{} - (comp (filter (comp #{:ignore} val)) - (map key)) - (edn/read-string (slurp schema))) - csv (csv/read-csv *in*) - ignored-csv (apply iql.csv/dissoc csv ignored)] - (csv/write-csv *out* ignored-csv))) - (defn numericalize [{table-path :table schema-path :schema}] (let [schema (edn/read-string (slurp (str schema-path))) diff --git a/src/clojure/inferenceql/structure_learning/schema.clj b/src/clojure/inferenceql/structure_learning/schema.clj deleted file mode 100644 index ec4c5c3c..00000000 --- a/src/clojure/inferenceql/structure_learning/schema.clj +++ /dev/null @@ -1,100 +0,0 @@ -(ns inferenceql.structure-learning.schema - (:require [clojure.set])) - -(defn consecutive? - "Returns true if if the provided column is comprised entirely of consecutive integers." - [coll] - (let [sorted-coll (sort coll)] - (= sorted-coll - (take (count coll) - (iterate inc (first sorted-coll)))))) - -(defn column - "Given a key k and a collection of associative data structures coll, `column` - returns a sequence of the values in for key k. Returns a transducer if only a - key is provided." - ([k] - (map #(get % k))) - ([k coll] - (sequence (column k) coll))) - -(defn guess-stattype - "Guess the statistical type of a collection of values." - [default-type coll] - (let [max-categories 50 - dv (distinct coll) ; Distinct vals. - num-dv (count dv) - ratio-dv (/ num-dv (count coll)) ; Ratio of distinct vals to all vals. - - ;; Number of distinct values below which columns whose values can all be parsed as - ;; numbers will be considered nominal anyway. - numcat-count 20 - ;; Ratio of distinct values to total values below which columns whose values can - ;; all be parsed as numbers will be considered nominal anyway. - numcat-ratio 0.02] - (cond (= 1 num-dv) - :ignore - - (some string? coll) - (if (< num-dv max-categories) - :nominal - :ignore) - - (= num-dv (count coll)) - (if (every? integer? coll) - (if (consecutive? coll) - :ignore - :numerical) - :numerical) - - (every? number? coll) - (if (or (< num-dv numcat-count) - (< ratio-dv numcat-ratio)) - :nominal - :numerical) - - :else - (keyword default-type)))) - -(defn guess - "Guess a schema for a collection of maps." - [default-type coll] - (let [columns (into #{} (mapcat keys) coll)] - (zipmap columns - (map #(guess-stattype default-type (column % coll)) - columns)))) - -(defn loom - "Returns the Loom schema for an InferenceQL schema." - [schema] - (let [replacements {:nominal "dd" ; discrete dirichlet - :numerical "nich"}] ; normal inverse chi squared - (into {} - (comp (remove (comp #{:ignore} val)) - (map (juxt key (comp replacements val)))) - schema))) - -(defn cgpm - "Returns the CGPM schema for an InferenceQL schema." - [schema] - (let [replacements {:nominal "categorical" ; discrete dirichlet - :numerical "normal"}] - (into {} - (comp (remove (comp #{:ignore} val)) - (map (juxt (comp name key) (comp replacements val)))) - schema))) - -(defn print-ignored - "Tells at the user which columns were ignored." - [schema] - (let [ignored (keys (filter (fn [[_ v]] (= v :ignore)) schema))] - (when (not-empty ignored) - (binding [*out* *err*] - (println "") - (println "======== CAVEAT - we ignore the following columns ========") - (println "") - (mapv println ignored) - (println "") - (println "- this is fine if it was intended.") - (println "==========================================================") - (println ""))))) From 8767d7779813a940b9c652248a3aef6e89da1624 Mon Sep 17 00:00:00 2001 From: Zane Shelby Date: Fri, 23 Feb 2024 17:24:57 -0500 Subject: [PATCH 2/2] WIP --- dvc-cgpm.yaml | 25 ++++++++++++++++--------- flake.lock | 27 +++++++++++++++++++++++++++ flake.nix | 12 ++++++++++++ 3 files changed, 55 insertions(+), 9 deletions(-) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/dvc-cgpm.yaml b/dvc-cgpm.yaml index e9a70d56..16461c7b 100644 --- a/dvc-cgpm.yaml +++ b/dvc-cgpm.yaml @@ -36,17 +36,25 @@ stages: outs: - data/nullified.csv - guess-schema: + summarize: cmd: > - clojure -X inferenceql.structure-learning.main/guess-schema + duckdb -csv :memory: "SUMMARIZE SELECT * FROM read_csv_auto('/dev/stdin', header=true)" < data/nullified.csv - > data/schema.edn + > data/summarized.csv deps: - data/nullified.csv - params: - - schema outs: - - data/schema.edn + - data/summarized.csv + + guess-schema: + cmd: > + duckdb -csv :memory: "$(cat scripts/guess_schema.sql)" + < data/summarized.csv + > data/schema.csv + deps: + - data/summarized.csv + outs: + - data/schema.csv cgpm-schema: cmd: > @@ -60,13 +68,12 @@ stages: ignore: cmd: > - clojure -X inferenceql.structure-learning.main/ignore - :schema '"data/schema.edn"' + ./scripts/ignore.sh < data/nullified.csv > data/ignored.csv deps: - data/nullified.csv - - data/schema.edn + - data/schema.csv outs: - data/ignored.csv diff --git a/flake.lock b/flake.lock new file mode 100644 index 00000000..93912036 --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1706826059, + "narHash": "sha256-N69Oab+cbt3flLvYv8fYnEHlBsWwdKciNZHUbynVEOA=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "25e3d4c0d3591c99929b1ec07883177f6ea70c9d", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-23.11", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 00000000..83b51bea --- /dev/null +++ b/flake.nix @@ -0,0 +1,12 @@ +{ + inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11"; + + outputs = { self, nixpkgs, ... }: let + system = "aarch64-darwin"; + pkgs = nixpkgs.legacyPackages."${system}"; + in { + devShells."${system}".default = pkgs.mkShell { + buildInputs = with pkgs; [ (python3.withPackages (ps: with ps; [ duckdb ])) ]; + }; + }; +}