From fd80525799107f7f40b5aad3ed4bafe9548ed5ac Mon Sep 17 00:00:00 2001
From: Eric Potash <eric@k2co3.net>
Date: Wed, 18 Oct 2017 11:05:24 -0500
Subject: [PATCH 1/7] add outcome_where_expr argument to match drain missing
 outcome handling

---
 lead/model/experiments.py |  7 +++++++
 lead/model/transform.py   | 10 ++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/lead/model/experiments.py b/lead/model/experiments.py
index bcdc31e..5c7493e 100644
--- a/lead/model/experiments.py
+++ b/lead/model/experiments.py
@@ -1,6 +1,13 @@
 from .workflows import *
 from copy import deepcopy
 
+def bll6_forest_where():
+    """
+    The basic temporal cross-validation workflow
+    """
+    return bll6_models(forest(), transform_search={'outcome_where_expr':[None, 'max_bll0 == max_bll0']})
+    return bll6_models(forest(), transform_search={'outcome_where_expr':['max_bll0 == max_bll0']})
+
 def bll6_forest_no_wic():
     """
     No wic features
diff --git a/lead/model/transform.py b/lead/model/transform.py
index 1a201a7..f402659 100644
--- a/lead/model/transform.py
+++ b/lead/model/transform.py
@@ -14,19 +14,23 @@ class LeadTransform(Step):
     performing feature selection and creating sample weights.
     """
     def __init__(self, inputs, outcome_expr, aggregations,
-            wic_sample_weight=0, exclude=[], include=[]):
+            outcome_where_expr=None, wic_sample_weight=0,
+            exclude=[], include=[]):
         """
         Args:
             inputs: list containing a LeadCrossValidate step
             outcome_expr: the query to perform on the auxillary information to produce an outcome variable
             aggregations: defines which of the SpacetimeAggregations to include
-            and which to drop
+                and which to drop
+            outcome_where_expr: where to evaluate the outcome_expr,
+                defaults to None, which means everywhere
             wic_sample_weight: optional different sample weight for wic kids
         """
         Step.__init__(self,
                 inputs=inputs,
                 outcome_expr=outcome_expr,
                 aggregations=aggregations,
+                outcome_where_expr=outcome_where_expr,
                 wic_sample_weight=wic_sample_weight, 
                 exclude=exclude, include=include)
 
@@ -40,6 +44,8 @@ def run(self, X, aux, train, test):
 
         """
         y = aux.eval(self.outcome_expr)
+        if self.outcome_where_expr is not None:
+            y = y.where(aux.eval(self.outcome_where_expr))
 
         logging.info('Selecting aggregations')
         aggregations = self.get_input(LeadData).aggregations

From 24eb97c65b4b69067cb0cb50437cd1c0e4ff3ff4 Mon Sep 17 00:00:00 2001
From: Eric Potash <eric@k2co3.net>
Date: Tue, 24 Oct 2017 15:05:39 -0500
Subject: [PATCH 2/7] tested and incorporated into workflows

---
 lead/model/experiments.py | 7 -------
 lead/model/workflows.py   | 3 ++-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/lead/model/experiments.py b/lead/model/experiments.py
index 5c7493e..bcdc31e 100644
--- a/lead/model/experiments.py
+++ b/lead/model/experiments.py
@@ -1,13 +1,6 @@
 from .workflows import *
 from copy import deepcopy
 
-def bll6_forest_where():
-    """
-    The basic temporal cross-validation workflow
-    """
-    return bll6_models(forest(), transform_search={'outcome_where_expr':[None, 'max_bll0 == max_bll0']})
-    return bll6_models(forest(), transform_search={'outcome_where_expr':['max_bll0 == max_bll0']})
-
 def bll6_forest_no_wic():
     """
     No wic features
diff --git a/lead/model/workflows.py b/lead/model/workflows.py
index cafa0c8..32191ab 100644
--- a/lead/model/workflows.py
+++ b/lead/model/workflows.py
@@ -143,7 +143,8 @@ def bll6_models(estimators, cv_search={}, transform_search={}):
     transformd = dict(
         wic_sample_weight=[0],
         aggregations=aggregations.args,
-        outcome_expr=['max_bll0 >= 6']
+        outcome_expr='max_bll0 >= 6',
+        outcome_where_expr='max_bll0 == max_bll0' # this means max_bll0.notnull()
     )
     transformd.update(transform_search)
     return models(estimators, cvd, transformd)

From a29a4674e0e6f7beb3cf61bdfb23f140e63e9fee Mon Sep 17 00:00:00 2001
From: Eric Potash <eric@k2co3.net>
Date: Wed, 25 Oct 2017 18:48:55 -0500
Subject: [PATCH 3/7] include (block,ward,community) pieces in address dataset

---
 lead/model/address.py | 32 +++++++++++++++++++++++++++++++-
 lead/model/data.py    |  2 +-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/lead/model/address.py b/lead/model/address.py
index 63250be..3c56415 100644
--- a/lead/model/address.py
+++ b/lead/model/address.py
@@ -6,7 +6,37 @@
 import numpy as np
 import logging
 
-addresses = FromSQL(table='output.addresses')
+# in addition to all addresses, we add all cells in the partition
+# created by intersecting blocks, wards and communities 
+# in anticipation of any new addresses in deployment
+addresses = FromSQL("""
+with blocks as (
+select
+    b.geoid10::double precision census_block_id,
+    substring(b.geoid10 for 11)::double precision census_tract_id,
+    c.area_numbe::int community_area_id,
+    w.ward::int ward_id
+from input.census_blocks b
+join input.community_areas c
+    on st_intersects(b.geom, c.geom)
+join input.wards w
+    on st_intersects(b.geom, w.geom) and st_intersects(c.geom, w.geom)
+)
+select
+    null address,
+    null address_lat,
+    null address_lng,
+    null as address_id,
+    null as building_id,
+    null as complex_id, *
+from blocks
+UNION ALL
+select address, address_lat, address_lng, 
+    address_id, building_id, complex_id,
+    census_block_id, census_tract_id, 
+    community_area_id, ward_id
+from output.addresses
+    """, tables=['output.addresses', 'input.census_blocks', 'input.census_tracts', 'input.community_areas', 'input.wards'])
 addresses.target = True
 
 class LeadAddressLeft(Step):
diff --git a/lead/model/data.py b/lead/model/data.py
index b3061ef..6432078 100644
--- a/lead/model/data.py
+++ b/lead/model/data.py
@@ -83,7 +83,7 @@ def run(self, acs, left, aux=None):
                 sample weights, and evaluation.
         """
         if self.address:
-            index_columns = ['address','date']
+            index_columns = ['address', 'census_block_id', 'ward_id', 'community_area_id', 'date']
         if not self.address:
             index_columns = ['kid_id', 'address_id', 'date']
 

From 96a06174699a7ca06bb4dd15243504a911da28b8 Mon Sep 17 00:00:00 2001
From: Eric Potash <eric@k2co3.net>
Date: Tue, 7 Nov 2017 16:35:29 -0600
Subject: [PATCH 4/7] use drain from github in requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 2bb3cec..9377c0a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
-drain
+git+https://github.com/potash/drain
 git+https://github.com/potash/scikit-learn@merged/balanced-random-forest#egg=scikit-learn

From 170a54669e969802862b47c59ee7cd4cb4acbe6e Mon Sep 17 00:00:00 2001
From: Eric Potash <eric@k2co3.net>
Date: Thu, 30 Nov 2017 18:17:56 -0600
Subject: [PATCH 5/7] fix index

---
 lead/model/data.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lead/model/data.py b/lead/model/data.py
index 6432078..7cd4d3a 100644
--- a/lead/model/data.py
+++ b/lead/model/data.py
@@ -32,8 +32,8 @@ def __init__(self, month, day, year_min, year_max, wic_lag=None, dtype=None, add
             year_max: the year to stop generating features
             wic_lag: a lag for the WIC aggregations, parsed by
                 drain.data.parse_delta, e.g. '6m' is a six month lag.
-                Defaultis to None, which is no lag.
-            dtype: the dtype to use for features. Defaults to np.float16.
+                Default is to None, which is no lag.
+            dtype: the dtype to use for features. Defaults to np.float16 for memory efficiency.
             address: whether to build an address dataset. Defaults to False,
                 which builds a kid dataset.
         """
@@ -83,15 +83,16 @@ def run(self, acs, left, aux=None):
                 sample weights, and evaluation.
         """
         if self.address:
-            index_columns = ['address', 'census_block_id', 'ward_id', 'community_area_id', 'date']
+            index_columns = ['address', 'census_block_id', 'census_tract_id', 'community_area_id', 'date']
+            left_columns = ['ward_id', 'address_lat', 'address_lng']
         if not self.address:
             index_columns = ['kid_id', 'address_id', 'date']
+            left_columns = ['ward_id', 'community_area_id', 'address_lat', 'address_lng']
 
-        left_columns = ['ward_id', 'community_area_id', 'address_lat', 'address_lng']
         left = left[index_columns + left_columns]
 
         logging.info('Binarizing community area and ward')
-        left = data.binarize(left, ['community_area_id', 'ward_id'], astype=self.dtype)
+        left = data.binarize(left, ['community_area_id', 'ward_id'], astype=self.dtype, drop=(not self.address))
 
         logging.info('Joining aggregations')
         X = left.join([a.result for a in self.aggregation_joins] + [acs])

From 62641e26391e9e01dd7201c67bff95a844b2acc6 Mon Sep 17 00:00:00 2001
From: Eric Potash <eric@k2co3.net>
Date: Fri, 15 Dec 2017 13:28:23 -0600
Subject: [PATCH 6/7] include ward

---
 lead/model/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lead/model/data.py b/lead/model/data.py
index 7cd4d3a..0e6a9cb 100644
--- a/lead/model/data.py
+++ b/lead/model/data.py
@@ -83,7 +83,7 @@ def run(self, acs, left, aux=None):
                 sample weights, and evaluation.
         """
         if self.address:
-            index_columns = ['address', 'census_block_id', 'census_tract_id', 'community_area_id', 'date']
+            index_columns = ['address', 'census_block_id', 'ward_id', 'community_area_id', 'date']
             left_columns = ['ward_id', 'address_lat', 'address_lng']
         if not self.address:
             index_columns = ['kid_id', 'address_id', 'date']

From 61c55192e32f24e70207b15c6ce77a883b5ccee7 Mon Sep 17 00:00:00 2001
From: Eric Potash <eric@k2co3.net>
Date: Wed, 20 Dec 2017 12:32:05 -0600
Subject: [PATCH 7/7] fix geographies

---
 lead/model/address.py | 1 +
 lead/model/data.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/lead/model/address.py b/lead/model/address.py
index 3c56415..27b55a5 100644
--- a/lead/model/address.py
+++ b/lead/model/address.py
@@ -21,6 +21,7 @@
     on st_intersects(b.geom, c.geom)
 join input.wards w
     on st_intersects(b.geom, w.geom) and st_intersects(c.geom, w.geom)
+group by 1,2,3,4
 )
 select
     null address,
diff --git a/lead/model/data.py b/lead/model/data.py
index 0e6a9cb..69ca99b 100644
--- a/lead/model/data.py
+++ b/lead/model/data.py
@@ -84,7 +84,7 @@ def run(self, acs, left, aux=None):
         """
         if self.address:
             index_columns = ['address', 'census_block_id', 'ward_id', 'community_area_id', 'date']
-            left_columns = ['ward_id', 'address_lat', 'address_lng']
+            left_columns = ['address_lat', 'address_lng']
         if not self.address:
             index_columns = ['kid_id', 'address_id', 'date']
             left_columns = ['ward_id', 'community_area_id', 'address_lat', 'address_lng']