From fd80525799107f7f40b5aad3ed4bafe9548ed5ac Mon Sep 17 00:00:00 2001 From: Eric Potash Date: Wed, 18 Oct 2017 11:05:24 -0500 Subject: [PATCH 1/7] add outcome_where_expr argument to match drain missing outcome handling --- lead/model/experiments.py | 7 +++++++ lead/model/transform.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/lead/model/experiments.py b/lead/model/experiments.py index bcdc31e..5c7493e 100644 --- a/lead/model/experiments.py +++ b/lead/model/experiments.py @@ -1,6 +1,13 @@ from .workflows import * from copy import deepcopy +def bll6_forest_where(): + """ + The basic temporal cross-validation workflow + """ + return bll6_models(forest(), transform_search={'outcome_where_expr':[None, 'max_bll0 == max_bll0']}) + return bll6_models(forest(), transform_search={'outcome_where_expr':['max_bll0 == max_bll0']}) + def bll6_forest_no_wic(): """ No wic features diff --git a/lead/model/transform.py b/lead/model/transform.py index 1a201a7..f402659 100644 --- a/lead/model/transform.py +++ b/lead/model/transform.py @@ -14,19 +14,23 @@ class LeadTransform(Step): performing feature selection and creating sample weights. """ def __init__(self, inputs, outcome_expr, aggregations, - wic_sample_weight=0, exclude=[], include=[]): + outcome_where_expr=None, wic_sample_weight=0, + exclude=[], include=[]): """ Args: inputs: list containing a LeadCrossValidate step outcome_expr: the query to perform on the auxillary information to produce an outcome variable aggregations: defines which of the SpacetimeAggregations to include - and which to drop + and which to drop + outcome_where_expr: where to evaluate the outcome_expr, + defaults to None, which means everywhere wic_sample_weight: optional different sample weight for wic kids """ Step.__init__(self, inputs=inputs, outcome_expr=outcome_expr, aggregations=aggregations, + outcome_where_expr=outcome_where_expr, wic_sample_weight=wic_sample_weight, exclude=exclude, include=include) @@ -40,6 +44,8 @@ def run(self, X, aux, train, test): """ y = aux.eval(self.outcome_expr) + if self.outcome_where_expr is not None: + y = y.where(aux.eval(self.outcome_where_expr)) logging.info('Selecting aggregations') aggregations = self.get_input(LeadData).aggregations From 24eb97c65b4b69067cb0cb50437cd1c0e4ff3ff4 Mon Sep 17 00:00:00 2001 From: Eric Potash Date: Tue, 24 Oct 2017 15:05:39 -0500 Subject: [PATCH 2/7] tested and incorporated into workflows --- lead/model/experiments.py | 7 ------- lead/model/workflows.py | 3 ++- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/lead/model/experiments.py b/lead/model/experiments.py index 5c7493e..bcdc31e 100644 --- a/lead/model/experiments.py +++ b/lead/model/experiments.py @@ -1,13 +1,6 @@ from .workflows import * from copy import deepcopy -def bll6_forest_where(): - """ - The basic temporal cross-validation workflow - """ - return bll6_models(forest(), transform_search={'outcome_where_expr':[None, 'max_bll0 == max_bll0']}) - return bll6_models(forest(), transform_search={'outcome_where_expr':['max_bll0 == max_bll0']}) - def bll6_forest_no_wic(): """ No wic features diff --git a/lead/model/workflows.py b/lead/model/workflows.py index cafa0c8..32191ab 100644 --- a/lead/model/workflows.py +++ b/lead/model/workflows.py @@ -143,7 +143,8 @@ def bll6_models(estimators, cv_search={}, transform_search={}): transformd = dict( wic_sample_weight=[0], aggregations=aggregations.args, - outcome_expr=['max_bll0 >= 6'] + outcome_expr='max_bll0 >= 6', + outcome_where_expr='max_bll0 == max_bll0' # this means max_bll0.notnull() ) transformd.update(transform_search) return models(estimators, cvd, transformd) From a29a4674e0e6f7beb3cf61bdfb23f140e63e9fee Mon Sep 17 00:00:00 2001 From: Eric Potash Date: Wed, 25 Oct 2017 18:48:55 -0500 Subject: [PATCH 3/7] include (block,ward,community) pieces in address dataset --- lead/model/address.py | 32 +++++++++++++++++++++++++++++++- lead/model/data.py | 2 +- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/lead/model/address.py b/lead/model/address.py index 63250be..3c56415 100644 --- a/lead/model/address.py +++ b/lead/model/address.py @@ -6,7 +6,37 @@ import numpy as np import logging -addresses = FromSQL(table='output.addresses') +# in addition to all addresses, we add all cells in the partition +# created by intersecting blocks, wards and communities +# in anticipation of any new addresses in deployment +addresses = FromSQL(""" +with blocks as ( +select + b.geoid10::double precision census_block_id, + substring(b.geoid10 for 11)::double precision census_tract_id, + c.area_numbe::int community_area_id, + w.ward::int ward_id +from input.census_blocks b +join input.community_areas c + on st_intersects(b.geom, c.geom) +join input.wards w + on st_intersects(b.geom, w.geom) and st_intersects(c.geom, w.geom) +) +select + null address, + null address_lat, + null address_lng, + null as address_id, + null as building_id, + null as complex_id, * +from blocks +UNION ALL +select address, address_lat, address_lng, + address_id, building_id, complex_id, + census_block_id, census_tract_id, + community_area_id, ward_id +from output.addresses + """, tables=['output.addresses', 'input.census_blocks', 'input.census_tracts', 'input.community_areas', 'input.wards']) addresses.target = True class LeadAddressLeft(Step): diff --git a/lead/model/data.py b/lead/model/data.py index b3061ef..6432078 100644 --- a/lead/model/data.py +++ b/lead/model/data.py @@ -83,7 +83,7 @@ def run(self, acs, left, aux=None): sample weights, and evaluation. """ if self.address: - index_columns = ['address','date'] + index_columns = ['address', 'census_block_id', 'ward_id', 'community_area_id', 'date'] if not self.address: index_columns = ['kid_id', 'address_id', 'date'] From 96a06174699a7ca06bb4dd15243504a911da28b8 Mon Sep 17 00:00:00 2001 From: Eric Potash Date: Tue, 7 Nov 2017 16:35:29 -0600 Subject: [PATCH 4/7] use drain from github in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2bb3cec..9377c0a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -drain +git+https://github.com/potash/drain git+https://github.com/potash/scikit-learn@merged/balanced-random-forest#egg=scikit-learn From 170a54669e969802862b47c59ee7cd4cb4acbe6e Mon Sep 17 00:00:00 2001 From: Eric Potash Date: Thu, 30 Nov 2017 18:17:56 -0600 Subject: [PATCH 5/7] fix index --- lead/model/data.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lead/model/data.py b/lead/model/data.py index 6432078..7cd4d3a 100644 --- a/lead/model/data.py +++ b/lead/model/data.py @@ -32,8 +32,8 @@ def __init__(self, month, day, year_min, year_max, wic_lag=None, dtype=None, add year_max: the year to stop generating features wic_lag: a lag for the WIC aggregations, parsed by drain.data.parse_delta, e.g. '6m' is a six month lag. - Defaultis to None, which is no lag. - dtype: the dtype to use for features. Defaults to np.float16. + Default is to None, which is no lag. + dtype: the dtype to use for features. Defaults to np.float16 for memory efficiency. address: whether to build an address dataset. Defaults to False, which builds a kid dataset. """ @@ -83,15 +83,16 @@ def run(self, acs, left, aux=None): sample weights, and evaluation. """ if self.address: - index_columns = ['address', 'census_block_id', 'ward_id', 'community_area_id', 'date'] + index_columns = ['address', 'census_block_id', 'census_tract_id', 'community_area_id', 'date'] + left_columns = ['ward_id', 'address_lat', 'address_lng'] if not self.address: index_columns = ['kid_id', 'address_id', 'date'] + left_columns = ['ward_id', 'community_area_id', 'address_lat', 'address_lng'] - left_columns = ['ward_id', 'community_area_id', 'address_lat', 'address_lng'] left = left[index_columns + left_columns] logging.info('Binarizing community area and ward') - left = data.binarize(left, ['community_area_id', 'ward_id'], astype=self.dtype) + left = data.binarize(left, ['community_area_id', 'ward_id'], astype=self.dtype, drop=(not self.address)) logging.info('Joining aggregations') X = left.join([a.result for a in self.aggregation_joins] + [acs]) From 62641e26391e9e01dd7201c67bff95a844b2acc6 Mon Sep 17 00:00:00 2001 From: Eric Potash Date: Fri, 15 Dec 2017 13:28:23 -0600 Subject: [PATCH 6/7] include ward --- lead/model/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lead/model/data.py b/lead/model/data.py index 7cd4d3a..0e6a9cb 100644 --- a/lead/model/data.py +++ b/lead/model/data.py @@ -83,7 +83,7 @@ def run(self, acs, left, aux=None): sample weights, and evaluation. """ if self.address: - index_columns = ['address', 'census_block_id', 'census_tract_id', 'community_area_id', 'date'] + index_columns = ['address', 'census_block_id', 'ward_id', 'community_area_id', 'date'] left_columns = ['ward_id', 'address_lat', 'address_lng'] if not self.address: index_columns = ['kid_id', 'address_id', 'date'] From 61c55192e32f24e70207b15c6ce77a883b5ccee7 Mon Sep 17 00:00:00 2001 From: Eric Potash Date: Wed, 20 Dec 2017 12:32:05 -0600 Subject: [PATCH 7/7] fix geographies --- lead/model/address.py | 1 + lead/model/data.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lead/model/address.py b/lead/model/address.py index 3c56415..27b55a5 100644 --- a/lead/model/address.py +++ b/lead/model/address.py @@ -21,6 +21,7 @@ on st_intersects(b.geom, c.geom) join input.wards w on st_intersects(b.geom, w.geom) and st_intersects(c.geom, w.geom) +group by 1,2,3,4 ) select null address, diff --git a/lead/model/data.py b/lead/model/data.py index 0e6a9cb..69ca99b 100644 --- a/lead/model/data.py +++ b/lead/model/data.py @@ -84,7 +84,7 @@ def run(self, acs, left, aux=None): """ if self.address: index_columns = ['address', 'census_block_id', 'ward_id', 'community_area_id', 'date'] - left_columns = ['ward_id', 'address_lat', 'address_lng'] + left_columns = ['address_lat', 'address_lng'] if not self.address: index_columns = ['kid_id', 'address_id', 'date'] left_columns = ['ward_id', 'community_area_id', 'address_lat', 'address_lng']