From b664d220a5a573063da86f1501a706a2a91ea875 Mon Sep 17 00:00:00 2001 From: Ben Stabler Date: Fri, 23 Apr 2021 14:45:11 -0700 Subject: [PATCH 01/14] Develop (#132) * deprecate py2.7 * Multiprocess (#130) --- .gitignore | 1 + .travis.yml | 15 +- LICENSE.txt | 2 + MANIFEST.in | 1 - docs/application_configuration.rst | 189 ++++++++++++------ docs/conf.py | 6 +- docs/getting_started.rst | 51 +++-- docs/index.rst | 75 ++++--- docs/software.rst | 10 +- docs/validation.rst | 37 ++-- example_calm/.gitignore | 2 + example_calm/configs/settings.yaml | 15 +- example_calm/configs_mp/settings.yaml | 76 +++++++ example_calm/output/.gitignore | 1 + example_calm/output_mp/.gitignore | 5 + example_calm/output_mp/log/.gitignore | 3 + example_calm/run_populationsim.py | 84 +++----- example_calm_repop/run_populationsim.py | 2 +- example_survey_weighting/run_populationsim.py | 2 +- example_test/.gitignore | 1 + example_test/convert_test_data.py | 2 +- example_test/output/.gitignore | 2 + ez_setup.py | 20 +- populationsim/assign.py | 2 +- populationsim/balancer.py | 3 +- populationsim/integerizer.py | 8 +- populationsim/lp.py | 2 +- populationsim/lp_cvx.py | 2 +- populationsim/multi_integerizer.py | 25 ++- populationsim/simul_balancer.py | 3 +- populationsim/steps/__init__.py | 10 +- populationsim/steps/expand_households.py | 16 +- populationsim/steps/final_seed_balancing.py | 2 +- populationsim/steps/initial_seed_balancing.py | 2 +- populationsim/steps/input_pre_processor.py | 8 +- .../steps/integerize_final_seed_weights.py | 2 +- populationsim/steps/meta_control_factoring.py | 6 +- populationsim/steps/repop_balancing.py | 2 +- populationsim/steps/setup_data_structures.py | 16 +- populationsim/steps/sub_balancing.py | 19 +- populationsim/steps/summarize.py | 3 +- .../steps/write_synthetic_population.py | 2 +- populationsim/tests/configs_mp/settings.yaml | 48 +++++ populationsim/tests/output/.gitignore | 2 + populationsim/tests/run_mp.py | 72 +++++++ populationsim/tests/test_flex.py | 4 +- populationsim/tests/test_integerizer.py | 2 +- populationsim/tests/test_multi_integerizer.py | 2 +- populationsim/tests/test_steps_mp.py | 23 +++ populationsim/tests/test_tracing.py | 4 +- setup.cfg | 2 +- setup.py | 17 +- 52 files changed, 622 insertions(+), 289 deletions(-) create mode 100644 example_calm/.gitignore create mode 100644 example_calm/configs_mp/settings.yaml create mode 100644 example_calm/output_mp/.gitignore create mode 100644 example_calm/output_mp/log/.gitignore create mode 100644 example_test/.gitignore create mode 100644 populationsim/tests/configs_mp/settings.yaml create mode 100644 populationsim/tests/run_mp.py create mode 100644 populationsim/tests/test_steps_mp.py diff --git a/.gitignore b/.gitignore index 5a96d9a9..195b984a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ sandbox/ regress/ example_test_no_integerizing/ +example_mtc/ .idea .ipynb_checkpoints diff --git a/.travis.yml b/.travis.yml index ca6b90b4..cce01c80 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,24 +3,27 @@ language: python sudo: false python: -- '2.7' -- '3.6' - '3.7' +- '3.8' install: -- wget http://repo.continuum.io/miniconda/Miniconda-3.7.0-Linux-x86_64.sh -O miniconda.sh +- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda -- export PATH="$HOME/miniconda/bin:$PATH" +- source "$HOME/miniconda/etc/profile.d/conda.sh" - hash -r - conda config --set always_yes yes --set changeps1 no - conda update -q conda -- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION future -- source activate test-environment +- conda info -a +- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION +- conda activate test-environment - conda install pytest pytest-cov coveralls pycodestyle - pip install . +- pip freeze + script: - pycodestyle populationsim - py.test --cov populationsim --cov-report term-missing + after_success: - coveralls # Build docs diff --git a/LICENSE.txt b/LICENSE.txt index 99e7272b..1654ca90 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,3 +1,5 @@ +BSD 3-Clause License + PopulationSim Contributions Copyright (C) by the contributing authors diff --git a/MANIFEST.in b/MANIFEST.in index 7bd71819..f0786354 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ include ez_setup.py -include README.rst graft example_calm graft example_calm_repop graft example_survey_weighting diff --git a/docs/application_configuration.rst b/docs/application_configuration.rst index 450fc1cc..b970c4cb 100644 --- a/docs/application_configuration.rst +++ b/docs/application_configuration.rst @@ -7,20 +7,20 @@
Application & Configuration -============================= +=========================== This section describes how to set up a new PopulationSim implementation. In order to create a new PopulationSim implementation, the user must first understand the requirements of the project in terms of geographic resolution and details desired in the synthetic population. Once the requirements of the project have been established, the next step is to prepare the inputs to PopulationSim which includes seed population tables and geographic controls. Next, PopulationSim needs to be configured for available inputs and features desired in the final synthetic population. After this, the user needs to run PopulationSim and resolve any data related errors. Finally, the user should validate the output synthetic population against the controls to understand the precision of the synthetic population compared to controls and the amount of variance in the population for each control. Selecting Geographies ----------------------- +--------------------- -PopulationSim can represent both household and person level controls at multiple geographic levels. Therefore the user must define what geographic units to use for each control. This is an art; there is not necessarily any 'right' way to define geographic areas or to determine what geographic level to use for each control. However, there are important considerations for selecting geography, discussed below. +PopulationSim can represent both household and person level controls at multiple geographic levels. Therefore the user must define what geographic units to use for each control. There is not necessarily any 'right' way to define geographic areas or to determine what geographic level to use for each control. However, there are important considerations for selecting geography, discussed below. -Traditionally, travel forecasting models have followed the sequential four-step model framework. This required the modeling region to be divided into zones, typically the size of census block groups or tracts. The zones used in four-step process are typically known as Transportation Analysis Zones (TAZs). The spatial boundaries of TAZs varies across modeling region and ranges from a city block to a large area in the suburb within a modeling region. If building a synthetic population for a trip-based model, or an Activity-based model (ABM) whose smallest geography is the TAZ, then there is no reason to select a smaller geoegraphical unit than the TAZ for any of the controls. +Traditionally, travel forecasting models have followed the sequential four-step model framework. This required the modeling region to be divided into zones, typically the size of census block groups or tracts. The zones used in four-step process are typically known as Transportation Analysis Zones (TAZs). The spatial boundaries of TAZs varies across modeling region and ranges from a city block to a large area in the suburb within a modeling region. If building a synthetic population for a trip-based model, or an activity-based travel models (ABMs) whose smallest geography is the TAZ, then there is no reason to select a smaller geoegraphical unit than the TAZ for any of the controls. -Activity-based models (ABMs) operate in a micro-simulation framework, where travel decisions are modeled explicitly for persons and households in the synthetic population. Many ABMs (e.g., DaySim, CT-RAMP) operate at a finer spatial resolution than TAZs, wherein all location choices (e.g., usual work location, tour destination choice) are modeled at a sub-TAZ geography. This finer geography is typically referred to as Micro-Analysis Zones (MAZs) which are smaller zones nested within TAZs. Models that represent behavior at the MAZ level requires that MAZs are used as the lowest level of control, so that the synthetic population will identify the MAZ that each household resides in. +ABMs operate at the individual level, where travel decisions are modeled explicitly for persons and households in the synthetic population. Many ABMs operate at a finer spatial resolution than TAZs, wherein all location choices (e.g., usual work location, tour destination choice) are modeled at a sub-TAZ geography. This finer geography is typically referred to as Micro-Analysis Zones (MAZs) which are smaller zones nested within TAZs. Models that represent behavior at the MAZ level requires that MAZs are used as the lowest level of control, so that the synthetic population will identify the MAZ that each household resides in. As discussed earlier, two main inputs to a population synthesizer are a seed sample and controls. The seed sample can come from a household travel survey or from American Community Survey (ACS) Public Use Microdata Sample (PUMS), with latter being the most common source. The PUMS data contains a sample of actual responses to the ACS, but the privacy of each household is protected by aggregating all household residential locations into relatively large regions called Public Use Microdata Areas (PUMAs). PUMAs are special non-overlapping areas that partition each state into contiguous geographic units containing no fewer than 100,000 people each. Some larger regions are composed of many PUMAs, while other, smaller regions have only one PUMA, or may even be smaller than a PUMA. It is not a problem to use PopulationSim to generate a synthetic population if the region is smaller than a PUMA; PopulationSim will 'fit' the PUMA-level population to regional control data as an initial step. @@ -49,7 +49,7 @@ The hierarchy of geographies is important when making a decision regarding contr * Seed (e.g., PUMA) * Sub-Seed (e.g., TAZ, MAZ) -The Meta geography is the entire region. Currently, PopulationSim can handle only one Meta geography. The Seed geography is the geographic resolution of the seed data. There can be one or more Seed geographies. PopulationSim can handle any number of nested Sub-Seed geographies. More information on PopulationSim algorithm can be found from the PopulationSim specifications in the :ref:`docs` section. +The Meta geography is the entire region. PopulationSim can handle only one Meta geography. The Seed geography is the geographic resolution of the seed data. There can be one or more Seed geographies. PopulationSim can handle any number of nested Sub-Seed geographies. More information on PopulationSim algorithm can be found from the PopulationSim specifications in the :ref:`docs` section. Geographic Cross-walk ~~~~~~~~~~~~~~~~~~~~~ @@ -74,12 +74,12 @@ After selecting the geographies, the next step is to prepare a geographic cross- Preparing seed and control data --------------------------------- +------------------------------- Seed sample ~~~~~~~~~~~ -As mentioned in previous section, the seed sample is typically obtained from the ACS PUMS. One of the main requirements for the seed sample is that it should be representative of the modeling region. In case of ACS PUMS, this can be ensured by selecting PUMAs representing the modeling region both demographically and geographically. PUMA boundaries may not perfectly line up against the modeling region boundaries and overlaps are possible. Each sub-seed geography must be assigned to a Seed geography, and each Seed geography must be assigned to a Meta geography. +As mentioned in previous section, the seed sample is typically obtained from the ACS PUMS. One of the main requirements for the seed sample is that it should be representative of the modeling region. In the case of ACS PUMS, this can be ensured by selecting PUMAs representing the modeling region both demographically and geographically. PUMA boundaries may not perfectly line up against the modeling region boundaries and overlaps are possible. Each sub-seed geography must be assigned to a Seed geography, and each Seed geography must be assigned to a Meta geography. The seed sample must contain all of the specified control variables, as well as any variables that are needed for the travel model but not specified as controls. For population groups that use completely separate, non-overlapping controls, such as residential population and group-quarter population, separate seed samples are prepared. In the ACS PUMS datasets, it is possible to have zero-person households in the raw data table (`NP = 0`); these records must be filtered from the seed data. PopulationSim can be set up and run separately for each population segment using the same geographic system. The outputs from each run can be combined into a unified synthetic population as a post processing step. @@ -105,7 +105,7 @@ versions, and then add the error to the largest category by subtracting it from Configuration ------------- -Below is PopulationSim's directory structure followed by a description of inputs. +Below is PopulationSim's typical directory structure followed by a description of inputs. .. image:: images/PopulationSimFolderStructure.png @@ -117,15 +117,15 @@ PopulationSim is run via **run_populationsim.py**. The user needs to first activ activate popsim python run_populationsim.py -PopulationSim is configured using the settings.YAML file. PopulationSim can be configured to run in **base** mode or **repop** mode. +PopulationSim is configured using the settings.yaml file. PopulationSim can be configured to run in **regular** mode or **repop** mode. -:base mode: +:regular mode: - The base configuration runs PopulationSim from beginning to end and produces a new synthetic population. + The regular configuration runs PopulationSim from beginning to end and produces a new synthetic population. This can run either single-process or multi-processed to save on runtime. :repop mode: - The repop configuration is used for repopulating a subset of zones for an existing synthetic population. The user has the option to *replace* or *append* to the existing synthetic population. These options are specified from the settings.YAML file, details can be found in the :ref:`settings` section. + The repop configuration is used for repopulating a subset of zones for an existing synthetic population. The user has the option to *replace* or *append* to the existing synthetic population. These options are specified from the settings.yaml file, details can be found in the :ref:`settings` section. The following sections describes the inputs and outputs, followed by discussion on configuring the settings file and specifying controls. @@ -134,7 +134,7 @@ The following sections describes the inputs and outputs, followed by discussion .. _inputs_outputs: Inputs & Outputs -~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~ Please refer to the following definition list to understand the file names: @@ -147,17 +147,19 @@ Please refer to the following definition list to understand the file names: Working Directory Contents: -+-----------------------+----------------------------------------------------------------------------+ -| File | Description | -+=======================+============================================================================+ -| run_populationsim.py | Python script that orchestrates a PopulationSim run | -+-----------------------+----------------------------------------------------------------------------+ -| /configs | Sub-directory containing control specifications and configuration settings | -+-----------------------+----------------------------------------------------------------------------+ -| /data | Sub-directory containing all input files | -+-----------------------+----------------------------------------------------------------------------+ -| /output | Sub-directory containing all outputs, summaries and intermediate files | -+-----------------------+----------------------------------------------------------------------------+ ++-----------------------+--------------------------------------------------------------------------------------------------------+ +| File | Description | ++=======================+========================================================================================================+ +| run_populationsim.py | Python script that orchestrates a PopulationSim run | ++-----------------------+--------------------------------------------------------------------------------------------------------+ +| /configs | Sub-directory containing control specifications and configuration settings | ++-----------------------+--------------------------------------------------------------------------------------------------------+ +| /configs_mp | Sub-directory containing configuration settings for running multi-processed if applicable | ++-----------------------+--------------------------------------------------------------------------------------------------------+ +| /data | Sub-directory containing all input files | ++-----------------------+--------------------------------------------------------------------------------------------------------+ +| /output | Sub-directory containing all outputs, summaries and intermediate files | ++-----------------------+--------------------------------------------------------------------------------------------------------+ -------------------------------------------------------------- @@ -166,15 +168,25 @@ Working Directory Contents: +--------------------+------------------------------------------------------------+ | File | Description | +====================+============================================================+ -| logging.yaml | YAML-based file for setting up logging | +| logging.yaml | yaml-based file for setting up logging | +--------------------+------------------------------------------------------------+ -| settings.yaml | YAML-based settings file to configure a PopulationSim run | +| settings.yaml | yaml-based settings file to configure a PopulationSim run | +--------------------+------------------------------------------------------------+ | controls.csv | CSV file to specify controls | +--------------------+------------------------------------------------------------+ -------------------------------------------------------------- +*/configs_mp* Sub-directory Contents: + ++--------------------+---------------------------------------------------------------+ +| File | Description | ++====================+===============================================================+ +| settings.yaml | additional yaml-based settings file for multiprocess running | ++--------------------+---------------------------------------------------------------+ + +-------------------------------------------------------------- + */data* Sub-directory Contents: +-------------------------------------+----------------------------------------------------------------------+ @@ -208,10 +220,10 @@ This sub-directory is populated at the end of the PopulationSim run. The table b | | | this file with the seed sample to generate a fully expanded synthetic population | +---------------------------------+----------------------------+-----------------------------------------------------------------------------------------+ | synthetic_households.csv | Final Synthetic Population | Fully expanded synthetic population of households. User can specify the attributes |br| | -| | | to be included from the *seed sample* in the *settings.YAML* file | +| | | to be included from the *seed sample* in the *settings.yaml* file | +---------------------------------+----------------------------+-----------------------------------------------------------------------------------------+ | synthetic_persons.csv | Final Synthetic Population | Fully expanded synthetic population of persons. User can specify the attributes to |br| | -| | | be included from the *seed sample* in the *settings.YAML* file | +| | | be included from the *seed sample* in the *settings.yaml* file | +---------------------------------+----------------------------+-----------------------------------------------------------------------------------------+ | incidence_table.csv | Intermediate | Intermediate incidence table | +---------------------------------+----------------------------+-----------------------------------------------------------------------------------------+ @@ -249,11 +261,19 @@ This sub-directory is populated at the end of the PopulationSim run. The table b .. _settings: Configuring Settings File -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~ + +PopulationSim is configured using the *configs/settings.yaml* file. The user has the flexibility to specify algorithm functionality, list geographies, invoke tracing, provide inputs specifications, select outputs, list the steps to run, and specify multiprocess settings. -PopulationSim is configured using the *configs/settings.YAML* file. The user has the flexibility to specify algorithm functionality, list geographies, invoke tracing, provide inputs specifications, select outputs and list the steps to run. The settings shown below are from the PopulationSim application for the CALM region as an example of how a run can be configured. The meta geography for CALM region is named as *Region*, the seed geography is *PUMA* and the two sub-seed geographies are *TRACT* and *TAZ*. The settings below are for this four geography application, but the user can configure PopulationSim for any number of geographies and use different geography names. +.. note:: + When running PopulationSim, multiple settings files can be specified so long as the ``inherit_settings: True`` setting is included in + subsequent files. This feature is used for the multi-processing configuration described below. To utilize this feature, once can run PopulationSim + with the following command: ``python run_populationsim.py -c configs_mp -c configs``. This command specifies two config folders, each with + a settings file, and the ``configs_mp`` settings inherit from the earlier ``configs`` settings. -Some of the setting are configured differently for the *repop* mode. The settings specific to the *repop* mode are described in the :ref:`settings_repop` section. +The settings shown below are from the PopulationSim application for the CALM region as an example of how a run can be configured. The meta geography for CALM region is named as *Region*, the seed geography is *PUMA* and the two sub-seed geographies are *TRACT* and *TAZ*. The settings below are for this four geography application, but the user can configure PopulationSim for any number of geographies and use different geography names. + +Some of the setting are configured differently for the *repop* mode. The settings specific to the *repop* mode are described in the :ref:`settings_repop` section. The settings specific to the *multiprocessing* setup are described in the :ref:`settings_mp` section. **Algorithm/Software Configuration**: @@ -267,6 +287,7 @@ These settings control the functionality of the PopulationSim algorithm. The set USE_SIMUL_INTEGERIZER: True USE_CVXPY: False max_expansion_factor: 30 + MAX_BALANCE_ITERATIONS_SIMULTANEOUS: 1000 +--------------------------------------+------------+---------------------------------------------------------------------------------+ | Attribute | Value | Description | @@ -299,7 +320,8 @@ These settings control the functionality of the PopulationSim algorithm. The set | | | The maximum expansion factor may have to be adjusted upwards if the target |br| | | | | is much greater than the seed number of households. |br| | +--------------------------------------+------------+---------------------------------------------------------------------------------+ - +| MAX_BALANCE_ITERATIONS_SIMULTANEOUS | Integer | Number of simultaneous list balancer iterations | ++--------------------------------------+------------+---------------------------------------------------------------------------------+ **Geographic Settings**: @@ -365,7 +387,7 @@ This setting is used to specify details of various inputs to PopulationSim. Belo * Geographic CrossWalk * Control data at each control geography -Note that Seed-Households, Seed-Persons and Geographic CrossWalk are all required tables and must be listed. There must be a control data file specified for each geography other than seed. For each input table, the user is required to specify an import table name, input CSV file name, index column name and column name map (only for renaming column names). The user can also specify a list of columns to be dropped from the input synthetic population seed data. An example is shown below followed by description of attributes. +Note that Seed-Households, Seed-Persons and Geographic CrossWalk are all required tables and must be listed. There must be a control data file specified for each geography other than seed. For each input table, the user is required to specify an import table name, input CSV file name, index column name and column name map (only for renaming column names). The user can also specify a list of columns to be dropped from the input synthetic population seed data. An example is shown below followed by description of attributes. :: @@ -542,7 +564,7 @@ This setting allows the user to specify the details of the expanded synthetic po -**Steps for base mode**: +**Steps for regular mode**: This setting lists the sub-modules or steps to be run by the PopulationSim orchestrator. The ActivitySim framework allows user to resume a PopulationSim run from a specific point. This is specified using the attribute ``resume_after``. The step, ``sub_balancing.geography`` is repeated for each sub-seed geography (the example below shows two, but there can be 0 or more). @@ -600,19 +622,78 @@ For detailed information on software implementation refer to :ref:`core_componen | :ref:`summarize` | Write aggregate summary files of controls and weights for all geographic levels to output dir | +--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +.. _settings_mp: + +Configuring Settings File for Multiprocessing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This sections describes the settings that are additionally configured for running PopulationSim with +multiprocessing to reduce runtime. PopulationSim uses ActivitySim's multiprocessing capabilities, which +are described in more detail `here `_. + +The example below can be found in the ``example_calm\configs_mp\settings.yaml`` file. The group of model steps +identified as ``mp_seed_balancing`` and starting with ``input_pre_processor`` +are run single process until the next group of model steps identified as ``mp_sub_balancing_TAZ`` and starting with +``sub_balancing.geography=TAZ`` is reached, at which time PopulationSim runs these steps in parallel using two processors +by slicing the problem into separate geographic batches based on the ``slice_geography: TRACT`` setting. It then +returns to single process with the final group of model steps identified as ``mp_summarize`` and +beginning with ``expand_households``. + +:: + + inherit_settings: True + multiprocess: True + num_processes: 2 + cleanup_pipeline_after_run: True + slice_geography: TRACT + + multiprocess_steps: + - name: mp_seed_balancing + begin: input_pre_processor + - name: mp_sub_balancing_TAZ + begin: sub_balancing.geography=TAZ + num_processes: 2 + slice: + tables: + - slice_crosswalk + - crosswalk + # don't slice any tables not explicitly listed above in slice.tables + except: True + # the following tables are added by sub_balancer and should be coalesced + coalesce: + - TAZ_weights + - TAZ_weights_sparse + - trace_TAZ_weights + - name: mp_summarize + begin: expand_households + + ++-------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Attribute | Description | ++===============================+==============================================================================================================+ +| inherit_settings | True means this settings file inherits settings from settings file(s) identified earlier in the run command | ++-------------------------------+--------------------------------------------------------------------------------------------------------------+ +| num_processes | Number of processors to use for multiprocessing | ++-------------------------------+--------------------------------------------------------------------------------------------------------------+ +| cleanup_pipeline_after_run | Removes multiprocess process specific intermediate pipelines at the end of the run if desired | ++-------------------------------+--------------------------------------------------------------------------------------------------------------+ +| slice_geography | The geography used to separate the problem into parallel geographic batches for balancing | ++-------------------------------+--------------------------------------------------------------------------------------------------------------+ +| multiprocess_steps | Specifies which steps to run single process and multiprocess | ++-------------------------------+--------------------------------------------------------------------------------------------------------------+ .. _settings_repop: Configuring Settings File for repop Mode -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sections describes the settings that are configured differently for the *repop* mode. **Input Data Tables for repop mode** -The repop mode runs over an existing synthetic population and uses the data pipeline (HDF5 file) from the base run as an input. User should copy the HDF5 file from the base outputs to the *output* folder of the repop set up. The data input which needs to be specified in this setting is the control data for the subset of geographies to be modified. Input tables for the repop mode can be specified in the same manner as base mode. However, only one geography can be controlled. In the example below, TAZ controls are specified. The controls specified in TAZ_control_data do not have to be consistent with the controls specified in the data used to control the initial population. Only those geographic units to be repopulated should be specified in the control data (for example, TAZs 314 through 317). +The repop mode runs over an existing synthetic population and uses the data pipeline (HDF5 file) from the regular run as an input. User should copy the HDF5 file from the regular outputs to the *output* folder of the repop set up. The data input which needs to be specified in this setting is the control data for the subset of geographies to be modified. Input tables for the repop mode can be specified in the same manner as regular mode. However, only one geography can be controlled. In the example below, TAZ controls are specified. The controls specified in TAZ_control_data do not have to be consistent with the controls specified in the data used to control the initial population. Only those geographic units to be repopulated should be specified in the control data (for example, TAZs 314 through 317). :: @@ -641,7 +722,7 @@ It should be noted that only the summary_GEOG_NAME.csv summary file is available **Steps for repop mode**: -When running PoulationSim in repop mode, the steps specified in this setting are run. As mentioned earlier, the repop mode runs over an existing synthetic population. The default value for the ``resume_after`` setting under the repop mode is *summarize* which is the last step of a base run. In other words, the repop mode starts from the last step of the base run and modifies the base synthetic population as per the new controls. The user can choose either *append* or *replace* in the ``expand_households.repop`` attribute to modify the existing synthetic population. The *append* option adds to the existing synthetic population in the specified geographies, while the *replace* option replaces any existing synthetic population with newly synthesized population in the specified geographies. +When running PoulationSim in repop mode, the steps specified in this setting are run. As mentioned earlier, the repop mode runs over an existing synthetic population. The default value for the ``resume_after`` setting under the repop mode is *summarize* which is the last step of a regular run. In other words, the repop mode starts from the last step of the regular run and modifies the regular synthetic population as per the new controls. The user can choose either *append* or *replace* in the ``expand_households.repop`` attribute to modify the existing synthetic population. The *append* option adds to the existing synthetic population in the specified geographies, while the *replace* option replaces any existing synthetic population with newly synthesized population in the specified geographies. :: @@ -676,27 +757,27 @@ For information on software implementation of repop balancing refer to :ref:`rep .. _settings_weighting: How to prepare PopulationSim inputs for survey weighting -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The main difference in the seed sample for population synthesis and survey weighting is that in case of survey weighting the geographic allocation is known. PopulationSim operates at multiple geographies and performs geographic allocation of the sample to match controls at lower geographies. Since it is undesirable to change geographic allocation in case of survey weighting, controls should be specified only at one geographic level – the seed geography. All the other inputs must be prepared in the same fashion as for population synthesis. **Configuring PopulationSim for survey weighting**: -Since survey weighting does not involve expanding the survey sample, integerization is not needed. Integerization can be skipped by switching off integerization in the YAML settings file as follows: +Since survey weighting does not involve expanding the survey sample, integerization is not needed. Integerization can be skipped by switching off integerization in the yaml settings file as follows: :: NO_INTEGERIZATION_EVER: True -User may want to specify the maximum and minimum limit on expansion of initial weights in the YAML settings file as follows: +User may want to specify the maximum and minimum limit on expansion of initial weights in the yaml settings file as follows: :: max_expansion_factor: 4 # Default is 30 min_expansion_factor: 0.5 -The desired output for survey weighting is a list of final weights by household ID. In order to achieve this, the grouping of incidence must be switched off in the YAML settings file as follows: +The desired output for survey weighting is a list of final weights by household ID. In order to achieve this, the grouping of incidence must be switched off in the yaml settings file as follows: :: @@ -705,7 +786,7 @@ The desired output for survey weighting is a list of final weights by household **Output Tables for weighting mode**: -To obtain the final weights by household ID, the seed geography weights table must be specified in the YAML settings file as below: +To obtain the final weights by household ID, the seed geography weights table must be specified in the yaml settings file as below: :: @@ -731,7 +812,7 @@ The seed_geography_weights file contains the following columns: - It should be noted that under NO_INTEGERIZATION_EVER mode the expanded_household_ids file is empty. Specifying Controls -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~ The controls for a PopulationSim run are specified using the control specification CSV file. Following the ActivitySim framework, Python expressions are used for specifying control constraints. An example file is below. @@ -782,26 +863,10 @@ Some conventions for writing expressions: Error Handling & Debugging -------------------------- -It is recommended to do appropriate checks on input data before running PopulationSim. - -Checks on data inputs -~~~~~~~~~~~~~~~~~~~~~~~ - -While the PopulationSim algorithm is designed to work even with imperfect data, an error-free and consistent set of input controls guarantees optimal performance. Poor performance and errors are usually the result of inconsistent data and it is the responsibility of the user to do necessary QA/QC on the input data. Some data problems that are frequently encountered are as follows: +It is recommended to do appropriate checks on input data before running PopulationSim. While the PopulationSim algorithm is designed to work even with imperfect data, an error-free and consistent set of input controls guarantees optimal performance. Poor performance and errors are usually the result of inconsistent data and it is the responsibility of the user to do necessary QA/QC on the input data. Some data problems that are frequently encountered are as follows: * Miscoding of data * Inconsistent controls (for example, household-level households by size controls do not match person-level controls on total persons, or household-level workers per household controls do not match person-level workers by occupation) * Controls do not add to total number of households * Controls do not aggregate consistently across geographies - * missing or mislabelled controls - -Common run-time errors -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Below is a list of common run-time errors: - -**Tabs in settings.YAML file** - -User should not use /t (tabs) while configuring the settings.YAML file. Presence of /t would result in the error shown below. {SPACE} should be used for indenting purposes and hard returns at the end of each line. - - .. image:: images/YAML_Tab_Error.JPG + * Missing or mislabelled controls diff --git a/docs/conf.py b/docs/conf.py index 9470f5c6..2c5df3c5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,9 +20,9 @@ # -- Get Package Version -------------------------------------------------- with open("../setup.py") as file: lines = file.readlines() - for l in lines: - if "version" in l: - VERSION = l.replace("version='", "").replace("',", "").replace(" ", "") + for line in lines: + if "version" in line: + VERSION = line.replace("version='", "").replace("',", "").replace(" ", "") print("package version: " + VERSION) # If extensions (or modules to document with autodoc) are in another directory, diff --git a/docs/getting_started.rst b/docs/getting_started.rst index b775cfce..371d2a33 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -27,34 +27,35 @@ Installation :: - conda create -n popsim python=3.7 + conda create -n popsim python=3.8 - #Windows + # Windows activate popsim - #Mac + # Mac conda activate popsim 4. Get and install the PopulationSim package on the activated conda Python environment: :: + # best to use the conda version of pytables for consistency with activitysim + conda install pytables + pip install populationsim -.. _anaconda_notes : +.. _activitysim : -Python 2 or 3? -~~~~~~~~~~~~~~~ +ActivitySim +~~~~~~~~~~~ .. note:: - PopulationSim is a 64bit Python 2 or 3 library that uses a number of packages from the + PopulationSim is a 64bit Python 3 library that uses a number of packages from the scientific Python ecosystem, most notably `pandas `__ - and `numpy `__. It relies heavily on the - `ActivitySim `__ package. Both ActivitySim and PopulationSim - currently support Python 2, but Python 2 will be `retired `__ at the - end of 2019 so Python 3 is recommended. + and `numpy `__. It also relies heavily on the + `ActivitySim `__ package. The recommended way to get your own scientific Python installation is to install 64 bit Anaconda, which contains many of the libraries upon which @@ -67,7 +68,17 @@ Python 2 or 3? Run Examples ------------ -There are three examples for running PopulationSim, two created using data from the Corvallis-Albany-Lebanon Modeling (CALM) region in Oregon and the other using data from the Metro Vancouver region in British Columbia. The `example_calm`_ set-up runs PopulationSim in base mode, where a synthetic population is created for the entire modeling region. This takes approximately 12 minutes on a laptop with an Intel i7-4800MQ CPU @ 2.70GHz and 16 GB of RAM. The `example_calm_repop`_ set-up runs PopulationSim in the *repop* mode, which updates the synthetic population for a small part of the region. The `example_survey_weighting`_ set-up runs PopulationSim for the case of developing final weights for a household travel survey. More information on the configuration of PopulationSim can be found in the **Application & Configuration** section. +There are four examples for running PopulationSim, three created using data from the +Corvallis-Albany-Lebanon Modeling (CALM) region in Oregon and the other using data from +the Metro Vancouver region in British Columbia. + +1. The `example_calm`_ set-up runs PopulationSim, where a synthetic population is created single-processed for the entire modeling region. + +2. The `example_calm_mp`_ set-up runs PopulationSim `multi-processed `_, where a synthetic population is created for the entire modeling region by simultaneously balancing results using multiple processors on your computer, thereby reducing runtime. + +3. The `example_calm_repop`_ set-up runs PopulationSim in the *repop* mode, which updates the synthetic population for a small part of the region. + +4. The `example_survey_weighting`_ set-up runs PopulationSim for the case of developing final weights for a household travel survey. More information on the configuration of PopulationSim can be found in the **Application & Configuration** section. Example_calm ~~~~~~~~~~~~ @@ -84,6 +95,22 @@ Follow the steps below to run **example_calm** set up: * Review the outputs in the *output* folder +Example_calm_mp +~~~~~~~~~~~~~~~ + +Follow the steps below to run **example_calm_mp** multiprocessed set up: + + * Open a command prompt in the example_calm folder + * In ``configs_mp\setting.yaml``, set ``num_processes: 2`` to a reasonable number of processors for your machine + * Run the following commands: + + :: + + activate popsim + python run_populationsim.py -c configs_mp -c configs + + * Review the outputs in the *output* folder + Example_calm_repop ~~~~~~~~~~~~~~~~~~ diff --git a/docs/index.rst b/docs/index.rst index 57356320..08c065a9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,64 +5,75 @@ Introduction ============= -PopulationSim is an open platform for population synthesis and survey weighting. It emerged from Oregon DOT's desire to -build a shared, open, platform that could be easily adapted for statewide, regional, and urban -transportation planning needs. +PopulationSim is an open platform for population synthesis and survey weighting. It emerged from +`Oregon DOT `_'s desire to build a shared, open, platform that could +be easily adapted for statewide, regional, and urban transportation planning needs. What is population synthesis? ----------------------------- -Activity Based Models (ABMs) operate in a micro-simulation framework , wherein the travel choices of person and household decision-making agents are predicted by applying Monte Carlo methods to behavioral models. This requires a data set of households and persons representing the entire population in the modeling region. Population synthesis refers to the process used to create this data. - -The required inputs to population synthesis are a population sample and marginal distributions. The population -sample is commonly referred to as the *seed or reference sample* and the marginal distributions are referred to -as *controls or targets*. **The process of expanding the seed sample to match the marginal distribution -is termed population synthesis.** The software tool which implements this population synthesis process +Activity based travel demand models such as `ActivitySim `_ operate at an individual +level, wherein the travel choices of person and household decision-making agents are predicted by applying +Monte Carlo methods to behavioral models. This requires a data set of households and persons representing +the entire population in the modeling region. Population synthesis refers to the process used to create this data. + +The required inputs to population synthesis are a population sample and marginal distributions (or control totals). +The population sample is commonly referred to as the *seed or reference sample* and the marginal distributions are +commonly referred to as *controls or targets*. **The process of expanding the seed sample to match the marginal +distribution is termed population synthesis.** The software tool which implements this population synthesis process is termed as a **Population Synthesizer**. What does a Population Synthesizer produce? ------------------------------------------- The objective of a population synthesizer is to generate a synthetic population for -a modeling region. The main outputs from a population synthesizer include lists of persons and households -representing the entire population of the modeling region. These databases include household and person-level -attributes of interest. Examples of attributes at the household level include household income, household size, housing type, and number of vehicles. Examples of person attributes include +a modeling region. The main outputs from a population synthesizer include tables of persons and households +representing the entire population of the modeling region. These tables also include household and person-level +attributes of interest. Examples of attributes at the household level include household income, household size, housing +type, and number of vehicles. Examples of person attributes include age, gender, work\school status, and occupation. Depending on the use case, a population synthesizer may also produce multi-way distribution of demographic variables at different geographies to be used as an input -to aggregate travel models. In the case of PopulationSim specifically, an additional option is also included to -modify an existing regional synthetic population for a smaller geographical area. In this case, the outputs are a modified list of persons and households. +to aggregate (four-step) travel models. In the case of PopulationSim specifically, an additional option is also included to +modify an existing regional synthetic population for a smaller geographical area. In this case, the outputs are a modified +set of persons and households. How does a population synthesizer work? --------------------------------------- The main inputs to a population synthesizer are disaggregate population samples and marginal control -distributions. In the United States, the disaggregate population sample is typically obtained from the Census Public Use Microdata Sample (PUMS), but other sources, such as a household travel survey, can also be used. The seed sample should -include demographic variables corresponding to each marginal control termed as *controlled variables* (e.g., -household size, household income, etc.). The seed sample could also include other variables of interest but not -necessarily controlled via marginal controls. These are termed as *uncontrolled variables*. The seed sample should also include an initial weight on each household record. - -Base-year marginal distributions of person and household-level attributes of interest are available from Census. For future years, marginal distributions are either held constant, or forecasted. Marginal distributions can be for both household or person level variables and are specified at a specific geography (e.g., Block Groups, Traffic Analysis Zone or County). PopulationSim allows controls to be specified at multiple geographic levels. - -The objective of a population synthesizer is to -generate household weights which satisfies the marginal control distributions. This is achieved by use of -a data fitting technique. The most common fitting technique used by various population synthesizers is the -Iterative Proportional Fitting (IPF) procedure. Generally, the IPF procedure is used to obtain joint distributions of demographic -variables. Then, random sampling from PUMS generates the baseline synthetic population. +distributions. In the United States, the disaggregate population sample is typically obtained from the `Census Public Use +Microdata Sample (PUMS) `_, but other sources, such as a household +travel survey, can also be used. The seed sample should include demographic variables corresponding to each marginal control +termed as *controlled variables* (e.g., household size, household income, etc.). The seed sample could also include other +variables of interest but not necessarily controlled via marginal controls. These are termed as *uncontrolled variables*. +The seed sample should also include an initial weight on each household record. + +Current year marginal distributions of person and household-level attributes of interest are available from Census. For +future years, marginal distributions are either held constant, or forecasted. Marginal distributions can be for both +household or person level variables and are specified at a specific geography (e.g., Block Groups, Traffic Analysis Zone +or County). PopulationSim allows controls to be specified at multiple geographic levels. + +The objective of a population synthesizer is to generate household weights which satisfies the marginal control +distributions. This is achieved by use of a data fitting technique. The most common fitting technique used by various +population synthesizers is the Iterative Proportional Fitting (IPF) procedure. Generally, the IPF procedure is used +to obtain joint distributions of demographic variables. Then, random sampling from PUMS generates the baseline synthetic +population. One of the limitations of the simple IPF method is that it does not incorporate both household and person level attributes simulatenously. Some population synthesizers use a heuristic algorithm called the Iterative Proportional Updating Algorithm (IPU) to incorporate both person and household-level variables in the fitting procedure. -Besides IPF, entropy -maximization algorithms have been used as a fitting technique. In most of the entropy based methods, +Besides IPF, entropy maximization algorithms have been used as a fitting technique. In most of the entropy based methods, the relative entropy is used as the objective function. The relative entropy based optimization ensures that the least amount of new information is introduced in finding a feasible solution. The base entropy is defined by the initial weights in the seed sample. The weights generated by the entropy maximization algorithm preserves the distribution of initial weights while matching the marginal controls. This is an -advantage of the entropy maximization based procedures over the IPF based procedures. PopulationSim uses the entropy maximization based list balancing to match controls specified at various geographic levels. +advantage of the entropy maximization based procedures over the IPF based procedures. PopulationSim uses the entropy maximization +based list balancing to match controls specified at various geographic levels. -Once the final weights -have been assigned, seed sample is expanded using these weights to generate a synthetic population. Most +Once the final weights have been assigned, the seed sample is expanded using these weights to generate a synthetic population. Most population synthesizers create distributions using final weights and employ random sampling to expand the seed sample. PopulationSim uses Linear Programming to convert the final weights to integer values and expands -the seed sample using these integer weights. For detailed description of PopulationSim algorithm, please refer to the TRB paper link in the :ref:`docs` section. For information on software implementation refer to :ref:`core_components` and :ref:`model_steps`. To learn more about PopulationSim application and configuration, please follow the content index below. +the seed sample using these integer weights. For detailed description of PopulationSim algorithm, please refer to the TRB paper +link in the :ref:`docs` section. For information on software implementation refer to :ref:`core_components` and :ref:`model_steps`. To +learn more about PopulationSim application and configuration, please follow the content index below. How does population synthesis work for survey weighting? -------------------------------------------------------- diff --git a/docs/software.rst b/docs/software.rst index 21fbf4b1..186e2020 100644 --- a/docs/software.rst +++ b/docs/software.rst @@ -9,8 +9,8 @@ This page describes the PopulationSim software implementation and how to contrib The implementation starts with the ActivitySim framework, which serves as the foundation for the software. The framework, as briefly described -below, includes features for data pipeline management, expression handling, testing, etc. Built upon the -framework are additional core components for population synthesis such as balancers and integerizers. +below, includes features for data pipeline management, expression handling, multiprocessing, testing, etc. Built upon +the framework are additional core components for population synthesis such as balancers and integerizers. Built upon the population synthesis core components are the model steps that make up a PopulationSim run, such as the inputs pre-processor, setting up the data strucutres, doing the initial seed balancing, etc. @@ -42,7 +42,8 @@ being implemented in the ActivitySim framework means: * Model Orchestrator * `ORCA `__ is used for running the overall model system and for defining dynamic data tables, columns, and injectables (functions). ActivitySim wraps ORCA functionality to make a Data Pipeline tool, which allows for re-starting at any model step. - + * Support for `multiprocessing `_ to reduce runtime + * Expressions * Model expressions are in CSV files and contain Python expressions, mainly pandas/numpy expression that operate on the input data tables. This helps to avoid modifying Python code when making changes to the model calculations. @@ -236,4 +237,5 @@ Release Notes * v0.4 - transfer to ActivitySim.org * v0.4.1 - package updates * v0.4.2 - validation script in Python - * v0.4.3 - allow non-binary incidence \ No newline at end of file + * v0.4.3 - allow non-binary incidence + * v0.5 - support for multiprocessing \ No newline at end of file diff --git a/docs/validation.rst b/docs/validation.rst index a4f35c0a..70971866 100644 --- a/docs/validation.rst +++ b/docs/validation.rst @@ -9,38 +9,49 @@ Validation of Results ===================== -One of the most critical steps in the population synthesis procedure is the validation of the synthetic population. Validation can give us clues about inconsistencies among controls, data processing errors or misspecification of settings. This section provides general guidelines on validation procedures. - -PopulationSim reports the difference between the synthesized totals and the control totals for all the controls at each geographic level. User can select these summaries using the ``output_tables:`` token as described in the :ref:`settings` section. The :ref:`inputs_outputs` section lists all the summaries available to user. Most population synthesizers will match each control very well at a regional level; therefore such summaries are useful but not very insightful into the goodness-of-fit of the tool at lower level geographies. Users can download a `validation script `_ to generate advanced summary statistics and validation plots. This *validation script* takes summaries and outputs from a PopulationSim run to generate plots and advanced summaries. The script is configured to run for the CALM region example and includes notes on inputs and configuration settings. To download and run the CALM region example refer to the :ref:`getting_started` section. +One of the most critical steps in the population synthesis procedure is the validation of the synthetic +population. Validation can give us clues about inconsistencies among controls, data processing errors or +misspecification of settings. This section provides general guidelines on validation procedures. + +PopulationSim reports the difference between the synthesized totals and the control totals for all the controls +at each geographic level. User can select these summaries using the ``output_tables:`` token as described in +the :ref:`settings` section. The :ref:`inputs_outputs` section lists all the summaries available to user. +Most population synthesizers will match each control very well at a regional level; therefore such summaries +are useful but not very insightful into the goodness-of-fit of the tool at lower level geographies. Users +can download a `validation Jupyter Notebook `_ to +generate advanced summary statistics and validation plots. This validation notebook takes summaries and +outputs from a PopulationSim run and generates plots and advanced summaries. The notebook is configured to run +for the CALM region example and includes notes on inputs and configuration settings. To download and run +the CALM region example refer to the :ref:`getting_started` section. Validation Summary Statistics -------------------------------- +----------------------------- + +Statistics related to the convergence at a more disaggregate level are generated by the validation notebook. These statistics are being computed for the geography at which the controls are specified i.e. MAZ, TAZ or Meta as the case might be. The following three statistics are computed: |br| + +1. The average percentage difference between the control totals and the synthesized totals, |br| +2. The standard deviation (STDEV) of the percentage difference – this measure describes how much dispersion from the average exists, and |br| +3. The percentage root mean square error (RMSE) - an indicator of the proximity of synthesized and control totals. |br| -Statistics that inform us of convergence at a more disaggregate level are generated by the *validation script* – please note that these statistics are being computed for the geography at which the controls are specified i.e. MAZ, TAZ or Meta as the case might be. The following three statistics are computed as a part of this exercise: |br| -(1) the average percentage difference between the control totals and the synthesized totals, |br| -(2) the standard deviation (STDEV) of the percentage difference – this measure informs us of how much dispersion from the average exists, and |br| -(3) the percentage root mean square error (RMSE) - an indicator of the proximity of synthesized and control totals. |br| The number of geographies for which the control is non-zero (N) are also reported. Charts & Plots -------------- -The *validation script* also produces charts and plots which includes frequency distribution and expansion factor distribution plots. +The validation notebook also produces charts and plots which includes frequency distribution and expansion factor distribution plots. Validation Charts ~~~~~~~~~~~~~~~~~ The validation chart is a visualization of the disaggregate summary statistics – mean percentage difference, STDEV and RMSE of percentage differences. A form of dot and whisker plot is generated for each control where the dots are the mean percentage differences and horizontal bars are twice the STDEV or RMSE centered around zero. An example validation chart is below: - - - .. image:: images/validation.jpeg + .. image:: images/validation.jpeg Frequency Distribution Plots ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -These are simply frequency distribution plots of differences between control and synthesized values across the geography at which the controls were specified. An example frequency distribution plot is below: +These are frequency distribution plots of differences between control and synthesized values across the geography at which the controls were specified. An example frequency distribution plot is below: .. image:: images/hh_inc_30_60_control.png diff --git a/example_calm/.gitignore b/example_calm/.gitignore new file mode 100644 index 00000000..520c87a5 --- /dev/null +++ b/example_calm/.gitignore @@ -0,0 +1,2 @@ +*_local/ + diff --git a/example_calm/configs/settings.yaml b/example_calm/configs/settings.yaml index ea958809..1b76fdf8 100644 --- a/example_calm/configs/settings.yaml +++ b/example_calm/configs/settings.yaml @@ -38,18 +38,19 @@ input_table_list: - tablename: households filename : seed_households.csv index_col: hh_id - column_map: + rename_columns: # not sure what SERIALNO is, but it is not unique #SERIALNO: hh_id hhnum: hh_id - tablename: persons # expression_filename: seed_persons_expressions.csv filename : seed_persons.csv - column_map: + rename_columns: # SERIALNO: hh_id hhnum: hh_id SPORDER: per_num - # drop mixed type fields that appear to have been incorrectly generated + # drop unused mixed type fields to avoid PyTables pipeline performance issues + # (PyTables will pickle object types that it cannot map directly to c-types) drop_columns: - indp02 - naicsp02 @@ -120,10 +121,9 @@ output_synthetic_population: - OCCP -# Steps for base mode +# Model steps for base mode # ------------------------------------------------------------------ -run_list: - steps: +models: - input_pre_processor - setup_data_structures - initial_seed_balancing @@ -133,8 +133,9 @@ run_list: - sub_balancing.geography=TRACT - sub_balancing.geography=TAZ - expand_households + - write_data_dictionary - summarize - write_tables - write_synthetic_population - #resume_after: expand_households +resume_after: diff --git a/example_calm/configs_mp/settings.yaml b/example_calm/configs_mp/settings.yaml new file mode 100644 index 00000000..0a5212fa --- /dev/null +++ b/example_calm/configs_mp/settings.yaml @@ -0,0 +1,76 @@ +inherit_settings: True + +multiprocess: True +num_processes: 2 +cleanup_pipeline_after_run: True + +slice_geography: TRACT + +# Steps for base mode +# ------------------------------------------------------------------ +models: + ### mp_seed_balancing step + - input_pre_processor + - setup_data_structures + - initial_seed_balancing + - meta_control_factoring + - final_seed_balancing + - integerize_final_seed_weights + - sub_balancing.geography=TRACT + ### mp_sub_balancing_TAZ step + - sub_balancing.geography=TAZ + ### mp_summarize step + - expand_households + - summarize + - write_synthetic_population + - write_data_dictionary + - write_tables + +#resume_after: integerize_final_seed_weights +resume_after: + +multiprocess_steps: + - name: mp_seed_balancing + begin: input_pre_processor + - name: mp_sub_balancing_TAZ + begin: sub_balancing.geography=TAZ + num_processes: 2 + slice: + tables: + - slice_crosswalk + - crosswalk + # don't slice any tables not explicitly listed above in slice.tables + except: True + # the following tables are added by sub_balancer and should be coalesced + coalesce: + - TAZ_weights + - TAZ_weights_sparse + - trace_TAZ_weights + - name: mp_summarize + begin: expand_households + +# Rather than using the 'except: True' wildcard, we could explicitly list the tables that shold not be sliced +# and let activitysim.mp_tasks deduce which created tables should be coalesced, but that requires a pathalogical +# knowledge of the names of esisting internal tables, whereas the coalesce directive allows us to simply specify +# the names of the new tables we expect mp_sub_balancing_TAZ to create, which seems less error-prone +# (especially since mp_tasks.coalesce_pipelines will complain if the tables int het coalesce list are not found.) + +# multiprocess_steps: +# - name: mp_seed_balancing +# begin: input_pre_processor +# - name: mp_sub_balancing_TAZ +# begin: sub_balancing.geography=TAZ +# num_processes: 2 +# slice: +# tables: +# - slice_crosswalk +# - crosswalk +# except: +# - geo_cross_walk +# - TRACT_control_data +# - TRACT_controls +# - TRACT_weights +# - TRACT_weights_sparse +# - trace_TRACT_weights +# - name: mp_summarize +# begin: expand_households diff --git a/example_calm/output/.gitignore b/example_calm/output/.gitignore index 0d39926d..658e53b6 100644 --- a/example_calm/output/.gitignore +++ b/example_calm/output/.gitignore @@ -1,3 +1,4 @@ *.csv *.h5 *.log +*.txt diff --git a/example_calm/output_mp/.gitignore b/example_calm/output_mp/.gitignore new file mode 100644 index 00000000..f5f20e4d --- /dev/null +++ b/example_calm/output_mp/.gitignore @@ -0,0 +1,5 @@ +*.csv +*.h5 +*.log +*.txt +*.yaml diff --git a/example_calm/output_mp/log/.gitignore b/example_calm/output_mp/log/.gitignore new file mode 100644 index 00000000..f8c0528b --- /dev/null +++ b/example_calm/output_mp/log/.gitignore @@ -0,0 +1,3 @@ +*.txt +*.log +*.csv diff --git a/example_calm/run_populationsim.py b/example_calm/run_populationsim.py index ddb1d128..de9095f3 100644 --- a/example_calm/run_populationsim.py +++ b/example_calm/run_populationsim.py @@ -1,69 +1,39 @@ -from __future__ import print_function -import os -import logging +# ActivitySim +# See full license in LICENSE.txt. -from activitysim.core import config -from populationsim import steps - -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import inject - -from activitysim.core.config import handle_standard_args -from activitysim.core.tracing import print_elapsed_time +import sys +import argparse from activitysim.core.config import setting -from populationsim import lp -from populationsim import multi_integerizer - - -# Add (and handle) 'standard' activitysim arguments: -# --config : specify path to config_dir -# --output : specify path to output_dir -# --data : specify path to data_dir -# --models : specify run_list name -# --resume : resume_after -handle_standard_args() - -tracing.config_logger() - -t0 = print_elapsed_time() - -logger = logging.getLogger('populationsim') - -logger.info("GROUP_BY_INCIDENCE_SIGNATURE: %s" - % setting('GROUP_BY_INCIDENCE_SIGNATURE')) -logger.info("INTEGERIZE_WITH_BACKSTOPPED_CONTROLS: %s" - % setting('INTEGERIZE_WITH_BACKSTOPPED_CONTROLS')) -logger.info("SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS: %s" - % setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS')) -logger.info("meta_control_data: %s" - % setting('meta_control_data')) -logger.info("control_file_name: %s" - % setting('control_file_name')) - -logger.info("USE_CVXPY: %s" % lp.use_cvxpy()) -logger.info("USE_SIMUL_INTEGERIZER: %s" % multi_integerizer.use_simul_integerizer()) +from activitysim.core import inject +from activitysim.cli.run import add_run_args, run +from populationsim import steps -# get the run list (name was possibly specified on the command line with the -m option) -run_list_name = inject.get_injectable('run_list_name', 'run_list') -# run list from settings file is dict with list of 'steps' and optional 'resume_after' -run_list = setting(run_list_name) -assert 'steps' in run_list, "Did not find steps in run_list" +@inject.injectable() +def log_settings(): -# list of steps and possible resume_after in run_list -steps = run_list.get('steps') -resume_after = run_list.get('resume_after', None) + return [ + 'multiprocess', + 'num_processes', + 'resume_after', + 'GROUP_BY_INCIDENCE_SIGNATURE', + 'INTEGERIZE_WITH_BACKSTOPPED_CONTROLS', + 'SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', + 'meta_control_data', + 'control_file_name', + 'USE_CVXPY', + 'USE_SIMUL_INTEGERIZER' + ] -if resume_after: - print("resume_after", resume_after) -pipeline.run(models=steps, resume_after=resume_after) +if __name__ == '__main__': + assert inject.get_injectable('preload_injectables', None) -# tables will no longer be available after pipeline is closed -pipeline.close_pipeline() + parser = argparse.ArgumentParser() + add_run_args(parser) + args = parser.parse_args() -t0 = ("all models", t0) + sys.exit(run(args)) diff --git a/example_calm_repop/run_populationsim.py b/example_calm_repop/run_populationsim.py index ddb1d128..51abd2f7 100644 --- a/example_calm_repop/run_populationsim.py +++ b/example_calm_repop/run_populationsim.py @@ -1,4 +1,4 @@ -from __future__ import print_function + import os import logging diff --git a/example_survey_weighting/run_populationsim.py b/example_survey_weighting/run_populationsim.py index 85e59795..282d306b 100755 --- a/example_survey_weighting/run_populationsim.py +++ b/example_survey_weighting/run_populationsim.py @@ -1,4 +1,4 @@ -from __future__ import print_function + import os import logging diff --git a/example_test/.gitignore b/example_test/.gitignore new file mode 100644 index 00000000..cfb7580f --- /dev/null +++ b/example_test/.gitignore @@ -0,0 +1 @@ +*_local/ diff --git a/example_test/convert_test_data.py b/example_test/convert_test_data.py index 89e85b8a..c390a93d 100644 --- a/example_test/convert_test_data.py +++ b/example_test/convert_test_data.py @@ -1,4 +1,4 @@ -from __future__ import print_function + import pandas as pd # settings diff --git a/example_test/output/.gitignore b/example_test/output/.gitignore index a81afc8c..3352db7b 100644 --- a/example_test/output/.gitignore +++ b/example_test/output/.gitignore @@ -1,2 +1,4 @@ *.csv *.h5 +*.txt +*.yaml diff --git a/ez_setup.py b/ez_setup.py index 44f749cb..6f5b3232 100644 --- a/ez_setup.py +++ b/ez_setup.py @@ -13,9 +13,7 @@ This file can also be run as a script to install or upgrade setuptools. """ -from future import standard_library -standard_library.install_aliases() -from builtins import next + import os import shutil import sys @@ -54,10 +52,10 @@ def _python_cmd(*args): def _install(archive_filename, install_args=()): with archive_context(archive_filename): # installing - log.warn('Installing Setuptools') + log.warning('Installing Setuptools') if not _python_cmd('setup.py', 'install', *install_args): - log.warn('Something went wrong during the installation.') - log.warn('See the error message above.') + log.warning('Something went wrong during the installation.') + log.warning('See the error message above.') # exitcode will be 2 return 2 @@ -65,10 +63,10 @@ def _install(archive_filename, install_args=()): def _build_egg(egg, archive_filename, to_dir): with archive_context(archive_filename): # building an egg - log.warn('Building a Setuptools egg in %s', to_dir) + log.warning('Building a Setuptools egg in %s', to_dir) _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) # returning the result - log.warn(egg) + log.warning(egg) if not os.path.exists(egg): raise IOError('Could not build the egg.') @@ -97,7 +95,7 @@ def __new__(cls, *args, **kwargs): def archive_context(filename): # extracting the archive tmpdir = tempfile.mkdtemp() - log.warn('Extracting in %s', tmpdir) + log.warning('Extracting in %s', tmpdir) old_wd = os.getcwd() try: os.chdir(tmpdir) @@ -107,7 +105,7 @@ def archive_context(filename): # going in the directory subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) os.chdir(subdir) - log.warn('Now working in %s', subdir) + log.warning('Now working in %s', subdir) yield finally: @@ -300,7 +298,7 @@ def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, url = download_base + zip_name saveto = os.path.join(to_dir, zip_name) if not os.path.exists(saveto): # Avoid repeated downloads - log.warn("Downloading %s", url) + log.warning("Downloading %s", url) downloader = downloader_factory() downloader(url, saveto) return os.path.realpath(saveto) diff --git a/populationsim/assign.py b/populationsim/assign.py index c6204c15..df65273d 100644 --- a/populationsim/assign.py +++ b/populationsim/assign.py @@ -51,7 +51,7 @@ def assign_variable(target, expression, df, locals_dict, df_alias=None, trace_ro def to_series(x, target=None): if x is None or np.isscalar(x): if target: - logger.warn("WARNING: assign_variables promoting scalar %s to series" % target) + logger.warning("WARNING: assign_variables promoting scalar %s to series" % target) return pd.Series([x] * len(df.index), index=df.index) return x diff --git a/populationsim/balancer.py b/populationsim/balancer.py index 59a26b6b..fef1c172 100644 --- a/populationsim/balancer.py +++ b/populationsim/balancer.py @@ -1,5 +1,4 @@ -from __future__ import division -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/integerizer.py b/populationsim/integerizer.py index 0abdd982..39d4f0b1 100644 --- a/populationsim/integerizer.py +++ b/populationsim/integerizer.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. @@ -160,8 +160,8 @@ def integerize(self): if (float_weights == 0).any(): # not sure this matters... - logger.warn("Integerizer: %s zero weights out of %s" % - ((float_weights == 0).sum(), sample_count)) + logger.warning("Integerizer: %s zero weights out of %s" % + ((float_weights == 0).sum(), sample_count)) assert False if (resid_weights == 0.0).any(): @@ -313,7 +313,7 @@ def do_integerizing( logger.error("Integerizer failed for %s status %s. " "Returning smart-rounded original weights" % (trace_label, status)) elif status != 'OPTIMAL': - logger.warn("Integerizer status non-optimal for %s status %s." % (trace_label, status)) + logger.warning("Integerizer status non-optimal for %s status %s." % (trace_label, status)) integerized_weights = pd.Series(0, index=zero_weight_rows.index) integerized_weights.update(integerizer.weights['integerized_weight']) diff --git a/populationsim/lp.py b/populationsim/lp.py index 9ec225be..5da18d58 100644 --- a/populationsim/lp.py +++ b/populationsim/lp.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/lp_cvx.py b/populationsim/lp_cvx.py index 578fa324..11fcb434 100644 --- a/populationsim/lp_cvx.py +++ b/populationsim/lp_cvx.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/multi_integerizer.py b/populationsim/multi_integerizer.py index cf0281c6..6b673943 100644 --- a/populationsim/multi_integerizer.py +++ b/populationsim/multi_integerizer.py @@ -1,5 +1,4 @@ -from __future__ import print_function -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. @@ -147,11 +146,11 @@ def integerize(self): # how could this not be the case? if not (parent_hh_constraint_ge_bound == parent_max_possible_control_values).all(): print("\nSimulIntegerizer integerizing", self.trace_label) - logger.warn("parent_hh_constraint_ge_bound != parent_max_possible_control_values") - logger.warn("parent_hh_constraint_ge_bound: %s" % - parent_hh_constraint_ge_bound) - logger.warn("parent_max_possible_control_values: %s" % - parent_max_possible_control_values) + logger.warning("parent_hh_constraint_ge_bound != parent_max_possible_control_values") + logger.warning("parent_hh_constraint_ge_bound: %s" % + parent_hh_constraint_ge_bound) + logger.warning("parent_max_possible_control_values: %s" % + parent_max_possible_control_values) print("\n") # assert (parent_hh_constraint_ge_bound == parent_max_possible_control_values).all() @@ -364,7 +363,7 @@ def do_simul_integerizing( logger.info("do_simul_integerizing succeeded for %s status %s. " % (trace_label, status)) return integerized_weights_df - logger.warn("do_simul_integerizing failed for %s status %s. " % (trace_label, status)) + logger.warning("do_simul_integerizing failed for %s status %s. " % (trace_label, status)) # if simultaneous integerization failed, sequentially integerize to detect infeasible subzones # infeasible zones will be smart rounded and returned in rounded_weights_df @@ -381,7 +380,7 @@ def do_simul_integerizing( if len(feasible_zone_ids) == 0: # if all subzones are infeasible, then we don't have any feasible zones to try # so the best we can do is return rounded_weights_df - logger.warn("do_sequential_integerizing failed for all subzones %s. " % trace_label) + logger.warning("do_sequential_integerizing failed for all subzones %s. " % trace_label) logger.info("do_simul_integerizing returning smart rounded weights for %s." % trace_label) return rounded_weights_df @@ -389,8 +388,8 @@ def do_simul_integerizing( if len(rounded_zone_ids) == 0: # if all subzones are feasible, then there are no zones to remove in order to retry # so the best we can do is return sequentially_integerized_weights_df - logger.warn("do_simul_integerizing failed but found no infeasible sub zones %s. " - % trace_label) + logger.warning("do_simul_integerizing failed but found no infeasible sub zones %s. " + % trace_label) logger.info("do_simul_integerizing falling back to sequential integerizing for %s." % trace_label) return sequentially_integerized_weights_df @@ -398,8 +397,8 @@ def do_simul_integerizing( if len(feasible_zone_ids) == 1: # if only one zone is feasible, not much point in simul_integerizing it # so the best we can do is return do_sequential_integerizing combined results - logger.warn("do_simul_integerizing failed but found no infeasible sub zones %s. " - % trace_label) + logger.warning("do_simul_integerizing failed but found no infeasible sub zones %s. " + % trace_label) return pd.concat([sequentially_integerized_weights_df, rounded_weights_df]) # - remove the infeasible subzones and retry simul_integerizing diff --git a/populationsim/simul_balancer.py b/populationsim/simul_balancer.py index ad976a2c..99977eb7 100644 --- a/populationsim/simul_balancer.py +++ b/populationsim/simul_balancer.py @@ -1,5 +1,4 @@ -from __future__ import division -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/steps/__init__.py b/populationsim/steps/__init__.py index a5bfb9b8..b531b7ce 100644 --- a/populationsim/steps/__init__.py +++ b/populationsim/steps/__init__.py @@ -1,9 +1,8 @@ # PopulationSim # See full license in LICENSE.txt. -from __future__ import absolute_import -from activitysim.core import inject as _inject +from activitysim.core import inject from . import input_pre_processor from . import setup_data_structures @@ -21,7 +20,8 @@ from activitysim.core.steps.output import write_tables -@_inject.injectable(cache=True) +@inject.injectable(cache=True) def preload_injectables(): - _inject.add_step('write_data_dictionary', write_data_dictionary) - _inject.add_step('write_tables', write_tables) + inject.add_step('write_data_dictionary', write_data_dictionary) + inject.add_step('write_tables', write_tables) + return True diff --git a/populationsim/steps/expand_households.py b/populationsim/steps/expand_households.py index a7846b0d..003890e4 100644 --- a/populationsim/steps/expand_households.py +++ b/populationsim/steps/expand_households.py @@ -1,5 +1,4 @@ -from __future__ import division -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. @@ -55,6 +54,10 @@ def expand_households(): if setting('GROUP_BY_INCIDENCE_SIGNATURE'): + # get these in a repeatable order so np.random.choice behaves the same regardless of weight table order + # i.e. which could vary depending on whether we ran single or multi process due to apportioned/coalesce + expanded_weights = expanded_weights.sort_values(geography_cols + [household_id_col]) + # the household_id_col is really the group_id expanded_weights.rename(columns={household_id_col: 'group_id'}, inplace=True) @@ -74,12 +77,14 @@ def expand_households(): probs = list(df.sample_weight / df.sample_weight.sum()) group_hh_probs[group_id] = [hh_ids, probs] - # FIXME - should sample without replacement? + # get a repeatable random number sequence generator for consistent choice results + prng = pipeline.get_rn_generator().get_external_rng('expand_households') + # now make a hh_id choice for each group_id in expanded_weights def chooser(group_id): hh_ids = group_hh_probs[group_id][HH_IDS] hh_probs = group_hh_probs[group_id][HH_PROBS] - return np.random.choice(hh_ids, p=hh_probs) + return prng.choice(hh_ids, p=hh_probs) expanded_weights[household_id_col] = \ expanded_weights.group_id.apply(chooser, convert_dtype=True,) @@ -109,5 +114,8 @@ def chooser(group_id): logger.info("expand_households op: %s prev hh count %s dropped %s added %s final %s" % (op, prev_hhs, dropped_hhs, added_hhs, final_hhs)) + # sort this so results will be consistent whether single or multiprocessing, GROUP_BY_INCIDENCE_SIGNATURE, etc... + expanded_weights = expanded_weights.sort_values(geography_cols + [household_id_col]) + repop = inject.get_step_arg('repop', default=False) inject.add_table('expanded_household_ids', expanded_weights, replace=repop) diff --git a/populationsim/steps/final_seed_balancing.py b/populationsim/steps/final_seed_balancing.py index 398e33b7..c21d2df6 100644 --- a/populationsim/steps/final_seed_balancing.py +++ b/populationsim/steps/final_seed_balancing.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/steps/initial_seed_balancing.py b/populationsim/steps/initial_seed_balancing.py index b75cb974..96bf0f70 100644 --- a/populationsim/steps/initial_seed_balancing.py +++ b/populationsim/steps/initial_seed_balancing.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/steps/input_pre_processor.py b/populationsim/steps/input_pre_processor.py index 722f93b5..3bb562ce 100644 --- a/populationsim/steps/input_pre_processor.py +++ b/populationsim/steps/input_pre_processor.py @@ -7,11 +7,9 @@ import pandas as pd import numpy as np -from activitysim.core import ( - inject, - config, - input -) +from activitysim.core import inject +from activitysim.core import config +from activitysim.core import input logger = logging.getLogger(__name__) diff --git a/populationsim/steps/integerize_final_seed_weights.py b/populationsim/steps/integerize_final_seed_weights.py index a93d1fb9..8653e8e8 100644 --- a/populationsim/steps/integerize_final_seed_weights.py +++ b/populationsim/steps/integerize_final_seed_weights.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/steps/meta_control_factoring.py b/populationsim/steps/meta_control_factoring.py index d62fa1aa..ba74adc0 100644 --- a/populationsim/steps/meta_control_factoring.py +++ b/populationsim/steps/meta_control_factoring.py @@ -1,6 +1,4 @@ -from __future__ import division -from __future__ import print_function -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. @@ -57,7 +55,7 @@ def meta_control_factoring(settings, control_spec, incidence_table): # - if there are no meta controls, then we don't have to do anything if not (control_spec.geography == meta_geography).any(): - logger.warn("meta_control_factoring: no meta targets so nothing to do") + logger.warning("meta_control_factoring: no meta targets so nothing to do") return meta_controls_df = get_control_table(meta_geography) diff --git a/populationsim/steps/repop_balancing.py b/populationsim/steps/repop_balancing.py index cc89a429..6a46376c 100644 --- a/populationsim/steps/repop_balancing.py +++ b/populationsim/steps/repop_balancing.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/steps/setup_data_structures.py b/populationsim/steps/setup_data_structures.py index 5d0bcd33..c0d5c5af 100644 --- a/populationsim/steps/setup_data_structures.py +++ b/populationsim/steps/setup_data_structures.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. @@ -307,6 +307,7 @@ def setup_data_structures(settings, households, persons): """ seed_geography = setting('seed_geography') + geographies = settings['geographies'] households_df = households.to_frame() persons_df = persons.to_frame() @@ -314,10 +315,21 @@ def setup_data_structures(settings, households, persons): crosswalk_df = build_crosswalk_table() inject.add_table('crosswalk', crosswalk_df) + slice_geography = settings.get('slice_geography', None) + if slice_geography: + assert slice_geography in geographies + assert slice_geography in crosswalk_df.columns + + # only want rows for slice_geography and higher + slice_geographies = geographies[:geographies.index(slice_geography) + 1] + slice_table = crosswalk_df[slice_geographies].groupby(slice_geography).max() + # it is convenient to have slice_geography column in table as well as index + slice_table[slice_geography] = slice_table.index + inject.add_table(f"slice_crosswalk", slice_table) + control_spec = read_control_spec(setting('control_file_name', 'controls.csv')) inject.add_table('control_spec', control_spec) - geographies = settings['geographies'] for g in geographies: controls = build_control_table(g, control_spec, crosswalk_df) inject.add_table(control_table_name(g), controls) diff --git a/populationsim/steps/sub_balancing.py b/populationsim/steps/sub_balancing.py index 3efbea2c..f8b26ac1 100644 --- a/populationsim/steps/sub_balancing.py +++ b/populationsim/steps/sub_balancing.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. @@ -14,6 +14,7 @@ from activitysim.core.config import setting +from .helper import control_table_name from .helper import get_control_table from .helper import weight_table_name from .helper import get_weight_table @@ -244,20 +245,22 @@ def sub_balancing(settings, crosswalk, control_spec, incidence_table): seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] + # expects seed geography is siloed by meta_geography + # (no seed_id is in more than one meta_geography zone) assert len(seed_crosswalk_df[meta_geography].unique()) == 1 # list of unique parent zone ids in this seed zone - # (there will be just one if parent geo is seed) + # (there will be just one if parent geography is seed) parent_ids = seed_crosswalk_df[parent_geography].unique() + # only want ones for which there are (non-zero) controls parent_ids = parent_controls_df.index.intersection(parent_ids) num_parent_ids = len(parent_ids) for idx, parent_id in enumerate(parent_ids, start=1): - log_msg = "balancing {}/{} seed {}, {} {}" - log_msg = log_msg.format(idx, num_parent_ids, seed_id, parent_geography, parent_id) - logger.info(log_msg) + logger.info(f"balancing {idx}/{num_parent_ids} seed {seed_id}, " + f"{parent_geography} {parent_id}") initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index(settings.get('household_id_col')) @@ -294,14 +297,12 @@ def sub_balancing(settings, crosswalk, control_spec, incidence_table): integer_weights_df = pd.concat(integer_weights_list) - # print "integer_weights_df\n", integer_weights_df.dtypes - # print integer_weights_df.head(10) - # bug - + logger.info(f"adding table {weight_table_name(geography)}") inject.add_table(weight_table_name(geography), integer_weights_df) if not NO_INTEGERIZATION_EVER: + inject.add_table(weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) diff --git a/populationsim/steps/summarize.py b/populationsim/steps/summarize.py index 74e02a5b..60767149 100644 --- a/populationsim/steps/summarize.py +++ b/populationsim/steps/summarize.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import + # PopulationSim # See full license in LICENSE.txt. @@ -52,7 +52,6 @@ def summarize_geography(geography, weight_col, hh_id_col, for zone_id in zone_ids: zone_controls = controls_df.loc[zone_id].tolist() - controls.append(zone_controls) zone_row_map = results_df[geography] == zone_id diff --git a/populationsim/steps/write_synthetic_population.py b/populationsim/steps/write_synthetic_population.py index 50450054..a1c251eb 100644 --- a/populationsim/steps/write_synthetic_population.py +++ b/populationsim/steps/write_synthetic_population.py @@ -23,7 +23,7 @@ def merge_seed_data(expanded_household_ids, seed_data_df, seed_columns, trace_la # warn of any columns that aren't in seed_data_df for c in seed_columns: if c not in df_columns and c != hh_col: - logger.warn("column '%s' not in %s" % (c, trace_label)) + logger.warning("column '%s' not in %s" % (c, trace_label)) # remove any columns that aren't in seed_data_df df_columns = [c for c in seed_columns if c in df_columns] diff --git a/populationsim/tests/configs_mp/settings.yaml b/populationsim/tests/configs_mp/settings.yaml new file mode 100644 index 00000000..93a2ce1e --- /dev/null +++ b/populationsim/tests/configs_mp/settings.yaml @@ -0,0 +1,48 @@ +inherit_settings: True + +multiprocess: True +num_processes: 2 + +slice_geography: TRACT + +# Steps for base mode +# ------------------------------------------------------------------ +models: + ### mp_seed_balancing step + - input_pre_processor + - setup_data_structures + - initial_seed_balancing + - meta_control_factoring + - final_seed_balancing + - integerize_final_seed_weights + - sub_balancing.geography=TRACT + ### mp_sub_balancing_TAZ step + - sub_balancing.geography=TAZ + ### mp_summarize step + - expand_households + - summarize + - write_synthetic_population + - write_data_dictionary + - write_tables + +resume_after: + +multiprocess_steps: + - name: mp_seed_balancing + begin: input_pre_processor + - name: mp_sub_balancing_TAZ + begin: sub_balancing.geography=TAZ + num_processes: 2 + slice: + tables: + - slice_crosswalk + - crosswalk + # don't slice any tables not explicitly listed above in slice.tables + except: True + # the following tables are added by sub_balancer and should be coalesced + coalesce: + - TAZ_weights + - TAZ_weights_sparse + - name: mp_summarize + begin: expand_households + diff --git a/populationsim/tests/output/.gitignore b/populationsim/tests/output/.gitignore index fe43411a..d98bf24a 100644 --- a/populationsim/tests/output/.gitignore +++ b/populationsim/tests/output/.gitignore @@ -1,3 +1,5 @@ *.csv *.log *.h5 +*.txt +*.yaml diff --git a/populationsim/tests/run_mp.py b/populationsim/tests/run_mp.py new file mode 100644 index 00000000..b0a4d59e --- /dev/null +++ b/populationsim/tests/run_mp.py @@ -0,0 +1,72 @@ +import os + +import pandas as pd + +from activitysim.core import config +from activitysim.core import tracing +from activitysim.core import pipeline +from activitysim.core import inject +from activitysim.core import mp_tasks + +from populationsim import steps + +TAZ_COUNT = 36 +TAZ_100_HH_COUNT = 33 +TAZ_100_HH_REPOP_COUNT = 26 + + +def setup_dirs(): + + configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + mp_configs_dir = os.path.join(os.path.dirname(__file__), 'configs_mp') + inject.add_injectable("configs_dir", [mp_configs_dir, configs_dir]) + + output_dir = os.path.join(os.path.dirname(__file__), 'output') + inject.add_injectable("output_dir", output_dir) + + data_dir = os.path.join(os.path.dirname(__file__), 'data') + inject.add_injectable("data_dir", data_dir) + + tracing.config_logger() + + tracing.delete_output_files('csv') + tracing.delete_output_files('txt') + tracing.delete_output_files('yaml') + + +def regress(): + + expanded_household_ids = pipeline.get_table('expanded_household_ids') + assert isinstance(expanded_household_ids, pd.DataFrame) + taz_hh_counts = expanded_household_ids.groupby('TAZ').size() + assert len(taz_hh_counts) == TAZ_COUNT + assert taz_hh_counts.loc[100] == TAZ_100_HH_COUNT + + # output_tables action: skip + output_dir = inject.get_injectable('output_dir') + assert not os.path.exists(os.path.join(output_dir, 'households.csv')) + assert os.path.exists(os.path.join(output_dir, 'summary_DISTRICT_1.csv')) + + +def test_mp_run(): + + setup_dirs() + + run_list = mp_tasks.get_run_list() + mp_tasks.print_run_list(run_list) + + # do this after config.handle_standard_args, as command line args may override injectables + injectables = ['data_dir', 'configs_dir', 'output_dir'] + injectables = {k: inject.get_injectable(k) for k in injectables} + + # pipeline.run(models=run_list['models'], resume_after=run_list['resume_after']) + + mp_tasks.run_multiprocess(run_list, injectables) + pipeline.open_pipeline('_') + regress() + pipeline.close_pipeline() + + +if __name__ == '__main__': + + test_mp_run() diff --git a/populationsim/tests/test_flex.py b/populationsim/tests/test_flex.py index 6b8927cf..f1a0d2e9 100644 --- a/populationsim/tests/test_flex.py +++ b/populationsim/tests/test_flex.py @@ -41,8 +41,8 @@ def test_full_run2(): 'meta_control_factoring', 'final_seed_balancing', 'integerize_final_seed_weights', - 'sub_balancing.geography = DISTRICT', - 'sub_balancing.geography = TRACT', + 'sub_balancing.geography=DISTRICT', + 'sub_balancing.geography=TRACT', 'sub_balancing.geography=TAZ', 'expand_households', 'summarize', diff --git a/populationsim/tests/test_integerizer.py b/populationsim/tests/test_integerizer.py index 691aae7a..9f146274 100644 --- a/populationsim/tests/test_integerizer.py +++ b/populationsim/tests/test_integerizer.py @@ -1,4 +1,4 @@ -from __future__ import (absolute_import, print_function) + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/tests/test_multi_integerizer.py b/populationsim/tests/test_multi_integerizer.py index 94002e0a..9eace501 100644 --- a/populationsim/tests/test_multi_integerizer.py +++ b/populationsim/tests/test_multi_integerizer.py @@ -1,4 +1,4 @@ -from __future__ import print_function + # PopulationSim # See full license in LICENSE.txt. diff --git a/populationsim/tests/test_steps_mp.py b/populationsim/tests/test_steps_mp.py new file mode 100644 index 00000000..cc5d76af --- /dev/null +++ b/populationsim/tests/test_steps_mp.py @@ -0,0 +1,23 @@ +# ActivitySim +# See full license in LICENSE.txt. +import os +import subprocess + +from activitysim.core import inject + + +def teardown_function(func): + inject.clear_cache() + inject.reinject_decorated_tables() + + +def test_mp_run(): + + file_path = os.path.join(os.path.dirname(__file__), 'run_mp.py') + + subprocess.check_call(['coverage', 'run', file_path]) + + +if __name__ == '__main__': + + test_mp_run() diff --git a/populationsim/tests/test_tracing.py b/populationsim/tests/test_tracing.py index cacc29ef..29877acc 100644 --- a/populationsim/tests/test_tracing.py +++ b/populationsim/tests/test_tracing.py @@ -1,4 +1,4 @@ -from __future__ import (absolute_import, print_function) + # ActivitySim # See full license in LICENSE.txt. @@ -39,7 +39,7 @@ def test_config_logger(capsys): logger.info('test_config_logger') logger.info('log_info') - logger.warn('log_warn1') + logger.warning('log_warn1') out, err = capsys.readouterr() diff --git a/setup.cfg b/setup.cfg index 048118d1..0415e4ae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [pycodestyle] -max-line-length = 100 +max-line-length = 120 exclude = ./example_calm_data_prep/convert_rsg_data.py,./sandbox/ [coverage:run] diff --git a/setup.py b/setup.py index 397a7952..3b210637 100644 --- a/setup.py +++ b/setup.py @@ -5,28 +5,23 @@ setup( name='populationsim', - version='0.4.3', + version='0.5', description='Population Synthesis', author='contributing authors', author_email='ben.stabler@rsginc.com', license='BSD-3', url='https://github.com/ActivitySim/populationsim', classifiers=[ - 'Development Status :: 4 - Beta', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', + 'Development Status :: 5 - Production/Stable', + 'Programming Language :: Python :: 3.8', 'License :: OSI Approved :: BSD License' ], packages=find_packages(exclude=['*.tests']), include_package_data=True, - python_requires='>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*', install_requires=[ - 'activitysim >= 0.9.2', + 'activitysim >= 0.9.9.1', 'numpy >= 1.16.1', - 'pandas >= 0.24.1', - 'ortools >= 5.1.4045, < 7.5', - 'future >= 0.16.0' + 'pandas >= 1.1.0', + 'ortools >= 5.1.4045' ] ) From a17d409d6eb49049722af1d6bb1dad4419d5e641 Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Wed, 26 May 2021 10:39:25 -0400 Subject: [PATCH 02/14] added absolute bounds to balancer.py and associated function calls --- populationsim/balancer.py | 20 +++++++++++++++++-- populationsim/steps/final_seed_balancing.py | 6 ++++++ populationsim/steps/initial_seed_balancing.py | 8 +++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/populationsim/balancer.py b/populationsim/balancer.py index 49e0126e..5cfc14b1 100644 --- a/populationsim/balancer.py +++ b/populationsim/balancer.py @@ -243,6 +243,7 @@ def np_balancer( def do_balancing(control_spec, total_hh_control_col, max_expansion_factor, min_expansion_factor, + absolute_upper_bound, absolute_lower_bound, incidence_df, control_totals, initial_weights): # incidence table should only have control columns @@ -261,6 +262,7 @@ def do_balancing(control_spec, control_importance_weights = control_spec.importance + if min_expansion_factor: # number_of_households in this seed geograpy as specified in seed_controlss @@ -270,11 +272,21 @@ def do_balancing(control_spec, lb_ratio = min_expansion_factor * float(number_of_households) / float(total_weights) lb_weights = initial_weights * lb_ratio - lb_weights = lb_weights.clip(lower=0) + + if absolute_lower_bound: + lb_weights = lb_weights.clip(lower=absolute_lower_bound) + else: + lb_weights = lb_weights.clip(lower=0) + + elif absolute_lower_bound: + lb_weights = initial_weights.clip(lower=absolute_lower_bound) else: lb_weights = None + + + if max_expansion_factor: # number_of_households in this seed geograpy as specified in seed_controlss @@ -284,7 +296,11 @@ def do_balancing(control_spec, ub_ratio = max_expansion_factor * float(number_of_households) / float(total_weights) ub_weights = initial_weights * ub_ratio - ub_weights = ub_weights.round().clip(lower=1).astype(int) + + if absolute_upper_bound: + ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower = 1).astype(int) + else: + ub_weights = ub_weights.round().clip(lower=1).astype(int) else: ub_weights = None diff --git a/populationsim/steps/final_seed_balancing.py b/populationsim/steps/final_seed_balancing.py index 398e33b7..107c94ef 100644 --- a/populationsim/steps/final_seed_balancing.py +++ b/populationsim/steps/final_seed_balancing.py @@ -68,6 +68,10 @@ def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): max_expansion_factor = settings.get('max_expansion_factor', None) min_expansion_factor = settings.get('min_expansion_factor', None) + + absolute_upper_bound = settings.get('absolute_upper_bound', None) + absolute_lower_bound = settings.get('absolute_lower_bound', None) + relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist()) @@ -86,6 +90,8 @@ def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, min_expansion_factor=min_expansion_factor, + absolute_lower_bound=absolute_lower_bound, + absolute_upper_bound=absolute_upper_bound, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) diff --git a/populationsim/steps/initial_seed_balancing.py b/populationsim/steps/initial_seed_balancing.py index b75cb974..19553585 100644 --- a/populationsim/steps/initial_seed_balancing.py +++ b/populationsim/steps/initial_seed_balancing.py @@ -66,13 +66,17 @@ def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): max_expansion_factor = settings.get('max_expansion_factor', None) min_expansion_factor = settings.get('min_expansion_factor', None) + absolute_upper_bound = settings.get('absolute_upper_bound', None) + absolute_lower_bound = settings.get('absolute_lower_bound', None) + + # run balancer for each seed geography weight_list = [] sample_weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: - + logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] @@ -82,6 +86,8 @@ def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, min_expansion_factor=min_expansion_factor, + absolute_upper_bound=absolute_upper_bound, + absolute_lower_bound=absolute_lower_bound, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) From feaff8ee98b5e6526719afb5a8fda689ecd8b275 Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Wed, 26 May 2021 10:46:08 -0400 Subject: [PATCH 03/14] removing blank lines --- populationsim/steps/final_seed_balancing.py | 3 --- populationsim/steps/initial_seed_balancing.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/populationsim/steps/final_seed_balancing.py b/populationsim/steps/final_seed_balancing.py index 107c94ef..b0c8c4d1 100644 --- a/populationsim/steps/final_seed_balancing.py +++ b/populationsim/steps/final_seed_balancing.py @@ -15,10 +15,8 @@ from .helper import weight_table_name from .helper import get_weight_table - logger = logging.getLogger(__name__) - @inject.step() def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ @@ -72,7 +70,6 @@ def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): absolute_upper_bound = settings.get('absolute_upper_bound', None) absolute_lower_bound = settings.get('absolute_lower_bound', None) - relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist()) # run balancer for each seed geography diff --git a/populationsim/steps/initial_seed_balancing.py b/populationsim/steps/initial_seed_balancing.py index 19553585..3557bfee 100644 --- a/populationsim/steps/initial_seed_balancing.py +++ b/populationsim/steps/initial_seed_balancing.py @@ -15,10 +15,8 @@ from .helper import get_control_table from .helper import weight_table_name - logger = logging.getLogger(__name__) - @inject.step() def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ @@ -69,7 +67,6 @@ def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): absolute_upper_bound = settings.get('absolute_upper_bound', None) absolute_lower_bound = settings.get('absolute_lower_bound', None) - # run balancer for each seed geography weight_list = [] sample_weight_list = [] From 3b5dec06f050e2b086a72b7308d7588b66181017 Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Wed, 26 May 2021 10:56:36 -0400 Subject: [PATCH 04/14] more white space/blank line fixes --- populationsim/balancer.py | 5 +---- populationsim/steps/final_seed_balancing.py | 3 ++- populationsim/steps/initial_seed_balancing.py | 3 ++- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/populationsim/balancer.py b/populationsim/balancer.py index 5cfc14b1..5b42749f 100644 --- a/populationsim/balancer.py +++ b/populationsim/balancer.py @@ -272,7 +272,7 @@ def do_balancing(control_spec, lb_ratio = min_expansion_factor * float(number_of_households) / float(total_weights) lb_weights = initial_weights * lb_ratio - + if absolute_lower_bound: lb_weights = lb_weights.clip(lower=absolute_lower_bound) else: @@ -284,9 +284,6 @@ def do_balancing(control_spec, else: lb_weights = None - - - if max_expansion_factor: # number_of_households in this seed geograpy as specified in seed_controlss diff --git a/populationsim/steps/final_seed_balancing.py b/populationsim/steps/final_seed_balancing.py index b0c8c4d1..f58cea85 100644 --- a/populationsim/steps/final_seed_balancing.py +++ b/populationsim/steps/final_seed_balancing.py @@ -15,8 +15,10 @@ from .helper import weight_table_name from .helper import get_weight_table + logger = logging.getLogger(__name__) + @inject.step() def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ @@ -66,7 +68,6 @@ def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): max_expansion_factor = settings.get('max_expansion_factor', None) min_expansion_factor = settings.get('min_expansion_factor', None) - absolute_upper_bound = settings.get('absolute_upper_bound', None) absolute_lower_bound = settings.get('absolute_lower_bound', None) diff --git a/populationsim/steps/initial_seed_balancing.py b/populationsim/steps/initial_seed_balancing.py index 3557bfee..41cb40a4 100644 --- a/populationsim/steps/initial_seed_balancing.py +++ b/populationsim/steps/initial_seed_balancing.py @@ -15,8 +15,10 @@ from .helper import get_control_table from .helper import weight_table_name + logger = logging.getLogger(__name__) + @inject.step() def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ @@ -63,7 +65,6 @@ def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): max_expansion_factor = settings.get('max_expansion_factor', None) min_expansion_factor = settings.get('min_expansion_factor', None) - absolute_upper_bound = settings.get('absolute_upper_bound', None) absolute_lower_bound = settings.get('absolute_lower_bound', None) From 65fbd3212e698b37f7260a6bb50ebc062b737eec Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Wed, 26 May 2021 11:04:15 -0400 Subject: [PATCH 05/14] added arguments to repop balancing step --- populationsim/steps/initial_seed_balancing.py | 2 +- populationsim/steps/repop_balancing.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/populationsim/steps/initial_seed_balancing.py b/populationsim/steps/initial_seed_balancing.py index 41cb40a4..c865ffab 100644 --- a/populationsim/steps/initial_seed_balancing.py +++ b/populationsim/steps/initial_seed_balancing.py @@ -74,7 +74,7 @@ def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: - + logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] diff --git a/populationsim/steps/repop_balancing.py b/populationsim/steps/repop_balancing.py index cc89a429..a0cd564d 100644 --- a/populationsim/steps/repop_balancing.py +++ b/populationsim/steps/repop_balancing.py @@ -60,6 +60,8 @@ def repop_balancing(settings, crosswalk, control_spec, incidence_table): max_expansion_factor = settings.get('max_expansion_factor', None) min_expansion_factor = settings.get('min_expansion_factor', None) + absolute_upper_bound = settings.get('absolute_upper_bound', None) + absolute_lower_bound = settings.get('absolute_lower_bound', None) # run balancer for each low geography low_weight_list = [] @@ -100,7 +102,9 @@ def repop_balancing(settings, crosswalk, control_spec, incidence_table): control_spec=low_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, - min_expansion_factor=min_expansion_factor, + min_expansion_factor=min_expansion_factor, + absolute_upper_bound=absolute_upper_bound, + absolute_lower_bound=absolute_lower_bound, incidence_df=seed_incidence_df, control_totals=low_controls_df.loc[low_id], initial_weights=initial_weights) From 439adc1ad1dd1315e2f120ea08e859dc8e476686 Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Wed, 26 May 2021 11:16:51 -0400 Subject: [PATCH 06/14] some more white space fixes --- populationsim/balancer.py | 4 ++-- populationsim/steps/repop_balancing.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/populationsim/balancer.py b/populationsim/balancer.py index 5b42749f..4fc99e8f 100644 --- a/populationsim/balancer.py +++ b/populationsim/balancer.py @@ -294,8 +294,8 @@ def do_balancing(control_spec, ub_weights = initial_weights * ub_ratio - if absolute_upper_bound: - ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower = 1).astype(int) + if absolute_upper_bound: + ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower=1).astype(int) else: ub_weights = ub_weights.round().clip(lower=1).astype(int) diff --git a/populationsim/steps/repop_balancing.py b/populationsim/steps/repop_balancing.py index a0cd564d..412cbabb 100644 --- a/populationsim/steps/repop_balancing.py +++ b/populationsim/steps/repop_balancing.py @@ -102,7 +102,7 @@ def repop_balancing(settings, crosswalk, control_spec, incidence_table): control_spec=low_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, - min_expansion_factor=min_expansion_factor, + min_expansion_factor=min_expansion_factor, absolute_upper_bound=absolute_upper_bound, absolute_lower_bound=absolute_lower_bound, incidence_df=seed_incidence_df, From b50f2cc13b6c720094ee9ced037b1ba7a664c2d3 Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Wed, 26 May 2021 11:22:57 -0400 Subject: [PATCH 07/14] trying again to resolve white space issues --- populationsim/balancer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/populationsim/balancer.py b/populationsim/balancer.py index 4fc99e8f..1c67e5ed 100644 --- a/populationsim/balancer.py +++ b/populationsim/balancer.py @@ -262,10 +262,9 @@ def do_balancing(control_spec, control_importance_weights = control_spec.importance - if min_expansion_factor: - # number_of_households in this seed geograpy as specified in seed_controlss + # number_of_households in this seed geograpy as specified in seed_controls number_of_households = control_totals[total_hh_control_index] total_weights = initial_weights.sum() @@ -295,7 +294,7 @@ def do_balancing(control_spec, ub_weights = initial_weights * ub_ratio if absolute_upper_bound: - ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower=1).astype(int) + ub_weights = ub_weights.round().clip(upper=absolute_upper_bound,lower=1).astype(int) else: ub_weights = ub_weights.round().clip(lower=1).astype(int) From 5f8307583461aec89a3c60f6a01f8a625fcb247f Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Wed, 26 May 2021 11:28:22 -0400 Subject: [PATCH 08/14] added one space back --- populationsim/balancer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/populationsim/balancer.py b/populationsim/balancer.py index 1c67e5ed..6c3556ac 100644 --- a/populationsim/balancer.py +++ b/populationsim/balancer.py @@ -294,7 +294,7 @@ def do_balancing(control_spec, ub_weights = initial_weights * ub_ratio if absolute_upper_bound: - ub_weights = ub_weights.round().clip(upper=absolute_upper_bound,lower=1).astype(int) + ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower=1).astype(int) else: ub_weights = ub_weights.round().clip(lower=1).astype(int) From 4ca0f158c2c9a9f5f1a493a435665489f3435e0a Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Mon, 7 Jun 2021 14:14:45 -0400 Subject: [PATCH 09/14] missed an else step for upper bounds --- populationsim/balancer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/populationsim/balancer.py b/populationsim/balancer.py index 6c3556ac..648ddbd0 100644 --- a/populationsim/balancer.py +++ b/populationsim/balancer.py @@ -297,6 +297,9 @@ def do_balancing(control_spec, ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower=1).astype(int) else: ub_weights = ub_weights.round().clip(lower=1).astype(int) + + elif absolute_upper_bound: + ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower=1).astype(int) else: ub_weights = None From c3d8c1ec0a858fce6e39697d708445412d00d968 Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Fri, 20 Aug 2021 08:33:54 -0400 Subject: [PATCH 10/14] removing white space --- populationsim/balancer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/populationsim/balancer.py b/populationsim/balancer.py index 74575f4f..e0e3702d 100644 --- a/populationsim/balancer.py +++ b/populationsim/balancer.py @@ -296,7 +296,7 @@ def do_balancing(control_spec, ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower=1).astype(int) else: ub_weights = ub_weights.round().clip(lower=1).astype(int) - + elif absolute_upper_bound: ub_weights = ub_weights.round().clip(upper=absolute_upper_bound, lower=1).astype(int) From 8681f458eae0ef65c003f96016bd3983fc8d98aa Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Fri, 20 Aug 2021 08:48:48 -0400 Subject: [PATCH 11/14] adding bounds to balancer test --- populationsim/tests/test_balancer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/populationsim/tests/test_balancer.py b/populationsim/tests/test_balancer.py index ef8aa3b0..61a822f8 100644 --- a/populationsim/tests/test_balancer.py +++ b/populationsim/tests/test_balancer.py @@ -41,6 +41,8 @@ def test_Konduri(): control_importance_weights=control_importance_weights, lb_weights=0, ub_weights=30, + absolute_lower_bound = 1, + absolute_upper_bound = 30, master_control_index=None, max_iterations=DEFAULT_MAX_ITERATIONS ) From 56324b0683ca7c0788041a4f3a8e2a012d8b004d Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Fri, 20 Aug 2021 08:54:26 -0400 Subject: [PATCH 12/14] removing spaces around equals... --- populationsim/tests/test_balancer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/populationsim/tests/test_balancer.py b/populationsim/tests/test_balancer.py index 61a822f8..a1520760 100644 --- a/populationsim/tests/test_balancer.py +++ b/populationsim/tests/test_balancer.py @@ -41,8 +41,8 @@ def test_Konduri(): control_importance_weights=control_importance_weights, lb_weights=0, ub_weights=30, - absolute_lower_bound = 1, - absolute_upper_bound = 30, + absolute_lower_bound=1, + absolute_upper_bound=30, master_control_index=None, max_iterations=DEFAULT_MAX_ITERATIONS ) From 46433f9a009346ebff7e184a0550ee21c60e9df4 Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Fri, 20 Aug 2021 08:56:43 -0400 Subject: [PATCH 13/14] removing test; not applicable to ListBalancer --- populationsim/tests/test_balancer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/populationsim/tests/test_balancer.py b/populationsim/tests/test_balancer.py index a1520760..ef8aa3b0 100644 --- a/populationsim/tests/test_balancer.py +++ b/populationsim/tests/test_balancer.py @@ -41,8 +41,6 @@ def test_Konduri(): control_importance_weights=control_importance_weights, lb_weights=0, ub_weights=30, - absolute_lower_bound=1, - absolute_upper_bound=30, master_control_index=None, max_iterations=DEFAULT_MAX_ITERATIONS ) From 7e18a2f52c8e1cf6322fc25d477fb67cefe739ca Mon Sep 17 00:00:00 2001 From: "leah.flake" Date: Fri, 20 Aug 2021 09:07:35 -0400 Subject: [PATCH 14/14] adding upper/lower bounds to example weighting settings --- example_survey_weighting/configs/settings.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/example_survey_weighting/configs/settings.yaml b/example_survey_weighting/configs/settings.yaml index 1254a73c..4072ab16 100755 --- a/example_survey_weighting/configs/settings.yaml +++ b/example_survey_weighting/configs/settings.yaml @@ -18,7 +18,8 @@ USE_SIMUL_INTEGERIZER: True USE_CVXPY: False max_expansion_factor: 4 # Default is 30 min_expansion_factor: 0.5 - +absolute_upper_bounds: 20000 +absolute_lower_bounds: 1 # Geographic Settings # ------------------------------------------------------------------