From eb1801fc2636c19f7b5ef3759b6750071c7fdc3b Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Wed, 2 Oct 2024 16:49:51 +0100 Subject: [PATCH 01/15] boundary preprocessing wip --- scripts/0_preprocess_inputs.py | 47 ++++++++++ scripts/3.1_assign_primary_feasible_zones.py | 7 +- scripts/3.2.2_assign_primary_zone_work.py | 7 +- scripts/3.2.3_assign_secondary_zone.py | 8 +- scripts/3.3_assign_facility_all.py | 10 +-- src/acbm/logger_config.py | 1 + src/acbm/preprocessing.py | 94 +++++++++++++++++++- 7 files changed, 151 insertions(+), 23 deletions(-) create mode 100644 scripts/0_preprocess_inputs.py diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py new file mode 100644 index 00000000..818b3c7d --- /dev/null +++ b/scripts/0_preprocess_inputs.py @@ -0,0 +1,47 @@ +import geopandas as gpd + +import acbm +from acbm.logger_config import preprocessing_logger_logger as logger +from acbm.preprocessing import edit_boundary_resolution, filter_boundaries + +# ----- BOUNDARIES +logger.info("Preprocessing Boundary Layer") + +## Read in the boundary layer for the whole of England + +logger.info("1. Reading in the boundary layer for the whole of England") + + +boundaries = gpd.read_file( + acbm.root_path / "data/external/boundaries/oa_england.geojson" +) + +boundaries = boundaries.to_crs(epsg=4326) + +## Dissolve boundaries if resolution is MSOA + +boundary_geography = "OA" # can only be OA or MSOA +logger.info(f"2. Dissolving boundaries to {boundary_geography} level") + +boundaries = edit_boundary_resolution(boundaries, boundary_geography) + + +## Filter to study area + +logger.info("3. Filtering boundaries to specified study area") +# TODO get from config and log +# logger.info(f"3. Filtering boundaries to {config.parameters.boundary_filter_column} = {config.parameters.study_area}") + +boundaries_filtered = filter_boundaries( + boundaries=boundaries, column="LEP22NM1", values=["Leeds City Region"] +) + +## Save the output as parquet +logger.info( + f"4. Saving the boundaries to {acbm.root_path / 'data/external/boundaries/'} path" +) + +boundaries_filtered.to_file( + acbm.root_path / "data/external/boundaries/study_area_zones.geojson", + driver="GeoJSON", +) diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index 15b83c83..f6b40a79 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -37,16 +37,11 @@ def main(config_file): # --- Study area boundaries logger.info("Loading study area boundaries") - where_clause = "MSOA21NM LIKE '%Leeds%'" boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/oa_england.geojson", - where=where_clause, + acbm.root_path / "data/external/boundaries/study_area_zones.geojson" ) - # convert boundaries to 4326 - boundaries = boundaries.to_crs(epsg=4326) - logger.info("Study area boundaries loaded") # --- Assign activity home locations to boundaries zoning system diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index 90baadfb..1552f39b 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -35,14 +35,13 @@ def main(config_file): # --- boundaries - where_clause = "MSOA21NM LIKE '%Leeds%'" + logger.info("Loading study area boundaries") boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/oa_england.geojson", - where=where_clause, + acbm.root_path / "data/external/boundaries/study_area_zones.geojson" ) - boundaries = boundaries.to_crs(epsg=4326) + logger.info("Study area boundaries loaded") # osm POI data diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index 4cff75c4..9749561f 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -45,15 +45,11 @@ def main(config_file): logger.info("Preprocessing: Adding OA21CD to the data") - where_clause = "MSOA21NM LIKE '%Leeds%'" - boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/oa_england.geojson", - where=where_clause, + acbm.root_path / "data/external/boundaries/study_area_zones.geojson" ) - # convert boundaries to 4326 - boundaries = boundaries.to_crs(epsg=4326) + logger.info("Study area boundaries loaded") # --- Assign activity home locations to boundaries zoning system diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index 3e888d59..93560a71 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -49,15 +49,13 @@ def main(config_file): ) # --- Load data: Boundaries - logger.info("Loading boundaries data") - - where_clause = "MSOA21NM LIKE '%Leeds%'" + logger.info("Loading study area boundaries") boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/oa_england.geojson", - where=where_clause, + acbm.root_path / "data/external/boundaries/study_area_zones.geojson" ) - boundaries = boundaries.to_crs(epsg=4326) + + logger.info("Study area boundaries loaded") # --- Prepprocess: add zone column to POI data logger.info("Adding zone column to POI data") diff --git a/src/acbm/logger_config.py b/src/acbm/logger_config.py index b2fc82a7..e86ec635 100644 --- a/src/acbm/logger_config.py +++ b/src/acbm/logger_config.py @@ -40,6 +40,7 @@ def create_logger(name, log_file): # Create loggers for different modules +preprocessing_logger = create_logger("preprocessing", "preprocessing.log") matching_logger = create_logger("matching", "matching.log") assigning_primary_feasible_logger = create_logger( "assigning_primary_feasible", "assigning_primary_feasible.log" diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py index d2169136..821a8cd9 100644 --- a/src/acbm/preprocessing.py +++ b/src/acbm/preprocessing.py @@ -4,10 +4,102 @@ import numpy as np import pandas as pd from pyproj import Transformer -from shapely import Point +from shapely.geometry import MultiPolygon, Point import acbm +# ----- PREPROCESSING BOUNDARIES + + +def edit_boundary_resolution( + study_area: gpd.GeoDataFrame, geography: str +) -> gpd.GeoDataFrame: + """ + This function takes a GeoDataFrame and a geography resolution as input and returns + a GeoDataFrame with the specified geography resolution. It dissolves OA boundaries + to MSOA boundaries if the geography resolution is set to "MSOA". Otherwise, it + retains the original OA boundaries. Currently it only works for OA and MSOA + + Parameters + ---------- + study_area : gpd.GeoDataFrame + A GeoDataFrame containing the study area boundaries + geography : str + A string specifying the geography resolution. It can be either "OA" or "MSOA" + + Returns + ------- + gpd.GeoDataFrame + A GeoDataFrame containing the study area boundaries with the specified geography + + """ + # Drop unnecessary columns + columns_to_drop = ["GlobalID", "OA21CD", "LSOA21CD", "LSOA21NM"] + study_area = study_area.drop( + columns=[col for col in columns_to_drop if col in study_area.columns] + ) + + # Dissolve based on the specified geography + if geography == "MSOA": + print("converting from OA to MSOA") + study_area = study_area.dissolve(by="MSOA21CD").reset_index() + elif geography == "OA": + print("keeping original OA boundaries") + + # Ensure all geometries are MultiPolygon + study_area["geometry"] = study_area["geometry"].apply( + lambda geom: MultiPolygon([geom]) if geom.geom_type == "Polygon" else geom + ) + + return study_area + + +# TODO: create spatial filter option +def filter_boundaries(boundaries, column, values): + """ + Filter the boundaries GeoDataFrame by the specified column and values. + + Parameters + ---------- + + boundaries: gpd.GeoDataFrame): The GeoDataFrame containing the boundaries. + column: str + The column to filter by (e.g., 'LEP22NM1', 'LAD22NM', 'rgn22nm'). + values: list + The list of values to keep in the specified column. + + Returns + ------- + gpd.GeoDataFrame + The filtered GeoDataFrame. + + Raises + ------ + ValueError + If the specified column does not exist in the GeoDataFrame. + If any of the specified values are not present in the column. + """ + + # Check if the column exists in the GeoDataFrame + if column not in boundaries.columns: + error_message = f"Column '{column}' does not exist in the GeoDataFrame." + raise ValueError(error_message) + + # Check if all values are present in the specified column + unique_values = boundaries[column].unique() + missing_values = [value for value in values if value not in unique_values] + if missing_values: + error_message = ( + f"Values {missing_values} are not present in the column '{column}'." + ) + raise ValueError(error_message) + + # Filter boundaries layer by column = values + return boundaries[boundaries[column].isin(values)] + + +# ----- MATCHING + def nts_filter_by_year( data: pd.DataFrame, psu: pd.DataFrame, years: list From c638777581b3267f5d93acf7b661fff85d569000 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:06:12 +0100 Subject: [PATCH 02/15] print out valid values if input is invalid --- src/acbm/preprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py index 821a8cd9..7c47fd81 100644 --- a/src/acbm/preprocessing.py +++ b/src/acbm/preprocessing.py @@ -90,7 +90,8 @@ def filter_boundaries(boundaries, column, values): missing_values = [value for value in values if value not in unique_values] if missing_values: error_message = ( - f"Values {missing_values} are not present in the column '{column}'." + f"Values {missing_values} are not present in the column '{column}'. " + f"Unique values in the column are: {unique_values}" ) raise ValueError(error_message) From c04d89aee29a1745939acaa94f1755063b939a30 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:38:53 +0100 Subject: [PATCH 03/15] update bash script --- scripts/run_pipeline.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh index 5f52aaeb..b9fc340a 100755 --- a/scripts/run_pipeline.sh +++ b/scripts/run_pipeline.sh @@ -2,6 +2,7 @@ set -e +python scripts/0_preprocessing_inputs.py --config_file $1 python scripts/1_prep_synthpop.py --config_file $1 python scripts/2_match_households_and_individuals.py --config_file $1 python scripts/3.1_assign_primary_feasible_zones.py --config_file $1 From d89962803d09dd111285c284af9b6587ddce032b Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Wed, 2 Oct 2024 18:10:36 +0100 Subject: [PATCH 04/15] add boundary filtering to config --- config/base.toml | 4 ++ config/base_500.toml | 4 ++ config/base_5000.toml | 4 ++ config/base_all.toml | 4 ++ scripts/0_preprocess_inputs.py | 69 ++++++++++++++++++++-------------- scripts/run_pipeline.sh | 2 +- src/acbm/config.py | 3 ++ 7 files changed, 61 insertions(+), 29 deletions(-) diff --git a/config/base.toml b/config/base.toml index 014fb0e9..85eef814 100644 --- a/config/base.toml +++ b/config/base.toml @@ -4,6 +4,10 @@ region = "leeds" number_of_households = 10000 zone_id = "OA21CD" travel_times = true +boundary_geography = "OA" +boundary_filter_column = "LEP22NM1" +boundary_filter_values = ["Leeds City Region"] + [work_assignment] use_percentages = true diff --git a/config/base_500.toml b/config/base_500.toml index 47b3df2c..be30dfef 100644 --- a/config/base_500.toml +++ b/config/base_500.toml @@ -4,6 +4,10 @@ region = "leeds" number_of_households = 500 zone_id = "OA21CD" travel_times = true +boundary_geography = "OA" +boundary_filter_column = "LEP22NM1" +boundary_filter_values = ["Leeds City Region"] + [work_assignment] use_percentages = true diff --git a/config/base_5000.toml b/config/base_5000.toml index 5eb6ec27..e75f8237 100644 --- a/config/base_5000.toml +++ b/config/base_5000.toml @@ -4,6 +4,10 @@ region = "leeds" number_of_households = 5000 zone_id = "OA21CD" travel_times = true +boundary_geography = "OA" +boundary_filter_column = "LEP22NM1" +boundary_filter_values = ["Leeds City Region"] + [work_assignment] use_percentages = true diff --git a/config/base_all.toml b/config/base_all.toml index b9964e1f..ed00b0a0 100644 --- a/config/base_all.toml +++ b/config/base_all.toml @@ -3,6 +3,10 @@ seed = 0 region = "leeds" zone_id = "OA21CD" travel_times = true +boundary_geography = "OA" +boundary_filter_column = "LEP22NM1" +boundary_filter_values = ["Leeds City Region"] + [work_assignment] use_percentages = false diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index 818b3c7d..e0716d82 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -1,47 +1,60 @@ import geopandas as gpd import acbm -from acbm.logger_config import preprocessing_logger_logger as logger +from acbm.cli import acbm_cli +from acbm.config import load_config +from acbm.logger_config import preprocessing_logger as logger from acbm.preprocessing import edit_boundary_resolution, filter_boundaries -# ----- BOUNDARIES -logger.info("Preprocessing Boundary Layer") -## Read in the boundary layer for the whole of England +@acbm_cli +def main(config_file): + config = load_config(config_file) + config.init_rng() -logger.info("1. Reading in the boundary layer for the whole of England") + # ----- BOUNDARIES + logger.info("Preprocessing Boundary Layer") + ## Read in the boundary layer for the whole of England -boundaries = gpd.read_file( - acbm.root_path / "data/external/boundaries/oa_england.geojson" -) + logger.info("1. Reading in the boundary layer for the whole of England") -boundaries = boundaries.to_crs(epsg=4326) + boundaries = gpd.read_file( + acbm.root_path / "data/external/boundaries/oa_england.geojson" + ) -## Dissolve boundaries if resolution is MSOA + boundaries = boundaries.to_crs(epsg=4326) -boundary_geography = "OA" # can only be OA or MSOA -logger.info(f"2. Dissolving boundaries to {boundary_geography} level") + ## Dissolve boundaries if resolution is MSOA -boundaries = edit_boundary_resolution(boundaries, boundary_geography) + boundary_geography = config.parameters.boundary_geography # can only be OA or MSOA + logger.info(f"2. Dissolving boundaries to {boundary_geography} level") + boundaries = edit_boundary_resolution(boundaries, boundary_geography) -## Filter to study area + ## Filter to study area -logger.info("3. Filtering boundaries to specified study area") -# TODO get from config and log -# logger.info(f"3. Filtering boundaries to {config.parameters.boundary_filter_column} = {config.parameters.study_area}") + logger.info("3. Filtering boundaries to specified study area") + # TODO get from config and log + # logger.info(f"3. Filtering boundaries to {config.parameters.boundary_filter_column} = {config.parameters.study_area}") -boundaries_filtered = filter_boundaries( - boundaries=boundaries, column="LEP22NM1", values=["Leeds City Region"] -) + boundaries_filtered = filter_boundaries( + # boundaries=boundaries, column="LEP22NM1", values=["Leeds City Region"] + boundaries=boundaries, + column=config.parameters.boundary_filter_column, + values=config.parameters.boundary_filter_values, + ) -## Save the output as parquet -logger.info( - f"4. Saving the boundaries to {acbm.root_path / 'data/external/boundaries/'} path" -) + ## Save the output as parquet + logger.info( + f"4. Saving the boundaries to {acbm.root_path / 'data/external/boundaries/'} path" + ) -boundaries_filtered.to_file( - acbm.root_path / "data/external/boundaries/study_area_zones.geojson", - driver="GeoJSON", -) + boundaries_filtered.to_file( + acbm.root_path / "data/external/boundaries/study_area_zones.geojson", + driver="GeoJSON", + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh index b9fc340a..381f5700 100755 --- a/scripts/run_pipeline.sh +++ b/scripts/run_pipeline.sh @@ -2,7 +2,7 @@ set -e -python scripts/0_preprocessing_inputs.py --config_file $1 +python scripts/0_preprocess_inputs.py --config_file $1 python scripts/1_prep_synthpop.py --config_file $1 python scripts/2_match_households_and_individuals.py --config_file $1 python scripts/3.1_assign_primary_feasible_zones.py --config_file $1 diff --git a/src/acbm/config.py b/src/acbm/config.py index 9bf4aa94..72213202 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -13,6 +13,9 @@ class Parameters(BaseModel): number_of_households: int | None = None zone_id: str travel_times: bool + boundary_geography: str + boundary_filter_column: str + boundary_filter_values: list[str] @dataclass(frozen=True) From 29ce798a3fd5ef9f5ba642d8421a5f4f103595cf Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Wed, 2 Oct 2024 18:14:30 +0100 Subject: [PATCH 05/15] update config --- config/base.toml | 2 +- config/base_500.toml | 2 +- config/base_5000.toml | 2 +- config/base_all.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/base.toml b/config/base.toml index 85eef814..ef15a84f 100644 --- a/config/base.toml +++ b/config/base.toml @@ -3,7 +3,7 @@ seed = 0 region = "leeds" number_of_households = 10000 zone_id = "OA21CD" -travel_times = true +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" boundary_filter_column = "LEP22NM1" boundary_filter_values = ["Leeds City Region"] diff --git a/config/base_500.toml b/config/base_500.toml index be30dfef..69529f7f 100644 --- a/config/base_500.toml +++ b/config/base_500.toml @@ -3,7 +3,7 @@ seed = 0 region = "leeds" number_of_households = 500 zone_id = "OA21CD" -travel_times = true +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" boundary_filter_column = "LEP22NM1" boundary_filter_values = ["Leeds City Region"] diff --git a/config/base_5000.toml b/config/base_5000.toml index e75f8237..5a3f61af 100644 --- a/config/base_5000.toml +++ b/config/base_5000.toml @@ -3,7 +3,7 @@ seed = 0 region = "leeds" number_of_households = 5000 zone_id = "OA21CD" -travel_times = true +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" boundary_filter_column = "LEP22NM1" boundary_filter_values = ["Leeds City Region"] diff --git a/config/base_all.toml b/config/base_all.toml index ed00b0a0..dc2ec8e3 100644 --- a/config/base_all.toml +++ b/config/base_all.toml @@ -2,7 +2,7 @@ seed = 0 region = "leeds" zone_id = "OA21CD" -travel_times = true +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" boundary_filter_column = "LEP22NM1" boundary_filter_values = ["Leeds City Region"] From ce944694c46702c7c8b784be394c3cdcc45ba383 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Fri, 4 Oct 2024 15:10:26 +0100 Subject: [PATCH 06/15] remove hardcoding --- scripts/3.1_assign_primary_feasible_zones.py | 5 ++--- src/acbm/assigning/feasible_zones_primary.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index f6b40a79..a7d9a895 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -94,7 +94,7 @@ def main(config_file): # If travel_times is not true or loading failed, create a new travel time matrix logger.info("No travel time matrix found. Creating a new travel time matrix.") # Create a new travel time matrix based on distances between zones - travel_times = zones_to_time_matrix(zones=boundaries, id_col="OA21CD") + travel_times = zones_to_time_matrix(zones=boundaries, id_col=config.zone_id) logger.info("Travel time estimates created") # --- Intrazonal trip times @@ -110,8 +110,7 @@ def main(config_file): logger.info("Creating intrazonal travel time estimates") - # TODO: use config zone_id instead of OA21CD - intrazone_times = intrazone_time(zones=boundaries, key_column="OA21CD") + intrazone_times = intrazone_time(zones=boundaries, key_column=config.zone_id) logger.info("Intrazonal travel time estimates created") diff --git a/src/acbm/assigning/feasible_zones_primary.py b/src/acbm/assigning/feasible_zones_primary.py index 5d3bd8cf..a65871dd 100644 --- a/src/acbm/assigning/feasible_zones_primary.py +++ b/src/acbm/assigning/feasible_zones_primary.py @@ -134,7 +134,7 @@ def get_possible_zones( if travel_times is None: logger.info("Travel time matrix not provided: Creating travel times estimates") - travel_times = zones_to_time_matrix(zones=boundaries, id_col="OA21CD") + travel_times = zones_to_time_matrix(zones=boundaries, id_col=zone_id) list_of_modes = activity_chains["mode"].unique() print(f"Unique modes found in the dataset are: {list_of_modes}") From 5739d89b1256aa9128181a22f9f302441912d955 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Fri, 4 Oct 2024 15:38:53 +0100 Subject: [PATCH 07/15] remove hardcoded region --- scripts/1_prep_synthpop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/1_prep_synthpop.py b/scripts/1_prep_synthpop.py index 069b5dc2..4d3d0e11 100644 --- a/scripts/1_prep_synthpop.py +++ b/scripts/1_prep_synthpop.py @@ -13,7 +13,6 @@ def main(config_file): # Pick a region with SPC output saved path = acbm.root_path / "data/external/spc_output/raw/" - region = "leeds" # Add people and households spc_people_hh = ( From b4148cf77fedb333bfd05842081a00b8b1dffaad Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Fri, 4 Oct 2024 16:04:19 +0100 Subject: [PATCH 08/15] update filtering logic to match spc zones --- scripts/0_preprocess_inputs.py | 38 +++++++++++++++++++--------- src/acbm/preprocessing.py | 45 ---------------------------------- 2 files changed, 27 insertions(+), 56 deletions(-) diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index e0716d82..fe629c63 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -1,16 +1,21 @@ import geopandas as gpd +import pandas as pd +from uatk_spc import Reader import acbm from acbm.cli import acbm_cli from acbm.config import load_config from acbm.logger_config import preprocessing_logger as logger -from acbm.preprocessing import edit_boundary_resolution, filter_boundaries +from acbm.preprocessing import edit_boundary_resolution @acbm_cli def main(config_file): config = load_config(config_file) config.init_rng() + region = config.region + # Pick a region with SPC output saved + spc_path = acbm.root_path / "data/external/spc_output/raw/" # ----- BOUNDARIES logger.info("Preprocessing Boundary Layer") @@ -25,25 +30,36 @@ def main(config_file): boundaries = boundaries.to_crs(epsg=4326) - ## Dissolve boundaries if resolution is MSOA + ## --- Dissolve boundaries if resolution is MSOA boundary_geography = config.parameters.boundary_geography # can only be OA or MSOA logger.info(f"2. Dissolving boundaries to {boundary_geography} level") boundaries = edit_boundary_resolution(boundaries, boundary_geography) - ## Filter to study area + ## --- Filter to study area + # we filter using msoa21cd values, which exist regardless of the boundary resolution logger.info("3. Filtering boundaries to specified study area") - # TODO get from config and log - # logger.info(f"3. Filtering boundaries to {config.parameters.boundary_filter_column} = {config.parameters.study_area}") - - boundaries_filtered = filter_boundaries( - # boundaries=boundaries, column="LEP22NM1", values=["Leeds City Region"] - boundaries=boundaries, - column=config.parameters.boundary_filter_column, - values=config.parameters.boundary_filter_values, + + # Step 1: Get zones from SPC (these will be 2011 MSOAs) + spc = Reader(spc_path, region, backend="pandas") + zones_in_region = list(spc.info_per_msoa.keys()) + + # Step 2: Filter boundaries to identified zones + + # a) get MSOA11CD to MSOA21CD lookup + msoa_lookup = pd.read_csv( + acbm.root_path + / "data/external/MSOA_2011_MSOA_2021_Lookup_for_England_and_Wales.csv" ) + # Filter msoa_lookup to include only rows where MSOA11CD is in zones_in_region + msoa_lookup_filtered = msoa_lookup[msoa_lookup["MSOA11CD"].isin(zones_in_region)] + # Extract the corresponding MSOA21CD values + msoa21cd_values = msoa_lookup_filtered["MSOA21CD"].tolist() + + # b) filter boundaries to include only rows where MSOA21CD is in msoa21cd_values + boundaries_filtered = boundaries[boundaries["MSOA21CD"].isin(msoa21cd_values)] ## Save the output as parquet logger.info( diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py index 7c47fd81..9b2542be 100644 --- a/src/acbm/preprocessing.py +++ b/src/acbm/preprocessing.py @@ -54,51 +54,6 @@ def edit_boundary_resolution( return study_area -# TODO: create spatial filter option -def filter_boundaries(boundaries, column, values): - """ - Filter the boundaries GeoDataFrame by the specified column and values. - - Parameters - ---------- - - boundaries: gpd.GeoDataFrame): The GeoDataFrame containing the boundaries. - column: str - The column to filter by (e.g., 'LEP22NM1', 'LAD22NM', 'rgn22nm'). - values: list - The list of values to keep in the specified column. - - Returns - ------- - gpd.GeoDataFrame - The filtered GeoDataFrame. - - Raises - ------ - ValueError - If the specified column does not exist in the GeoDataFrame. - If any of the specified values are not present in the column. - """ - - # Check if the column exists in the GeoDataFrame - if column not in boundaries.columns: - error_message = f"Column '{column}' does not exist in the GeoDataFrame." - raise ValueError(error_message) - - # Check if all values are present in the specified column - unique_values = boundaries[column].unique() - missing_values = [value for value in values if value not in unique_values] - if missing_values: - error_message = ( - f"Values {missing_values} are not present in the column '{column}'. " - f"Unique values in the column are: {unique_values}" - ) - raise ValueError(error_message) - - # Filter boundaries layer by column = values - return boundaries[boundaries[column].isin(values)] - - # ----- MATCHING From 9dc2127af142582056d1f4326eb75e8f79da62a5 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Fri, 4 Oct 2024 16:08:00 +0100 Subject: [PATCH 09/15] remove filter parameters --- src/acbm/config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/acbm/config.py b/src/acbm/config.py index 72213202..9805836b 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -14,8 +14,6 @@ class Parameters(BaseModel): zone_id: str travel_times: bool boundary_geography: str - boundary_filter_column: str - boundary_filter_values: list[str] @dataclass(frozen=True) From bf65e49a12bff040cc97aea78b4d164334052638 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Fri, 4 Oct 2024 16:20:39 +0100 Subject: [PATCH 10/15] fix column drop error --- src/acbm/preprocessing.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py index 9b2542be..cbf4d25b 100644 --- a/src/acbm/preprocessing.py +++ b/src/acbm/preprocessing.py @@ -33,17 +33,23 @@ def edit_boundary_resolution( A GeoDataFrame containing the study area boundaries with the specified geography """ - # Drop unnecessary columns - columns_to_drop = ["GlobalID", "OA21CD", "LSOA21CD", "LSOA21NM"] - study_area = study_area.drop( - columns=[col for col in columns_to_drop if col in study_area.columns] - ) - # Dissolve based on the specified geography if geography == "MSOA": + # Drop unnecessary columns (they are lower level than MSOA) + columns_to_drop = ["GlobalID", "OA21CD", "LSOA21CD", "LSOA21NM"] + study_area = study_area.drop( + columns=[col for col in columns_to_drop if col in study_area.columns] + ) + print("converting from OA to MSOA") study_area = study_area.dissolve(by="MSOA21CD").reset_index() + elif geography == "OA": + # Drop unnecessary columns + columns_to_drop = ["GlobalID"] + study_area = study_area.drop( + columns=[col for col in columns_to_drop if col in study_area.columns] + ) print("keeping original OA boundaries") # Ensure all geometries are MultiPolygon From 8d162755f897225dc271187ce1474f32f62d62d8 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Fri, 4 Oct 2024 16:42:44 +0100 Subject: [PATCH 11/15] always keep msoa21cd --- src/acbm/preprocessing.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py index cbf4d25b..95c025a1 100644 --- a/src/acbm/preprocessing.py +++ b/src/acbm/preprocessing.py @@ -12,7 +12,7 @@ def edit_boundary_resolution( - study_area: gpd.GeoDataFrame, geography: str + study_area: gpd.GeoDataFrame, geography: str, zone_id: str ) -> gpd.GeoDataFrame: """ This function takes a GeoDataFrame and a geography resolution as input and returns @@ -26,6 +26,8 @@ def edit_boundary_resolution( A GeoDataFrame containing the study area boundaries geography : str A string specifying the geography resolution. It can be either "OA" or "MSOA" + zone_id : str + The column name of the zone identifier in the study_area GeoDataFrame Returns ------- @@ -36,20 +38,16 @@ def edit_boundary_resolution( # Dissolve based on the specified geography if geography == "MSOA": # Drop unnecessary columns (they are lower level than MSOA) - columns_to_drop = ["GlobalID", "OA21CD", "LSOA21CD", "LSOA21NM"] - study_area = study_area.drop( - columns=[col for col in columns_to_drop if col in study_area.columns] - ) + study_area = study_area[[zone_id, "geometry"]] print("converting from OA to MSOA") study_area = study_area.dissolve(by="MSOA21CD").reset_index() elif geography == "OA": # Drop unnecessary columns - columns_to_drop = ["GlobalID"] - study_area = study_area.drop( - columns=[col for col in columns_to_drop if col in study_area.columns] - ) + study_area = study_area[ + [zone_id, "MSOA21CD", "geometry"] + ] # we always need MSOA21CD to filter to study area print("keeping original OA boundaries") # Ensure all geometries are MultiPolygon From 9d7d5b7a8d3ff0f1997570be30e167dab4440af2 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Fri, 4 Oct 2024 16:44:30 +0100 Subject: [PATCH 12/15] add zone_id argument --- scripts/0_preprocess_inputs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py index fe629c63..ed8aa190 100644 --- a/scripts/0_preprocess_inputs.py +++ b/scripts/0_preprocess_inputs.py @@ -35,7 +35,9 @@ def main(config_file): boundary_geography = config.parameters.boundary_geography # can only be OA or MSOA logger.info(f"2. Dissolving boundaries to {boundary_geography} level") - boundaries = edit_boundary_resolution(boundaries, boundary_geography) + boundaries = edit_boundary_resolution( + study_area=boundaries, geography=boundary_geography, zone_id=config.zone_id + ) ## --- Filter to study area # we filter using msoa21cd values, which exist regardless of the boundary resolution From 7deda465a685a9b56472a509923ad1bd23906197 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Fri, 4 Oct 2024 16:45:10 +0100 Subject: [PATCH 13/15] update configs --- config/base.toml | 2 -- config/base_500.toml | 2 -- config/base_5000.toml | 3 --- config/base_all.toml | 2 -- 4 files changed, 9 deletions(-) diff --git a/config/base.toml b/config/base.toml index ef15a84f..38a4f2f8 100644 --- a/config/base.toml +++ b/config/base.toml @@ -5,8 +5,6 @@ number_of_households = 10000 zone_id = "OA21CD" travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" -boundary_filter_column = "LEP22NM1" -boundary_filter_values = ["Leeds City Region"] [work_assignment] diff --git a/config/base_500.toml b/config/base_500.toml index 69529f7f..d9164a49 100644 --- a/config/base_500.toml +++ b/config/base_500.toml @@ -5,8 +5,6 @@ number_of_households = 500 zone_id = "OA21CD" travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" -boundary_filter_column = "LEP22NM1" -boundary_filter_values = ["Leeds City Region"] [work_assignment] diff --git a/config/base_5000.toml b/config/base_5000.toml index 5a3f61af..d4f01343 100644 --- a/config/base_5000.toml +++ b/config/base_5000.toml @@ -5,9 +5,6 @@ number_of_households = 5000 zone_id = "OA21CD" travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" -boundary_filter_column = "LEP22NM1" -boundary_filter_values = ["Leeds City Region"] - [work_assignment] use_percentages = true diff --git a/config/base_all.toml b/config/base_all.toml index dc2ec8e3..bb1cc1ee 100644 --- a/config/base_all.toml +++ b/config/base_all.toml @@ -4,8 +4,6 @@ region = "leeds" zone_id = "OA21CD" travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" -boundary_filter_column = "LEP22NM1" -boundary_filter_values = ["Leeds City Region"] [work_assignment] From 346a91734603f7a062b0207eea7e8231d5c281ad Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Mon, 7 Oct 2024 10:54:57 +0100 Subject: [PATCH 14/15] get commute_level from config --- scripts/3.2.2_assign_primary_zone_work.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index 1552f39b..82ee9ee2 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -63,7 +63,7 @@ def main(config_file): # Commuting matrices (from 2021 census) # TODO: consider making this configurable - commute_level = "OA" # "OA" or "MSOA" data + commute_level = config.boundary_geography # "OA" or "MSOA" data logger.info(f"Loading commuting matrices at {commute_level} level") From 5e47d0c9e8d2b4bc74e90dcfa03630e4b03b4cc1 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Mon, 7 Oct 2024 10:57:15 +0100 Subject: [PATCH 15/15] add error message if boundary_geography is not OA / MSOA --- src/acbm/preprocessing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py index 95c025a1..b5f3c5c6 100644 --- a/src/acbm/preprocessing.py +++ b/src/acbm/preprocessing.py @@ -50,6 +50,10 @@ def edit_boundary_resolution( ] # we always need MSOA21CD to filter to study area print("keeping original OA boundaries") + else: + msg = f"Invalid geography: '{geography}'. Expected 'OA' or 'MSOA'." + raise ValueError(msg) + # Ensure all geometries are MultiPolygon study_area["geometry"] = study_area["geometry"].apply( lambda geom: MultiPolygon([geom]) if geom.geom_type == "Polygon" else geom