-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path0_preprocess_inputs.py
78 lines (57 loc) · 2.53 KB
/
0_preprocess_inputs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import geopandas as gpd
import pandas as pd
from uatk_spc import Reader
import acbm
from acbm.cli import acbm_cli
from acbm.config import load_config
from acbm.logger_config import preprocessing_logger as logger
from acbm.preprocessing import edit_boundary_resolution
@acbm_cli
def main(config_file):
config = load_config(config_file)
config.init_rng()
region = config.region
# Pick a region with SPC output saved
spc_path = acbm.root_path / "data/external/spc_output/raw/"
# ----- BOUNDARIES
logger.info("Preprocessing Boundary Layer")
## Read in the boundary layer for the whole of England
logger.info("1. Reading in the boundary layer for the whole of England")
boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson"
)
boundaries = boundaries.to_crs(epsg=4326)
## --- Dissolve boundaries if resolution is MSOA
boundary_geography = config.parameters.boundary_geography # can only be OA or MSOA
logger.info(f"2. Dissolving boundaries to {boundary_geography} level")
boundaries = edit_boundary_resolution(
study_area=boundaries, geography=boundary_geography, zone_id=config.zone_id
)
## --- Filter to study area
# we filter using msoa21cd values, which exist regardless of the boundary resolution
logger.info("3. Filtering boundaries to specified study area")
# Step 1: Get zones from SPC (these will be 2011 MSOAs)
spc = Reader(spc_path, region, backend="pandas")
zones_in_region = list(spc.info_per_msoa.keys())
# Step 2: Filter boundaries to identified zones
# a) get MSOA11CD to MSOA21CD lookup
msoa_lookup = pd.read_csv(
acbm.root_path
/ "data/external/MSOA_2011_MSOA_2021_Lookup_for_England_and_Wales.csv"
)
# Filter msoa_lookup to include only rows where MSOA11CD is in zones_in_region
msoa_lookup_filtered = msoa_lookup[msoa_lookup["MSOA11CD"].isin(zones_in_region)]
# Extract the corresponding MSOA21CD values
msoa21cd_values = msoa_lookup_filtered["MSOA21CD"].tolist()
# b) filter boundaries to include only rows where MSOA21CD is in msoa21cd_values
boundaries_filtered = boundaries[boundaries["MSOA21CD"].isin(msoa21cd_values)]
## Save the output as parquet
logger.info(
f"4. Saving the boundaries to {acbm.root_path / 'data/external/boundaries/'} path"
)
boundaries_filtered.to_file(
acbm.root_path / "data/external/boundaries/study_area_zones.geojson",
driver="GeoJSON",
)
if __name__ == "__main__":
main()