From 8656bbd3f39d37419b367637ef8176ed84282e67 Mon Sep 17 00:00:00 2001 From: Marie Laurent Date: Thu, 29 Aug 2024 17:15:23 +0200 Subject: [PATCH 1/4] feat: addition municipalities info to households and activities --- synthesis/output.py | 32 ++++++++++++++--------- synthesis/population/enriched.py | 1 + synthesis/population/spatial/locations.py | 7 +++++ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/synthesis/output.py b/synthesis/output.py index b970e59b..e7d99127 100644 --- a/synthesis/output.py +++ b/synthesis/output.py @@ -69,7 +69,7 @@ def execute(context): ).drop_duplicates("household_id") df_households = df_households[[ - "household_id", + "household_id","iris_id", "commune_id", "departement_id", "car_availability", "bike_availability", "number_of_vehicles", "number_of_bikes", "income", @@ -107,9 +107,28 @@ def execute(context): df_activities["preceding_trip_index"] = df_activities["following_trip_index"].shift(1) df_activities.loc[df_activities["is_first"], "preceding_trip_index"] = -1 df_activities["preceding_trip_index"] = df_activities["preceding_trip_index"].astype(int) + # Prepare spatial data sets + df_locations = context.stage("synthesis.population.spatial.locations")[[ + "person_id", "iris_id", "commune_id","departement_id","region_id","activity_index", "geometry" + ]] + df_activities = pd.merge(df_activities, df_locations[[ + "person_id", "iris_id", "commune_id","departement_id","region_id","activity_index", "geometry" + ]], how = "left", on = ["person_id", "activity_index"]) + + # Prepare spatial activities + df_spatial = gpd.GeoDataFrame(df_activities[[ + "person_id", "household_id", "activity_index", + "preceding_trip_index", "following_trip_index", + "purpose", "start_time", "end_time", + "is_first", "is_last", "geometry" + ]], crs = df_locations.crs) + df_spatial["purpose"] = df_spatial["purpose"].astype(str) + + # Write activities df_activities = df_activities[[ "person_id", "household_id", "activity_index", + "iris_id", "commune_id","departement_id","region_id", "preceding_trip_index", "following_trip_index", "purpose", "start_time", "end_time", "is_first", "is_last" @@ -168,18 +187,7 @@ def execute(context): df_vehicle_types.to_csv("%s/%svehicle_types.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") df_vehicles.to_csv("%s/%svehicles.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") - # Prepare spatial data sets - df_locations = context.stage("synthesis.population.spatial.locations")[[ - "person_id", "activity_index", "geometry" - ]] - - df_activities = pd.merge(df_activities, df_locations[[ - "person_id", "activity_index", "geometry" - ]], how = "left", on = ["person_id", "activity_index"]) - # Write spatial activities - df_spatial = gpd.GeoDataFrame(df_activities, crs = df_locations.crs) - df_spatial["purpose"] = df_spatial["purpose"].astype(str) if "gpkg" in output_formats: path = "%s/%sactivities.gpkg" % (output_path, output_prefix) df_spatial.to_file(path, driver = "GPKG") diff --git a/synthesis/population/enriched.py b/synthesis/population/enriched.py index 94a9ee6b..115efa10 100644 --- a/synthesis/population/enriched.py +++ b/synthesis/population/enriched.py @@ -26,6 +26,7 @@ def execute(context): df_population = context.stage("synthesis.population.sampled")[[ "person_id", "household_id", "census_person_id", "census_household_id", + "iris_id", "commune_id", "departement_id", "age", "sex", "employed", "studies", "number_of_vehicles", "household_size", "consumption_units", "socioprofessional_class" diff --git a/synthesis/population/spatial/locations.py b/synthesis/population/spatial/locations.py index 5277fd19..2397e095 100644 --- a/synthesis/population/spatial/locations.py +++ b/synthesis/population/spatial/locations.py @@ -9,6 +9,7 @@ def configure(context): context.stage("synthesis.population.activities") context.stage("synthesis.population.sampled") + context.stage("data.spatial.iris") def execute(context): df_home = context.stage("synthesis.population.spatial.home.locations") @@ -57,4 +58,10 @@ def execute(context): assert not df_locations["geometry"].isna().any() df_locations = gpd.GeoDataFrame(df_locations, crs = df_home.crs) + # add municipalities + df_iris = context.stage("data.spatial.iris") + df_iris = gpd.GeoDataFrame(df_iris, crs = df_home.crs) + + df_locations = gpd.sjoin(df_locations,df_iris,how="left") + return df_locations From 01aac938c062fb33e7fc4c8c1c31338c2c14a97d Mon Sep 17 00:00:00 2001 From: Marie Laurent Date: Tue, 3 Sep 2024 17:49:44 +0200 Subject: [PATCH 2/4] upadate tests & improve municipalities for house --- synthesis/output.py | 41 +++++++++++++++++--------------- synthesis/population/enriched.py | 1 - tests/test_determinism.py | 12 +++++----- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/synthesis/output.py b/synthesis/output.py index e7d99127..a93eeb4d 100644 --- a/synthesis/output.py +++ b/synthesis/output.py @@ -63,23 +63,6 @@ def execute(context): output_prefix = context.config("output_prefix") output_formats = context.config("output_formats") - # Prepare households - df_households = context.stage("synthesis.population.enriched").rename( - columns = { "household_income": "income" } - ).drop_duplicates("household_id") - - df_households = df_households[[ - "household_id","iris_id", "commune_id", "departement_id", - "car_availability", "bike_availability", - "number_of_vehicles", "number_of_bikes", - "income", - "census_household_id" - ]] - if "csv" in output_formats: - df_households.to_csv("%s/%shouseholds.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") - if "parquet" in output_formats: - df_households.to_parquet("%s/%shouseholds.parquet" % (output_path, output_prefix)) - # Prepare persons df_persons = context.stage("synthesis.population.enriched").rename( columns = { "has_license": "has_driving_license" } @@ -119,11 +102,12 @@ def execute(context): # Prepare spatial activities df_spatial = gpd.GeoDataFrame(df_activities[[ "person_id", "household_id", "activity_index", + "iris_id", "commune_id","departement_id","region_id", "preceding_trip_index", "following_trip_index", "purpose", "start_time", "end_time", "is_first", "is_last", "geometry" ]], crs = df_locations.crs) - df_spatial["purpose"] = df_spatial["purpose"].astype(str) + df_spatial = df_spatial.astype({'purpose': 'str', "departement_id": 'str'}) # Write activities df_activities = df_activities[[ @@ -139,6 +123,25 @@ def execute(context): if "parquet" in output_formats: df_activities.to_parquet("%s/%sactivities.parquet" % (output_path, output_prefix)) + # Prepare households + df_households = context.stage("synthesis.population.enriched").rename( + columns = { "household_income": "income" } + ).drop_duplicates("household_id") + + df_households = pd.merge(df_households,df_activities[df_activities["purpose"] == "home"][["household_id", + "iris_id", "commune_id","departement_id","region_id"]].drop_duplicates("household_id"),how="left") + df_households = df_households[[ + "household_id","iris_id", "commune_id", "departement_id","region_id", + "car_availability", "bike_availability", + "number_of_vehicles", "number_of_bikes", + "income", + "census_household_id" + ]] + if "csv" in output_formats: + df_households.to_csv("%s/%shouseholds.csv" % (output_path, output_prefix), sep = ";", index = None, lineterminator = "\n") + if "parquet" in output_formats: + df_households.to_parquet("%s/%shouseholds.parquet" % (output_path, output_prefix)) + # Prepare trips df_trips = context.stage("synthesis.population.trips").rename( columns = { @@ -200,7 +203,7 @@ def execute(context): df_spatial_homes = df_spatial[ df_spatial["purpose"] == "home" ].drop_duplicates("household_id")[[ - "household_id", "geometry" + "household_id","iris_id", "commune_id","departement_id","region_id", "geometry" ]] if "gpkg" in output_formats: path = "%s/%shomes.gpkg" % (output_path, output_prefix) diff --git a/synthesis/population/enriched.py b/synthesis/population/enriched.py index 115efa10..94a9ee6b 100644 --- a/synthesis/population/enriched.py +++ b/synthesis/population/enriched.py @@ -26,7 +26,6 @@ def execute(context): df_population = context.stage("synthesis.population.sampled")[[ "person_id", "household_id", "census_person_id", "census_household_id", - "iris_id", "commune_id", "departement_id", "age", "sex", "employed", "studies", "number_of_vehicles", "household_size", "consumption_units", "socioprofessional_class" diff --git a/tests/test_determinism.py b/tests/test_determinism.py index 763e567e..ac84087a 100644 --- a/tests/test_determinism.py +++ b/tests/test_determinism.py @@ -68,17 +68,17 @@ def _test_determinism(index, data_path, tmpdir): synpp.run(stages, config, working_directory = cache_path) REFERENCE_CSV_HASHES = { - "ile_de_france_activities.csv": "e520003e1876a9542ff1a955a6efcfdc", - "ile_de_france_households.csv": "709ce7ded8a2487e6691d4fb3374754b", + "ile_de_france_activities.csv": "53c44fb4026d2037729ee8ff1c8fb93f", + "ile_de_france_households.csv": "ca2a29ef13467326f937638f1ff8be1a", "ile_de_france_persons.csv": "ddbe9b418c915b14e888b54efbdf9b1e", "ile_de_france_trips.csv": "6c5f3427e41e683da768eeb53796a806", } REFERENCE_GPKG_HASHES = { - "ile_de_france_activities.gpkg": "9cf9a5fd8927c709927f7a940f86efbf", - "ile_de_france_commutes.gpkg": "5a4180390a69349cc655c07c5671e8d3", - "ile_de_france_homes.gpkg": "033d1aa7a5350579cbd5e8213b9736f2", - "ile_de_france_trips.gpkg": "d0aec4033cfc184bf1b91ae13a537ef8", + "ile_de_france_activities.gpkg": "50c11d1be6ef2f29a718c574da7fd8a3", + "ile_de_france_commutes.gpkg": "4a62b544376981d689c1b5fe88865398", + "ile_de_france_homes.gpkg": "cda4719021b02726164f8ace43ab20f4", + "ile_de_france_trips.gpkg": "d7581bd60a4ad1ad2c473170455f744f", } generated_csv_hashes = { From 752471c5b776b941413e1970556cc0bb75d049bc Mon Sep 17 00:00:00 2001 From: Marie Laurent Date: Mon, 16 Sep 2024 14:51:50 +0200 Subject: [PATCH 3/4] first try correction test & changelog --- CHANGELOG.md | 1 + tests/test_determinism.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3f30918..468795dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ **Under development** +- feat: add municipality information to households and activities - chore: update to `eqasim-java` commit `ece4932` - feat: vehicles and vehicle types are now always generated - feat: read vehicles data from zip files diff --git a/tests/test_determinism.py b/tests/test_determinism.py index 2f964f06..295c56ec 100644 --- a/tests/test_determinism.py +++ b/tests/test_determinism.py @@ -77,10 +77,10 @@ def _test_determinism(index, data_path, tmpdir): } REFERENCE_GPKG_HASHES = { - "ile_de_france_activities.gpkg": "50c11d1be6ef2f29a718c574da7fd8a3", - "ile_de_france_commutes.gpkg": "4a62b544376981d689c1b5fe88865398", - "ile_de_france_homes.gpkg": "cda4719021b02726164f8ace43ab20f4", - "ile_de_france_trips.gpkg": "d7581bd60a4ad1ad2c473170455f744f", + "ile_de_france_activities.gpkg": "9cf9a5fd8927c709927f7a940f86efbf", + "ile_de_france_commutes.gpkg": "5a4180390a69349cc655c07c5671e8d3", + "ile_de_france_homes.gpkg": "033d1aa7a5350579cbd5e8213b9736f2", + "ile_de_france_trips.gpkg": "d0aec4033cfc184bf1b91ae13a537ef8", } generated_csv_hashes = { From 94cfda9976550909a996bee3e11ea20f637414ca Mon Sep 17 00:00:00 2001 From: Marie Laurent Date: Mon, 16 Sep 2024 15:39:53 +0200 Subject: [PATCH 4/4] fix: test gpkg hashes --- tests/test_determinism.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_determinism.py b/tests/test_determinism.py index 295c56ec..e2755d7a 100644 --- a/tests/test_determinism.py +++ b/tests/test_determinism.py @@ -77,9 +77,9 @@ def _test_determinism(index, data_path, tmpdir): } REFERENCE_GPKG_HASHES = { - "ile_de_france_activities.gpkg": "9cf9a5fd8927c709927f7a940f86efbf", + "ile_de_france_activities.gpkg": "884eec1fd0c29904284eb4362ff89be1", "ile_de_france_commutes.gpkg": "5a4180390a69349cc655c07c5671e8d3", - "ile_de_france_homes.gpkg": "033d1aa7a5350579cbd5e8213b9736f2", + "ile_de_france_homes.gpkg": "a85e973f0e2f51031cd60170d351845e", "ile_de_france_trips.gpkg": "d0aec4033cfc184bf1b91ae13a537ef8", }