diff --git a/.gitignore b/.gitignore index c74b2f26..143eb173 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ Pipfile.lock #backend backend/data backend/.env + +.DS_Store diff --git a/API/api_worker.py b/API/api_worker.py index 096f2f0f..c538b42c 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -16,6 +16,7 @@ # Reader imports from src.app import CustomExport, PolygonStats, RawData, S3FileTransfer +from src.post_processing.processor import PostProcessor from src.config import ALLOW_BIND_ZIP_FILTER from src.config import CELERY_BROKER_URL as celery_broker_uri from src.config import CELERY_RESULT_BACKEND as celery_backend @@ -39,6 +40,7 @@ RawDataCurrentParams, RawDataOutputType, ) +from src.post_processing.processor import PostProcessor if ENABLE_SOZIP: # Third party imports @@ -75,7 +77,12 @@ def create_readme_content(default_readme, polygon_stats): def zip_binding( - working_dir, exportname_parts, geom_dump, polygon_stats, default_readme + working_dir, + exportname_parts, + geom_dump, + polygon_stats, + geojson_stats, + default_readme, ): logging.debug("Zip Binding Started!") upload_file_path = os.path.join( @@ -88,6 +95,9 @@ def zip_binding( ), } + if geojson_stats: + additional_files["stats.json"] = geojson_stats + for name, content in additional_files.items(): temp_path = os.path.join(working_dir, name) with open(temp_path, "w") as f: @@ -209,11 +219,60 @@ def process_raw_data(self, params, user=None): file_parts, ) - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts) - inside_file_size = 0 + # Post-proccessing: Generate GeoJSON/HTML stats and transliterations polygon_stats = None + geojson_stats_html = None + geojson_stats_json = None + download_html_url = None + if "include_stats" or "include_translit" in params.dict(): + post_processor = PostProcessor( + { + "include_stats": params.include_stats, + "include_translit": params.include_translit, + } + ) + + if params.include_stats: + post_processor.filters = params.filters + + post_processor.init() + + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts, post_processor.post_process_line) + + if params.include_stats: + geojson_stats_json = json.dumps(post_processor.geoJSONStats.dict()) + + # Create a HTML summary of stats + if params.include_stats_html: + tpl = "stats" + if "waterway" in post_processor.geoJSONStats.config.keys: + tpl = "stats_waterway" + elif "highway" in post_processor.geoJSONStats.config.keys: + tpl = "stats_highway" + elif "building" in post_processor.geoJSONStats.config.keys: + tpl = "stats_building" + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join( + project_root, + "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl), + ) + geojson_stats_html = post_processor.geoJSONStats.html( + tpl_path + ).build() + upload_html_path = os.path.join( + working_dir, os.pardir, f"{exportname_parts[-1]}.html" + ) + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + + else: + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts) + + inside_file_size = 0 if "include_stats" in params.dict(): if params.include_stats: feature = { @@ -222,12 +281,14 @@ def process_raw_data(self, params, user=None): "properties": {}, } polygon_stats = PolygonStats(feature).get_summary_stats() + if bind_zip: upload_file_path, inside_file_size = zip_binding( working_dir=working_dir, exportname_parts=exportname_parts, geom_dump=geom_dump, polygon_stats=polygon_stats, + geojson_stats=geojson_stats_json, default_readme=DEFAULT_README_TEXT, ) @@ -240,6 +301,7 @@ def process_raw_data(self, params, user=None): upload_file_path = file_path inside_file_size += os.path.getsize(file_path) break # only take one file inside dir , if contains many it should be inside zip + # check if download url will be generated from s3 or not from config if use_s3_to_upload: file_transfer_obj = S3FileTransfer() @@ -253,7 +315,6 @@ def process_raw_data(self, params, user=None): pattern = r"(hotosm_project_)(\d+)" match = re.match(pattern, exportname) if match: - prefix = match.group(1) project_number = match.group(2) if project_number: upload_name = f"TM/{project_number}/{exportname}" @@ -272,6 +333,15 @@ def process_raw_data(self, params, user=None): upload_name, file_suffix="zip" if bind_zip else params.output_type.lower(), ) + + # If there's an HTML file, upload it too + if geojson_stats_html: + download_html_url = file_transfer_obj.upload( + upload_html_path, + upload_name, + file_suffix="html", + ) + else: # give the static file download url back to user served from fastapi static export path download_url = str(upload_file_path) @@ -297,6 +367,9 @@ def process_raw_data(self, params, user=None): } if polygon_stats: final_response["stats"] = polygon_stats + if download_html_url: + final_response["download_html_url"] = download_html_url + return final_response except Exception as ex: diff --git a/backend/field_update b/backend/field_update index b663498b..0c55c83b 100644 --- a/backend/field_update +++ b/backend/field_update @@ -114,7 +114,6 @@ class Database: try: self.cursor.execute(query) self.conn.commit() - # print(query) try: result = self.cursor.fetchall() diff --git a/backend/raw_backend b/backend/raw_backend index 5ddd55c8..4a852eb3 100644 --- a/backend/raw_backend +++ b/backend/raw_backend @@ -256,7 +256,7 @@ if __name__ == "__main__": if not args.replication: osm2pgsql.append("--drop") - print(osm2pgsql) + run_subprocess_cmd(osm2pgsql) basic_index_cmd = [ diff --git a/requirements.txt b/requirements.txt index c295c164..46eb24da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -54,3 +54,10 @@ psutil==5.9.8 ## logging tqdm==4.66.2 + +# stats for geojson data +geojson-stats==0.2.4 + +# transliterations +transliterate==1.10.2 + diff --git a/src/app.py b/src/app.py index fb5e82d5..3ca2ad6c 100644 --- a/src/app.py +++ b/src/app.py @@ -47,6 +47,7 @@ from psycopg2.extras import DictCursor from slugify import slugify from tqdm import tqdm +from .post_processing.processor import PostProcessor # Reader imports from src.config import ( @@ -640,7 +641,7 @@ def ogr_export(query, outputtype, working_dir, dump_temp_path, params): os.remove(query_path) @staticmethod - def query2geojson(con, extraction_query, dump_temp_file_path): + def query2geojson(con, extraction_query, dump_temp_file_path, plugin_fn=None): """Function written from scratch without being dependent on any library, Provides better performance for geojson binding""" # creating geojson file pre_geojson = """{"type": "FeatureCollection","features": [""" @@ -660,10 +661,12 @@ def query2geojson(con, extraction_query, dump_temp_file_path): for row in cursor: if first: first = False - f.write(row[0]) else: f.write(",") - f.write(row[0]) + if plugin_fn: + f.write(plugin_fn(row[0])) + else: + f.write((row[0])) cursor.close() # closing connection to avoid memory issues # close the writing geojson with last part f.write(post_geojson) @@ -711,7 +714,7 @@ def get_grid_id(geom, cur): country_export, ) - def extract_current_data(self, exportname): + def extract_current_data(self, exportname, plugin_fn=None): """Responsible for Extracting rawdata current snapshot, Initially it creates a geojson file , Generates query , run it with 1000 chunk size and writes it directly to the geojson file and closes the file after dump Args: exportname: takes filename as argument to create geojson file passed from routers @@ -777,6 +780,7 @@ def extract_current_data(self, exportname): country_export=country_export, ), dump_temp_file_path, + plugin_fn, ) # uses own conversion class if output_type == RawDataOutputType.SHAPEFILE.value: ( @@ -1488,8 +1492,29 @@ def process_export_format(export_format): layer_creation_options=layer_creation_options_str, query_dump_path=export_format_path, ) + run_ogr2ogr_cmd(ogr2ogr_cmd) + # Post-processing GeoJSON files + # Adds: stats, HTML stats summary and transliterations + if export_format.driver_name == "GeoJSON" and ( + self.params.include_stats or self.params.include_translit + ): + post_processor = PostProcessor( + { + "include_stats": self.params.include_stats, + "include_translit": self.params.include_translit, + "include_stats_html": self.params.include_stats_html, + } + ) + post_processor.init() + post_processor.custom( + categories=self.params.categories, + export_format_path=export_format_path, + export_filename=export_filename, + file_export_path=file_export_path, + ) + zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") zip_path = self.file_to_zip(export_format_path, zip_file_path) diff --git a/src/config.py b/src/config.py index 3f6ab649..cc889318 100644 --- a/src/config.py +++ b/src/config.py @@ -334,6 +334,7 @@ def not_raises(func, *args, **kwargs): logging.error( "Error creating HDX configuration: %s, Disabling the hdx exports feature", e ) + ENABLE_HDX_EXPORTS = False if ENABLE_HDX_EXPORTS: diff --git a/src/post_processing/__init__.py b/src/post_processing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py new file mode 100644 index 00000000..9ca74d68 --- /dev/null +++ b/src/post_processing/geojson_stats.py @@ -0,0 +1,61 @@ +from geojson_stats.stats import Stats +from geojson_stats.html import Html + +CONFIG_AREA = ["building"] +CONFIG_LENGTH = ["highway", "waterway"] + + +class GeoJSONStats(Stats): + """Used for collecting stats while processing GeoJSON files line by line""" + + def __init__(self, filters, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.config.clean = True + self.config.properties_prop = "properties.tags" + + if filters and filters.tags: + for tag in CONFIG_AREA: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.area = True + + for tag in CONFIG_LENGTH: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.length = True + + def check_filter(self, tags, tag): + """ + Check if a tag is present in tag filters + """ + + if tags.all_geometry: + if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: + return True + if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: + return True + if tags.polygon: + if tags.polygon.join_or and tag in tags.polygon.join_or: + return True + if tags.polygon.join_and and tag in tags.polygon.join_and: + return True + if tags.line: + if tags.line.join_or and tag in tags.line.join_or: + return True + if tags.line.join_and and tag in tags.line.join_and: + return True + + def raw_data_line_stats(self, json_object: dict): + """ + Process a GeoJSON line (for getting stats) and return that line + """ + self.get_object_stats(json_object) + + def html(self, tpl): + """ + Returns stats Html object, generated from stats data using a template + """ + return Html(tpl, self) diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py new file mode 100644 index 00000000..44feccff --- /dev/null +++ b/src/post_processing/processor.py @@ -0,0 +1,120 @@ +import json +from .transliterator import Transliterator +from .geojson_stats import GeoJSONStats +import os +import pathlib + + +class PostProcessor: + """Used for posst-process data while processing GeoJSON files line by line""" + + options = {} + filters = {} + functions = [] + + def __init__(self, options, *args, **kwargs): + self.options = options + + def post_process_line(self, line: str): + """ + Parses line, run functions over it and returns it + """ + + line_object = json.loads(line) + + for fn in self.functions: + fn(line_object) + + return json.dumps(line_object) + + def custom(self, categories, export_format_path, export_filename, file_export_path): + """ + Post-process custom exports + """ + self.geoJSONStats.config.properties_prop = "properties" + + category_tag = "" + if any("Roads" in element for element in categories): + category_tag = "highway" + self.geoJSONStats.config.length = True + elif any("Buildings" in element for element in categories): + category_tag = "building" + self.geoJSONStats.config.area = True + elif any("Waterways" in element for element in categories): + category_tag = "Waterway" + self.geoJSONStats.config.length = True + + if self.options["include_stats"]: + if category_tag: + self.geoJSONStats.config.keys.append(category_tag) + self.geoJSONStats.config.value_keys.append(category_tag) + + path_input = os.path.join( + export_format_path, f"{export_filename}.geojson" + ) + path_output = os.path.join( + export_format_path, f"{export_filename}-post.geojson" + ) + + with open(path_input, "r") as input_file, open( + path_output, "w" + ) as output_file: + for line in input_file: + comma = False + if line.startswith('{ "type": "Feature"'): + json_string = "" + if line[-2:-1] == ",": + json_string = line[:-2] + comma = True + else: + json_string = line + line = self.post_process_line(json_string) + if self.options["include_translit"]: + if comma: + output_file.write(line + ",") + else: + output_file.write(line) + + if self.options["include_translit"]: + os.remove(path_input) + os.rename(path_output, path_input) + else: + os.remove(path_output) + + geojson_stats_json = json.dumps(self.geoJSONStats.dict()) + with open( + os.path.join(file_export_path, "stats.json"), + "w", + ) as f: + f.write(geojson_stats_json) + + if self.options["include_stats_html"]: + tpl = ( + "stats_{category_tag}".format(category_tag=category_tag) + if category_tag + else "stats" + ) + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join( + project_root, + "{tpl}_tpl.html".format(tpl=tpl), + ) + geojson_stats_html = self.geoJSONStats.html(tpl_path).build() + upload_html_path = os.path.join( + file_export_path, "stats-summary.html" + ) + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + + def init(self): + """ + Initialize post-processor + """ + + if "include_stats" in self.options and self.options["include_stats"]: + self.geoJSONStats = GeoJSONStats(self.filters) + self.functions.append(self.geoJSONStats.raw_data_line_stats) + + if "include_translit" in self.options and self.options["include_translit"]: + self.transliterator = Transliterator() + self.functions.append(self.transliterator.translit) diff --git a/src/post_processing/stats_building_tpl.html b/src/post_processing/stats_building_tpl.html new file mode 100644 index 00000000..97c3f659 --- /dev/null +++ b/src/post_processing/stats_building_tpl.html @@ -0,0 +1,165 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + diff --git a/src/post_processing/stats_highway_tpl.html b/src/post_processing/stats_highway_tpl.html new file mode 100644 index 00000000..be09c355 --- /dev/null +++ b/src/post_processing/stats_highway_tpl.html @@ -0,0 +1,164 @@ + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + diff --git a/src/post_processing/stats_tpl.html b/src/post_processing/stats_tpl.html new file mode 100644 index 00000000..7359ad13 --- /dev/null +++ b/src/post_processing/stats_tpl.html @@ -0,0 +1,161 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + diff --git a/src/post_processing/stats_waterway_tpl.html b/src/post_processing/stats_waterway_tpl.html new file mode 100644 index 00000000..af11d163 --- /dev/null +++ b/src/post_processing/stats_waterway_tpl.html @@ -0,0 +1,165 @@ + + + + + + + + + + + HOT Export Stats + + + +
+
+ +
+

${count}

+

Number of Features

+

Elements identified as distinct

+
+
+

${languages_count}

+

Languages Available

+

Including local language and english

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyCount%
Total features${count}100%
${key_0}${key_0_count}${key_0_percent}
${key_1}${key_1_count}${key_1_percent}
${key_2}${key_2_count}${key_2_percent}
${key_3}${key_3_count}${key_3_percent}
${key_4}${key_4_count}${key_4_percent}
${key_5}${key_5_count}${key_5_percent}
+
+
+ + diff --git a/src/post_processing/transliterator.py b/src/post_processing/transliterator.py new file mode 100644 index 00000000..ddc16bb7 --- /dev/null +++ b/src/post_processing/transliterator.py @@ -0,0 +1,34 @@ +from transliterate import translit, get_available_language_codes + + +class Transliterator: + """Used for transliterate names while processing GeoJSON files line by line""" + + props = "properties" + + def __init__(self): + self.available_language_codes = get_available_language_codes() + self.name_tags = [f"name:{x}" for x in self.available_language_codes] + + def translit(self, line): + """ + Transliterate names and add a new tag suffixed with -translit + """ + for code in self.available_language_codes: + tag = "name:{code}".format(code=code) + prop = ( + line["properties"]["tags"] + if self.props == "properties.tags" + else line["properties"] + ) + if tag in prop: + translit_tag = "{tag}-translit".format(tag=tag) + if not translit_tag in prop: + if self.props == "properties.tags": + line["properties"]["tags"][translit_tag] = translit( + prop[tag], code, reversed=True + ) + else: + line["properties"][translit_tag] = translit( + prop[tag], code, reversed=True + ) diff --git a/src/validation/models.py b/src/validation/models.py index 1b1b0e92..52677470 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -167,6 +167,14 @@ class RawDataCurrentParamsBase(BaseModel, GeometryValidatorMixin): default=False, description="Includes detailed stats about the polygon passed such as buildings count , road count along with summary about data completeness in the area", ) + include_stats_html: Optional[bool] = Field( + default=False, + description="Includes detailed stats about the polygon passed such as buildings count , road count along with summary about data completeness in the area", + ) + include_translit: Optional[bool] = Field( + default=False, + description="Includes transliterations", + ) filters: Optional[Filters] = Field( default=None, example={ @@ -610,6 +618,9 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): Fields: - iso3 (Optional[str]): ISO3 Country Code. + - include_stats (bool): Include a JSON file with stats. Available for GeoJSON exports only. + - include_stats_html (bool): Include a HTML file with a stats summary. Available for GeoJSON exports only. + - include_translit (bool): Add transliterations. Available for GeoJSON exports only. - dataset (Optional[DatasetConfig]): Dataset Configurations for HDX Upload. - meta (bool): Dumps Meta db in parquet format & HDX config JSON to S3. - hdx_upload (bool): Enable/Disable uploading the dataset to HDX. @@ -624,6 +635,18 @@ class DynamicCategoriesModel(CategoriesBase, GeometryValidatorMixin): max_length=3, example="USA", ) + include_stats: Optional[bool] = Field( + default=False, + description="Include a JSON file with stats. Available for GeoJSON exports only.", + ) + include_stats_html: Optional[bool] = Field( + default=False, + description="Include a HTML file with a stats summary. Available for GeoJSON exports only.", + ) + include_translit: Optional[bool] = Field( + default=False, + description="Add transliterations. Available for GeoJSON exports only.", + ) geometry: Optional[ Union[Polygon, MultiPolygon, Feature, FeatureCollection] ] = Field( diff --git a/tests/test_API.py b/tests/test_API.py index 5651dffa..433b133c 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -9,6 +9,7 @@ from API.main import app client = TestClient(app) +client.base_url = "http://127.0.0.1:8000" access_token = os.environ.get("ACCESS_TOKEN") @@ -740,33 +741,6 @@ def test_snapshot_authentication_uuid(): }, "uuid": False, } - - response = client.post("/v1/snapshot/", json=payload, headers=headers) - - assert response.status_code == 200 - res = response.json() - track_link = res["track_link"] - wait_for_task_completion(track_link) - - -def test_snapshot_bind_zip(): - headers = {"access-token": access_token} - payload = { - "geometry": { - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, - "bindZip": False, - } - response = client.post("/v1/snapshot/", json=payload, headers=headers) assert response.status_code == 200 @@ -955,6 +929,38 @@ def test_hdx_submit_normal_iso3(): wait_for_task_completion(track_link) +def test_hdx_submit_normal_iso3_with_stats(): + headers = {"access-token": access_token} + payload = { + "iso3": "NPL", + "hdx_upload": False, + "include_stats": True, + "include_translit": True, + "include_stats_html": True, + "categories": [ + { + "Roads": { + "hdx": { + "tags": ["roads", "transportation", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines"], + "select": ["name", "highway"], + "where": "tags['highway'] IS NOT NULL", + "formats": ["geojson"], + } + } + ], + } + + response = client.post("/v1/custom/snapshot/", json=payload, headers=headers) + + assert response.status_code == 200 + res = response.json() + track_link = res["track_link"] + wait_for_task_completion(track_link) + + def test_hdx_submit_normal_iso3_multiple_format(): headers = {"access-token": access_token} payload = {