diff --git a/.gitignore b/.gitignore index c74b2f26..143eb173 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ Pipfile.lock #backend backend/data backend/.env + +.DS_Store diff --git a/API/api_worker.py b/API/api_worker.py index 096f2f0f..c538b42c 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -16,6 +16,7 @@ # Reader imports from src.app import CustomExport, PolygonStats, RawData, S3FileTransfer +from src.post_processing.processor import PostProcessor from src.config import ALLOW_BIND_ZIP_FILTER from src.config import CELERY_BROKER_URL as celery_broker_uri from src.config import CELERY_RESULT_BACKEND as celery_backend @@ -39,6 +40,7 @@ RawDataCurrentParams, RawDataOutputType, ) +from src.post_processing.processor import PostProcessor if ENABLE_SOZIP: # Third party imports @@ -75,7 +77,12 @@ def create_readme_content(default_readme, polygon_stats): def zip_binding( - working_dir, exportname_parts, geom_dump, polygon_stats, default_readme + working_dir, + exportname_parts, + geom_dump, + polygon_stats, + geojson_stats, + default_readme, ): logging.debug("Zip Binding Started!") upload_file_path = os.path.join( @@ -88,6 +95,9 @@ def zip_binding( ), } + if geojson_stats: + additional_files["stats.json"] = geojson_stats + for name, content in additional_files.items(): temp_path = os.path.join(working_dir, name) with open(temp_path, "w") as f: @@ -209,11 +219,60 @@ def process_raw_data(self, params, user=None): file_parts, ) - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts) - inside_file_size = 0 + # Post-proccessing: Generate GeoJSON/HTML stats and transliterations polygon_stats = None + geojson_stats_html = None + geojson_stats_json = None + download_html_url = None + if "include_stats" or "include_translit" in params.dict(): + post_processor = PostProcessor( + { + "include_stats": params.include_stats, + "include_translit": params.include_translit, + } + ) + + if params.include_stats: + post_processor.filters = params.filters + + post_processor.init() + + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts, post_processor.post_process_line) + + if params.include_stats: + geojson_stats_json = json.dumps(post_processor.geoJSONStats.dict()) + + # Create a HTML summary of stats + if params.include_stats_html: + tpl = "stats" + if "waterway" in post_processor.geoJSONStats.config.keys: + tpl = "stats_waterway" + elif "highway" in post_processor.geoJSONStats.config.keys: + tpl = "stats_highway" + elif "building" in post_processor.geoJSONStats.config.keys: + tpl = "stats_building" + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join( + project_root, + "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl), + ) + geojson_stats_html = post_processor.geoJSONStats.html( + tpl_path + ).build() + upload_html_path = os.path.join( + working_dir, os.pardir, f"{exportname_parts[-1]}.html" + ) + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + + else: + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts) + + inside_file_size = 0 if "include_stats" in params.dict(): if params.include_stats: feature = { @@ -222,12 +281,14 @@ def process_raw_data(self, params, user=None): "properties": {}, } polygon_stats = PolygonStats(feature).get_summary_stats() + if bind_zip: upload_file_path, inside_file_size = zip_binding( working_dir=working_dir, exportname_parts=exportname_parts, geom_dump=geom_dump, polygon_stats=polygon_stats, + geojson_stats=geojson_stats_json, default_readme=DEFAULT_README_TEXT, ) @@ -240,6 +301,7 @@ def process_raw_data(self, params, user=None): upload_file_path = file_path inside_file_size += os.path.getsize(file_path) break # only take one file inside dir , if contains many it should be inside zip + # check if download url will be generated from s3 or not from config if use_s3_to_upload: file_transfer_obj = S3FileTransfer() @@ -253,7 +315,6 @@ def process_raw_data(self, params, user=None): pattern = r"(hotosm_project_)(\d+)" match = re.match(pattern, exportname) if match: - prefix = match.group(1) project_number = match.group(2) if project_number: upload_name = f"TM/{project_number}/{exportname}" @@ -272,6 +333,15 @@ def process_raw_data(self, params, user=None): upload_name, file_suffix="zip" if bind_zip else params.output_type.lower(), ) + + # If there's an HTML file, upload it too + if geojson_stats_html: + download_html_url = file_transfer_obj.upload( + upload_html_path, + upload_name, + file_suffix="html", + ) + else: # give the static file download url back to user served from fastapi static export path download_url = str(upload_file_path) @@ -297,6 +367,9 @@ def process_raw_data(self, params, user=None): } if polygon_stats: final_response["stats"] = polygon_stats + if download_html_url: + final_response["download_html_url"] = download_html_url + return final_response except Exception as ex: diff --git a/backend/field_update b/backend/field_update index b663498b..0c55c83b 100644 --- a/backend/field_update +++ b/backend/field_update @@ -114,7 +114,6 @@ class Database: try: self.cursor.execute(query) self.conn.commit() - # print(query) try: result = self.cursor.fetchall() diff --git a/backend/raw_backend b/backend/raw_backend index 5ddd55c8..4a852eb3 100644 --- a/backend/raw_backend +++ b/backend/raw_backend @@ -256,7 +256,7 @@ if __name__ == "__main__": if not args.replication: osm2pgsql.append("--drop") - print(osm2pgsql) + run_subprocess_cmd(osm2pgsql) basic_index_cmd = [ diff --git a/requirements.txt b/requirements.txt index c295c164..46eb24da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -54,3 +54,10 @@ psutil==5.9.8 ## logging tqdm==4.66.2 + +# stats for geojson data +geojson-stats==0.2.4 + +# transliterations +transliterate==1.10.2 + diff --git a/src/app.py b/src/app.py index fb5e82d5..3ca2ad6c 100644 --- a/src/app.py +++ b/src/app.py @@ -47,6 +47,7 @@ from psycopg2.extras import DictCursor from slugify import slugify from tqdm import tqdm +from .post_processing.processor import PostProcessor # Reader imports from src.config import ( @@ -640,7 +641,7 @@ def ogr_export(query, outputtype, working_dir, dump_temp_path, params): os.remove(query_path) @staticmethod - def query2geojson(con, extraction_query, dump_temp_file_path): + def query2geojson(con, extraction_query, dump_temp_file_path, plugin_fn=None): """Function written from scratch without being dependent on any library, Provides better performance for geojson binding""" # creating geojson file pre_geojson = """{"type": "FeatureCollection","features": [""" @@ -660,10 +661,12 @@ def query2geojson(con, extraction_query, dump_temp_file_path): for row in cursor: if first: first = False - f.write(row[0]) else: f.write(",") - f.write(row[0]) + if plugin_fn: + f.write(plugin_fn(row[0])) + else: + f.write((row[0])) cursor.close() # closing connection to avoid memory issues # close the writing geojson with last part f.write(post_geojson) @@ -711,7 +714,7 @@ def get_grid_id(geom, cur): country_export, ) - def extract_current_data(self, exportname): + def extract_current_data(self, exportname, plugin_fn=None): """Responsible for Extracting rawdata current snapshot, Initially it creates a geojson file , Generates query , run it with 1000 chunk size and writes it directly to the geojson file and closes the file after dump Args: exportname: takes filename as argument to create geojson file passed from routers @@ -777,6 +780,7 @@ def extract_current_data(self, exportname): country_export=country_export, ), dump_temp_file_path, + plugin_fn, ) # uses own conversion class if output_type == RawDataOutputType.SHAPEFILE.value: ( @@ -1488,8 +1492,29 @@ def process_export_format(export_format): layer_creation_options=layer_creation_options_str, query_dump_path=export_format_path, ) + run_ogr2ogr_cmd(ogr2ogr_cmd) + # Post-processing GeoJSON files + # Adds: stats, HTML stats summary and transliterations + if export_format.driver_name == "GeoJSON" and ( + self.params.include_stats or self.params.include_translit + ): + post_processor = PostProcessor( + { + "include_stats": self.params.include_stats, + "include_translit": self.params.include_translit, + "include_stats_html": self.params.include_stats_html, + } + ) + post_processor.init() + post_processor.custom( + categories=self.params.categories, + export_format_path=export_format_path, + export_filename=export_filename, + file_export_path=file_export_path, + ) + zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") zip_path = self.file_to_zip(export_format_path, zip_file_path) diff --git a/src/config.py b/src/config.py index 3f6ab649..cc889318 100644 --- a/src/config.py +++ b/src/config.py @@ -334,6 +334,7 @@ def not_raises(func, *args, **kwargs): logging.error( "Error creating HDX configuration: %s, Disabling the hdx exports feature", e ) + ENABLE_HDX_EXPORTS = False if ENABLE_HDX_EXPORTS: diff --git a/src/post_processing/__init__.py b/src/post_processing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py new file mode 100644 index 00000000..9ca74d68 --- /dev/null +++ b/src/post_processing/geojson_stats.py @@ -0,0 +1,61 @@ +from geojson_stats.stats import Stats +from geojson_stats.html import Html + +CONFIG_AREA = ["building"] +CONFIG_LENGTH = ["highway", "waterway"] + + +class GeoJSONStats(Stats): + """Used for collecting stats while processing GeoJSON files line by line""" + + def __init__(self, filters, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.config.clean = True + self.config.properties_prop = "properties.tags" + + if filters and filters.tags: + for tag in CONFIG_AREA: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.area = True + + for tag in CONFIG_LENGTH: + if self.check_filter(filters.tags, tag): + self.config.keys.append(tag) + self.config.value_keys.append(tag) + self.config.length = True + + def check_filter(self, tags, tag): + """ + Check if a tag is present in tag filters + """ + + if tags.all_geometry: + if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: + return True + if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: + return True + if tags.polygon: + if tags.polygon.join_or and tag in tags.polygon.join_or: + return True + if tags.polygon.join_and and tag in tags.polygon.join_and: + return True + if tags.line: + if tags.line.join_or and tag in tags.line.join_or: + return True + if tags.line.join_and and tag in tags.line.join_and: + return True + + def raw_data_line_stats(self, json_object: dict): + """ + Process a GeoJSON line (for getting stats) and return that line + """ + self.get_object_stats(json_object) + + def html(self, tpl): + """ + Returns stats Html object, generated from stats data using a template + """ + return Html(tpl, self) diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py new file mode 100644 index 00000000..44feccff --- /dev/null +++ b/src/post_processing/processor.py @@ -0,0 +1,120 @@ +import json +from .transliterator import Transliterator +from .geojson_stats import GeoJSONStats +import os +import pathlib + + +class PostProcessor: + """Used for posst-process data while processing GeoJSON files line by line""" + + options = {} + filters = {} + functions = [] + + def __init__(self, options, *args, **kwargs): + self.options = options + + def post_process_line(self, line: str): + """ + Parses line, run functions over it and returns it + """ + + line_object = json.loads(line) + + for fn in self.functions: + fn(line_object) + + return json.dumps(line_object) + + def custom(self, categories, export_format_path, export_filename, file_export_path): + """ + Post-process custom exports + """ + self.geoJSONStats.config.properties_prop = "properties" + + category_tag = "" + if any("Roads" in element for element in categories): + category_tag = "highway" + self.geoJSONStats.config.length = True + elif any("Buildings" in element for element in categories): + category_tag = "building" + self.geoJSONStats.config.area = True + elif any("Waterways" in element for element in categories): + category_tag = "Waterway" + self.geoJSONStats.config.length = True + + if self.options["include_stats"]: + if category_tag: + self.geoJSONStats.config.keys.append(category_tag) + self.geoJSONStats.config.value_keys.append(category_tag) + + path_input = os.path.join( + export_format_path, f"{export_filename}.geojson" + ) + path_output = os.path.join( + export_format_path, f"{export_filename}-post.geojson" + ) + + with open(path_input, "r") as input_file, open( + path_output, "w" + ) as output_file: + for line in input_file: + comma = False + if line.startswith('{ "type": "Feature"'): + json_string = "" + if line[-2:-1] == ",": + json_string = line[:-2] + comma = True + else: + json_string = line + line = self.post_process_line(json_string) + if self.options["include_translit"]: + if comma: + output_file.write(line + ",") + else: + output_file.write(line) + + if self.options["include_translit"]: + os.remove(path_input) + os.rename(path_output, path_input) + else: + os.remove(path_output) + + geojson_stats_json = json.dumps(self.geoJSONStats.dict()) + with open( + os.path.join(file_export_path, "stats.json"), + "w", + ) as f: + f.write(geojson_stats_json) + + if self.options["include_stats_html"]: + tpl = ( + "stats_{category_tag}".format(category_tag=category_tag) + if category_tag + else "stats" + ) + project_root = pathlib.Path(__file__).resolve().parent + tpl_path = os.path.join( + project_root, + "{tpl}_tpl.html".format(tpl=tpl), + ) + geojson_stats_html = self.geoJSONStats.html(tpl_path).build() + upload_html_path = os.path.join( + file_export_path, "stats-summary.html" + ) + with open(upload_html_path, "w") as f: + f.write(geojson_stats_html) + + def init(self): + """ + Initialize post-processor + """ + + if "include_stats" in self.options and self.options["include_stats"]: + self.geoJSONStats = GeoJSONStats(self.filters) + self.functions.append(self.geoJSONStats.raw_data_line_stats) + + if "include_translit" in self.options and self.options["include_translit"]: + self.transliterator = Transliterator() + self.functions.append(self.transliterator.translit) diff --git a/src/post_processing/stats_building_tpl.html b/src/post_processing/stats_building_tpl.html new file mode 100644 index 00000000..97c3f659 --- /dev/null +++ b/src/post_processing/stats_building_tpl.html @@ -0,0 +1,165 @@ + + + +
+ + + + + + +Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +
Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +
Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +
Elements identified as distinct
+Including local language and english
+Key | +Count | +% | +
---|---|---|
Total features | +${count} | +100% | +
${key_0} | +${key_0_count} | +${key_0_percent} | +
${key_1} | +${key_1_count} | +${key_1_percent} | +
${key_2} | +${key_2_count} | +${key_2_percent} | +
${key_3} | +${key_3_count} | +${key_3_percent} | +
${key_4} | +${key_4_count} | +${key_4_percent} | +
${key_5} | +${key_5_count} | +${key_5_percent} | +