nasa · sudha-murthy · Nov 6, 2024 · Nov 8, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/hoss/coordinate_utilities.py b/hoss/coordinate_utilities.py
@@ -5,14 +5,13 @@
 
 import numpy as np
 from netCDF4 import Dataset
-
-# from numpy import ndarray
+from pyproj import CRS, Transformer
 from varinfo import VariableFromDmr, VarInfoFromDmr
 
 from hoss.exceptions import (
     IncompatibleCoordinateVariables,
+    InvalidCoordinateData,
     InvalidCoordinateDataset,
-    InvalidCoordinateVariable,
     MissingCoordinateVariable,
     MissingVariable,
 )
@@ -97,24 +96,28 @@ def get_coordinate_variables(
     varinfo: VarInfoFromDmr,
     requested_variables: list,
 ) -> tuple[list, list]:
-    """This function returns latitude and longitude variables listed in the
-    CF-Convention coordinates metadata attribute. It returns them in a specific
-    order [latitude, longitude]"
+    """This function returns latitude and longitude variable names from
+    latitude and longitude variables listed in the CF-Convention coordinates
+    metadata attribute. It returns them in a specific
+    order [latitude_name, longitude_name]"
     """
 
-    coordinate_variables_list = varinfo.get_references_for_attribute(
+    coordinate_variables = varinfo.get_references_for_attribute(
         requested_variables, 'coordinates'
     )
+
     latitude_coordinate_variables = [
         coordinate
-        for coordinate in coordinate_variables_list
-        if varinfo.get_variable(coordinate).is_latitude()
+        for coordinate in coordinate_variables
+        if varinfo.get_variable(coordinate) is not None
+        and varinfo.get_variable(coordinate).is_latitude()
     ]
 
     longitude_coordinate_variables = [
         coordinate
-        for coordinate in coordinate_variables_list
-        if varinfo.get_variable(coordinate).is_longitude()
+        for coordinate in coordinate_variables
+        if varinfo.get_variable(coordinate) is not None
+        and varinfo.get_variable(coordinate).is_longitude()
     ]
 
     return latitude_coordinate_variables, longitude_coordinate_variables
@@ -162,59 +165,226 @@ def get_coordinate_array(
     return coordinate_array
 
 
-def get_1D_dim_array_data_from_dimvalues(
-    dim_values: np.ndarray, dim_indices: np.ndarray, dim_size: int
+def get_1d_dim_array_data_from_dimvalues(
+    dim_values: np.ndarray,
+    dim_indices: np.ndarray,
+    dim_size: int,
 ) -> np.ndarray:
     """
-    return a full dimension data array based on the 2 projected points and
-    grid size
+    return a full dimension data array based upon 2 valid projected values
+    (x and y) given in dim_values which are within the indices given in
+    dim_indices and the full dimension size provided in dim_size. The
+    dim_indices need to be between 0 and less than the dim_size.
+    returns a 1D array of size = dim_size with proper dimension array values
     """
 
     if (dim_indices[1] != dim_indices[0]) and (dim_values[1] != dim_values[0]):
         dim_resolution = (dim_values[1] - dim_values[0]) / (
             dim_indices[1] - dim_indices[0]
         )
     else:
-        raise InvalidCoordinateDataset(dim_values[0], dim_indices[0])
+        raise InvalidCoordinateData(dim_values[0], dim_indices[0])
 
     dim_min = dim_values[0] - (dim_resolution * dim_indices[0])
     dim_max = dim_values[1] + (dim_resolution * (dim_size - 1 - dim_indices[1]))
+
     return np.linspace(dim_min, dim_max, dim_size)
 
 
 def get_valid_indices(
-    coordinate_row_col: np.ndarray, coordinate: VariableFromDmr
+    lat_lon_array: np.ndarray, coordinate: VariableFromDmr
 ) -> np.ndarray:
     """
-    Returns indices of a valid array without fill values if the fill
-    value is provided. If it is not provided, we check for valid values
-    for latitude and longitude
+    Returns an array of boolean values
+    - true, false - indicating a valid value (non-fill, within range)
+    for a given coordinate variable. A value of True means the
+    value is valid
+    - latitude or longitude - or
+    returns an empty ndarray of size (0,0) for any other variable.
+
     """
+
     # get_attribute_value returns a value of type `str`
     coordinate_fill = coordinate.get_attribute_value('_FillValue')
     if coordinate_fill is not None:
-        is_not_fill = ~np.isclose(coordinate_row_col, float(coordinate_fill))
+        is_not_fill = ~np.isclose(lat_lon_array, float(coordinate_fill))
     else:
         # Creates an entire array of `True` values.
-        is_not_fill = np.ones_like(coordinate_row_col, dtype=bool)
+        is_not_fill = np.ones_like(lat_lon_array, dtype=bool)
 
     if coordinate.is_longitude():
-        valid_indices = np.where(
-            np.logical_and(
-                is_not_fill,
-                np.logical_and(
-                    coordinate_row_col >= -180.0, coordinate_row_col <= 360.0
-                ),
-            )
-        )[0]
+        valid_indices = np.logical_and(
+            is_not_fill,
+            np.logical_and(lat_lon_array >= -180.0, lat_lon_array <= 360.0),
+        )
     elif coordinate.is_latitude():
-        valid_indices = np.where(
-            np.logical_and(
-                is_not_fill,
-                np.logical_and(coordinate_row_col >= -90.0, coordinate_row_col <= 90.0),
-            )
-        )[0]
+        valid_indices = np.logical_and(
+            is_not_fill,
+            np.logical_and(lat_lon_array >= -90.0, lat_lon_array <= 90.0),
+        )
     else:
-        valid_indices = np.empty((0, 0))
+        raise InvalidCoordinateDataset(coordinate.full_name_path)
 
     return valid_indices
+
+
+def get_row_col_geo_grid_points(
+    lat_arr: np.ndarray,
+    lon_arr: np.ndarray,
+    latitude_coordinate: VariableFromDmr,
+    longitude_coordinate: VariableFromDmr,
+) -> tuple[list, list, list, list]:
+    """
+    This method is used to return two sets of valid indices and the
+    corresponding two sets of row and col lat lon points from 2D
+    coordinate datasets.
+    """
+
+    row_indices, col_indices = get_valid_row_col_pairs(
+        lat_arr, lon_arr, latitude_coordinate, longitude_coordinate
+    )
+
+    geo_grid_col_points = [
+        (lon_arr[row, col], lat_arr[row, col]) for row, col in row_indices
+    ]
+
+    geo_grid_row_points = [
+        (lon_arr[row, col], lat_arr[row, col]) for row, col in col_indices
+    ]
+
+    return (
+        row_indices,
+        col_indices,
+        geo_grid_row_points,
+        geo_grid_col_points,
+    )
+
+
+def get_x_y_values_from_geographic_points(
+    points: list[tuple],  # list of data points as tuple in (lon, lat) order
+    crs: CRS,  # CRS object from PyProj
+) -> list[tuple]:  # list of X-Y points as tuple in (X, Y) order
+    """Take an input list of (longitude, latitude) coordinates and project
+    those points to the target grid. Then return the x-y dimscales
+
+    """
+    point_longitudes, point_latitudes = zip(*list(points))
+
+    from_geo_transformer = Transformer.from_crs(4326, crs)
+    points_x, points_y = (  # pylint: disable=unpacking-non-sequence
+        from_geo_transformer.transform(point_latitudes, point_longitudes)
+    )
+
+    return list(zip(points_x, points_y))
+
+
+def get_valid_row_col_pairs(
+    lat_arr: np.ndarray,
+    lon_arr: np.ndarray,
+    lat_coordinate: VariableFromDmr,
+    lon_coordinate: VariableFromDmr,
+) -> tuple[list, list]:
+    """
+    This function finds a set of indices maximally spread across
+    a row, and the set maximally spread across a column, with the
+    indices being valid in both the latitude and longitude datasets.
+    When interpolating between these points, the maximal spread
+    ensures the greatest interpolation accuracy.
+    """
+    valid_lat_lon_mask = np.logical_and(
+        get_valid_indices(lat_arr, lat_coordinate),
+        get_valid_indices(lon_arr, lon_coordinate),
+    )
+
+    # get maximally spread points within rows
+    max_x_spread_pts = get_max_x_spread_pts(~valid_lat_lon_mask)
+
+    # Doing the same for the columns is done by transposing the valid_mask
+    # and then fixing the results from [x, y] to [y, x].
+    max_y_spread_trsp = get_max_x_spread_pts(np.transpose(~valid_lat_lon_mask))
+    max_y_spread_pts = [
+        list(np.flip(max_y_spread_trsp[0])),
+        list(np.flip(max_y_spread_trsp[1])),
+    ]
+
+    return max_y_spread_pts, max_x_spread_pts
+
+
+def get_max_x_spread_pts(
+    valid_mask: np.ndarray,
+) -> list[list]:
+    """
+    This function returns two data points by x, y indices that are spread farthest
+    from each other in the same row, i.e., have the greatest delta-x value - and
+    are valid data points from the valid_mask array passed in. The input array
+    must be a 2D Numpy mask array providing the valid data points, e.g., filtering
+    out fill values and out-of-range values.
+    - input is Numpy Mask Array, e.g., invalid latitudes & longitudes
+    - returns 2 points by indices, [[y_ind, x_ind], [y_ind, x_ind]
+    """
+    # fill a sample array with x-index values, x_ind[i, j] = j
+    x_ind = np.indices((valid_mask.shape[0], valid_mask.shape[1]))[1]
+    # mask x_ind to hide the invalid data points
+    valid_x_ind = np.ma.array(x_ind, mask=valid_mask)
+
+    # ptp (peak-to-peak) finds the greatest delta-x value amongst valid points
+    # for each row. Result is 1D
+    x_ind_spread = valid_x_ind.ptp(axis=1)
+
+    # This finds which row has the greatest spread (delta-x)
+    max_x_spread_row = np.argmax(x_ind_spread)
+
+    # Using the row reference, find the min-x and max-x
+    min_x_ind = np.min(valid_x_ind[max_x_spread_row])
+    max_x_ind = np.max(valid_x_ind[max_x_spread_row])
+
+    return [[max_x_spread_row, min_x_ind], [max_x_spread_row, max_x_ind]]
+
+
+def create_dimension_arrays_from_coordinates(
+    prefetch_dataset: Dataset,
+    latitude_coordinate: VariableFromDmr,
+    longitude_coordinate: VariableFromDmr,
+    crs: CRS,
+    projected_dimension_names: list,
+) -> dict[str, np.ndarray]:
+    """Generate artificial 1D dimensions scales for each
+    2D dimension or coordinate variable.
+    1) Get 2 valid geo grid points
+    2) convert them to a projected x-y extent
+    3) Generate the x-y dimscale array and return to the calling method
+
+    """
+    lat_arr = get_coordinate_array(
+        prefetch_dataset,
+        latitude_coordinate.full_name_path,
+    )
+    lon_arr = get_coordinate_array(
+        prefetch_dataset,
+        longitude_coordinate.full_name_path,
+    )
+    row_size, col_size = get_row_col_sizes_from_coordinate_datasets(
+        lat_arr,
+        lon_arr,
+    )
+
+    row_indices, col_indices, geo_grid_row_points, geo_grid_col_points = (
+        get_row_col_geo_grid_points(
+            lat_arr,
+            lon_arr,
+            latitude_coordinate,
+            longitude_coordinate,
+        )
+    )
+    x_y_values1 = get_x_y_values_from_geographic_points(geo_grid_row_points, crs)
+    col_indices_for_x = np.transpose(col_indices)[1]
+    x_values = np.transpose(x_y_values1)[0]
+    x_dim = get_1d_dim_array_data_from_dimvalues(x_values, col_indices_for_x, col_size)
+
+    x_y_values2 = get_x_y_values_from_geographic_points(geo_grid_col_points, crs)
+    row_indices_for_y = np.transpose(row_indices)[0]
+    y_values = np.transpose(x_y_values2)[1]
+    y_dim = get_1d_dim_array_data_from_dimvalues(y_values, row_indices_for_y, row_size)
+
+    projected_y, projected_x = tuple(projected_dimension_names)
+    return {projected_y: y_dim, projected_x: x_dim}