-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from ecmwf-projects/check-for-missing-variables
Systematic checks for missing variables
- Loading branch information
Showing
5 changed files
with
219 additions
and
117 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import os | ||
from pathlib import Path | ||
|
||
import pandas | ||
import pytest | ||
import sqlalchemy as sa | ||
|
||
from cdsobs.api import run_ingestion_pipeline | ||
from cdsobs.cdm.api import open_netcdf | ||
from cdsobs.cdm.lite import auxiliary_variable_names | ||
from cdsobs.ingestion.core import get_aux_vars_from_service_definition | ||
from cdsobs.observation_catalogue.models import Catalogue | ||
from cdsobs.service_definition.api import get_service_definition | ||
from cdsobs.storage import S3Client | ||
from tests.test_api import TEST_API_PARAMETERS | ||
from tests.utils import get_test_years | ||
|
||
|
||
@pytest.mark.parametrize("dataset_name,source", TEST_API_PARAMETERS) | ||
def test_run_ingestion_pipeline( | ||
dataset_name, source, test_session, test_config, caplog, tmp_path | ||
): | ||
start_year, end_year = get_test_years(source) | ||
service_definition = get_service_definition(dataset_name) | ||
os.environ["CADSOBS_AVOID_MULTIPROCESS"] = "0" | ||
run_ingestion_pipeline( | ||
dataset_name, | ||
service_definition, | ||
source, | ||
test_session, | ||
test_config, | ||
start_year=start_year, | ||
end_year=end_year, | ||
update=False, | ||
) | ||
# Check variables | ||
variable_check_results_file = Path("variable_check_results.csv") | ||
index_cols = ["dataset_name", "dataset_source"] | ||
if variable_check_results_file.exists(): | ||
results = pandas.read_csv(variable_check_results_file, index_col=index_cols) | ||
else: | ||
results = pandas.DataFrame( | ||
columns=[ | ||
"dataset_name", | ||
"dataset_source", | ||
"in_file_not_in_descriptions", | ||
"in_descriptions_not_in_file", | ||
] | ||
).set_index(index_cols) | ||
# Get the file | ||
asset = test_session.scalar( | ||
sa.select(Catalogue.asset).where(Catalogue.dataset == dataset_name) | ||
) | ||
s3client = S3Client.from_config(test_config.s3config) | ||
asset_filename = asset.split("/")[1] | ||
asset_local_path = Path(tmp_path, asset_filename) | ||
s3client.download_file( | ||
s3client.get_bucket_name(dataset_name), asset_filename, asset_local_path | ||
) | ||
dataset = open_netcdf(asset_local_path, decode_variables=True) | ||
# Get variables in file | ||
variables_in_file = set( | ||
dataset.columns.tolist() + dataset.observed_variable.unique().tolist() | ||
) | ||
# Get expected variables according to service definition file | ||
aux_variables = get_aux_vars_from_service_definition(service_definition, source) | ||
expected_variables = set(service_definition.sources[source].descriptions) - set( | ||
aux_variables | ||
) | ||
# Here we add some more variables to expected variables | ||
for v in [ | ||
"observed_variable", | ||
"observation_value", | ||
"units", | ||
] + auxiliary_variable_names: | ||
if v in variables_in_file: | ||
expected_variables.add(v) | ||
in_file_not_in_descriptions = tuple(variables_in_file - expected_variables) | ||
in_descriptions_not_in_file = tuple(expected_variables - variables_in_file) | ||
|
||
results.loc[(dataset_name, source), :] = pandas.Series( | ||
index=("in_file_not_in_descriptions", "in_descriptions_not_in_file"), | ||
data=[in_file_not_in_descriptions, in_descriptions_not_in_file], | ||
) | ||
results.to_csv(variable_check_results_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters