diff --git a/.gitignore b/.gitignore index 157dfe7..4aa2051 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,9 @@ nosetests.xml coverage.xml *,cover +# asv environments +.asv + # Translations *.mo *.pot diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e5b6178..3a65b70 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,6 +10,7 @@ repos: - id: end-of-file-fixer - id: check-docstring-first - id: check-json + exclude: "asv_bench/asv.conf.json" - id: check-yaml - id: double-quote-string-fixer diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..c9d9358 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1 @@ +Xbatcher's contributor guidelines [can be found in the online documentation](https://xbatcher.readthedocs.io/en/latest/contributing.html). diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json new file mode 100644 index 0000000..a3b585c --- /dev/null +++ b/asv_bench/asv.conf.json @@ -0,0 +1,188 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "xbatcher", + + // The project's homepage + "project_url": "https://xbatcher.readthedocs.io/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["main"], // for git + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + "install_timeout": 600, + + // the base URL to show a commit for the project. + // "show_commit_url": "http://github.com/pangeo-data/xbatcher/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["3.8"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + "conda_channels": ["conda-forge"], + + // A conda environment file that is used for environment creation. + // "conda_environment_file": "environment.yml", + + // The matrix of dependencies to test. Each key of the "req" + // requirements dictionary is the name of a package (in PyPI) and + // the values are version numbers. An empty list or empty string + // indicates to just test against the default (latest) + // version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed + // via pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + // The ``@env`` and ``@env_nobuild`` keys contain the matrix of + // environment variables to pass to build and benchmark commands. + // An environment will be created for every combination of the + // cartesian product of the "@env" variables in this matrix. + // Variables in "@env_nobuild" will be passed to every environment + // during the benchmark phase, but will not trigger creation of + // new environments. A value of ``null`` means that the variable + // will not be set for the current combination. + // + // "matrix": { + // "req": { + // "numpy": ["1.6", "1.7"], + // "six": ["", null], // test with and without six installed + // "pip+emcee": [""] // emcee is only available for install with pip. + // }, + // "env": {"ENV_VAR_1": ["val1", "val2"]}, + // "env_nobuild": {"ENV_VAR_2": ["val3", null]}, + // }, + // "matrix": { + // "xarray": [""], + // "numpy": [""], + // "dask": [""], + // }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // - req + // Required packages + // - env + // Environment variables + // - env_nobuild + // Non-build environment variables + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda + // {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1 + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO": "123"}}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "req": {"libpython": ""}}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html" + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py new file mode 100644 index 0000000..1f52d08 --- /dev/null +++ b/asv_bench/benchmarks/__init__.py @@ -0,0 +1,12 @@ +def parameterized(names, params): + """ + Copied from xarray benchmarks: + https://github.com/pydata/xarray/blob/main/asv_bench/benchmarks/__init__.py#L9-L15 + """ + + def decorator(func): + func.param_names = names + func.params = params + return func + + return decorator diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py new file mode 100644 index 0000000..39ef8e0 --- /dev/null +++ b/asv_bench/benchmarks/benchmarks.py @@ -0,0 +1,147 @@ +import numpy as np +import torch +import xarray as xr + +from xbatcher import BatchGenerator +from xbatcher.loaders.torch import IterableDataset, MapDataset + +from . import parameterized + + +class Base: + def setup(self, *args, **kwargs): + shape = (10, 50, 100) + self.ds_3d = xr.Dataset( + { + 'foo': (['time', 'y', 'x'], np.random.rand(*shape)), + }, + { + 'x': (['x'], np.arange(shape[-1])), + 'y': (['y'], np.arange(shape[-2])), + }, + ) + + shape_4d = (10, 50, 100, 3) + self.ds_4d = xr.Dataset( + { + 'foo': (['time', 'y', 'x', 'b'], np.random.rand(*shape_4d)), + }, + { + 'x': (['x'], np.arange(shape_4d[-2])), + 'y': (['y'], np.arange(shape_4d[-3])), + 'b': (['b'], np.arange(shape_4d[-1])), + }, + ) + + self.ds_xy = xr.Dataset( + { + 'x': ( + ['sample', 'feature'], + np.random.random((shape[-1], shape[0])), + ), + 'y': (['sample'], np.random.random(shape[-1])), + }, + ) + + +class Generator(Base): + @parameterized(['preload_batch'], ([True, False])) + def time_batch_preload(self, preload_batch): + """ + Construct a generator on a chunked DataSet with and without preloading + batches. + """ + ds_dask = self.ds_xy.chunk({'sample': 2}) + BatchGenerator( + ds_dask, input_dims={'sample': 2}, preload_batch=preload_batch + ) + + @parameterized( + ['input_dims', 'batch_dims', 'input_overlap'], + ( + [{'x': 5}, {'x': 10}, {'x': 5, 'y': 5}, {'x': 10, 'y': 5}], + [{}, {'x': 20}, {'x': 30}], + [{}, {'x': 1}, {'x': 2}], + ), + ) + def time_batch_input(self, input_dims, batch_dims, input_overlap): + """ + Benchmark simple batch generation case. + """ + BatchGenerator( + self.ds_3d, + input_dims=input_dims, + batch_dims=batch_dims, + input_overlap=input_overlap, + ) + + @parameterized( + ['input_dims', 'concat_input_dims'], + ([{'x': 5}, {'x': 10}, {'x': 5, 'y': 5}], [True, False]), + ) + def time_batch_concat(self, input_dims, concat_input_dims): + """ + Construct a generator on a DataSet with and without concatenating + chunks specified by ``input_dims`` into the batch dimension. + """ + BatchGenerator( + self.ds_3d, + input_dims=input_dims, + concat_input_dims=concat_input_dims, + ) + + @parameterized( + ['input_dims', 'batch_dims', 'concat_input_dims'], + ( + [{'x': 5}, {'x': 5, 'y': 5}], + [{}, {'x': 10}, {'x': 10, 'y': 10}], + [True, False], + ), + ) + def time_batch_concat_4d(self, input_dims, batch_dims, concat_input_dims): + """ + Construct a generator on a DataSet with and without concatenating + chunks specified by ``input_dims`` into the batch dimension. + """ + BatchGenerator( + self.ds_4d, + input_dims=input_dims, + batch_dims=batch_dims, + concat_input_dims=concat_input_dims, + ) + + +class Accessor(Base): + @parameterized( + ['input_dims'], + ([{'x': 2}, {'x': 4}, {'x': 2, 'y': 2}, {'x': 4, 'y': 2}]), + ) + def time_accessor_input_dim(self, input_dims): + """ + Benchmark simple batch generation case using xarray accessor + Equivalent to subset of ``time_batch_input()``. + """ + self.ds_3d.batch.generator(input_dims=input_dims) + + +class TorchLoader(Base): + def setup(self, *args, **kwargs): + super().setup(**kwargs) + self.x_gen = BatchGenerator(self.ds_xy['x'], {'sample': 10}) + self.y_gen = BatchGenerator(self.ds_xy['y'], {'sample': 10}) + + def time_map_dataset(self): + """ + Benchmark MapDataset integration with torch DataLoader. + """ + dataset = MapDataset(self.x_gen, self.y_gen) + loader = torch.utils.data.DataLoader(dataset) + iter(loader).next() + + def time_iterable_dataset(self): + """ + Benchmark IterableDataset integration with torch DataLoader. + """ + dataset = IterableDataset(self.x_gen, self.y_gen) + loader = torch.utils.data.DataLoader(dataset) + iter(loader).next() diff --git a/dev-requirements.txt b/dev-requirements.txt index 34f20d8..3e14ae5 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -3,4 +3,5 @@ torch coverage pytest-cov adlfs +asv -r requirements.txt diff --git a/doc/contributing.rst b/doc/contributing.rst new file mode 100644 index 0000000..95445f8 --- /dev/null +++ b/doc/contributing.rst @@ -0,0 +1,55 @@ +.. _contributing: + +************************ +Contributing to xbatcher +************************ + +.. note:: + + Large parts of this document came from the `Xarray Contributing + Guide `_, which is based + on the `Pandas Contributing Guide + `_. + +Running the performance test suite +---------------------------------- + +*xbatcher* is starting a suite of benchmarking tests using +`asv `__ to enable easy monitoring of +the performance of critical operations. These benchmarks are all found in the +``asv_bench`` directory. + +To use all features of asv, you will need either ``conda`` or ``virtualenv``. +For more details please check the `asv installation webpage +`_. + +To install asv:: + + pip install git+https://github.com/airspeed-velocity/asv + +If you need to run a benchmark, change your directory to ``asv_bench/`` and run:: + + asv continuous -f 1.1 main + +You can replace ``my-branch`` with the name of the branch you are working on. +The output will include "BENCHMARKS NOT SIGNIFICANTLY CHANGED" if the +benchmarks did not change by more than 10%. + +The command uses ``conda`` by default for creating the benchmark +environments. If you want to use virtualenv instead, write:: + + asv continuous -f 1.1 -E virtualenv main + +The ``-E virtualenv`` option should be added to all ``asv`` commands +that run benchmarks. The default value is defined in ``asv.conf.json``. + +If you want to only run a specific group of tests from a file, you can do it +using ``.`` as a separator. For example:: + + asv continuous -f 1.1 main HEAD -b benchmarks.Generator.time_batch_preload + +will only run the ``Generator.time_batch_preload`` benchmark defined in +``benchmarks.py``. + +Information on how to write a benchmark and how to use asv can be found in the +`asv documentation `_. diff --git a/doc/index.rst b/doc/index.rst index baaab0b..d52ff69 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -64,3 +64,4 @@ or via a built-in `Xarray accessor