speakleash · djstrong · Feb 6, 2024 · Feb 6, 2024 · Feb 6, 2024 · Feb 6, 2024
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,78 @@
+name: Publish Python distribution to PyPI
+
+on:
+  push:
+    tags:
+      - '*'
+
+jobs:
+  build:
+    name: Build distribution
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.x"
+
+    - name: Install pypa/build
+      run: >-
+        python3 -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: python3 -m build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+
+  publish-to-pypi:
+    name: >-
+      Publish Python distribution to PyPI
+    if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/lm_eval
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+
+  publish-to-testpypi:
+    name: Publish Python distribution to TestPyPI
+    needs:
+    - build
+    runs-on: ubuntu-latest
+
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/lm_eval
+
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        repository-url: https://test.pypi.org/legacy/
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -56,7 +56,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,8 @@ temp
 # IPython
 profile_default/
 ipython_config.py
+# don't track (the default location of) the cached requests
+lm_eval/caching/.cache
+# don't track files created by wandb
+wandb
+examples/wandb
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,14 +2,15 @@
 exclude: ^tests/testdata/
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+    rev: v4.5.0
     hooks:
       - id: check-added-large-files
       - id: check-ast
       - id: check-byte-order-marker
       - id: check-case-conflict
       - id: check-json
       - id: check-merge-conflict
+        args: [--assume-in-merge]
       - id: check-symlinks
       - id: check-yaml
         args: ["--unsafe"]
@@ -29,7 +30,7 @@ repos:
         args: [--fix=lf]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.1.8
+    rev: v0.2.2
     hooks:
       # Run the linter.
       - id: ruff
@@ -38,7 +39,7 @@ repos:
         # Run the formatter.
       - id: ruff-format
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.1.0
+    rev: v2.2.6
     hooks:
       - id: codespell
         exclude: >

diff --git a/README.md b/README.md
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -0,0 +1,81 @@
+# Contributing to LM Evaluation Harness
+
+Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful!
+
+We intend LM Evaluation Harness to be a broadly useful and
+
+## Important Resources
+
+There are several places information about LM Evaluation Harness is located:
+
+- Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)
+- We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases.
+- We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests.
+- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](discord.gg/eleutherai).
+
+## Code Style
+
+LM Evaluation Harness uses [ruff](https://github.com/astral-sh/ruff) for linting via [pre-commit](https://pre-commit.com/).
+
+You can install linters and dev tools via
+
+```pip install lm_eval[dev]``` or ```pip install -e ".[dev]"```
+
+Then, run
+
+```pre-commit install```
+
+in order to ensure linters and other checks will be run upon committing.
+
+## Testing
+
+We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:
+
+```
+python -m pytest --ignore=tests/tests_master --ignore=tests/extra
+```
+
+## Contributor License Agreement
+
+We ask that new contributors agree to a Contributor License Agreement affirming that EleutherAI has the rights to use your contribution to our library.
+First-time pull requests will have a reply added by @CLAassistant containing instructions for how to confirm this, and we require it before merging your PR.
+
+
+## Contribution Best Practices
+
+We recommend a few best practices to make your contributions or reported errors easier to assist with.
+
+**For Pull Requests:**
+- PRs should be titled descriptively, and be opened with a brief description of the scope and intent of the new contribution.
+- New features should have appropriate documentation added alongside them.
+- Aim for code maintainability, and minimize code copying.
+- If opening a task, try to share test results on the task using a publicly-available model, and if any public results are available on the task, compare to them.
+
+**For Feature Requests:**
+- Provide a short paragraph's worth of description. What is the feature you are requesting? What is its motivation, and an example use case of it? How does this differ from what is currently supported?
+
+**For Bug Reports**:
+- Provide a short description of the bug.
+- Provide a *reproducible example*--what is the command you run with our library that results in this error? Have you tried any other steps to resolve it?
+- Provide a *full error traceback* of the error that occurs, if applicable. A one-line error message or small screenshot snippet is unhelpful without the surrounding context.
+- Note what version of the codebase you are using, and any specifics of your environment and setup that may be relevant.
+
+**For Requesting New Tasks**:
+- Provide a 1-2 sentence description of what the task is and what it evaluates.
+- Provide a link to the paper introducing the task.
+- Provide a link to where the dataset can be found.
+- Provide a link to a paper containing results on an open-source model on the task, for use in comparisons and implementation validation.
+- If applicable, link to any codebase that has implemented the task (especially the original publication's codebase, if existent).
+
+## How Can I Get Involved?
+
+To quickly get started, we maintain a list of good first issues, which can be found [on our project board](https://github.com/orgs/EleutherAI/projects/25/views/8) or by [filtering GH Issues](https://github.com/EleutherAI/lm-evaluation-harness/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3A%22help+wanted%22). These are typically smaller code changes or self-contained features which can be added without extensive familiarity with library internals, and we recommend new contributors consider taking a stab at one of these first if they are feeling uncertain where to begin.
+
+There are a number of distinct ways to contribute to LM Evaluation Harness, and all are extremely helpful! A sampling of ways to contribute include:
+- **Implementing and verifying new evaluation tasks**: Is there a task you'd like to see LM Evaluation Harness support? Consider opening an issue requesting it, or helping add it! Verifying and cross-checking task implementations with their original versions is also a very valuable form of assistance in ensuring standardized evaluation.
+- **Improving documentation** - Improvements to the documentation, or noting pain points / gaps in documentation, are helpful in order for us to improve the user experience of the library and clarity + coverage of documentation.
+- **Testing and devops** - We are very grateful for any assistance in adding tests for the library that can be run for new PRs, and other devops workflows.
+- **Adding new modeling / inference library integrations** - We hope to support a broad range of commonly-used inference libraries popular among the community, and welcome PRs for new integrations, so long as they are documented properly and maintainable.
+- **Proposing or Contributing New Features** - We want LM Evaluation Harness to support a broad range of evaluation usecases. If you have a feature that is not currently supported but desired, feel free to open an issue describing the feature and, if applicable, how you intend to implement it. We would be happy to give feedback on the cleanest way to implement new functionalities and are happy to coordinate with interested contributors via GH discussions or via discord.
+
+We hope that this has been helpful, and appreciate your interest in contributing! Further questions can be directed to [our Discord](discord.gg/eleutherai).
diff --git a/docs/README.md b/docs/README.md
@@ -4,7 +4,7 @@ Welcome to the docs for the LM Evaluation Harness!
 
 ## Table of Contents
 
-* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md)
-* To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/model_guide.md).
-* For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
-* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/task_guide.md).
+* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](./interface.md)
+* To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md).
+* For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md).
+* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md).
diff --git a/docs/decontamination.md b/docs/decontamination.md
@@ -2,15 +2,14 @@
 
 ## Usage
 
-Simply add a "--decontamination_ngrams_path" when running \__main\__.py. The provided directory should contain
+The provided directory should contain
 the ngram files and info.json produced in "Pile Ngram Generation" further down.
 
 ```bash
 python -m lm_eval \
     --model gpt2 \
     --device 0 \
-    --tasks sciq \
-    --decontamination_ngrams_path path/containing/training/set/ngrams
+    --tasks sciq
 ```
 
 ## Background
@@ -70,5 +69,3 @@ python -m scripts/clean_training_data/compress_and_package \
        -output path/to/final/directory \
        -procs 8
 ```
-
-Congratulations, the final directory can now be passed to lm-evaulation-harness with the "--decontamination_ngrams_path" argument.