Skip to content

Commit

Permalink
2023-08-08 nightly release (3f98fb9)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Aug 8, 2023
1 parent 618aad9 commit c9178fd
Show file tree
Hide file tree
Showing 30 changed files with 627 additions and 86 deletions.
49 changes: 42 additions & 7 deletions docs/source/_templates/autosummary/bundle_class.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@

.. autoclass:: {{ fullname }}()

{%- if name in ["RNNTBundle.FeatureExtractor", "RNNTBundle.TokenProcessor"] %}
{%- set support_classes = [] %}
{%- if name in ["RNNTBundle.FeatureExtractor", "RNNTBundle.TokenProcessor", "Wav2Vec2FABundle.Tokenizer"] %}
{%- set methods = ["__call__"] %}
{%- elif name == "Wav2Vec2FABundle.Aligner" %}
{%- set attributes = [] %}
{%- set methods = ["__call__"] %}
{%- set support_classes = ["Token"] %}
{%- elif name == "Tacotron2TTSBundle.TextProcessor" %}
{%- set attributes = ["tokens"] %}
{%- set methods = ["__call__"] %}
Expand All @@ -21,12 +26,17 @@
{%- set methods = ["__call__"] %}
{% endif %}

..
ATTRIBUTES
{%- if attributes %}

Properties
----------

{%- endif %}

{%- for item in attributes %}
{%- if not item.startswith('_') %}

{{ item | underline("-") }}
{{ item | underline("~") }}

.. container:: py attribute

Expand All @@ -35,17 +45,42 @@
{%- endif %}
{%- endfor %}

..
METHODS
{%- if methods %}

Methods
-------

{%- endif %}

{%- for item in methods %}
{%- if item != "__init__" %}

{{item | underline("-") }}
{{item | underline("~") }}

.. container:: py attribute

.. automethod:: {{[fullname, item] | join('.')}}

{%- endif %}
{%- endfor %}

{%- if support_classes %}

Support Structures
------------------

{%- endif %}

{%- for item in support_classes %}

{% set components = item.split('.') %}

{{ components[-1] | underline("~") }}

.. container:: py attribute

.. autoclass:: {{[fullname, item] | join('.')}}
:members:


{%- endfor %}
9 changes: 9 additions & 0 deletions docs/source/functional.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,16 @@ Utility
preemphasis
deemphasis
speed

Forced Alignment
----------------
.. autosummary::
:toctree: generated
:nosignatures:

forced_align
merge_tokens
TokenSpan


Filtering
Expand Down
32 changes: 32 additions & 0 deletions docs/source/pipelines.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,38 @@ Pretrained Models
HUBERT_ASR_LARGE
HUBERT_ASR_XLARGE

wav2vec 2.0 / HuBERT - Forced Alignment
---------------------------------------

Interface
~~~~~~~~~

``Wav2Vec2FABundle`` bundles pre-trained model and its associated dictionary. Additionally, it supports appending ``star`` token dimension.

.. image:: https://download.pytorch.org/torchaudio/doc-assets/pipelines-wav2vec2fabundle.png

.. autosummary::
:toctree: generated
:nosignatures:
:template: autosummary/bundle_class.rst

Wav2Vec2FABundle
Wav2Vec2FABundle.Tokenizer
Wav2Vec2FABundle.Aligner

.. rubric:: Tutorials using ``Wav2Vec2FABundle``

.. minigallery:: torchaudio.pipelines.Wav2Vec2FABundle

Pertrained Models
~~~~~~~~~~~~~~~~~

.. autosummary::
:toctree: generated
:nosignatures:
:template: autosummary/bundle_data.rst

MMS_FA

.. _Tacotron2:

Expand Down
9 changes: 9 additions & 0 deletions docs/source/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -570,3 +570,12 @@ @incollection{45611
URL = {https://arxiv.org/abs/1609.09430},
booktitle = {International Conference on Acoustics, Speech and Signal Processing (ICASSP)}
}

@misc{pratap2023scaling,
title={Scaling Speech Technology to 1,000+ Languages},
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
year={2023},
eprint={2305.13516},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
62 changes: 62 additions & 0 deletions test/torchaudio_unittest/functional/functional_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1220,6 +1220,68 @@ def test_forced_align_fail(self, targets_dtype):
with self.assertRaisesRegex(RuntimeError, r"blank must be within \[0, num classes\)"):
hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)

def _assert_tokens(self, first, second):
assert len(first) == len(second)

for f, s in zip(first, second):
self.assertEqual(f.token, s.token)
self.assertEqual(f.score, s.score)
self.assertEqual(f.start, s.start)
self.assertEqual(f.end, s.end)

@parameterized.expand(
[
([], [], []),
([F.TokenSpan(1, 0, 1, 1.0)], [1], [1.0]),
([F.TokenSpan(1, 0, 2, 0.5)], [1, 1], [0.4, 0.6]),
([F.TokenSpan(1, 0, 3, 0.6)], [1, 1, 1], [0.5, 0.6, 0.7]),
([F.TokenSpan(1, 0, 1, 0.8), F.TokenSpan(2, 1, 2, 0.9)], [1, 2], [0.8, 0.9]),
([F.TokenSpan(1, 0, 1, 1.0), F.TokenSpan(2, 1, 3, 0.5)], [1, 2, 2], [1.0, 0.4, 0.6]),
([F.TokenSpan(1, 0, 1, 0.8), F.TokenSpan(1, 2, 3, 1.0)], [1, 0, 1], [0.8, 0.9, 1.0]),
([F.TokenSpan(1, 0, 1, 0.8), F.TokenSpan(2, 2, 3, 1.0)], [1, 0, 2], [0.8, 0.9, 1.0]),
([F.TokenSpan(1, 0, 1, 1.0), F.TokenSpan(1, 2, 4, 0.5)], [1, 0, 1, 1], [1.0, 0.1, 0.4, 0.6]),
([F.TokenSpan(1, 0, 1, 1.0), F.TokenSpan(2, 2, 4, 0.5)], [1, 0, 2, 2], [1.0, 0.1, 0.4, 0.6]),
([F.TokenSpan(1, 0, 1, 1.0), F.TokenSpan(1, 3, 4, 0.4)], [1, 0, 0, 1], [1.0, 0.9, 0.7, 0.4]),
([F.TokenSpan(1, 0, 1, 1.0), F.TokenSpan(2, 3, 4, 0.4)], [1, 0, 0, 2], [1.0, 0.9, 0.7, 0.4]),
([F.TokenSpan(1, 0, 1, 1.0), F.TokenSpan(1, 3, 5, 0.5)], [1, 0, 0, 1, 1], [1.0, 0.9, 0.8, 0.6, 0.4]),
([F.TokenSpan(1, 0, 1, 1.0), F.TokenSpan(2, 3, 5, 0.5)], [1, 0, 0, 2, 2], [1.0, 0.9, 0.8, 0.6, 0.4]),
([F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(2, 2, 3, 0.5)], [1, 1, 2], [1.0, 0.8, 0.5]),
([F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(1, 3, 4, 0.7)], [1, 1, 0, 1], [1.0, 0.8, 0.1, 0.7]),
([F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(2, 3, 4, 0.7)], [1, 1, 0, 2], [1.0, 0.8, 0.1, 0.7]),
([F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(1, 3, 5, 0.4)], [1, 1, 0, 1, 1], [1.0, 0.8, 0.1, 0.5, 0.3]),
([F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(2, 3, 5, 0.4)], [1, 1, 0, 2, 2], [1.0, 0.8, 0.1, 0.5, 0.3]),
([F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(1, 4, 5, 0.3)], [1, 1, 0, 0, 1], [1.0, 0.8, 0.1, 0.5, 0.3]),
([F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(2, 4, 5, 0.3)], [1, 1, 0, 0, 2], [1.0, 0.8, 0.1, 0.5, 0.3]),
(
[F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(1, 4, 6, 0.2)],
[1, 1, 0, 0, 1, 1],
[1.0, 0.8, 0.6, 0.5, 0.3, 0.1],
),
(
[F.TokenSpan(1, 0, 2, 0.9), F.TokenSpan(2, 4, 6, 0.2)],
[1, 1, 0, 0, 2, 2],
[1.0, 0.8, 0.6, 0.5, 0.3, 0.1],
),
]
)
def test_merge_repeated_tokens(self, expected, tokens, scores):
scores_ = torch.tensor(scores, dtype=torch.float32, device=self.device)
tokens_ = torch.tensor(tokens, dtype=torch.int64, device=self.device)
spans = F.merge_tokens(tokens_, scores_, blank=0)
print(tokens_, scores_)
self._assert_tokens(spans, expected)

# Append blanks at the beginning and at the end.
for num_prefix, num_suffix in itertools.product([0, 1, 2], repeat=2):
tokens_ = ([0] * num_prefix) + tokens + ([0] * num_suffix)
scores_ = ([0.1] * num_prefix) + scores + ([0.1] * num_suffix)
tokens_ = torch.tensor(tokens_, dtype=torch.int64, device=self.device)
scores_ = torch.tensor(scores_, dtype=torch.float32, device=self.device)
expected_ = [F.TokenSpan(s.token, s.start + num_prefix, s.end + num_prefix, s.score) for s in expected]
print(tokens_, scores_)
spans = F.merge_tokens(tokens_, scores_, blank=0)
self._assert_tokens(spans, expected_)


class FunctionalCPUOnly(TestBaseMixin):
def test_melscale_fbanks_no_warning_high_n_freq(self):
Expand Down
5 changes: 4 additions & 1 deletion torchaudio/_internal/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from torch.hub import download_url_to_file, load_state_dict_from_url
try:
from .fb import download_url_to_file, load_state_dict_from_url
except ImportError:
from torch.hub import download_url_to_file, load_state_dict_from_url


__all__ = [
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/cmuarctic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar

URL = "aew"
Expand Down
3 changes: 2 additions & 1 deletion torchaudio/datasets/cmudict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from pathlib import Path
from typing import Iterable, List, Tuple, Union

from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file


_CHECKSUMS = {
"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4", # noqa: E501
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/dr_vctk.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_zip


Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/gtzan.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar

# The following lists prefixed with `filtered_` provide a filtered split
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/librilight_limited.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.librispeech import _get_librispeech_metadata
from torchaudio.datasets.utils import _extract_tar

Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from typing import Tuple, Union

from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar, _load_waveform

URL = "train-clean-100"
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/librispeech_biasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from typing import List, Tuple, Union

from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar, _load_waveform

URL = "train-clean-100"
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/libritts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar

URL = "train-clean-100"
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/ljspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar


Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/musdb_hq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import torch
import torchaudio
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_zip

_URL = "https://zenodo.org/record/3338373/files/musdb18hq.zip"
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/quesst14.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from typing import Optional, Tuple, Union

import torch
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar, _load_waveform


Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/speechcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from typing import Optional, Tuple, Union

from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar, _load_waveform

FOLDER_IN_ARCHIVE = "SpeechCommands"
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/tedlium.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar


Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/vctk.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_zip

URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/voxceleb1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from typing import List, Tuple, Union

from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_zip, _load_waveform


Expand Down
2 changes: 1 addition & 1 deletion torchaudio/datasets/yesno.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar


Expand Down
Loading

0 comments on commit c9178fd

Please sign in to comment.