diff --git a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py index 7416f0f8b9a..9d615b4a93a 100644 --- a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py +++ b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py @@ -2,18 +2,16 @@ Forced alignment for multilingual data ====================================== -**Author**: `Xiaohui Zhang `__ - -This tutorial shows how to compute forced alignments for speech data -from multiple non-English languages using ``torchaudio``'s CTC forced alignment -API described in `CTC forced alignment tutorial <./forced_alignment_tutorial.html>`__ -and the multilingual Wav2vec2 model proposed in the paper `Scaling -Speech Technology to 1,000+ -Languages `__. - -The model was trained on 23K of audio data from 1100+ languages using -the `uroman vocabulary `__ -as targets. +**Authors**: `Xiaohui Zhang `__, `Moto Hira `__. + +This tutorial shows how to align transcript to speech for non-English languages. + +The process of aligning non-English (normalized) transcript is identical to aligning +English (normalized) transcript, and the process for English is covered in detail in +`CTC forced alignment tutorial <./ctc_forced_alignment_api_tutorial.html>`__. +In this tutorial, we use TorchAudio's high-level API, +:py:class:`torchaudio.pipelines.Wav2Vec2FABundle`, which packages the pre-trained +model, tokenizer and aligner, to perform the forced alignment with less code. """ import torch @@ -25,114 +23,109 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) -from dataclasses import dataclass ###################################################################### -# Preparation -# ----------- # -from typing import Dict, List +from typing import List import IPython import matplotlib.pyplot as plt -from torchaudio.functional import forced_align - ###################################################################### +# Creating the pipeline +# --------------------- # - -SAMPLE_RATE = 16000 - - -###################################################################### +# First, we instantiate the model and pre/post-processing pipelines. # -# Here we define utility functions for computing the frame-level -# alignments (using the API :py:func:`torchaudio.functional.forced_align`), -# token-level and word-level alignments. -# For the detail of these functions please refer to -# `CTC forced alignment API tutorial <./ctc_forced_alignment_api_tutorial.html>`__. +# The following diagram illustrates the process of alignment. # +# .. image:: https://download.pytorch.org/torchaudio/doc-assets/pipelines-wav2vec2fabundle.png +# +# The waveform is passed to an acoustic model, which produces the sequence of +# probability distribution of tokens. +# The transcript is passed to tokenizer, which converts the transcript to +# sequence of tokens. +# Aligner takes the results from the acoustic model and the tokenizer and generate +# timestamps for each token. +# +# .. note:: +# +# This process expects that the input transcript is already normalized. +# The process of normalization, which involves romanization of non-English +# languages, is language-dependent, so it is not covered in this tutorial, +# but we will breifly look into it. +# +# The acoustic model and the tokenizer must use the same set of tokens. +# To facilitate the creation of matching processors, +# :py:class:`~torchaudio.pipelines.Wav2Vec2FABundle` associates a +# pre-trained accoustic model and a tokenizer. +# :py:data:`torchaudio.pipelines.MMS_FA` is one of such instance. +# +# The following code instantiates a pre-trained acoustic model, a tokenizer +# which uses the same set of tokens as the model, and an aligner. +# +from torchaudio.pipelines import MMS_FA as bundle +model = bundle.get_model() +model.to(device) -@dataclass -class TokenSpan: - index: int # index of token in transcript - start: int # start time (inclusive) - end: int # end time (exclusive) - score: float - - def __len__(self) -> int: - return self.end - self.start +tokenizer = bundle.get_tokenizer() +aligner = bundle.get_aligner() ###################################################################### +# .. note:: +# +# The model instantiated by :py:data:`~torchaudio.pipelines.MMS_FA`'s +# :py:meth:`~torchaudio.pipelines.Wav2Vec2FABundle.get_model` +# method by default includes the feature dimension for ```` token. +# You can disable this by passing ``with_star=False``. # - - -@dataclass -class WordSpan: - token_spans: List[TokenSpan] - score: float - ###################################################################### +# The acoustic model of :py:data:`~torchaudio.pipelines.MMS_FA` was +# created and open-sourced as part of the research project, +# `Scaling Speech Technology to 1,000+ Languages +# `__. +# It was trained with 23,000 hours of audio from 1100+ languages. # -def align_emission_and_tokens(emission: torch.Tensor, tokens: List[int]): - device = emission.device - targets = torch.tensor([tokens], dtype=torch.int32, device=device) - input_lengths = torch.tensor([emission.size(1)], device=device) - target_lengths = torch.tensor([targets.size(1)], device=device) +# The tokenizer simply maps the normalized characters to integers. +# You can check the mapping as follow; - aligned_tokens, scores = forced_align(emission, targets, input_lengths, target_lengths, 0) +print(bundle.get_dict()) - scores = scores.exp() # convert back to probability - aligned_tokens, scores = aligned_tokens[0], scores[0] # remove batch dimension - return aligned_tokens, scores - - -def merge_tokens(tokens, scores, blank=0) -> List[TokenSpan]: - prev_token = blank - i = start = -1 - spans = [] - for t, token in enumerate(tokens): - if token != prev_token: - if prev_token != blank: - spans.append(TokenSpan(i, start, t, scores[start:t].mean().item())) - if token != blank: - i += 1 - start = t - prev_token = token - if prev_token != blank: - spans.append(TokenSpan(i, start, len(tokens), scores[start:].mean().item())) - return spans +###################################################################### +# +# The aligner internally uses :py:func:`torchaudio.functional.forced_align` +# and :py:func:`torchaudio.functional.merge_tokens` to infer the time +# stamps of the input tokens. +# +# The detail of the underlying mechanism is covered in +# `CTC forced alignment API tutorial <./ctc_forced_alignment_api_tutorial.html>`__, +# so please refer to it. -def merge_words(token_spans: List[TokenSpan], transcript: List[str]) -> List[WordSpan]: - def _score(t_spans): - return sum(s.score * len(s) for s in t_spans) / sum(len(s) for s in t_spans) +###################################################################### +# We define a utility function that performs the forced alignment with +# the above model, the tokenizer and the aligner. +# +def compute_alignments(waveform: torch.Tensor, transcript: List[str]): + with torch.inference_mode(): + emission, _ = model(waveform.to(device)) + token_spans = aligner(emission[0], tokenizer(transcript)) + return emission, token_spans - word_spans = [] - i = 0 - for words in transcript: - j = i + len(words) - word_spans.append(WordSpan(token_spans[i:j], _score(token_spans[i:j]))) - i = j - return word_spans +###################################################################### +# We also define utility functions for plotting the result and previewing +# the audio segments. -def compute_alignments(emission: torch.Tensor, transcript: List[str], dictionary: Dict[str, int]): - tokens = [dictionary[c] for word in transcript for c in word] - aligned_tokens, scores = align_emission_and_tokens(emission, tokens) - token_spans = merge_tokens(aligned_tokens, scores) - word_spans = merge_words(token_spans, transcript) - return word_spans +# Compute average score weighted by the span length +def _score(spans): + return sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans) -###################################################################### -# - -# utility function for plotting word alignments -def plot_alignments(waveform, word_spans, emission, transcript, sample_rate=SAMPLE_RATE): +def plot_alignments(waveform, token_spans, emission, transcript, sample_rate=bundle.sample_rate): ratio = waveform.size(1) / emission.size(1) / sample_rate fig, axes = plt.subplots(2, 1) @@ -141,150 +134,68 @@ def plot_alignments(waveform, word_spans, emission, transcript, sample_rate=SAMP axes[0].set_xticks([]) axes[1].specgram(waveform[0], Fs=sample_rate) - for w_span, chars in zip(word_spans, transcript): - t_spans = w_span.token_spans + for t_spans, chars in zip(token_spans, transcript): t0, t1 = t_spans[0].start, t_spans[-1].end + axes[0].axvspan(t0 - 0.5, t1 - 0.5, facecolor="None", hatch="/", edgecolor="white") axes[1].axvspan(ratio * t0, ratio * t1, facecolor="None", hatch="/", edgecolor="white") - axes[1].annotate(f"{w_span.score:.2f}", (ratio * t0, sample_rate * 0.51), annotation_clip=False) + axes[1].annotate(f"{_score(t_spans):.2f}", (ratio * t0, sample_rate * 0.51), annotation_clip=False) for span, char in zip(t_spans, chars): - axes[1].annotate(char, (span.start * ratio, sample_rate * 0.55), annotation_clip=False) + t0 = span.start * ratio + axes[1].annotate(char, (t0, sample_rate * 0.55), annotation_clip=False) axes[1].set_xlabel("time [second]") fig.tight_layout() - return IPython.display.Audio(waveform, rate=sample_rate) ###################################################################### # - - -def preview_word(waveform, word_span, num_frames, transcript, sample_rate=SAMPLE_RATE): +def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sample_rate): ratio = waveform.size(1) / num_frames - t0 = word_span.token_spans[0].start - t1 = word_span.token_spans[-1].end - x0 = int(ratio * t0) - x1 = int(ratio * t1) - print(f"{transcript} ({word_span.score:.2f}): {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec") + x0 = int(ratio * spans[0].start) + x1 = int(ratio * spans[-1].end) + print(f"{transcript} ({_score(spans):.2f}): {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec") segment = waveform[:, x0:x1] return IPython.display.Audio(segment.numpy(), rate=sample_rate) ###################################################################### -# Aligning multilingual data +# Normalizing the transcript # -------------------------- # -# Here we show examples of computing forced alignments of utterances in -# 5 languages using the multilingual Wav2vec2 model, with the alignments visualized. -# One can also play the whole audio and audio segments aligned with each word, in -# order to verify the alignment quality. Here we first load the model and dictionary. -# - -from torchaudio.models import wav2vec2_model - -model = wav2vec2_model( - extractor_mode="layer_norm", - extractor_conv_layer_config=[ - (512, 10, 5), - (512, 3, 2), - (512, 3, 2), - (512, 3, 2), - (512, 3, 2), - (512, 2, 2), - (512, 2, 2), - ], - extractor_conv_bias=True, - encoder_embed_dim=1024, - encoder_projection_dropout=0.0, - encoder_pos_conv_kernel=128, - encoder_pos_conv_groups=16, - encoder_num_layers=24, - encoder_num_heads=16, - encoder_attention_dropout=0.0, - encoder_ff_interm_features=4096, - encoder_ff_interm_dropout=0.1, - encoder_dropout=0.0, - encoder_layer_norm_first=True, - encoder_layer_drop=0.1, - aux_num_out=31, -) - - -model.load_state_dict( - torch.hub.load_state_dict_from_url( - "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt" - ) -) -model.eval() -model.to(device) - - -def get_emission(waveform): - with torch.inference_mode(): - # NOTE: this step is essential - waveform = torch.nn.functional.layer_norm(waveform, waveform.shape) - emission, _ = model(waveform) - return torch.log_softmax(emission, dim=-1) - - -# Construct the dictionary -# '@' represents the OOV token -# and are fairseq's legacy tokens, which're not used. -# token is omitted as we do not use it in this tutorial -dictionary = { - "": 0, - "": 1, - "": 2, - "@": 3, - "a": 4, - "i": 5, - "e": 6, - "n": 7, - "o": 8, - "u": 9, - "t": 10, - "s": 11, - "r": 12, - "m": 13, - "k": 14, - "l": 15, - "d": 16, - "g": 17, - "h": 18, - "y": 19, - "b": 20, - "p": 21, - "w": 22, - "c": 23, - "v": 24, - "j": 25, - "z": 26, - "f": 27, - "'": 28, - "q": 29, - "x": 30, -} - - -###################################################################### -# Before aligning the speech with transcripts, we need to make sure -# the transcripts are already romanized. Here are the BASH commands -# required for saving raw transcript to a file, downloading the uroman -# romanizer and using it to obtain romanized transcripts, and PyThon -# commands required for further normalizing the romanized transcript. +# The transcripts passed to the pipeline must be normalized beforehand. +# The exact process of normalization depends on language. +# +# Languages that do not have explicit word boundaries +# (such as Chinese, Japanese and Korean) require segmentation first. +# There are dedicated tools for this, but let's say we have segmented +# transcript. +# +# The first step† of normalization is romanization. +# `uroman `__ is a tool that +# supports many languages. +# +# Here is a BASH commands to romanize the input text file and write +# the output to another text file using ``uroman``. # # .. code-block:: bash # -# Save the raw transcript to a file -# echo 'raw text' > text.txt -# git clone https://github.com/isi-nlp/uroman -# uroman/bin/uroman.pl < text.txt > text_romanized.txt +# $ echo "des événements d'actualité qui se sont produits durant l'année 1882" > text.txt +# $ uroman/bin/uroman.pl < text.txt > text_romanized.txt +# $ cat text_romanized.txt +# +# .. code-block:: text +# +# Cette page concerne des evenements d'actualite qui se sont produits durant l'annee 1882 +# +# The next step is to remove non-alphabets and punctuations. +# The following snippet normalizes the romanized transcript. # - -###################################################################### # .. code-block:: python # # import re +# +# # def normalize_uroman(text): # text = text.lower() # text = text.replace("’", "'") @@ -292,80 +203,99 @@ def get_emission(waveform): # text = re.sub(' +', ' ', text) # return text.strip() # -# file = "text_romanized.txt" -# f = open(file, "r") -# lines = f.readlines() -# text_normalized = normalize_uroman(lines[0].strip()) # - +# with open("text_romanized.txt", "r") as f: +# for line in f: +# text_normalized = normalize_uroman(line) +# print(text_normalized) +# +# Running the script on the above exanple produces the following. +# +# .. code-block:: text +# +# cette page concerne des evenements d'actualite qui se sont produits durant l'annee +# +# Note that, in this example, since "1882" was not romanized by ``uroman``, +# it was removed in the normalization step. +# To avoid this, one needs to romanize numbers, but this is known to be a non-trivial task. +# ###################################################################### +# Aligning transcripts to speech +# ------------------------------ +# +# Now we perform the forced alignment for multiple languages. +# +# # German # ~~~~~~ -speech_file = torchaudio.utils.download_asset("tutorial-assets/10349_8674_000087.flac", progress=False) - text_raw = "aber seit ich bei ihnen das brot hole" text_normalized = "aber seit ich bei ihnen das brot hole" -print("Raw Transcript: ", text_raw) -print("Normalized Transcript: ", text_normalized) +url = "https://download.pytorch.org/torchaudio/tutorial-assets/10349_8674_000087.flac" +waveform, sample_rate = torchaudio.load( + url, frame_offset=int(0.5 * bundle.sample_rate), num_frames=int(2.5 * bundle.sample_rate) +) ###################################################################### # - -waveform, _ = torchaudio.load(speech_file, frame_offset=int(0.5 * SAMPLE_RATE), num_frames=int(2.5 * SAMPLE_RATE)) - -emission = get_emission(waveform.to(device)) -num_frames = emission.size(1) +assert sample_rate == bundle.sample_rate ###################################################################### # transcript = text_normalized.split() -word_spans = compute_alignments(emission, transcript, dictionary) +tokens = tokenizer(transcript) + +emission, token_spans = compute_alignments(waveform, transcript) +num_frames = emission.size(1) -plot_alignments(waveform, word_spans, emission, transcript) +plot_alignments(waveform, token_spans, emission, transcript) + +print("Raw Transcript: ", text_raw) +print("Normalized Transcript: ", text_normalized) +IPython.display.Audio(waveform, rate=sample_rate) ###################################################################### # -preview_word(waveform, word_spans[0], num_frames, transcript[0]) +preview_word(waveform, token_spans[0], num_frames, transcript[0]) ###################################################################### # -preview_word(waveform, word_spans[1], num_frames, transcript[1]) +preview_word(waveform, token_spans[1], num_frames, transcript[1]) ###################################################################### # -preview_word(waveform, word_spans[2], num_frames, transcript[2]) +preview_word(waveform, token_spans[2], num_frames, transcript[2]) ###################################################################### # -preview_word(waveform, word_spans[3], num_frames, transcript[3]) +preview_word(waveform, token_spans[3], num_frames, transcript[3]) ###################################################################### # -preview_word(waveform, word_spans[4], num_frames, transcript[4]) +preview_word(waveform, token_spans[4], num_frames, transcript[4]) ###################################################################### # -preview_word(waveform, word_spans[5], num_frames, transcript[5]) +preview_word(waveform, token_spans[5], num_frames, transcript[5]) ###################################################################### # -preview_word(waveform, word_spans[6], num_frames, transcript[6]) +preview_word(waveform, token_spans[6], num_frames, transcript[6]) ###################################################################### # -preview_word(waveform, word_spans[7], num_frames, transcript[7]) +preview_word(waveform, token_spans[7], num_frames, transcript[7]) ###################################################################### # Chinese @@ -379,276 +309,277 @@ def get_emission(waveform): # However this is not needed if you only want character-level alignments. # -speech_file = torchaudio.utils.download_asset("tutorial-assets/mvdr/clean_speech.wav", progress=False) - text_raw = "关 服务 高端 产品 仍 处于 供不应求 的 局面" text_normalized = "guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian" -print("Raw Transcript: ", text_raw) -print("Normalized Transcript: ", text_normalized) - ###################################################################### # -waveform, _ = torchaudio.load(speech_file) +url = "https://download.pytorch.org/torchaudio/tutorial-assets/mvdr/clean_speech.wav" +waveform, sample_rate = torchaudio.load(url) waveform = waveform[0:1] -emission = get_emission(waveform.to(device)) -num_frames = emission.size(1) +###################################################################### +# +assert sample_rate == bundle.sample_rate ###################################################################### # transcript = text_normalized.split() -word_spans = compute_alignments(emission, transcript, dictionary) +emission, token_spans = compute_alignments(waveform, transcript) +num_frames = emission.size(1) + +plot_alignments(waveform, token_spans, emission, transcript) -plot_alignments(waveform, word_spans, emission, transcript) +print("Raw Transcript: ", text_raw) +print("Normalized Transcript: ", text_normalized) +IPython.display.Audio(waveform, rate=sample_rate) ###################################################################### # -preview_word(waveform, word_spans[0], num_frames, transcript[0]) +preview_word(waveform, token_spans[0], num_frames, transcript[0]) ###################################################################### # -preview_word(waveform, word_spans[1], num_frames, transcript[1]) +preview_word(waveform, token_spans[1], num_frames, transcript[1]) ###################################################################### # -preview_word(waveform, word_spans[2], num_frames, transcript[2]) +preview_word(waveform, token_spans[2], num_frames, transcript[2]) ###################################################################### # -preview_word(waveform, word_spans[3], num_frames, transcript[3]) +preview_word(waveform, token_spans[3], num_frames, transcript[3]) ###################################################################### # -preview_word(waveform, word_spans[4], num_frames, transcript[4]) +preview_word(waveform, token_spans[4], num_frames, transcript[4]) ###################################################################### # -preview_word(waveform, word_spans[5], num_frames, transcript[5]) +preview_word(waveform, token_spans[5], num_frames, transcript[5]) ###################################################################### # -preview_word(waveform, word_spans[6], num_frames, transcript[6]) +preview_word(waveform, token_spans[6], num_frames, transcript[6]) ###################################################################### # -preview_word(waveform, word_spans[7], num_frames, transcript[7]) +preview_word(waveform, token_spans[7], num_frames, transcript[7]) ###################################################################### # -preview_word(waveform, word_spans[8], num_frames, transcript[8]) +preview_word(waveform, token_spans[8], num_frames, transcript[8]) ###################################################################### # Polish # ~~~~~~ -speech_file = torchaudio.utils.download_asset("tutorial-assets/5090_1447_000088.flac", progress=False) - text_raw = "wtedy ujrzałem na jego brzuchu okrągłą czarną ranę" text_normalized = "wtedy ujrzalem na jego brzuchu okragla czarna rane" -print("Raw Transcript: ", text_raw) -print("Normalized Transcript: ", text_normalized) +url = "https://download.pytorch.org/torchaudio/tutorial-assets/5090_1447_000088.flac" +waveform, sample_rate = torchaudio.load(url, num_frames=int(4.5 * bundle.sample_rate)) ###################################################################### # - -waveform, _ = torchaudio.load(speech_file, num_frames=int(4.5 * SAMPLE_RATE)) - -emission = get_emission(waveform.to(device)) -num_frames = emission.size(1) +assert sample_rate == bundle.sample_rate ###################################################################### # transcript = text_normalized.split() -word_spans = compute_alignments(emission, transcript, dictionary) +emission, token_spans = compute_alignments(waveform, transcript) +num_frames = emission.size(1) -plot_alignments(waveform, word_spans, emission, transcript) +plot_alignments(waveform, token_spans, emission, transcript) + +print("Raw Transcript: ", text_raw) +print("Normalized Transcript: ", text_normalized) +IPython.display.Audio(waveform, rate=sample_rate) ###################################################################### # -preview_word(waveform, word_spans[0], num_frames, transcript[0]) +preview_word(waveform, token_spans[0], num_frames, transcript[0]) ###################################################################### # -preview_word(waveform, word_spans[1], num_frames, transcript[1]) +preview_word(waveform, token_spans[1], num_frames, transcript[1]) ###################################################################### # -preview_word(waveform, word_spans[2], num_frames, transcript[2]) +preview_word(waveform, token_spans[2], num_frames, transcript[2]) ###################################################################### # -preview_word(waveform, word_spans[3], num_frames, transcript[3]) +preview_word(waveform, token_spans[3], num_frames, transcript[3]) ###################################################################### # -preview_word(waveform, word_spans[4], num_frames, transcript[4]) +preview_word(waveform, token_spans[4], num_frames, transcript[4]) ###################################################################### # -preview_word(waveform, word_spans[5], num_frames, transcript[5]) +preview_word(waveform, token_spans[5], num_frames, transcript[5]) ###################################################################### # -preview_word(waveform, word_spans[6], num_frames, transcript[6]) +preview_word(waveform, token_spans[6], num_frames, transcript[6]) ###################################################################### # -preview_word(waveform, word_spans[7], num_frames, transcript[7]) +preview_word(waveform, token_spans[7], num_frames, transcript[7]) ###################################################################### # Portuguese # ~~~~~~~~~~ -speech_file = torchaudio.utils.download_asset("tutorial-assets/6566_5323_000027.flac", progress=False) - text_raw = "na imensa extensão onde se esconde o inconsciente imortal" text_normalized = "na imensa extensao onde se esconde o inconsciente imortal" -print("Raw Transcript: ", text_raw) -print("Normalized Transcript: ", text_normalized) +url = "https://download.pytorch.org/torchaudio/tutorial-assets/6566_5323_000027.flac" +waveform, sample_rate = torchaudio.load( + url, frame_offset=int(bundle.sample_rate), num_frames=int(4.6 * bundle.sample_rate) +) ###################################################################### # - -waveform, _ = torchaudio.load(speech_file, frame_offset=int(SAMPLE_RATE), num_frames=int(4.6 * SAMPLE_RATE)) - -emission = get_emission(waveform.to(device)) -num_frames = emission.size(1) +assert sample_rate == bundle.sample_rate ###################################################################### # transcript = text_normalized.split() -word_spans = compute_alignments(emission, transcript, dictionary) +emission, token_spans = compute_alignments(waveform, transcript) +num_frames = emission.size(1) + +plot_alignments(waveform, token_spans, emission, transcript) -plot_alignments(waveform, word_spans, emission, transcript) +print("Raw Transcript: ", text_raw) +print("Normalized Transcript: ", text_normalized) +IPython.display.Audio(waveform, rate=sample_rate) ###################################################################### # -preview_word(waveform, word_spans[0], num_frames, transcript[0]) +preview_word(waveform, token_spans[0], num_frames, transcript[0]) ###################################################################### # -preview_word(waveform, word_spans[1], num_frames, transcript[1]) +preview_word(waveform, token_spans[1], num_frames, transcript[1]) ###################################################################### # -preview_word(waveform, word_spans[2], num_frames, transcript[2]) +preview_word(waveform, token_spans[2], num_frames, transcript[2]) ###################################################################### # -preview_word(waveform, word_spans[3], num_frames, transcript[3]) +preview_word(waveform, token_spans[3], num_frames, transcript[3]) ###################################################################### # -preview_word(waveform, word_spans[4], num_frames, transcript[4]) +preview_word(waveform, token_spans[4], num_frames, transcript[4]) ###################################################################### # -preview_word(waveform, word_spans[5], num_frames, transcript[5]) +preview_word(waveform, token_spans[5], num_frames, transcript[5]) ###################################################################### # -preview_word(waveform, word_spans[6], num_frames, transcript[6]) +preview_word(waveform, token_spans[6], num_frames, transcript[6]) ###################################################################### # -preview_word(waveform, word_spans[7], num_frames, transcript[7]) +preview_word(waveform, token_spans[7], num_frames, transcript[7]) ###################################################################### # -preview_word(waveform, word_spans[8], num_frames, transcript[8]) +preview_word(waveform, token_spans[8], num_frames, transcript[8]) ###################################################################### # Italian # ~~~~~~~ -speech_file = torchaudio.utils.download_asset("tutorial-assets/642_529_000025.flac", progress=False) - text_raw = "elle giacean per terra tutte quante" text_normalized = "elle giacean per terra tutte quante" -print("Raw Transcript: ", text_raw) -print("Normalized Transcript: ", text_normalized) +url = "https://download.pytorch.org/torchaudio/tutorial-assets/642_529_000025.flac" +waveform, sample_rate = torchaudio.load(url, num_frames=int(4 * bundle.sample_rate)) ###################################################################### # - -waveform, _ = torchaudio.load(speech_file, num_frames=int(4 * SAMPLE_RATE)) - -emission = get_emission(waveform.to(device)) -num_frames = emission.size(1) +assert sample_rate == bundle.sample_rate ###################################################################### # transcript = text_normalized.split() -word_spans = compute_alignments(emission, transcript, dictionary) +emission, token_spans = compute_alignments(waveform, transcript) +num_frames = emission.size(1) -plot_alignments(waveform, word_spans, emission, transcript) +plot_alignments(waveform, token_spans, emission, transcript) + +print("Raw Transcript: ", text_raw) +print("Normalized Transcript: ", text_normalized) +IPython.display.Audio(waveform, rate=sample_rate) ###################################################################### # -preview_word(waveform, word_spans[0], num_frames, transcript[0]) +preview_word(waveform, token_spans[0], num_frames, transcript[0]) ###################################################################### # -preview_word(waveform, word_spans[1], num_frames, transcript[1]) +preview_word(waveform, token_spans[1], num_frames, transcript[1]) ###################################################################### # -preview_word(waveform, word_spans[2], num_frames, transcript[2]) +preview_word(waveform, token_spans[2], num_frames, transcript[2]) ###################################################################### # -preview_word(waveform, word_spans[3], num_frames, transcript[3]) +preview_word(waveform, token_spans[3], num_frames, transcript[3]) ###################################################################### # -preview_word(waveform, word_spans[4], num_frames, transcript[4]) +preview_word(waveform, token_spans[4], num_frames, transcript[4]) ###################################################################### # -preview_word(waveform, word_spans[5], num_frames, transcript[5]) +preview_word(waveform, token_spans[5], num_frames, transcript[5]) ###################################################################### # Conclusion @@ -664,7 +595,6 @@ def get_emission(waveform): # --------------- # # Thanks to `Vineel Pratap `__ and `Zhaoheng -# Ni `__ for working on the forced aligner API, and -# `Moto Hira `__ for providing alignment merging and -# visualization utilities. +# Ni `__ for developing and open-sourcing the +# forced aligner API. #