From f038e6003c1fc5fa7bcc9873b1492d9c73515e66 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Thu, 25 Mar 2021 19:34:08 +0800 Subject: [PATCH 01/25] hugginface tokenizer and Neural LM training pipeline. This commit is mainly about hugginface tokenizer and a draft transformer/RNN based LM training pipeline. --- egs/librispeech/asr/nnlm/local/data.py | 54 +++ .../asr/nnlm/local/download_lm_train_data.py | 42 +++ .../asr/nnlm/local/huggingface_tokenizer.py | 98 ++++++ egs/librispeech/asr/nnlm/local/model.py | 154 ++++++++ egs/librispeech/asr/nnlm/main.py | 331 ++++++++++++++++++ egs/librispeech/asr/nnlm/run.sh | 61 ++++ 6 files changed, 740 insertions(+) create mode 100644 egs/librispeech/asr/nnlm/local/data.py create mode 100644 egs/librispeech/asr/nnlm/local/download_lm_train_data.py create mode 100644 egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py create mode 100644 egs/librispeech/asr/nnlm/local/model.py create mode 100644 egs/librispeech/asr/nnlm/main.py create mode 100644 egs/librispeech/asr/nnlm/run.sh diff --git a/egs/librispeech/asr/nnlm/local/data.py b/egs/librispeech/asr/nnlm/local/data.py new file mode 100644 index 00000000..7cdb4c2d --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/data.py @@ -0,0 +1,54 @@ +import os +from io import open +import torch + + +class Dictionary(object): + + def __init__(self): + self.word2idx = {} + self.idx2word = [] + self.idx2word.append('') + self.word2idx[''] = 0 + + def add_word(self, word): + if word not in self.word2idx: + self.idx2word.append(word) + self.word2idx[word] = len(self.idx2word) - 1 + # self.word2idx[word] = len(self.idx2word) + return self.word2idx[word] + + def __len__(self): + return len(self.idx2word) + + +class Corpus(object): + + def __init__(self, path): + self.dictionary = Dictionary() + self.train = self.tokenize(os.path.join(path, 'train.tokens')) + self.valid = self.tokenize(os.path.join(path, 'valid.tokens')) + self.test = self.tokenize(os.path.join(path, 'test.tokens')) + + def tokenize(self, path): + """Tokenizes a text file.""" + assert os.path.exists(path) + # Add words to the dictionary + with open(path, 'r', encoding="utf8") as f: + for line in f: + words = line.split() + [''] + for word in words: + self.dictionary.add_word(word) + + # Tokenize file content + with open(path, 'r', encoding="utf8") as f: + idss = [] + for line in f: + words = line.split() + [''] + ids = [] + for word in words: + ids.append(self.dictionary.word2idx[word]) + idss.append(torch.tensor(ids).type(torch.int64)) + # ids = torch.cat(idss) + + return idss diff --git a/egs/librispeech/asr/nnlm/local/download_lm_train_data.py b/egs/librispeech/asr/nnlm/local/download_lm_train_data.py new file mode 100644 index 00000000..d9ff066a --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/download_lm_train_data.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +import os +import logging +from google_drive_downloader import GoogleDriveDownloader as gdd +from pathlib import Path + +# librispeech-lm-norm.txt is 4G +# train_960_text is 48M, which is stands for the sum of {train_clean_360, train_clean_100, train_other_500} +# here only train_960_text used to verify the whole pipeline +# A copy of train_960_text: "htts://drive.google.com/file/d/1AgP4wTqbfp12dv4fJmjKXHdOf8eOtp_A/view?usp=sharing" +# local_path: "/ceph-ly/open-source/snowfall/egs/librispeech/asr/simple_v1/data/local/lm_train/train_960_text" + + +def download_librispeech_train_960_text(): + train_960_text = "./data/lm_train/librispeech_train_960_text" + if not os.path.exists(train_960_text): + Path(os.path.dirname(train_960_text)).mkdir(parents=True, + exist_ok=True) + + logging.info("downloading train_960_text of librispeech.") + gdd.download_file_from_google_drive( + file_id='1AgP4wTqbfp12dv4fJmjKXHdOf8eOtp_A', + dest_path=train_960_text, + unzip=False) + else: + logging.info( + "train_960_text of librispeech is already downloaded. You may should check that" + ) + + +def main(): + logging.getLogger().setLevel(logging.INFO) + + download_librispeech_train_960_text() + + +if __name__ == '__main__': + main() diff --git a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py new file mode 100644 index 00000000..8f2cff43 --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +# reference: https://huggingface.co/docs/tokenizers/python/latest/quicktour.html +import argparse +import logging +import os +import shutil +from pathlib import Path +from tokenizers import Tokenizer +from tokenizers.models import WordPiece +from tokenizers import normalizers +from tokenizers.normalizers import Lowercase, NFD, StripAccents +from tokenizers.pre_tokenizers import Whitespace +from tokenizers.trainers import WordPieceTrainer +from tokenizers import decoders + + +def get_args(): + parser = argparse.ArgumentParser( + description='train and tokenize with huggingface tokenizer') + parser.add_argument('--train-file', + type=str, + help="""file to train tokenizer""") + parser.add_argument('--vocab-size', + type=int, + default=1000, + help="""number of tokens of the tokenizer""") + parser.add_argument('--tokenizer-path', + type=str, + help="path to save or load tokenizer") + parser.add_argument('--test-file', + type=str, + help="""file to be tokenized""") + args = parser.parse_args() + return args + + +def train_tokenizer(train_files, save_path, vocab_size): + if os.path.exists(save_path): + logging.warning( + "{} already exists. Please check that.".format(save_path)) + return + else: + Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True) + + tokenizer = Tokenizer(WordPiece(unk_token='[UNK]')) + tokenizer.normalizer = normalizers.Sequence( + [NFD(), Lowercase(), StripAccents()]) + tokenizer.pre_tokenizer = Whitespace() + + # default vocab_size=30000 + # here set vocab_size=1000 for accelerating + trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=['[UNK]']) + tokenizer.train(train_files, trainer) + tokenizer.save(save_path) + + +def tokenize_text(test_file, tokenizer_path): + if not os.path.exists(tokenizer_path): + logging.warning( + "Tokenizer {} does not exist. Please check that.".format( + tokenizer_path)) + return + tokenizer = Tokenizer.from_file(tokenizer_path) + tokenizer.decoder = decoders.WordPiece() + tokenized_file = "{}.tokens".format(test_file) + # tokenized_ids = "{}.ids".format(test_file) + if os.path.exists(tokenized_file): + logging.warning( + "The input file seems already tokenized. Buckupping previous result" + ) + shutil.copyfile(tokenized_file, "{}.bk".format(tokenized_file)) + logging.warning("Tokenizing {}.".format(test_file)) + fout = open(tokenized_file, 'w') + with open(test_file) as f: + for line in f: + line = line.strip() + output = tokenizer.encode(line) + fout.write(" ".join(output.tokens) + '\n') + + fout.close() + + +def main(): + args = get_args() + if args.train_file is not None: + train_files = [args.train_file] + train_tokenizer(train_files, args.tokenizer_path, args.vocab_size) + + if args.test_file is not None: + tokenize_text(args.test_file, args.tokenizer_path) + + +if __name__ == '__main__': + main() diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py new file mode 100644 index 00000000..3767302f --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/model.py @@ -0,0 +1,154 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +class RNNModel(nn.Module): + """Container module with an encoder, a recurrent module, and a decoder.""" + + def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): + super(RNNModel, self).__init__() + self.ntoken = ntoken + self.drop = nn.Dropout(dropout) + # import pdb; pdb.set_trace() + self.encoder = nn.Embedding(ntoken, ninp, padding_idx=0) + if rnn_type in ['LSTM', 'GRU']: + self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) + else: + try: + nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] + except KeyError: + raise ValueError( """An invalid option for `--model` was supplied, + options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") + self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) + self.decoder = nn.Linear(nhid, ntoken) + + # Optionally tie weights as in: + # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) + # https://arxiv.org/abs/1608.05859 + # and + # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) + # https://arxiv.org/abs/1611.01462 + if tie_weights: + if nhid != ninp: + raise ValueError('When using the tied flag, nhid must be equal to emsize') + self.decoder.weight = self.encoder.weight + + self.init_weights() + + self.rnn_type = rnn_type + self.nhid = nhid + self.nlayers = nlayers + + def init_weights(self): + initrange = 0.1 + nn.init.uniform_(self.encoder.weight, -initrange, initrange) + nn.init.zeros_(self.decoder.weight) + nn.init.uniform_(self.decoder.weight, -initrange, initrange) + + def forward(self, input, hidden): + # import pdb; pdb.set_trace() + emb = self.drop(self.encoder(input)) + output, hidden = self.rnn(emb, hidden) + output = self.drop(output) + decoded = self.decoder(output) + decoded = decoded.view(-1, self.ntoken) + return F.log_softmax(decoded, dim=1), hidden + + def init_hidden(self, bsz): + weight = next(self.parameters()) + if self.rnn_type == 'LSTM': + return (weight.new_zeros(self.nlayers, bsz, self.nhid), + weight.new_zeros(self.nlayers, bsz, self.nhid)) + else: + return weight.new_zeros(self.nlayers, bsz, self.nhid) + +# Temporarily leave PositionalEncoding module here. Will be moved somewhere else. +class PositionalEncoding(nn.Module): + r"""Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, d_model, dropout=0.1, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer('pe', pe) + + def forward(self, x): + r"""Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + + x = x + self.pe[:x.size(0), :] + return self.dropout(x) + +class TransformerModel(nn.Module): + """Container module with an encoder, a recurrent or transformer module, and a decoder.""" + + def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): + super(TransformerModel, self).__init__() + try: + from torch.nn import TransformerEncoder, TransformerEncoderLayer + except: + raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.') + self.model_type = 'Transformer' + self.src_mask = None + self.pos_encoder = PositionalEncoding(ninp, dropout) + encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout) + self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) + self.encoder = nn.Embedding(ntoken, ninp) + self.ninp = ninp + self.decoder = nn.Linear(ninp, ntoken) + + self.init_weights() + + def _generate_square_subsequent_mask(self, sz): + mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) + mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + return mask + + def init_weights(self): + initrange = 0.1 + nn.init.uniform_(self.encoder.weight, -initrange, initrange) + nn.init.zeros_(self.decoder.weight) + nn.init.uniform_(self.decoder.weight, -initrange, initrange) + + def forward(self, src, has_mask=True): + if has_mask: + device = src.device + if self.src_mask is None or self.src_mask.size(0) != len(src): + mask = self._generate_square_subsequent_mask(len(src)).to(device) + self.src_mask = mask + else: + self.src_mask = None + + src = self.encoder(src) * math.sqrt(self.ninp) + src = self.pos_encoder(src) + output = self.transformer_encoder(src, self.src_mask) + output = self.decoder(output) + return F.log_softmax(output, dim=-1) diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py new file mode 100644 index 00000000..6d81ecde --- /dev/null +++ b/egs/librispeech/asr/nnlm/main.py @@ -0,0 +1,331 @@ +# coding: utf-8 +import argparse +import time +import math +import os +import sys +import torch +import torch.nn as nn +import torch.onnx + +sys.path.insert(0, './local/') + +import data +import model + +parser = argparse.ArgumentParser( + description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model') +parser.add_argument('--data', + type=str, + default='./data/lm_train/', + help='location of the data corpus') +parser.add_argument( + '--model', + type=str, + default='LSTM', + help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)') +parser.add_argument('--emsize', + type=int, + default=200, + help='size of word embeddings') +parser.add_argument('--nhid', + type=int, + default=200, + help='number of hidden units per layer') +parser.add_argument('--nlayers', type=int, default=2, help='number of layers') +parser.add_argument('--lr', + type=float, + default=20, + help='initial learning rate') +parser.add_argument('--clip', + type=float, + default=0.25, + help='gradient clipping') +parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') +parser.add_argument('--batch_size', + type=int, + default=30, + metavar='N', + help='batch size') +parser.add_argument('--bptt', type=int, default=35, help='sequence length') +parser.add_argument('--dropout', + type=float, + default=0.2, + help='dropout applied to layers (0 = no dropout)') +parser.add_argument('--tied', + action='store_true', + help='tie the word embedding and softmax weights') +parser.add_argument('--seed', type=int, default=1111, help='random seed') +parser.add_argument('--cuda', action='store_true', help='use CUDA') +parser.add_argument('--log-interval', + type=int, + default=200, + metavar='N', + help='report interval') +parser.add_argument('--save', + type=str, + default='model.pt', + help='path to save the final model') +parser.add_argument('--onnx-export', + type=str, + default='', + help='path to export the final model in onnx format') + +parser.add_argument( + '--nhead', + type=int, + default=2, + help='the number of heads in the encoder/decoder of the transformer model') +parser.add_argument('--dry-run', + action='store_true', + help='verify the code and the model') + +args = parser.parse_args() + +# Set the random seed manually for reproducibility. +torch.manual_seed(args.seed) +if torch.cuda.is_available(): + if not args.cuda: + print( + "WARNING: You have a CUDA device, so you should probably run with --cuda" + ) + +device = torch.device("cuda" if args.cuda else "cpu") + +############################################################################### +# Load data +############################################################################### + +corpus = data.Corpus(args.data) + +# Starting from sequential data, batchify arranges the dataset into columns. +# For instance, with the alphabet as the sequence and batch size 4, we'd get +# ┌ a g m s ┐ +# │ b h n t │ +# │ c i o u │ +# │ d j p v │ +# │ e k q w │ +# └ f l r x ┘. +# These columns are treated as independent by the model, which means that the +# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient +# batch processing. + + +def batchify(data, bsz): + # Work out how cleanly we can divide the dataset into bsz parts. + nbatch = len(data) // bsz + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, nbatch * bsz) + # Evenly divide the data across the bsz batches. + data = data.view(bsz, -1).t().contiguous() + return data.to(device) + + +eval_batch_size = args.batch_size +# train_data = batchify(corpus.train, args.batch_size) +# val_data = batchify(corpus.valid, eval_batch_size) +# test_data = batchify(corpus.test, eval_batch_size) + +train_data = corpus.train +val_data = corpus.valid +test_data = corpus.test +############################################################################### +# Build the model +############################################################################### + +ntokens = len(corpus.dictionary) +if args.model == 'Transformer': + model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, + args.nlayers, args.dropout).to(device) +else: + model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, + args.nlayers, args.dropout, args.tied).to(device) + +criterion = nn.NLLLoss(ignore_index=0) + +############################################################################### +# Training code +############################################################################### + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +# get_batch subdivides the source data into chunks of length args.bptt. +# If source is equal to the example output of the batchify function, with +# a bptt-limit of 2, we'd get the following two Variables for i = 0: +# ┌ a g m s ┐ ┌ b h n t ┐ +# └ b h n t ┘ └ c i o u ┘ +# Note that despite the name of the function, the subdivison of data is not +# done along the batch dimension (i.e. dimension 1), since that was handled +# by the batchify function. The chunks are along dimension 0, corresponding +# to the seq_len dimension in the LSTM. + +# def get_batch(source, i): +# seq_len = min(args.bptt, len(source) - 1 - i) +# data = source[i:i+seq_len] +# target = source[i+1:i+1+seq_len].view(-1) +# return data, target + + +def get_batch(source, i, pad_index=0, batch_size=args.batch_size): + batch = source[i * batch_size:(i + 1) * batch_size] + # import pdb; pdb.set_trace() + seq_lens = [len(batch[i]) for i in range(batch_size)] + seq_len = max(seq_lens) + 1 + data_padded = [] + target = [] + for data in batch: + # import pdb; pdb.set_trace() + # print("{} {}".format(seq_len,len(data))) + padding = torch.tensor([pad_index for _ in range(seq_len - len(data))]) + data = torch.cat((data, padding), dim=0) + data_padded.append(data[:-1]) + target.append(data[1:]) + + # import pdb; pdb.set_trace() + # data_padded: (Length, Batch_size) + # target: (Batch_size, Batch_size, Batch_size, ***) + try: + data_padded = torch.stack(data_padded).to(device).transpose(0, 1) + target = torch.flatten(torch.stack(target).to(device).transpose(0, 1)) + return data_padded, target + except: + import pdb + pdb.set_trace() + + #return torch.stack(data_padded).to(device), torch.stack(target).to(device) + + +def evaluate(data_source): + # Turn on evaluation mode which disables dropout. + model.eval() + total_loss = 0. + ntokens = len(corpus.dictionary) + if args.model != 'Transformer': + hidden = model.init_hidden(eval_batch_size) + with torch.no_grad(): + for i in range(0, len(data_source) // eval_batch_size - 1): + data, targets = get_batch(data_source, i) + if args.model == 'Transformer': + output = model(data) + output = output.view(-1, ntokens) + else: + output, hidden = model(data, hidden) + hidden = repackage_hidden(hidden) + total_loss += len(data) * criterion(output, targets).item() + return total_loss / (len(data_source) - 1) + + +def train(): + # Turn on training mode which enables dropout. + batch_size = args.batch_size + model.train() + total_loss = 0. + start_time = time.time() + ntokens = len(corpus.dictionary) + if args.model != 'Transformer': + hidden = model.init_hidden(batch_size) + for batch_idx in range(0, len(train_data) // batch_size - 1): + data, targets = get_batch(train_data, batch_idx) + # Starting each batch, we detach the hidden state from how it was previously produced. + # If we didn't, the model would try backpropagating all the way to start of the dataset. + model.zero_grad() + if args.model == 'Transformer': + output = model(data) + output = output.view(-1, ntokens) + else: + hidden = repackage_hidden(hidden) + output, hidden = model(data, hidden) + + # import pdb; pdb.set_trace() + loss = criterion(output, targets) + loss.backward() + + # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) + for p in model.parameters(): + p.data.add_(p.grad, alpha=-lr) + + #import pdb; pdb.set_trace() + total_loss += loss.item() + + if batch_idx % args.log_interval == 0 and batch_idx > 0: + cur_loss = total_loss / args.log_interval + elapsed = time.time() - start_time + print( + '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' + 'loss {:5.2f} | ppl {:8.2f}'.format( + epoch, batch_idx, + len(train_data) // batch_size, lr, + elapsed * 1000 / args.log_interval, cur_loss, + math.exp(cur_loss))) + total_loss = 0 + start_time = time.time() + if args.dry_run: + break + + +def export_onnx(path, batch_size, seq_len): + print('The model is also exported in ONNX format at {}'.format( + os.path.realpath(args.onnx_export))) + model.eval() + dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view( + -1, batch_size).to(device) + hidden = model.init_hidden(batch_size) + torch.onnx.export(model, (dummy_input, hidden), path) + + +# Loop over epochs. +lr = args.lr +best_val_loss = None + +# At any point you can hit Ctrl + C to break out of training early. +try: + for epoch in range(1, args.epochs + 1): + epoch_start_time = time.time() + train() + val_loss = evaluate(val_data) + print('-' * 89) + print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' + 'valid ppl {:8.2f}'.format(epoch, + (time.time() - epoch_start_time), + val_loss, math.exp(val_loss))) + print('-' * 89) + # Save the model if the validation loss is the best we've seen so far. + if not best_val_loss or val_loss < best_val_loss: + with open(args.save, 'wb') as f: + torch.save(model, f) + best_val_loss = val_loss + else: + # Anneal the learning rate if no improvement has been seen in the validation dataset. + lr /= 4.0 +except KeyboardInterrupt: + print('-' * 89) + print('Exiting from training early') + +# Load the best saved model. +with open(args.save, 'rb') as f: + model = torch.load(f) + # after load the rnn params are not a continuous chunk of memory + # this makes them a continuous chunk, and will speed up forward pass + # Currently, only rnn model supports flatten_parameters function. + if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']: + model.rnn.flatten_parameters() + +# Run on test data. +test_loss = evaluate(test_data) +print('=' * 89) +print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( + test_loss, math.exp(test_loss))) +print('=' * 89) + +if len(args.onnx_export) > 0: + # Export the model in ONNX format. + export_onnx(args.onnx_export, batch_size=1, seq_len=args.bptt) diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh new file mode 100644 index 00000000..3cfd5e18 --- /dev/null +++ b/egs/librispeech/asr/nnlm/run.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# Copyright 2020 Xiaomi Corporation (Author: Liyong Guo) +# Apache 2.0 + +# References: +# https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/train_rnnlm.sh +# https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/prepare_rnnlm_dir.sh +# https://github.com/pytorch/examples/tree/master/word_language_model +# https://huggingface.co/docs/tokenizers/python/latest/quicktour.html + +# Example of how to use HuggingFace tokenizer and train {RNN, Transformer} based LMs + +set -e +stage=$1 + +lm_train=data/lm_train/ +full_text=$lm_train/librispeech_train_960_text +tokenizer=$lm_train/tokenizer-librispeech_train_960.json +if [ $stage -eq 1 ]; then + python3 ./local/download_lm_train_data.py +fi +if [ $stage -eq 2 ]; then + echo "training tokenizer" + python3 local/huggingface_tokenizer.py \ + --train-file=$full_text \ + --tokenizer-path=$tokenizer +fi + +if [ $stage -eq 3 ]; then + echo "tokenize a file" + python3 local/huggingface_tokenizer.py \ + --test-file=$full_text \ + --tokenizer-path=$tokenizer +fi + +if [ $stage -eq 4 ]; then + echo "split all data into train/valid/test" + + full_tokens=${full_text}.tokens + valid_test_fraction=10 # currently 5 percent for valid and 5 percent for test + valid_test_tokens=$lm_train/valid_test.tokens + train_tokens=$lm_train/train.tokens + + num_utts_total=$(wc -l <$full_tokens ) + num_valid_test=$(($num_utts_total/${valid_test_fraction})) + set +x + shuf -n $num_valid_test $full_tokens > $valid_test_tokens + + comm -3 <(sort $valid_test_tokens) <(sort $full_tokens) > $train_tokens + shuf -n $(($num_valid_test/2)) $valid_test_tokens > $lm_train/valid.tokens + comm -3 <(sort $lm_train/valid.tokens) <(sort $valid_test_tokens) > $lm_train/test.tokens + +fi + + +if [ $stage -eq 5 ]; then + python main.py \ + --cuda \ + --model Transformer +fi From e9482d25349a0367775e3b0470b67b3099362ce8 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Mon, 29 Mar 2021 21:52:18 +0800 Subject: [PATCH 02/25] draft of class LMDataset --- egs/librispeech/asr/nnlm/local/dataset.py | 71 +++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 egs/librispeech/asr/nnlm/local/dataset.py diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py new file mode 100644 index 00000000..6792b4d8 --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +from torch.utils.data import Dataset, DataLoader +from torch.nn.utils.rnn import pad_sequence + + +class CollateFunc(object): + '''Collate function for LMDataset + ''' + + def __init__(self, pad_index): + self.pad_index = pad_index + + def __call__(self, batch): + # xs: input sequence + # ys: label sequence + xs = batch + # ys = batch + xs_pad = pad_sequence( + [torch.from_numpy(x).int() for x in xs, True, self.pad_index]) + ys_pad = xs_pad + return xs_pad, ys_pad + + +class LMDataset(Dataset): + + def __init__(self, text_file: str): + '''Dataset to load Language Model train/dev text data + + Args: + text_file: text file, one utt per line. + ''' + assert os.path.exists( + text_file), "text_file: {} does not exist, please check that." + self.data = [] + with open(text_file, 'r') as f: + for line in f: + text = line.strip().split() + assert len(text) > 0 + text_id = text2id(text) + token_id = text_id2token_id(text_id) + self.data.append(token_id) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + def text2id(text: list[str]) -> list[int]: + pass + + def text_id2token_id(text_id: list[int]) -> list[int]: + pass + + +if __name__ == '__main__': + train_file = "./data/local/lm/train.txt" + # dev_file = "./data/local/lm/dev.txt" + dataset = LMDataset(train_file) + data_loader = DataLoader(dataset, + batch_size=1, + shuffle=True, + num_workers=0, + collaate_fn=collate_func) + for i, batch in enumerate(data_loader): + print(i) + print(batch) From 135bfdb7985318e3ab77319ac8cc2de239dc2a9e Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Mon, 29 Mar 2021 22:07:43 +0800 Subject: [PATCH 03/25] a dummy implementation of LMDataset --- egs/librispeech/asr/nnlm/local/dataset.py | 36 ++++++++++++++--------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index 6792b4d8..8fb98fa7 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -5,22 +5,27 @@ from torch.utils.data import Dataset, DataLoader from torch.nn.utils.rnn import pad_sequence +from typing import List + +import os class CollateFunc(object): '''Collate function for LMDataset ''' - def __init__(self, pad_index): + def __init__(self, pad_index=0): self.pad_index = pad_index def __call__(self, batch): + import pdb + pdb.set_trace() # xs: input sequence # ys: label sequence xs = batch # ys = batch - xs_pad = pad_sequence( - [torch.from_numpy(x).int() for x in xs, True, self.pad_index]) + xs_pad = pad_sequence([torch.from_numpy(x).float() for x in xs], True, + self.pad_index) ys_pad = xs_pad return xs_pad, ys_pad @@ -31,7 +36,7 @@ def __init__(self, text_file: str): '''Dataset to load Language Model train/dev text data Args: - text_file: text file, one utt per line. + text_file: text file, text for one utt per line. ''' assert os.path.exists( text_file), "text_file: {} does not exist, please check that." @@ -40,8 +45,8 @@ def __init__(self, text_file: str): for line in f: text = line.strip().split() assert len(text) > 0 - text_id = text2id(text) - token_id = text_id2token_id(text_id) + text_id = self.text2id(text) + token_id = self.text_id2token_id(text_id) self.data.append(token_id) def __len__(self): @@ -50,22 +55,25 @@ def __len__(self): def __getitem__(self, idx): return self.data[idx] - def text2id(text: list[str]) -> list[int]: - pass + def text2id(self, text: List[str]) -> List[int]: + # A dumpy implementation + return [i for i in range(len(text))] - def text_id2token_id(text_id: list[int]) -> list[int]: - pass + def text_id2token_id(self, text_id: List[int]) -> List[int]: + # A dumpy implementation + return [i for i in range(len(text_id))] if __name__ == '__main__': - train_file = "./data/local/lm/train.txt" - # dev_file = "./data/local/lm/dev.txt" - dataset = LMDataset(train_file) + # train_file = "./data/nnlm/text/librispeech.txt" + dev_file = "./data/nnlm/text/dev.txt" + dataset = LMDataset(dev_file) + collate_func = CollateFunc() data_loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=0, - collaate_fn=collate_func) + collate_fn=collate_func) for i, batch in enumerate(data_loader): print(i) print(batch) From 88e0d49d559860134bfdf244b38bf25c84fa2c56 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 30 Mar 2021 11:11:40 +0800 Subject: [PATCH 04/25] collate function of NNLM --- egs/librispeech/asr/nnlm/local/dataset.py | 34 +++++++++++++++-------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index 8fb98fa7..50cc8f73 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -7,7 +7,9 @@ from torch.nn.utils.rnn import pad_sequence from typing import List +import numpy as np import os +import torch class CollateFunc(object): @@ -15,18 +17,20 @@ class CollateFunc(object): ''' def __init__(self, pad_index=0): + # pad_index should be identical to ignore_index of torch.nn.NLLLoss self.pad_index = pad_index - def __call__(self, batch): - import pdb - pdb.set_trace() - # xs: input sequence - # ys: label sequence - xs = batch - # ys = batch - xs_pad = pad_sequence([torch.from_numpy(x).float() for x in xs], True, - self.pad_index) - ys_pad = xs_pad + def __call__(self, batch: List[List[int]]): + '''batch contains token_id. + batch can be viewd as a ragged 2-d array, with a row represent a token_id. + token_id reprents a tokenized text, whose format is: + token_id token_id token_id *** + ''' + data_pad = pad_sequence( + [torch.from_numpy(np.array(x)).float() for x in batch], True, + self.pad_index) + xs_pad = data_pad[:, :-1] + ys_pad = data_pad[:, 1:] return xs_pad, ys_pad @@ -42,10 +46,14 @@ def __init__(self, text_file: str): text_file), "text_file: {} does not exist, please check that." self.data = [] with open(text_file, 'r') as f: + # a line represent a piece of text, e.g. + # DELAWARE IS NOT AFRAID OF DOGS for line in f: text = line.strip().split() assert len(text) > 0 text_id = self.text2id(text) + # token_id format: + # token_id token_id token_id *** token_id = self.text_id2token_id(text_id) self.data.append(token_id) @@ -70,10 +78,12 @@ def text_id2token_id(self, text_id: List[int]) -> List[int]: dataset = LMDataset(dev_file) collate_func = CollateFunc() data_loader = DataLoader(dataset, - batch_size=1, + batch_size=2, shuffle=True, num_workers=0, collate_fn=collate_func) for i, batch in enumerate(data_loader): - print(i) + xs, ys = batch + print(xs) + print(ys) print(batch) From 27b1863fbf5e5d7b99910960a58fd4834e1e2d26 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 30 Mar 2021 16:02:53 +0800 Subject: [PATCH 05/25] add scripts to process word piece lexicons. --- .flake8 | 14 ++ egs/librispeech/asr/nnlm/scripts/lexicon.py | 53 +++++ egs/librispeech/asr/nnlm/scripts/util.py | 207 ++++++++++++++++++ egs/librispeech/asr/nnlm/scripts/util_test.py | 153 +++++++++++++ 4 files changed, 427 insertions(+) create mode 100644 .flake8 create mode 100644 egs/librispeech/asr/nnlm/scripts/lexicon.py create mode 100644 egs/librispeech/asr/nnlm/scripts/util.py create mode 100755 egs/librispeech/asr/nnlm/scripts/util_test.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..8cebb5dd --- /dev/null +++ b/.flake8 @@ -0,0 +1,14 @@ +[flake8] +show-source=true +statistics=true +max-line-length=80 +exclude = + .git, + +ignore = + # E127 continuation line over-indented for visual indent + E127, + # F401, import but not used + F401, + # W504, line break after binary operator + W504, diff --git a/egs/librispeech/asr/nnlm/scripts/lexicon.py b/egs/librispeech/asr/nnlm/scripts/lexicon.py new file mode 100644 index 00000000..859f1b48 --- /dev/null +++ b/egs/librispeech/asr/nnlm/scripts/lexicon.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# + +from pathlib import Path +from typing import Union + +from util import create_ragged_lexicon +from util import read_lexicon +from util import read_mapping + +import torch +import k2 + + +class Lexicon(object): + + def __init__(self, lexicon_filename: Union[Path, str], + word2id_filename: Union[Path, str], + piece2id_filename: Union[Path, str]) -> None: + ''' + Args: + lexicon_filename: + Path to the lexicon file. Each line in it consists of + spaces separated columns. The first column is a word + and the remaining columns are the word pieces of this word. + word2id_filename: + Path to the file that maps a word to an ID. + piece2id_filename: + Path to the file that maps a word piece to an ID. + ''' + lexicon = read_lexicon(lexicon_filename) + word2id = read_mapping(word2id_filename) + piece2id = read_mapping(piece2id_filename) + + self.lexicon = create_ragged_lexicon(lexicon=lexicon, + word2id=word2id, + piece2id=piece2id) + + def word_seq_to_word_piece_seq(self, words: torch.Tensor) -> torch.Tensor: + '''Convert a word sequence to a word piece seq. + + Args: + words: + A 1-D torch.Tensor of dtype torch.int32 containing word IDs. + Returns: + Return a 1-D torch.Tensor containing the IDs of the + corresponding word pieces. + ''' + assert words.ndim == 1 + assert words.dtype == torch.int32 + + ans = k2.index(self.lexicon, words) + return ans.values() diff --git a/egs/librispeech/asr/nnlm/scripts/util.py b/egs/librispeech/asr/nnlm/scripts/util.py new file mode 100644 index 00000000..7c053e60 --- /dev/null +++ b/egs/librispeech/asr/nnlm/scripts/util.py @@ -0,0 +1,207 @@ +# Copyright (c) 2021 Xiaomi Corp. (authors: Fangjun Kuang) + +from pathlib import Path +from typing import Dict +from typing import List +from typing import Set +from typing import Tuple +from typing import Union + +import k2 + + +def read_mapping(filename: Union[str, Path]) -> Dict[str, int]: + '''Read a file that contains ID mappings. + + Each line in the file contains two fields separated by spaces. + The first field is a token and the second is its integer ID. + + An example file may look like the following:: + + a 1 + b 2 + hello 3 + + Args: + filename: + Filename containing the mapping. + Returns: + Return a dict that maps a token to an integer. + ''' + filename = Path(filename) + assert filename.is_file(), f'{filename} is not a file' + + ans: Dict[str, int] = dict() + seen: Set[int] = set() + + with open(filename) as f: + for line in f: + line = line.strip() + if len(line) == 0: + continue # skip empty lines + + splits = line.split() + assert len(splits) == 2, \ + f"Invalid line '{line}'.\n" \ + 'Each line should contain exactly two columns' + + key = splits[0] + value = int(splits[1]) + assert key not in ans, \ + f"Duplicate key '{key}' in line '{line}'" + + assert value not in seen, \ + f"Duplicate ID '{value}' in line '{line}'" + ans[key] = value + seen.add(value) + return ans + + +def convert_tokens_to_ids(tokens: List[str], + mapping: Dict[str, int]) -> List[int]: + '''Convert a list of tokens to its corresponding IDs. + + Caution: + We require that there are no OOVs. That is, every token + present in `tokens` has a corresponding ID in `mapping`. + + Args: + tokens: + A list of str representing tokens. + mapping: + A map that maps a token to an integer. + Returns: + A list of integers that are the IDs of the input tokens. + ''' + ans = [] + for t in tokens: + assert t in mapping, f"token '{t}' does not have an ID" + ans.append(mapping[t]) + return ans + + +def convert_lexicon_to_mappings( + filename: Union[str, Path] +) -> Tuple[Dict[str, int], Dict[str, int]]: # noqa + '''Generate IDs for tokens from a lexicon file. + + Each line in the lexicon consists of spaces separated columns. + The first column is the word and the remaining columns are its + word pieces. We require that each word has a unique decomposition + into word pieces. + + Args: + filename: + The lexicon file. + Returns: + Return a tuple containing two mappings: + - The first dict maps a word to an ID + - The second dict maps a word piece to an ID + ''' + filename = Path(filename) + assert filename.is_file(), f'File {filename} is not a file' + + words: Set[str] = set() + pieces: Set[str] = set() + + with open(filename) as f: + for line in f: + line = line.strip() + if len(line) == 0: + continue # skip empty lines + splits = line.split() + assert len(splits) >= 2, \ + f"Invalid line '{line}'.' \ + 'Expecting at least two columns" + + assert splits[0] not in words, "'Duplicate word '{splits[0]}'" + words.add(splits[0]) + + for p in splits[1:]: + pieces.add(p) + + words = list(words) + pieces = list(pieces) + words.sort() + pieces.sort() + + word2id: Dict[str, int] = dict() + piece2id: Dict[str, int] = dict() + + for i, w in enumerate(words): + word2id[w] = i + + for i, p in enumerate(pieces): + piece2id[p] = i + + return word2id, piece2id + + +def read_lexicon(lexicon_filename: Union[Path, str]) -> Dict[str, List[str]]: + '''Read a lexicon file. + + Each line in the lexicon consists of spaces separated columns. + The first column is the word and the remaining columns are the + corresponding word pieces. + + Args: + lexicon_filename: + Path to the lexicon. + Returns: + Return a dict mapping a word to its word pieces. + ''' + lexicon_filename = Path(lexicon_filename) + assert lexicon_filename.is_file(), f'File {lexicon_filename} is not a file' + + ans: Dict[str, List[str]] = dict() + + with open(lexicon_filename) as f: + for line in f: + line = line.strip() + if len(line) == 0: + continue # skip empty lines + + splits = line.split() + assert len(splits) >= 2, \ + f"Invalid line '{line}'" \ + 'Expected a line with at least two fields' + word = splits[0] + + assert word not in ans, \ + f"Duplicate word '{word}' in line '{line}'" + ans[word] = splits[1:] + return ans + + +def create_ragged_lexicon(lexicon: Dict[str, List[str]], + word2id: Dict[str, int], + piece2id: Dict[str, int]) -> k2.RaggedInt: + ''' + Args: + lexicon: + A dict that maps a word to word pieces. + word2id: + A dict that maps a word to an ID. + + CAUTION: + We require that word IDs are contiguous. For instance, if + there are 3 words, then the word IDs are 0, 1, and 2. + piece2id: + A dict that maps a word piece to an ID. + ''' + # First, check that word IDs are contiguous + id2word = {i: w for w, i in word2id.items()} + ids = list(id2word.keys()) + ids.sort() + # we assume that word IDs are contiguous + expected_ids = list(range(ids[-1] + 1)) + assert ids == expected_ids + + values = [] + for i in ids: + word = id2word[i] + pieces = lexicon[word] + pieces_id = convert_tokens_to_ids(pieces, piece2id) + values.append(pieces_id) + + return k2.create_ragged2(values) diff --git a/egs/librispeech/asr/nnlm/scripts/util_test.py b/egs/librispeech/asr/nnlm/scripts/util_test.py new file mode 100755 index 00000000..273be1f1 --- /dev/null +++ b/egs/librispeech/asr/nnlm/scripts/util_test.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +from pathlib import Path + +import os +import tempfile + +from lexicon import Lexicon +from util import convert_lexicon_to_mappings +from util import convert_tokens_to_ids +from util import read_lexicon +from util import read_mapping + +import torch + + +def get_temp_filename() -> str: + '''Return a temporary file. + + The caller is expected to remove the returned file. + ''' + with tempfile.NamedTemporaryFile(delete=False) as tmp: + name = tmp.name + tmp.close() + return name + + +def generate_mapping_file() -> str: + '''Generate a temporary mapping file for testing. + + Caution: + The caller is responsible to delete the returned file after using it. + + Returns: + A temporary file that contains an example mapping. + ''' + s = ''' + a 1 + b 2 + hello 3 + ''' + filename = get_temp_filename() + with open(filename, 'w') as f: + f.write(s) + return filename + + +def generate_lexicon_file() -> str: + '''Generate a temporary lexicon file for testing. + + Caution: + The caller is responsible to delete the returned file after using it. + + Returns: + A temporary file that contains an example lexicon. + ''' + s = ''' + tom to m + the the + piper p ip er + son so n + ''' + filename = get_temp_filename() + with open(filename, 'w') as f: + f.write(s) + return filename + + +def test_read_mapping_file(): + filename = generate_mapping_file() + mapping = read_mapping(filename) + os.remove(filename) + assert mapping['a'] == 1 + assert mapping['b'] == 2 + assert mapping['hello'] == 3 + + +def test_convert_tokens_to_ids(): + filename = generate_mapping_file() + mapping = read_mapping(filename) + os.remove(filename) + + tokens = ['b', 'a', 'a', 'hello', 'a', 'a', 'b'] + ids = convert_tokens_to_ids(tokens=tokens, mapping=mapping) + assert ids == [2, 1, 1, 3, 1, 1, 2] + + +def test_convert_lexicon_to_mappings(): + filename = generate_lexicon_file() + word2id, piece2id = convert_lexicon_to_mappings(filename) + print(word2id) + print(piece2id) + os.remove(filename) + + +def test_read_lexicon(): + filename = generate_lexicon_file() + lexicon = read_lexicon(filename) + os.remove(filename) + print(lexicon) + + +def test_lexicon(): + lexicon_filename = generate_lexicon_file() + + word2id, piece2id = convert_lexicon_to_mappings(lexicon_filename) + + word2id_filename = get_temp_filename() + piece2id_filename = get_temp_filename() + # piper: 0 + # son: 1 + # the 2 + # tome 3 + + # er 0 + # ip 1 + # m 2 + # n 3 + # p 4 + # so 5 + # the 6 + # to 7 + + with open(word2id_filename, 'w') as f: + for w, i in word2id.items(): + f.write(f'{w} {i}\n') + + with open(piece2id_filename, 'w') as f: + for p, i in piece2id.items(): + f.write(f'{p} {i}\n') + + lexicon = Lexicon(lexicon_filename, word2id_filename, piece2id_filename) + words = ['the', 'son', 'tom', 'piper', 'the'] + word_ids = convert_tokens_to_ids(words, word2id) + word_piece_ids = lexicon.word_seq_to_word_piece_seq( + torch.tensor(word_ids, dtype=torch.int32)) + # the so n to m p ip er the + # 6 5 3 7 2 4 1 0 6 + expected_word_piece_ids = torch.tensor([6, 5, 3, 7, 2, 4, 1, 0, 6]) + + assert torch.all(torch.eq(word_piece_ids, expected_word_piece_ids)) + + os.remove(lexicon_filename) + os.remove(word2id_filename) + os.remove(piece2id_filename) + + +if __name__ == '__main__': + test_read_mapping_file() + test_convert_tokens_to_ids() + test_convert_lexicon_to_mappings() + test_read_lexicon() + test_lexicon() From 47bf358ecac609428582a5d6793c17e5bc5a6fd6 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 30 Mar 2021 16:33:03 +0800 Subject: [PATCH 06/25] trainer --- egs/librispeech/asr/nnlm/local/trainer.py | 101 ++++++ egs/librispeech/asr/nnlm/main.py | 424 +++++----------------- 2 files changed, 199 insertions(+), 326 deletions(-) create mode 100644 egs/librispeech/asr/nnlm/local/trainer.py diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py new file mode 100644 index 00000000..50813b96 --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +import logging +import math +import torch + + +# references: +# https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py +# https://github.com/espnet/espnet/blob/master/espnet/lm/pytorch_backend/lm.py +# https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py +# https://www.jianshu.com/p/c88df856dbc8 +class Trainer(object): + + def __init__(self, + device, + model=None, + criterion=None, + optimizer=None, + train_data_loader=None, + dev_data_loader=None, + ntokens=None, + batch_size=1, + epoch=0, + num_epochs=10, + log_interval=10, + writer=None): + self.device = device + self.model = model + self.criterion = criterion + self.optimizer = optimizer + self.ntokens = ntokens + self.batch_size = batch_size + self.epoch = epoch + self.num_epochs = num_epochs + self.train_data_loader = train_data_loader + self.dev_data_loader = dev_data_loader + self.iterations = 0 + self.writer = writer + self.log_interval = log_interval + + def run(self): + for epoch in range(self.num_epochs): + if self.train_data_loader is not None: + self.train() + + if self.dev_data_loader is not None: + self.eval() + + self.epoch += 1 + + def train(self): + self.model.train() + num_total_batch = len(self.train_data_loader) + for batch_idx, batch in enumerate(self.train_data_loader): + batch_input, batch_target = batch + batch_input = batch_input.to(self.device) + batch_target = batch_target.to(self.device) + self.model.to(self.device) + batch_output = self.model(batch_input) + + prediction = batch_output.view(-1, self.ntokens) + target = torch.flatten(batch_target.transpose(0, 1)) + loss = self.criterion(prediction, target) + self.optimizer.zero_grad() + self.optimizer.step() + + self.writer.add_scalar('train_loss', loss, self.iterations) + + self.iterations += 1 + if batch_idx % self.log_interval == 0: + log_str = 'TRAIN Batch {}/{} loss {:.6f} ppl {:.6f} at epoch {}'.format( + batch_idx, num_total_batch, loss.item(), + math.exp(loss.item()), self.epoch) + logging.info(log_str) + + def eval(self): + self.model.eval() + total_loss = 0.0 + num_total_batch = len(self.dev_data_loader) + for batch_idx, batch in enumerate(self.dev_data_loader): + batch_input, batch_target = batch + batch_input = batch_input.to(self.device) + batch_target = batch_target.to(self.device) + self.model.to(self.device) + batch_output = self.model(batch_input) + + prediction = batch_output.view(-1, self.ntokens) + target = torch.flatten(batch_target.transpose(0, 1)) + loss = self.criterion(prediction, target) + total_loss += loss * self.batch_size + + loss = total_loss / (num_total_batch * self.batch_size) + ppl = math.exp(loss) + self.writer.add_scalar('dev_ppl', ppl, self.epoch) + log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format( + loss.item(), ppl, self.epoch) + logging.info(log_str) diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py index 6d81ecde..cce38a26 100644 --- a/egs/librispeech/asr/nnlm/main.py +++ b/egs/librispeech/asr/nnlm/main.py @@ -1,331 +1,103 @@ -# coding: utf-8 +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +# Reference: +# https://github.com/mobvoi/wenet/blob/main/wenet/bin/train.py import argparse -import time -import math -import os -import sys + +import logging import torch import torch.nn as nn -import torch.onnx +import torch.optim as optim +import sys sys.path.insert(0, './local/') - -import data -import model - -parser = argparse.ArgumentParser( - description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model') -parser.add_argument('--data', - type=str, - default='./data/lm_train/', - help='location of the data corpus') -parser.add_argument( - '--model', - type=str, - default='LSTM', - help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)') -parser.add_argument('--emsize', - type=int, - default=200, - help='size of word embeddings') -parser.add_argument('--nhid', - type=int, - default=200, - help='number of hidden units per layer') -parser.add_argument('--nlayers', type=int, default=2, help='number of layers') -parser.add_argument('--lr', - type=float, - default=20, - help='initial learning rate') -parser.add_argument('--clip', - type=float, - default=0.25, - help='gradient clipping') -parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') -parser.add_argument('--batch_size', - type=int, - default=30, - metavar='N', - help='batch size') -parser.add_argument('--bptt', type=int, default=35, help='sequence length') -parser.add_argument('--dropout', - type=float, - default=0.2, - help='dropout applied to layers (0 = no dropout)') -parser.add_argument('--tied', - action='store_true', - help='tie the word embedding and softmax weights') -parser.add_argument('--seed', type=int, default=1111, help='random seed') -parser.add_argument('--cuda', action='store_true', help='use CUDA') -parser.add_argument('--log-interval', - type=int, - default=200, - metavar='N', - help='report interval') -parser.add_argument('--save', - type=str, - default='model.pt', - help='path to save the final model') -parser.add_argument('--onnx-export', - type=str, - default='', - help='path to export the final model in onnx format') - -parser.add_argument( - '--nhead', - type=int, - default=2, - help='the number of heads in the encoder/decoder of the transformer model') -parser.add_argument('--dry-run', - action='store_true', - help='verify the code and the model') - -args = parser.parse_args() - -# Set the random seed manually for reproducibility. -torch.manual_seed(args.seed) -if torch.cuda.is_available(): - if not args.cuda: - print( - "WARNING: You have a CUDA device, so you should probably run with --cuda" - ) - -device = torch.device("cuda" if args.cuda else "cpu") - -############################################################################### -# Load data -############################################################################### - -corpus = data.Corpus(args.data) - -# Starting from sequential data, batchify arranges the dataset into columns. -# For instance, with the alphabet as the sequence and batch size 4, we'd get -# ┌ a g m s ┐ -# │ b h n t │ -# │ c i o u │ -# │ d j p v │ -# │ e k q w │ -# └ f l r x ┘. -# These columns are treated as independent by the model, which means that the -# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient -# batch processing. - - -def batchify(data, bsz): - # Work out how cleanly we can divide the dataset into bsz parts. - nbatch = len(data) // bsz - # Trim off any extra elements that wouldn't cleanly fit (remainders). - data = data.narrow(0, 0, nbatch * bsz) - # Evenly divide the data across the bsz batches. - data = data.view(bsz, -1).t().contiguous() - return data.to(device) - - -eval_batch_size = args.batch_size -# train_data = batchify(corpus.train, args.batch_size) -# val_data = batchify(corpus.valid, eval_batch_size) -# test_data = batchify(corpus.test, eval_batch_size) - -train_data = corpus.train -val_data = corpus.valid -test_data = corpus.test -############################################################################### -# Build the model -############################################################################### - -ntokens = len(corpus.dictionary) -if args.model == 'Transformer': - model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, - args.nlayers, args.dropout).to(device) -else: - model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, - args.nlayers, args.dropout, args.tied).to(device) - -criterion = nn.NLLLoss(ignore_index=0) - -############################################################################### -# Training code -############################################################################### - - -def repackage_hidden(h): - """Wraps hidden states in new Tensors, to detach them from their history.""" - - if isinstance(h, torch.Tensor): - return h.detach() - else: - return tuple(repackage_hidden(v) for v in h) - - -# get_batch subdivides the source data into chunks of length args.bptt. -# If source is equal to the example output of the batchify function, with -# a bptt-limit of 2, we'd get the following two Variables for i = 0: -# ┌ a g m s ┐ ┌ b h n t ┐ -# └ b h n t ┘ └ c i o u ┘ -# Note that despite the name of the function, the subdivison of data is not -# done along the batch dimension (i.e. dimension 1), since that was handled -# by the batchify function. The chunks are along dimension 0, corresponding -# to the seq_len dimension in the LSTM. - -# def get_batch(source, i): -# seq_len = min(args.bptt, len(source) - 1 - i) -# data = source[i:i+seq_len] -# target = source[i+1:i+1+seq_len].view(-1) -# return data, target - - -def get_batch(source, i, pad_index=0, batch_size=args.batch_size): - batch = source[i * batch_size:(i + 1) * batch_size] - # import pdb; pdb.set_trace() - seq_lens = [len(batch[i]) for i in range(batch_size)] - seq_len = max(seq_lens) + 1 - data_padded = [] - target = [] - for data in batch: - # import pdb; pdb.set_trace() - # print("{} {}".format(seq_len,len(data))) - padding = torch.tensor([pad_index for _ in range(seq_len - len(data))]) - data = torch.cat((data, padding), dim=0) - data_padded.append(data[:-1]) - target.append(data[1:]) - - # import pdb; pdb.set_trace() - # data_padded: (Length, Batch_size) - # target: (Batch_size, Batch_size, Batch_size, ***) - try: - data_padded = torch.stack(data_padded).to(device).transpose(0, 1) - target = torch.flatten(torch.stack(target).to(device).transpose(0, 1)) - return data_padded, target - except: - import pdb - pdb.set_trace() - - #return torch.stack(data_padded).to(device), torch.stack(target).to(device) - - -def evaluate(data_source): - # Turn on evaluation mode which disables dropout. - model.eval() - total_loss = 0. - ntokens = len(corpus.dictionary) - if args.model != 'Transformer': - hidden = model.init_hidden(eval_batch_size) - with torch.no_grad(): - for i in range(0, len(data_source) // eval_batch_size - 1): - data, targets = get_batch(data_source, i) - if args.model == 'Transformer': - output = model(data) - output = output.view(-1, ntokens) - else: - output, hidden = model(data, hidden) - hidden = repackage_hidden(hidden) - total_loss += len(data) * criterion(output, targets).item() - return total_loss / (len(data_source) - 1) - - -def train(): - # Turn on training mode which enables dropout. - batch_size = args.batch_size - model.train() - total_loss = 0. - start_time = time.time() - ntokens = len(corpus.dictionary) - if args.model != 'Transformer': - hidden = model.init_hidden(batch_size) - for batch_idx in range(0, len(train_data) // batch_size - 1): - data, targets = get_batch(train_data, batch_idx) - # Starting each batch, we detach the hidden state from how it was previously produced. - # If we didn't, the model would try backpropagating all the way to start of the dataset. - model.zero_grad() - if args.model == 'Transformer': - output = model(data) - output = output.view(-1, ntokens) - else: - hidden = repackage_hidden(hidden) - output, hidden = model(data, hidden) - - # import pdb; pdb.set_trace() - loss = criterion(output, targets) - loss.backward() - - # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. - torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) - for p in model.parameters(): - p.data.add_(p.grad, alpha=-lr) - - #import pdb; pdb.set_trace() - total_loss += loss.item() - - if batch_idx % args.log_interval == 0 and batch_idx > 0: - cur_loss = total_loss / args.log_interval - elapsed = time.time() - start_time - print( - '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' - 'loss {:5.2f} | ppl {:8.2f}'.format( - epoch, batch_idx, - len(train_data) // batch_size, lr, - elapsed * 1000 / args.log_interval, cur_loss, - math.exp(cur_loss))) - total_loss = 0 - start_time = time.time() - if args.dry_run: - break - - -def export_onnx(path, batch_size, seq_len): - print('The model is also exported in ONNX format at {}'.format( - os.path.realpath(args.onnx_export))) - model.eval() - dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view( - -1, batch_size).to(device) - hidden = model.init_hidden(batch_size) - torch.onnx.export(model, (dummy_input, hidden), path) - - -# Loop over epochs. -lr = args.lr -best_val_loss = None - -# At any point you can hit Ctrl + C to break out of training early. -try: - for epoch in range(1, args.epochs + 1): - epoch_start_time = time.time() - train() - val_loss = evaluate(val_data) - print('-' * 89) - print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' - 'valid ppl {:8.2f}'.format(epoch, - (time.time() - epoch_start_time), - val_loss, math.exp(val_loss))) - print('-' * 89) - # Save the model if the validation loss is the best we've seen so far. - if not best_val_loss or val_loss < best_val_loss: - with open(args.save, 'wb') as f: - torch.save(model, f) - best_val_loss = val_loss - else: - # Anneal the learning rate if no improvement has been seen in the validation dataset. - lr /= 4.0 -except KeyboardInterrupt: - print('-' * 89) - print('Exiting from training early') - -# Load the best saved model. -with open(args.save, 'rb') as f: - model = torch.load(f) - # after load the rnn params are not a continuous chunk of memory - # this makes them a continuous chunk, and will speed up forward pass - # Currently, only rnn model supports flatten_parameters function. - if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']: - model.rnn.flatten_parameters() - -# Run on test data. -test_loss = evaluate(test_data) -print('=' * 89) -print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( - test_loss, math.exp(test_loss))) -print('=' * 89) - -if len(args.onnx_export) > 0: - # Export the model in ONNX format. - export_onnx(args.onnx_export, batch_size=1, seq_len=args.bptt) +from dataset import LMDataset, CollateFunc +from model import TransformerModel +from trainer import Trainer +from torch.utils.tensorboard import SummaryWriter +from torch.utils.data import DataLoader + + +def get_args(): + parser = argparse.ArgumentParser( + description='training Neural Language Model') + parser.add_argument('--train_text', + default='data/nnlm/text/librispeech.txt', + help='train data file') + parser.add_argument('--dev_text', + default='data/nnlm/text/dev.txt', + help='dev data file') + parser.add_argument('--batch_size', type=int, default=4) + parser.add_argument('--ntokens', type=int, default=3000) + parser.add_argument('--emsize', type=int, default=128) + parser.add_argument('--nhead', type=int, default=4) + parser.add_argument('--nhid', type=int, default=128) + parser.add_argument('--nlayers', type=int, default=6) + parser.add_argument('--dropout', type=int, default=0.2) + parser.add_argument('--model_dir', + default='./exp/', + help='path to save model') + parser.add_argument('--tensorboard_dir', + default='tensorboard', + help='path to save tensorboard log') + parser.add_argument('--gpu', + type=int, + default=-1, + help='gpu id for this local rank, -1 for cpu') + + args = parser.parse_args() + + return args + + +def main(): + args = get_args() + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + + #Set random seed + torch.manual_seed(2021) + collate_func = CollateFunc() + + train_dataset = LMDataset(args.train_text) + dev_dataset = LMDataset(args.dev_text) + + train_data_loader = DataLoader(train_dataset, + batch_size=args.batch_size, + shuffle=False, + num_workers=0, + collate_fn=collate_func) + + dev_data_loader = DataLoader(dev_dataset, + batch_size=args.batch_size, + shuffle=False, + num_workers=0, + collate_fn=collate_func) + + ntokens = args.ntokens + model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, + args.nlayers, args.dropout) + optimizer = optim.Adam(model.parameters()) + use_cuda = args.gpu >= 0 and torch.cuda.is_available() + device = torch.device('cuda' if use_cuda else 'cpu') + criterion = nn.NLLLoss(ignore_index=0) + exp_dir = 'exp-nnlm' + writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') + trainer = Trainer(device, + model, + criterion, + optimizer, + train_data_loader=train_data_loader, + dev_data_loader=dev_data_loader, + ntokens=ntokens, + batch_size=args.batch_size, + epoch=0, + writer=writer) + trainer.run() + + +if __name__ == '__main__': + main() From d8aaabdddfcc84ed6dc27fe5bbb9c3752b54ac5a Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 30 Mar 2021 17:50:13 +0800 Subject: [PATCH 07/25] generate lexicon --- .../asr/nnlm/local/generate_lexicon.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 egs/librispeech/asr/nnlm/local/generate_lexicon.py diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py new file mode 100644 index 00000000..337ffb4e --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +import argparse +from tokenizers import Tokenizer +from tokenizers.models import WordPiece +from tokenizers import decoders + + +def get_args(): + parser = argparse.ArgumentParser( + description='generate words.txt tokens.txt and lexicon.txt') + parser.add_argument('--lexicon-path', + default='data/nnlm/lexicon', + type=str, + help="path to save lexicon files") + parser.add_argument('--tokenizer-path', + type=str, + default='./data/lm_train/tokenizer-librispeech.json', + help="path to load tokenizer") + parser.add_argument('--train-file', + default='data/nnlm/text/librispeech.txt', + type=str, + help="""file to be tokenized""") + args = parser.parse_args() + return args + + +def generate_tokens(args): + tokenizer = Tokenizer.from_file(args.tokenizer_path) + symbols = tokenizer.get_vocab() + tokens_file = '{}/tokens.txt'.format(args.lexicon_path) + tokens_f = open(tokens_file, 'w') + for idx, sym in enumerate(symbols): + tokens_f.write('{} {}\n'.format(sym, idx)) + + tokens_f.close() + + +def generate_lexicon(args, words): + lexicon_file = '{}/lexicon.txt'.format(args.lexicon_path) + lf = open(lexicon_file, 'w') + tokenizer = Tokenizer.from_file(args.tokenizer_path) + tokenizer.decoder = decoders.WordPiece() + for word in words: + output = tokenizer.encode(word) + tokens = " ".join(output.tokens) + lf.write("{}\t{}\n".format(word, tokens)) + lf.close() + + +def load_words(args): + words = [] + tokens_file = '{}/words.txt'.format(args.lexicon_path) + special_words = [ + '', '!SIL', '', '', '', '', '#0' + ] + + with open(tokens_file) as f: + for line in f: + arr = line.strip().split() + if arr[0] not in special_words: + words.append(arr[0]) + + return words + + +def main(): + args = get_args() + generate_tokens(args) + words = load_words(args) + generate_lexicon(args, words) + + +if __name__ == '__main__': + main() From c44f99da9c0027f34981e359ff439c1f31c16b96 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 30 Mar 2021 17:57:57 +0800 Subject: [PATCH 08/25] check text length in dataset.py --- egs/librispeech/asr/nnlm/local/data.py | 54 ----------------------- egs/librispeech/asr/nnlm/local/dataset.py | 10 +++-- 2 files changed, 6 insertions(+), 58 deletions(-) delete mode 100644 egs/librispeech/asr/nnlm/local/data.py diff --git a/egs/librispeech/asr/nnlm/local/data.py b/egs/librispeech/asr/nnlm/local/data.py deleted file mode 100644 index 7cdb4c2d..00000000 --- a/egs/librispeech/asr/nnlm/local/data.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from io import open -import torch - - -class Dictionary(object): - - def __init__(self): - self.word2idx = {} - self.idx2word = [] - self.idx2word.append('') - self.word2idx[''] = 0 - - def add_word(self, word): - if word not in self.word2idx: - self.idx2word.append(word) - self.word2idx[word] = len(self.idx2word) - 1 - # self.word2idx[word] = len(self.idx2word) - return self.word2idx[word] - - def __len__(self): - return len(self.idx2word) - - -class Corpus(object): - - def __init__(self, path): - self.dictionary = Dictionary() - self.train = self.tokenize(os.path.join(path, 'train.tokens')) - self.valid = self.tokenize(os.path.join(path, 'valid.tokens')) - self.test = self.tokenize(os.path.join(path, 'test.tokens')) - - def tokenize(self, path): - """Tokenizes a text file.""" - assert os.path.exists(path) - # Add words to the dictionary - with open(path, 'r', encoding="utf8") as f: - for line in f: - words = line.split() + [''] - for word in words: - self.dictionary.add_word(word) - - # Tokenize file content - with open(path, 'r', encoding="utf8") as f: - idss = [] - for line in f: - words = line.split() + [''] - ids = [] - for word in words: - ids.append(self.dictionary.word2idx[word]) - idss.append(torch.tensor(ids).type(torch.int64)) - # ids = torch.cat(idss) - - return idss diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index 50cc8f73..41b4dad0 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -22,12 +22,12 @@ def __init__(self, pad_index=0): def __call__(self, batch: List[List[int]]): '''batch contains token_id. - batch can be viewd as a ragged 2-d array, with a row represent a token_id. + batch can be viewd as a ragged 2-d array, with a row represents a token_id. token_id reprents a tokenized text, whose format is: token_id token_id token_id *** ''' data_pad = pad_sequence( - [torch.from_numpy(np.array(x)).float() for x in batch], True, + [torch.from_numpy(np.array(x)).long() for x in batch], True, self.pad_index) xs_pad = data_pad[:, :-1] ys_pad = data_pad[:, 1:] @@ -50,12 +50,14 @@ def __init__(self, text_file: str): # DELAWARE IS NOT AFRAID OF DOGS for line in f: text = line.strip().split() - assert len(text) > 0 + if len(text) == 0: + continue text_id = self.text2id(text) # token_id format: # token_id token_id token_id *** token_id = self.text_id2token_id(text_id) - self.data.append(token_id) + if len(token_id) >= 2: + self.data.append(token_id) def __len__(self): return len(self.data) From b13954d8d54e945747c0468452c9373f2f199fc2 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 30 Mar 2021 18:00:18 +0800 Subject: [PATCH 09/25] remove shuf/comm commands --- egs/librispeech/asr/nnlm/run.sh | 48 +++++++++++++-------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index 3cfd5e18..c7603ac9 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -5,6 +5,7 @@ # References: # https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/train_rnnlm.sh +# https://github.com/kaldi-asr/kaldi/blob/pybind11/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh#L75 # https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/prepare_rnnlm_dir.sh # https://github.com/pytorch/examples/tree/master/word_language_model # https://huggingface.co/docs/tokenizers/python/latest/quicktour.html @@ -15,45 +16,34 @@ set -e stage=$1 lm_train=data/lm_train/ -full_text=$lm_train/librispeech_train_960_text -tokenizer=$lm_train/tokenizer-librispeech_train_960.json -if [ $stage -eq 1 ]; then - python3 ./local/download_lm_train_data.py +tokenizer=$lm_train/tokenizer-librispeech.json + +text=data/local/lm/librispeech-lm-norm.txt.gz +text_dir=data/nnlm/text +train_text=$text_dir/librispeech.txt +if [ $stage -eq 0 ]; then + mkdir -p $text_dir + if [ ! -f $text ]; then + wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm + fi + echo -n >$text_dir/dev.txt + # hold out one in every 2000 lines as dev data. + gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$train_text fi + + if [ $stage -eq 2 ]; then echo "training tokenizer" python3 local/huggingface_tokenizer.py \ - --train-file=$full_text \ + --train-file=$train_text \ --tokenizer-path=$tokenizer fi if [ $stage -eq 3 ]; then - echo "tokenize a file" - python3 local/huggingface_tokenizer.py \ - --test-file=$full_text \ - --tokenizer-path=$tokenizer + echo "generate lexicon" + python local/generate_lexicon.py fi -if [ $stage -eq 4 ]; then - echo "split all data into train/valid/test" - - full_tokens=${full_text}.tokens - valid_test_fraction=10 # currently 5 percent for valid and 5 percent for test - valid_test_tokens=$lm_train/valid_test.tokens - train_tokens=$lm_train/train.tokens - - num_utts_total=$(wc -l <$full_tokens ) - num_valid_test=$(($num_utts_total/${valid_test_fraction})) - set +x - shuf -n $num_valid_test $full_tokens > $valid_test_tokens - - comm -3 <(sort $valid_test_tokens) <(sort $full_tokens) > $train_tokens - shuf -n $(($num_valid_test/2)) $valid_test_tokens > $lm_train/valid.tokens - comm -3 <(sort $lm_train/valid.tokens) <(sort $valid_test_tokens) > $lm_train/test.tokens - -fi - - if [ $stage -eq 5 ]; then python main.py \ --cuda \ From 775d4775f9989b93680a19da35ab3dfd884fcb8c Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 30 Mar 2021 20:44:59 +0800 Subject: [PATCH 10/25] beta version of training pipeline --- egs/librispeech/asr/nnlm/local/dataset.py | 17 ++++-- .../asr/nnlm/local/generate_lexicon.py | 25 +++++---- .../asr/nnlm/local/huggingface_tokenizer.py | 2 +- egs/librispeech/asr/nnlm/local/model.py | 53 +++++++++++++++---- egs/librispeech/asr/nnlm/main.py | 21 ++++++-- egs/librispeech/asr/nnlm/scripts/lexicon.py | 4 +- egs/librispeech/asr/nnlm/scripts/util.py | 21 ++++---- 7 files changed, 102 insertions(+), 41 deletions(-) diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index 41b4dad0..20fe34bd 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -6,6 +6,7 @@ from torch.utils.data import Dataset, DataLoader from torch.nn.utils.rnn import pad_sequence from typing import List +from util import convert_tokens_to_ids import numpy as np import os @@ -36,12 +37,13 @@ def __call__(self, batch: List[List[int]]): class LMDataset(Dataset): - def __init__(self, text_file: str): + def __init__(self, text_file: str, lexicon): '''Dataset to load Language Model train/dev text data Args: text_file: text file, text for one utt per line. ''' + self.lexicon = lexicon assert os.path.exists( text_file), "text_file: {} does not exist, please check that." self.data = [] @@ -49,13 +51,20 @@ def __init__(self, text_file: str): # a line represent a piece of text, e.g. # DELAWARE IS NOT AFRAID OF DOGS for line in f: - text = line.strip().split() + # import pdb + # pdb.set_trace() + text = line.strip().lower().split() + # print(text) if len(text) == 0: continue - text_id = self.text2id(text) + word_id = convert_tokens_to_ids(text, self.lexicon.word2id) + if len(word_id) == 0: + continue + word_id = torch.from_numpy(np.array(word_id, dtype="int32")) + + token_id = self.lexicon.word_seq_to_word_piece_seq(word_id) # token_id format: # token_id token_id token_id *** - token_id = self.text_id2token_id(text_id) if len(token_id) >= 2: self.data.append(token_id) diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py index 337ffb4e..0fa62afa 100644 --- a/egs/librispeech/asr/nnlm/local/generate_lexicon.py +++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py @@ -34,35 +34,42 @@ def generate_tokens(args): tokens_file = '{}/tokens.txt'.format(args.lexicon_path) tokens_f = open(tokens_file, 'w') for idx, sym in enumerate(symbols): - tokens_f.write('{} {}\n'.format(sym, idx)) + tokens_f.write('{} {}\n'.format(sym.lower(), idx)) tokens_f.close() def generate_lexicon(args, words): + special_words = [ + '', '!SIL', '', '', '', '', '#0' + ] lexicon_file = '{}/lexicon.txt'.format(args.lexicon_path) lf = open(lexicon_file, 'w') tokenizer = Tokenizer.from_file(args.tokenizer_path) tokenizer.decoder = decoders.WordPiece() for word in words: - output = tokenizer.encode(word) - tokens = " ".join(output.tokens) - lf.write("{}\t{}\n".format(word, tokens)) + if word not in special_words: + output = tokenizer.encode(word) + tokens = ' '.join(output.tokens) + else: + tokens = '[unk]' + lf.write("{}\t{}\n".format(word.lower(), tokens.lower())) lf.close() def load_words(args): words = [] tokens_file = '{}/words.txt'.format(args.lexicon_path) - special_words = [ - '', '!SIL', '', '', '', '', '#0' - ] + # special_words = [ + # '', '!SIL', '', '', '', '', '#0' + # ] + # special_words = [] with open(tokens_file) as f: for line in f: arr = line.strip().split() - if arr[0] not in special_words: - words.append(arr[0]) + # if arr[0] not in special_words: + words.append(arr[0]) return words diff --git a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py index 8f2cff43..8779fb2d 100644 --- a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py +++ b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py @@ -26,7 +26,7 @@ def get_args(): help="""file to train tokenizer""") parser.add_argument('--vocab-size', type=int, - default=1000, + default=10000, help="""number of tokens of the tokenizer""") parser.add_argument('--tokenizer-path', type=str, diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py index 3767302f..fcd7b8fc 100644 --- a/egs/librispeech/asr/nnlm/local/model.py +++ b/egs/librispeech/asr/nnlm/local/model.py @@ -1,26 +1,48 @@ +# reference: +# https://github.com/pytorch/examples/blob/master/word_language_model/model.py import math import torch import torch.nn as nn import torch.nn.functional as F + class RNNModel(nn.Module): """Container module with an encoder, a recurrent module, and a decoder.""" - def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): + def __init__(self, + rnn_type, + ntoken, + ninp, + nhid, + nlayers, + dropout=0.5, + tie_weights=False): super(RNNModel, self).__init__() self.ntoken = ntoken self.drop = nn.Dropout(dropout) # import pdb; pdb.set_trace() self.encoder = nn.Embedding(ntoken, ninp, padding_idx=0) if rnn_type in ['LSTM', 'GRU']: - self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) + self.rnn = getattr(nn, rnn_type)(ninp, + nhid, + nlayers, + dropout=dropout) else: try: - nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] + nonlinearity = { + 'RNN_TANH': 'tanh', + 'RNN_RELU': 'relu' + }[rnn_type] except KeyError: - raise ValueError( """An invalid option for `--model` was supplied, - options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") - self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) + raise ValueError( + """An invalid option for `--model` was supplied, + options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""" + ) + self.rnn = nn.RNN(ninp, + nhid, + nlayers, + nonlinearity=nonlinearity, + dropout=dropout) self.decoder = nn.Linear(nhid, ntoken) # Optionally tie weights as in: @@ -31,7 +53,8 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weigh # https://arxiv.org/abs/1611.01462 if tie_weights: if nhid != ninp: - raise ValueError('When using the tied flag, nhid must be equal to emsize') + raise ValueError( + 'When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight self.init_weights() @@ -63,6 +86,7 @@ def init_hidden(self, bsz): else: return weight.new_zeros(self.nlayers, bsz, self.nhid) + # Temporarily leave PositionalEncoding module here. Will be moved somewhere else. class PositionalEncoding(nn.Module): r"""Inject some information about the relative or absolute position of the tokens @@ -87,7 +111,9 @@ def __init__(self, d_model, dropout=0.1, max_len=5000): pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + div_term = torch.exp( + torch.arange(0, d_model, 2).float() * + (-math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) @@ -107,6 +133,7 @@ def forward(self, x): x = x + self.pe[:x.size(0), :] return self.dropout(x) + class TransformerModel(nn.Module): """Container module with an encoder, a recurrent or transformer module, and a decoder.""" @@ -115,7 +142,9 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): try: from torch.nn import TransformerEncoder, TransformerEncoderLayer except: - raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.') + raise ImportError( + 'TransformerEncoder module does not exist in PyTorch 1.1 or lower.' + ) self.model_type = 'Transformer' self.src_mask = None self.pos_encoder = PositionalEncoding(ninp, dropout) @@ -129,7 +158,8 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): def _generate_square_subsequent_mask(self, sz): mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill( + mask == 1, float(0.0)) return mask def init_weights(self): @@ -142,7 +172,8 @@ def forward(self, src, has_mask=True): if has_mask: device = src.device if self.src_mask is None or self.src_mask.size(0) != len(src): - mask = self._generate_square_subsequent_mask(len(src)).to(device) + mask = self._generate_square_subsequent_mask( + len(src)).to(device) self.src_mask = mask else: self.src_mask = None diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py index cce38a26..a3eb8c38 100644 --- a/egs/librispeech/asr/nnlm/main.py +++ b/egs/librispeech/asr/nnlm/main.py @@ -14,6 +14,9 @@ import sys sys.path.insert(0, './local/') +sys.path.insert(0, './scripts/') +from lexicon import Lexicon + from dataset import LMDataset, CollateFunc from model import TransformerModel from trainer import Trainer @@ -30,8 +33,8 @@ def get_args(): parser.add_argument('--dev_text', default='data/nnlm/text/dev.txt', help='dev data file') - parser.add_argument('--batch_size', type=int, default=4) - parser.add_argument('--ntokens', type=int, default=3000) + parser.add_argument('--batch_size', type=int, default=256) + parser.add_argument('--ntokens', type=int, default=10000) parser.add_argument('--emsize', type=int, default=128) parser.add_argument('--nhead', type=int, default=4) parser.add_argument('--nhid', type=int, default=128) @@ -45,8 +48,12 @@ def get_args(): help='path to save tensorboard log') parser.add_argument('--gpu', type=int, - default=-1, + default=1, help='gpu id for this local rank, -1 for cpu') + parser.add_argument('--lexicon-path', + default='data/nnlm/lexicon', + type=str, + help="path to save lexicon files") args = parser.parse_args() @@ -61,9 +68,13 @@ def main(): #Set random seed torch.manual_seed(2021) collate_func = CollateFunc() + lexicon_filename = '{}/lexicon.txt'.format(args.lexicon_path) + word2id_filename = '{}/words.txt'.format(args.lexicon_path) + piece2id_filename = '{}/tokens.txt'.format(args.lexicon_path) - train_dataset = LMDataset(args.train_text) - dev_dataset = LMDataset(args.dev_text) + lexicon = Lexicon(lexicon_filename, word2id_filename, piece2id_filename) + train_dataset = LMDataset(args.train_text, lexicon) + dev_dataset = LMDataset(args.dev_text, lexicon) train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, diff --git a/egs/librispeech/asr/nnlm/scripts/lexicon.py b/egs/librispeech/asr/nnlm/scripts/lexicon.py index 859f1b48..f882f06a 100644 --- a/egs/librispeech/asr/nnlm/scripts/lexicon.py +++ b/egs/librispeech/asr/nnlm/scripts/lexicon.py @@ -29,11 +29,11 @@ def __init__(self, lexicon_filename: Union[Path, str], Path to the file that maps a word piece to an ID. ''' lexicon = read_lexicon(lexicon_filename) - word2id = read_mapping(word2id_filename) + self.word2id = read_mapping(word2id_filename) piece2id = read_mapping(piece2id_filename) self.lexicon = create_ragged_lexicon(lexicon=lexicon, - word2id=word2id, + word2id=self.word2id, piece2id=piece2id) def word_seq_to_word_piece_seq(self, words: torch.Tensor) -> torch.Tensor: diff --git a/egs/librispeech/asr/nnlm/scripts/util.py b/egs/librispeech/asr/nnlm/scripts/util.py index 7c053e60..dda33da1 100644 --- a/egs/librispeech/asr/nnlm/scripts/util.py +++ b/egs/librispeech/asr/nnlm/scripts/util.py @@ -36,7 +36,7 @@ def read_mapping(filename: Union[str, Path]) -> Dict[str, int]: with open(filename) as f: for line in f: - line = line.strip() + line = line.strip().lower() if len(line) == 0: continue # skip empty lines @@ -57,8 +57,8 @@ def read_mapping(filename: Union[str, Path]) -> Dict[str, int]: return ans -def convert_tokens_to_ids(tokens: List[str], - mapping: Dict[str, int]) -> List[int]: +def convert_tokens_to_ids(tokens: List[str], mapping: Dict[str, + int]) -> List[int]: '''Convert a list of tokens to its corresponding IDs. Caution: @@ -75,14 +75,17 @@ def convert_tokens_to_ids(tokens: List[str], ''' ans = [] for t in tokens: - assert t in mapping, f"token '{t}' does not have an ID" - ans.append(mapping[t]) + # assert t in mapping, f"token '{t}' does not have an ID" + if t in mapping: + ans.append(mapping[t]) + else: + ans.append(mapping['']) return ans def convert_lexicon_to_mappings( - filename: Union[str, Path] -) -> Tuple[Dict[str, int], Dict[str, int]]: # noqa + filename: Union[str, + Path]) -> Tuple[Dict[str, int], Dict[str, int]]: # noqa '''Generate IDs for tokens from a lexicon file. Each line in the lexicon consists of spaces separated columns. @@ -173,8 +176,8 @@ def read_lexicon(lexicon_filename: Union[Path, str]) -> Dict[str, List[str]]: return ans -def create_ragged_lexicon(lexicon: Dict[str, List[str]], - word2id: Dict[str, int], +def create_ragged_lexicon(lexicon: Dict[str, List[str]], word2id: Dict[str, + int], piece2id: Dict[str, int]) -> k2.RaggedInt: ''' Args: From d415ed0773c8d76adde59900c7daf944c50db6fd Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 30 Mar 2021 20:49:56 +0800 Subject: [PATCH 11/25] remove unused file --- .../asr/nnlm/local/download_lm_train_data.py | 42 ------------------- 1 file changed, 42 deletions(-) delete mode 100644 egs/librispeech/asr/nnlm/local/download_lm_train_data.py diff --git a/egs/librispeech/asr/nnlm/local/download_lm_train_data.py b/egs/librispeech/asr/nnlm/local/download_lm_train_data.py deleted file mode 100644 index d9ff066a..00000000 --- a/egs/librispeech/asr/nnlm/local/download_lm_train_data.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) -# Apache 2.0 - -import os -import logging -from google_drive_downloader import GoogleDriveDownloader as gdd -from pathlib import Path - -# librispeech-lm-norm.txt is 4G -# train_960_text is 48M, which is stands for the sum of {train_clean_360, train_clean_100, train_other_500} -# here only train_960_text used to verify the whole pipeline -# A copy of train_960_text: "htts://drive.google.com/file/d/1AgP4wTqbfp12dv4fJmjKXHdOf8eOtp_A/view?usp=sharing" -# local_path: "/ceph-ly/open-source/snowfall/egs/librispeech/asr/simple_v1/data/local/lm_train/train_960_text" - - -def download_librispeech_train_960_text(): - train_960_text = "./data/lm_train/librispeech_train_960_text" - if not os.path.exists(train_960_text): - Path(os.path.dirname(train_960_text)).mkdir(parents=True, - exist_ok=True) - - logging.info("downloading train_960_text of librispeech.") - gdd.download_file_from_google_drive( - file_id='1AgP4wTqbfp12dv4fJmjKXHdOf8eOtp_A', - dest_path=train_960_text, - unzip=False) - else: - logging.info( - "train_960_text of librispeech is already downloaded. You may should check that" - ) - - -def main(): - logging.getLogger().setLevel(logging.INFO) - - download_librispeech_train_960_text() - - -if __name__ == '__main__': - main() From 4937232c65a1a2e607825887fec567497c2419a2 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Thu, 1 Apr 2021 19:26:13 +0800 Subject: [PATCH 12/25] add dependency and fix known bugs scripts to install tokenizers fix training bugs port online tokenization to offline tokenization load/save checkpoint --- egs/librispeech/asr/nnlm/local/common.py | 42 +++++++++++ egs/librispeech/asr/nnlm/local/dataset.py | 42 +++-------- .../asr/nnlm/local/generate_lexicon.py | 39 +++++++--- .../asr/nnlm/local/huggingface_tokenizer.py | 26 ++++--- egs/librispeech/asr/nnlm/local/trainer.py | 34 +++++++-- egs/librispeech/asr/nnlm/main.py | 57 +++++++------- egs/librispeech/asr/nnlm/requirements.txt | 1 + egs/librispeech/asr/nnlm/run.sh | 75 ++++++++++++++++--- 8 files changed, 222 insertions(+), 94 deletions(-) create mode 100644 egs/librispeech/asr/nnlm/local/common.py create mode 100644 egs/librispeech/asr/nnlm/requirements.txt diff --git a/egs/librispeech/asr/nnlm/local/common.py b/egs/librispeech/asr/nnlm/local/common.py new file mode 100644 index 00000000..f561cf1d --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/common.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +# modified from https://github.com/k2-fsa/snowfall/blob/master/snowfall/common.py to save/load non-Acoustic Model +import logging +import os +import torch + +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +Pathlike = Union[str, Path] +Info = Union[dict, None] + + +def load_checkpoint(filename: Pathlike, + model: torch.nn.Module, + info: Info = None) -> Dict[str, Any]: + logging.info('load checkpoint from {}'.format(filename)) + + checkpoint = torch.load(filename, map_location='cpu') + + model.load_state_dict(checkpoint['state_dict']) + + return checkpoint + + +def save_checkpoint(filename: Pathlike, + model: torch.nn.Module, + info: Info = None) -> None: + if not os.path.exists(os.path.dirname(filename)): + Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True) + logging.info(f'Save checkpoint to {filename}') + checkpoint = { + 'state_dict': model.state_dict(), + } + if info is not None: + checkpoint.update(info) + + torch.save(checkpoint, filename) diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index 20fe34bd..eb17491b 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -3,10 +3,10 @@ # Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) # Apache 2.0 +import time from torch.utils.data import Dataset, DataLoader from torch.nn.utils.rnn import pad_sequence from typing import List -from util import convert_tokens_to_ids import numpy as np import os @@ -37,35 +37,22 @@ def __call__(self, batch: List[List[int]]): class LMDataset(Dataset): - def __init__(self, text_file: str, lexicon): + def __init__(self, text_file: str): '''Dataset to load Language Model train/dev text data Args: text_file: text file, text for one utt per line. ''' - self.lexicon = lexicon assert os.path.exists( - text_file), "text_file: {} does not exist, please check that." + text_file + ), "text_file: {} does not exist, please check that.".format(text_file) self.data = [] with open(text_file, 'r') as f: - # a line represent a piece of text, e.g. - # DELAWARE IS NOT AFRAID OF DOGS - for line in f: - # import pdb - # pdb.set_trace() - text = line.strip().lower().split() - # print(text) - if len(text) == 0: - continue - word_id = convert_tokens_to_ids(text, self.lexicon.word2id) - if len(word_id) == 0: - continue - word_id = torch.from_numpy(np.array(word_id, dtype="int32")) - - token_id = self.lexicon.word_seq_to_word_piece_seq(word_id) - # token_id format: - # token_id token_id token_id *** - if len(token_id) >= 2: + for idx, line in enumerate(f): + token_id = [int(i) for i in line.strip().split()] + # TODO(Liyong Guo): add bos_id and eos_id to each piece of example + # then each valid example should be longer than 2 + if len(token_id) > 2: self.data.append(token_id) def __len__(self): @@ -74,18 +61,9 @@ def __len__(self): def __getitem__(self, idx): return self.data[idx] - def text2id(self, text: List[str]) -> List[int]: - # A dumpy implementation - return [i for i in range(len(text))] - - def text_id2token_id(self, text_id: List[int]) -> List[int]: - # A dumpy implementation - return [i for i in range(len(text_id))] - if __name__ == '__main__': - # train_file = "./data/nnlm/text/librispeech.txt" - dev_file = "./data/nnlm/text/dev.txt" + dev_file = "./data/nnlm/text/dev.txt.tokens" dataset = LMDataset(dev_file) collate_func = CollateFunc() data_loader = DataLoader(dataset, diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py index 0fa62afa..3b50ecf5 100644 --- a/egs/librispeech/asr/nnlm/local/generate_lexicon.py +++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py @@ -4,6 +4,7 @@ # Apache 2.0 import argparse +import collections from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers import decoders @@ -29,17 +30,41 @@ def get_args(): def generate_tokens(args): + ''' Extract symbols and there corresponding ids from a tokenizer, + and save as tokens.txt. + An example file looks like: + a 1 + b 2 + c 3 + ... + it 100 + sh 101 + + ''' + tokenizer = Tokenizer.from_file(args.tokenizer_path) symbols = tokenizer.get_vocab() tokens_file = '{}/tokens.txt'.format(args.lexicon_path) tokens_f = open(tokens_file, 'w') - for idx, sym in enumerate(symbols): - tokens_f.write('{} {}\n'.format(sym.lower(), idx)) + id2sym = dict((v, k.lower()) for k, v in symbols.items()) + for idx in range(len(symbols)): + assert idx in id2sym + tokens_f.write('{} {}\n'.format(id2sym[idx], idx)) tokens_f.close() def generate_lexicon(args, words): + ''' Tokenize every word in words.txt and save as lexicont.txt. + Each line represents a word and its tokenized representation, i.e. a sequence of tokens. a word and its tokens are seprated by a table. + + An example file looks like: + + abbreviating abb ##re ##via ##ting + abbreviation abb ##re ##via ##t ##ion + abbreviations abb ##re ##via ##t ##ions + + ''' special_words = [ '', '!SIL', '', '', '', '', '#0' ] @@ -48,7 +73,8 @@ def generate_lexicon(args, words): tokenizer = Tokenizer.from_file(args.tokenizer_path) tokenizer.decoder = decoders.WordPiece() for word in words: - if word not in special_words: + if not (word.upper() in special_words or + word.lower() in special_words): output = tokenizer.encode(word) tokens = ' '.join(output.tokens) else: @@ -60,16 +86,11 @@ def generate_lexicon(args, words): def load_words(args): words = [] tokens_file = '{}/words.txt'.format(args.lexicon_path) - # special_words = [ - # '', '!SIL', '', '', '', '', '#0' - # ] - # special_words = [] with open(tokens_file) as f: for line in f: arr = line.strip().split() - # if arr[0] not in special_words: - words.append(arr[0]) + words.append(arr[0].lower()) return words diff --git a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py index 8779fb2d..5a260b9f 100644 --- a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py +++ b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py @@ -41,8 +41,8 @@ def get_args(): def train_tokenizer(train_files, save_path, vocab_size): if os.path.exists(save_path): logging.warning( - "{} already exists. Please check that.".format(save_path)) - return + "{} already exists. Backing up that.".format(save_path)) + shutil.move(save_path, '{}'.format(save_path)) else: Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True) @@ -52,34 +52,42 @@ def train_tokenizer(train_files, save_path, vocab_size): tokenizer.pre_tokenizer = Whitespace() # default vocab_size=30000 - # here set vocab_size=1000 for accelerating trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=['[UNK]']) tokenizer.train(train_files, trainer) tokenizer.save(save_path) def tokenize_text(test_file, tokenizer_path): + ''' + tokenize text + input format looks like: + BOY IS BETTER UNBORN THAN + BRAVE OFFICER + + + output format looks like: + 355 127 794 4824 346 370 + 1330 1898 + ''' if not os.path.exists(tokenizer_path): - logging.warning( - "Tokenizer {} does not exist. Please check that.".format( - tokenizer_path)) + logging.warning("Tokenizer {} does not exist.".format(tokenizer_path)) return tokenizer = Tokenizer.from_file(tokenizer_path) tokenizer.decoder = decoders.WordPiece() tokenized_file = "{}.tokens".format(test_file) - # tokenized_ids = "{}.ids".format(test_file) if os.path.exists(tokenized_file): logging.warning( "The input file seems already tokenized. Buckupping previous result" ) - shutil.copyfile(tokenized_file, "{}.bk".format(tokenized_file)) + shutil.move(tokenized_file, "{}.bk".format(tokenized_file)) logging.warning("Tokenizing {}.".format(test_file)) fout = open(tokenized_file, 'w') with open(test_file) as f: for line in f: line = line.strip() output = tokenizer.encode(line) - fout.write(" ".join(output.tokens) + '\n') + if len(output.ids) > 0: + fout.write(' '.join([str(i) for i in output.ids]) + '\n') fout.close() diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py index 50813b96..03b9f95b 100644 --- a/egs/librispeech/asr/nnlm/local/trainer.py +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -7,6 +7,8 @@ import math import torch +from common import load_checkpoint, save_checkpoint + # references: # https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py @@ -26,7 +28,9 @@ def __init__(self, batch_size=1, epoch=0, num_epochs=10, - log_interval=10, + clip=0.25, + log_interval=100, + model_dir="exp-nnlm/models/", writer=None): self.device = device self.model = model @@ -41,6 +45,8 @@ def __init__(self, self.iterations = 0 self.writer = writer self.log_interval = log_interval + self.clip = clip + self.model_dir = model_dir def run(self): for epoch in range(self.num_epochs): @@ -49,13 +55,17 @@ def run(self): if self.dev_data_loader is not None: self.eval() + save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, epoch), + self.model) self.epoch += 1 def train(self): self.model.train() + total_loss = 0 num_total_batch = len(self.train_data_loader) for batch_idx, batch in enumerate(self.train_data_loader): + self.optimizer.zero_grad() batch_input, batch_target = batch batch_input = batch_input.to(self.device) batch_target = batch_target.to(self.device) @@ -65,18 +75,28 @@ def train(self): prediction = batch_output.view(-1, self.ntokens) target = torch.flatten(batch_target.transpose(0, 1)) loss = self.criterion(prediction, target) - self.optimizer.zero_grad() + loss.backward() + + torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) self.optimizer.step() self.writer.add_scalar('train_loss', loss, self.iterations) self.iterations += 1 - if batch_idx % self.log_interval == 0: + total_loss += loss.item() + if batch_idx % self.log_interval == 0 and batch_idx > 0: + cur_loss = total_loss / self.log_interval log_str = 'TRAIN Batch {}/{} loss {:.6f} ppl {:.6f} at epoch {}'.format( - batch_idx, num_total_batch, loss.item(), - math.exp(loss.item()), self.epoch) + batch_idx, num_total_batch, cur_loss, math.exp(cur_loss), + self.epoch) logging.info(log_str) + total_loss = 0.0 + if batch_idx % 10000 == 0 and batch_idx > 0: + save_checkpoint( + "./exp/nn-lm/models/epoch_{}-batch_{}.pt".format( + self.epoch, batch_idx), self.model) + @torch.no_grad() def eval(self): self.model.eval() total_loss = 0.0 @@ -91,9 +111,9 @@ def eval(self): prediction = batch_output.view(-1, self.ntokens) target = torch.flatten(batch_target.transpose(0, 1)) loss = self.criterion(prediction, target) - total_loss += loss * self.batch_size + total_loss += loss - loss = total_loss / (num_total_batch * self.batch_size) + loss = total_loss / num_total_batch ppl = math.exp(loss) self.writer.add_scalar('dev_ppl', ppl, self.epoch) log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format( diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py index a3eb8c38..358c7592 100644 --- a/egs/librispeech/asr/nnlm/main.py +++ b/egs/librispeech/asr/nnlm/main.py @@ -8,17 +8,17 @@ import argparse import logging +import os import torch import torch.nn as nn import torch.optim as optim import sys sys.path.insert(0, './local/') -sys.path.insert(0, './scripts/') -from lexicon import Lexicon from dataset import LMDataset, CollateFunc from model import TransformerModel +from pathlib import Path from trainer import Trainer from torch.utils.tensorboard import SummaryWriter from torch.utils.data import DataLoader @@ -27,21 +27,30 @@ def get_args(): parser = argparse.ArgumentParser( description='training Neural Language Model') - parser.add_argument('--train_text', - default='data/nnlm/text/librispeech.txt', + parser.add_argument('--train_token', + default='data/nnlm/text/librispeech.txt.tokens', help='train data file') - parser.add_argument('--dev_text', - default='data/nnlm/text/dev.txt', + parser.add_argument('--dev_token', + default='data/nnlm/text/dev.txt.tokens', help='dev data file') - parser.add_argument('--batch_size', type=int, default=256) + parser.add_argument('--batch_size', type=int, default=60) parser.add_argument('--ntokens', type=int, default=10000) - parser.add_argument('--emsize', type=int, default=128) - parser.add_argument('--nhead', type=int, default=4) - parser.add_argument('--nhid', type=int, default=128) - parser.add_argument('--nlayers', type=int, default=6) + parser.add_argument('--emsize', type=int, default=200) + parser.add_argument('--nhead', type=int, default=2) + parser.add_argument('--nhid', type=int, default=200) + parser.add_argument('--nlayers', type=int, default=2) + parser.add_argument('--num_epochs', type=int, default=50) parser.add_argument('--dropout', type=int, default=0.2) + parser.add_argument('--lr', + type=float, + default=1e-2, + help='initial learning rate') + parser.add_argument('--clip', + type=float, + default=50.0, + help='gradient clipping') parser.add_argument('--model_dir', - default='./exp/', + default='./exp-nnlm/models/', help='path to save model') parser.add_argument('--tensorboard_dir', default='tensorboard', @@ -50,10 +59,6 @@ def get_args(): type=int, default=1, help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--lexicon-path', - default='data/nnlm/lexicon', - type=str, - help="path to save lexicon files") args = parser.parse_args() @@ -68,22 +73,18 @@ def main(): #Set random seed torch.manual_seed(2021) collate_func = CollateFunc() - lexicon_filename = '{}/lexicon.txt'.format(args.lexicon_path) - word2id_filename = '{}/words.txt'.format(args.lexicon_path) - piece2id_filename = '{}/tokens.txt'.format(args.lexicon_path) - lexicon = Lexicon(lexicon_filename, word2id_filename, piece2id_filename) - train_dataset = LMDataset(args.train_text, lexicon) - dev_dataset = LMDataset(args.dev_text, lexicon) + train_dataset = LMDataset(args.train_token) + dev_dataset = LMDataset(args.dev_token) train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, - num_workers=0, + num_workers=10, collate_fn=collate_func) dev_data_loader = DataLoader(dev_dataset, - batch_size=args.batch_size, + batch_size=20, shuffle=False, num_workers=0, collate_fn=collate_func) @@ -91,12 +92,15 @@ def main(): ntokens = args.ntokens model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout) - optimizer = optim.Adam(model.parameters()) + optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4) use_cuda = args.gpu >= 0 and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') + print(device) criterion = nn.NLLLoss(ignore_index=0) exp_dir = 'exp-nnlm' writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') + + Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True) trainer = Trainer(device, model, criterion, @@ -106,6 +110,9 @@ def main(): ntokens=ntokens, batch_size=args.batch_size, epoch=0, + num_epochs=args.num_epochs, + clip=args.clip, + model_dir=args.model_dir, writer=writer) trainer.run() diff --git a/egs/librispeech/asr/nnlm/requirements.txt b/egs/librispeech/asr/nnlm/requirements.txt new file mode 100644 index 00000000..fb4a0dd1 --- /dev/null +++ b/egs/librispeech/asr/nnlm/requirements.txt @@ -0,0 +1 @@ +tokenizers==0.10.0 diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index c7603ac9..9953080b 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -15,37 +15,88 @@ set -e stage=$1 -lm_train=data/lm_train/ -tokenizer=$lm_train/tokenizer-librispeech.json +exp=exp-nnlm +tokenizer=$exp/tokenizer-librispeech.json text=data/local/lm/librispeech-lm-norm.txt.gz text_dir=data/nnlm/text -train_text=$text_dir/librispeech.txt +all_train_text=$text_dir/librispeech.txt +# there are 40,398,052 pieces in all_train_text, which will take 50 MINUTES to be tokenized, with a single process. +# Now only $train_pieces data is used for debugging pipeline +train_pieces=100000 # 5 times of dev.txt +# uncomment follwoing line to use all_train_text +# train_pieces= +dev_text=$text_dir/dev.txt + + +mkdir -p $text_dir + +if [ $stage -eq -1 ]; then + # env for experiment ../simple_v1 is expected to have been built. + echo "Install extra dependencies" + pip install -r requirements.txt +fi + if [ $stage -eq 0 ]; then - mkdir -p $text_dir + # reference: + # https://github.com/kaldi-asr/kaldi/blob/pybind11/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh#L75 + # use the same data seperation method to kaldi whose result can be used as a baseline if [ ! -f $text ]; then - wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm + wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm fi echo -n >$text_dir/dev.txt # hold out one in every 2000 lines as dev data. - gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$train_text + gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$all_train_text fi +if [ ! -z "$train_pieces" ]; then + train_text=$text_dir/${train_pieces}_librispeech.txt + if [ $train_text -ot $all_train_text ] || [ ! -f $train_text ]; then + # if [ ! -f $train_text) || $train_text -ot $all_train_text ]; then + head -n $train_pieces $all_train_text > $train_text + fi +else + train_text=$all_train_text +fi -if [ $stage -eq 2 ]; then + +if [ $stage -eq 1 ]; then echo "training tokenizer" python3 local/huggingface_tokenizer.py \ --train-file=$train_text \ --tokenizer-path=$tokenizer fi -if [ $stage -eq 3 ]; then - echo "generate lexicon" - python local/generate_lexicon.py + +if [ $stage -eq 2 ]; then + echo "tokenize train and dev files" + for text in $dev_text $train_text; do + python3 local/huggingface_tokenizer.py \ + --test-file=$text \ + --tokenizer-path=$tokenizer + done fi -if [ $stage -eq 5 ]; then +if [ $stage -eq 3 ]; then + echo "start to train" python main.py \ - --cuda \ + --train_token ${train_text}.tokens \ --model Transformer fi + +if [ $stage -eq 4 ]; then + # generate words.txt tokens.txt and lexicion.txt + # which is used in future rescore process + lexicon_path=./data/nnlm/lexicon + mkdir -p $lexicon_path + words_txt=../simple_v1/data/lang_nosp/words.txt + if [ -f $words_txt ]; then + cp $words_txt $lexicon_path + else + echo "please set words_txt path of your previous experiment" + echo "the NN-LM trained LM is used as a rescore module, \ + currently the same words.txt with previous experiment is prefered" + fi + echo "generate lexicon" + python local/generate_lexicon.py +fi From 61863dbabffaaf710f60c6ea403efdf43e387424 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Fri, 2 Apr 2021 23:14:49 +0800 Subject: [PATCH 13/25] fix various bugs with vocab_size=2000, epochs=50 tokens ppl of train: around 80 of dev: 119 --- egs/librispeech/asr/nnlm/local/dataset.py | 21 +++++-- egs/librispeech/asr/nnlm/local/model.py | 17 +++++- egs/librispeech/asr/nnlm/local/trainer.py | 73 +++++++++++++++++------ egs/librispeech/asr/nnlm/main.py | 60 ++++++++++++++----- egs/librispeech/asr/nnlm/run.sh | 29 ++++++--- 5 files changed, 153 insertions(+), 47 deletions(-) diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index eb17491b..be7904d7 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -17,8 +17,9 @@ class CollateFunc(object): '''Collate function for LMDataset ''' - def __init__(self, pad_index=0): + def __init__(self, pad_index=None): # pad_index should be identical to ignore_index of torch.nn.NLLLoss + # and padding_idx in torch.nn.Embedding self.pad_index = pad_index def __call__(self, batch: List[List[int]]): @@ -27,22 +28,30 @@ def __call__(self, batch: List[List[int]]): token_id reprents a tokenized text, whose format is: token_id token_id token_id *** ''' + # data_pad: [batch_size, seq_len] + # each seq_len always different data_pad = pad_sequence( [torch.from_numpy(np.array(x)).long() for x in batch], True, self.pad_index) - xs_pad = data_pad[:, :-1] - ys_pad = data_pad[:, 1:] + data_pad = data_pad.t().contiguous() + # xs_pad, ys_pad: [max_seq_len, batch_size] + # max_seq_len is the maximum lenght in current batch + xs_pad = data_pad[:-1, :] + ys_pad = data_pad[1:, :] return xs_pad, ys_pad class LMDataset(Dataset): - def __init__(self, text_file: str): + def __init__(self, text_file: str, ntokens=None): '''Dataset to load Language Model train/dev text data Args: text_file: text file, text for one utt per line. ''' + self.bos_id = ntokens - 3 + self.eos_id = ntokens - 2 + self.pad_index = ntokens - 1 assert os.path.exists( text_file ), "text_file: {} does not exist, please check that.".format(text_file) @@ -50,8 +59,10 @@ def __init__(self, text_file: str): with open(text_file, 'r') as f: for idx, line in enumerate(f): token_id = [int(i) for i in line.strip().split()] - # TODO(Liyong Guo): add bos_id and eos_id to each piece of example + # add bos_id and eos_id to each piece of example # then each valid example should be longer than 2 + token_id.insert(0, self.bos_id) + token_id.append(self.eos_id) if len(token_id) > 2: self.data.append(token_id) diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py index fcd7b8fc..bc3833dc 100644 --- a/egs/librispeech/asr/nnlm/local/model.py +++ b/egs/librispeech/asr/nnlm/local/model.py @@ -20,8 +20,7 @@ def __init__(self, super(RNNModel, self).__init__() self.ntoken = ntoken self.drop = nn.Dropout(dropout) - # import pdb; pdb.set_trace() - self.encoder = nn.Embedding(ntoken, ninp, padding_idx=0) + self.encoder = nn.Embedding(ntoken, ninp, padding_idx=ntoken - 1) if rnn_type in ['LSTM', 'GRU']: self.rnn = getattr(nn, rnn_type)(ninp, nhid, @@ -150,7 +149,7 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): self.pos_encoder = PositionalEncoding(ninp, dropout) encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout) self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) - self.encoder = nn.Embedding(ntoken, ninp) + self.encoder = nn.Embedding(ntoken, ninp, padding_idx=ntoken - 1) self.ninp = ninp self.decoder = nn.Linear(ninp, ntoken) @@ -169,6 +168,8 @@ def init_weights(self): nn.init.uniform_(self.decoder.weight, -initrange, initrange) def forward(self, src, has_mask=True): + # src: [seq—len, batch_size] + # len(src) is seq_len if has_mask: device = src.device if self.src_mask is None or self.src_mask.size(0) != len(src): @@ -178,6 +179,16 @@ def forward(self, src, has_mask=True): else: self.src_mask = None + # mask: [seq_len, seq_len] + # looks like: + # tensor([[0., -inf, -inf, ..., -inf, -inf, -inf], + # [0., 0., -inf, ..., -inf, -inf, -inf], + # [0., 0., 0., ..., -inf, -inf, -inf], + # ..., + # [0., 0., 0., ..., 0., -inf, -inf], + # [0., 0., 0., ..., 0., 0., -inf], + # [0., 0., 0., ..., 0., 0., 0.]], device='cuda:0') + src = self.encoder(src) * math.sqrt(self.ninp) src = self.pos_encoder(src) output = self.transformer_encoder(src, self.src_mask) diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py index 03b9f95b..e99cfb9c 100644 --- a/egs/librispeech/asr/nnlm/local/trainer.py +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -8,13 +8,24 @@ import torch from common import load_checkpoint, save_checkpoint - +from model import TransformerModel, RNNModel # references: # https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py # https://github.com/espnet/espnet/blob/master/espnet/lm/pytorch_backend/lm.py # https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py # https://www.jianshu.com/p/c88df856dbc8 + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + class Trainer(object): def __init__(self, @@ -47,6 +58,7 @@ def __init__(self, self.log_interval = log_interval self.clip = clip self.model_dir = model_dir + self.num_infinite_grad_norm = 0 def run(self): for epoch in range(self.num_epochs): @@ -65,20 +77,37 @@ def train(self): total_loss = 0 num_total_batch = len(self.train_data_loader) for batch_idx, batch in enumerate(self.train_data_loader): - self.optimizer.zero_grad() + # batch_input, batch_target: [max_seq_len, batch_size] + # max_seq_len is the maximum lenght in current batch batch_input, batch_target = batch + assert batch_input.shape[1] == self.batch_size + assert batch_target.shape[1] == self.batch_size batch_input = batch_input.to(self.device) batch_target = batch_target.to(self.device) self.model.to(self.device) - batch_output = self.model(batch_input) - - prediction = batch_output.view(-1, self.ntokens) - target = torch.flatten(batch_target.transpose(0, 1)) + if isinstance(self.model, TransformerModel): + batch_output = self.model(batch_input) + + prediction = batch_output.view(-1, self.ntokens) + else: + # reinitiate hidden for everch batch + # as batches are independent on each other + hidden = self.model.init_hidden(batch_input.shape[1]) + prediction, _ = self.model(batch_input, hidden) + + # target: [max_seq_len * batch_size] + # example_1_token_1 example_2_token_1 example_3_token_1 ..... + target = batch_target.view(-1) loss = self.criterion(prediction, target) loss.backward() - torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) - self.optimizer.step() + grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), + self.clip) + if torch.isfinite(grad_norm): + self.optimizer.step() + else: + self.num_infinite_grad_norm += 1 + self.optimizer.zero_grad() self.writer.add_scalar('train_loss', loss, self.iterations) @@ -90,30 +119,40 @@ def train(self): batch_idx, num_total_batch, cur_loss, math.exp(cur_loss), self.epoch) logging.info(log_str) + logging.info('infinite grad_norm detected {} times'.format( + self.num_infinite_grad_norm)) total_loss = 0.0 if batch_idx % 10000 == 0 and batch_idx > 0: save_checkpoint( - "./exp/nn-lm/models/epoch_{}-batch_{}.pt".format( - self.epoch, batch_idx), self.model) + "{}/epoch_{}-batch_{}.pt".format(self.model_dir, + self.epoch, batch_idx), + self.model) @torch.no_grad() def eval(self): self.model.eval() total_loss = 0.0 - num_total_batch = len(self.dev_data_loader) + total_examples = 0 for batch_idx, batch in enumerate(self.dev_data_loader): batch_input, batch_target = batch batch_input = batch_input.to(self.device) batch_target = batch_target.to(self.device) self.model.to(self.device) - batch_output = self.model(batch_input) - - prediction = batch_output.view(-1, self.ntokens) - target = torch.flatten(batch_target.transpose(0, 1)) + if isinstance(self.model, TransformerModel): + batch_output = self.model(batch_input) + + prediction = batch_output.view(-1, self.ntokens) + else: + hidden = self.model.init_hidden(batch_input.shape[1]) + prediction, _ = self.model(batch_input, hidden) + # target: [max_seq_len * batch_size] + # example_1_token_1 example_2_token_1 example_3_token_1 ..... + target = batch_target.view(-1) loss = self.criterion(prediction, target) - total_loss += loss + total_loss += loss * batch_input.shape[1] + total_examples += batch_input.shape[1] - loss = total_loss / num_total_batch + loss = total_loss / total_examples ppl = math.exp(loss) self.writer.add_scalar('dev_ppl', ppl, self.epoch) log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format( diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py index 358c7592..2fb6bc6a 100644 --- a/egs/librispeech/asr/nnlm/main.py +++ b/egs/librispeech/asr/nnlm/main.py @@ -16,8 +16,9 @@ sys.path.insert(0, './local/') +from common import load_checkpoint from dataset import LMDataset, CollateFunc -from model import TransformerModel +from model import TransformerModel, RNNModel from pathlib import Path from trainer import Trainer from torch.utils.tensorboard import SummaryWriter @@ -34,7 +35,7 @@ def get_args(): default='data/nnlm/text/dev.txt.tokens', help='dev data file') parser.add_argument('--batch_size', type=int, default=60) - parser.add_argument('--ntokens', type=int, default=10000) + parser.add_argument('--vocab_size', type=int, default=10000) parser.add_argument('--emsize', type=int, default=200) parser.add_argument('--nhead', type=int, default=2) parser.add_argument('--nhid', type=int, default=200) @@ -59,6 +60,15 @@ def get_args(): type=int, default=1, help='gpu id for this local rank, -1 for cpu') + parser.add_argument( + '--model_iter', + type=int, + default=-1, + help='resume from trained model; if -1 training from scratch') + parser.add_argument('--model_type', + type=str, + default='Transformer', + help='model type') args = parser.parse_args() @@ -70,33 +80,53 @@ def main(): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') - #Set random seed + # Set random seed torch.manual_seed(2021) - collate_func = CollateFunc() - - train_dataset = LMDataset(args.train_token) - dev_dataset = LMDataset(args.dev_token) - + # args.vocab_size: number of tokens in tokenizer.get_vocab + # + 2: one for eos_id, another for pad_idx + # i.e. token_idxs[0, 1, 2, ...., ntokens -3, ntokens - 2, ntokens - 1] + # bos_id: ntokens - 3 + # eos_id: ntokens - 2 + # pad_idx: ntokens - 1 + ntokens = args.vocab_size + 3 + pad_index = ntokens - 1 + + collate_func = CollateFunc(pad_index=pad_index) + + train_dataset = LMDataset(args.train_token, ntokens=ntokens) + dev_dataset = LMDataset(args.dev_token, ntokens=ntokens) + + # To debug dataset.py, set shuffle=False and num_workers=0 + # then examples will be loaded as the sequence they are in {train, dev}.tokens train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, - shuffle=False, - num_workers=10, + shuffle=True, + num_workers=0, + drop_last=True, collate_fn=collate_func) dev_data_loader = DataLoader(dev_dataset, batch_size=20, shuffle=False, num_workers=0, + drop_last=True, collate_fn=collate_func) - ntokens = args.ntokens - model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, - args.nlayers, args.dropout) + if 'Trasformer' == args.model_type: + model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, + args.nlayers, args.dropout) + else: + model = RNNModel('LSTM', ntokens, args.emsize, args.nhid, args.nlayers, + args.dropout, False) + + if args.model_iter > 0: + model_path = '{}/epoch_{}.pt'.format(args.model_dir, args.model_iter) + load_checkpoint(model_path, model) optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4) use_cuda = args.gpu >= 0 and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') print(device) - criterion = nn.NLLLoss(ignore_index=0) + criterion = nn.NLLLoss(ignore_index=pad_index) exp_dir = 'exp-nnlm' writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') @@ -109,7 +139,7 @@ def main(): dev_data_loader=dev_data_loader, ntokens=ntokens, batch_size=args.batch_size, - epoch=0, + epoch=args.model_iter + 1, num_epochs=args.num_epochs, clip=args.clip, model_dir=args.model_dir, diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index 9953080b..dba4c443 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -28,6 +28,14 @@ train_pieces=100000 # 5 times of dev.txt # train_pieces= dev_text=$text_dir/dev.txt +# vocab_size of huggingface tokenizer +vocab_size=2000 +# for neural models, number of final classes is: +# ntokens = $vocab_size + 3 +# while: bos_id = ntokens - 3 +# eos_id = ntokens - 2 +# pad_index = ntokens - 1 + mkdir -p $text_dir @@ -59,12 +67,12 @@ else train_text=$all_train_text fi - if [ $stage -eq 1 ]; then echo "training tokenizer" python3 local/huggingface_tokenizer.py \ - --train-file=$train_text \ - --tokenizer-path=$tokenizer + --train-file $train_text \ + --vocab-size $vocab_size \ + --tokenizer-path $tokenizer fi @@ -72,16 +80,21 @@ if [ $stage -eq 2 ]; then echo "tokenize train and dev files" for text in $dev_text $train_text; do python3 local/huggingface_tokenizer.py \ - --test-file=$text \ - --tokenizer-path=$tokenizer + --test-file $text \ + --tokenizer-path $tokenizer done fi if [ $stage -eq 3 ]; then echo "start to train" + # model_iter if for resume training + # -1 means train from scratch python main.py \ + --model_iter 48 \ --train_token ${train_text}.tokens \ - --model Transformer + --vocab_size $vocab_size \ + --model_type Transformer + fi if [ $stage -eq 4 ]; then @@ -98,5 +111,7 @@ if [ $stage -eq 4 ]; then currently the same words.txt with previous experiment is prefered" fi echo "generate lexicon" - python local/generate_lexicon.py + python local/generate_lexicon.py \ + --tokenizer-path $tokenizer + fi From d4dccae36a6db4815dc0c52b0826b8ab8f89fa30 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Fri, 2 Apr 2021 23:39:45 +0800 Subject: [PATCH 14/25] compute word_ppl from token_ppl --- egs/librispeech/asr/nnlm/compute_word_ppl.py | 145 +++++++++++++++++++ egs/librispeech/asr/nnlm/local/trainer.py | 69 +++++++++ egs/librispeech/asr/nnlm/run.sh | 29 ++-- 3 files changed, 234 insertions(+), 9 deletions(-) create mode 100644 egs/librispeech/asr/nnlm/compute_word_ppl.py diff --git a/egs/librispeech/asr/nnlm/compute_word_ppl.py b/egs/librispeech/asr/nnlm/compute_word_ppl.py new file mode 100644 index 00000000..d7d2f47b --- /dev/null +++ b/egs/librispeech/asr/nnlm/compute_word_ppl.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +# Reference: +# https://github.com/mobvoi/wenet/blob/main/wenet/bin/train.py +import argparse + +import logging +import os +import torch +import torch.nn as nn +import torch.optim as optim +import sys + +sys.path.insert(0, './local/') + +from common import load_checkpoint +from dataset import LMDataset, CollateFunc +from model import TransformerModel, RNNModel +from pathlib import Path +from trainer import Trainer +from torch.utils.tensorboard import SummaryWriter +from torch.utils.data import DataLoader + + +def get_args(): + parser = argparse.ArgumentParser( + description='training Neural Language Model') + parser.add_argument('--train_token', + default='data/nnlm/text/librispeech.txt.tokens', + help='train token file') + parser.add_argument('--dev_token', + default='data/nnlm/text/dev.txt.tokens', + help='dev token file') + parser.add_argument('--dev_txt', + default='data/nnlm/text/dev.txt', + help='dev txt file, used to compute word ppl') + parser.add_argument('--batch_size', type=int, default=60) + parser.add_argument('--vocab_size', type=int, default=2000) + parser.add_argument('--emsize', type=int, default=200) + parser.add_argument('--nhead', type=int, default=2) + parser.add_argument('--nhid', type=int, default=200) + parser.add_argument('--nlayers', type=int, default=2) + parser.add_argument('--num_epochs', type=int, default=50) + parser.add_argument('--dropout', type=int, default=0.2) + parser.add_argument('--lr', + type=float, + default=1e-2, + help='initial learning rate') + parser.add_argument('--clip', + type=float, + default=50.0, + help='gradient clipping') + parser.add_argument('--model_dir', + default='./exp-nnlm/models/', + help='path to save model') + parser.add_argument('--tensorboard_dir', + default='tensorboard', + help='path to save tensorboard log') + parser.add_argument('--gpu', + type=int, + default=1, + help='gpu id for this local rank, -1 for cpu') + parser.add_argument( + '--model_iter', + type=int, + default=19, + help='resume from trained model; if -1 training from scratch') + parser.add_argument('--model_type', + type=str, + default='Transformer', + help='model type') + + args = parser.parse_args() + + return args + + +def main(): + args = get_args() + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + + # Set random seed + torch.manual_seed(2021) + # args.vocab_size: number of tokens in tokenizer.get_vocab + # + 2: one for eos_id, another for pad_idx + # i.e. token_idxs[0, 1, 2, ...., ntokens -3, ntokens - 2, ntokens - 1] + # bos_id: ntokens - 3 + # eos_id: ntokens - 2 + # pad_idx: ntokens - 1 + ntokens = args.vocab_size + 3 + pad_index = ntokens - 1 + + collate_func = CollateFunc(pad_index=pad_index) + + dev_dataset = LMDataset(args.dev_token, ntokens=ntokens) + + dev_data_loader = DataLoader(dev_dataset, + batch_size=1, + shuffle=False, + num_workers=0, + drop_last=False, + collate_fn=collate_func) + + if 'Trasformer' == args.model_type: + model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, + args.nlayers, args.dropout) + else: + model = RNNModel('LSTM', ntokens, args.emsize, args.nhid, args.nlayers, + args.dropout, False) + + if args.model_iter > 0: + model_path = '{}/epoch_{}.pt'.format(args.model_dir, args.model_iter) + load_checkpoint(model_path, model) + optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4) + use_cuda = args.gpu >= 0 and torch.cuda.is_available() + device = torch.device('cuda' if use_cuda else 'cpu') + print(device) + criterion = nn.NLLLoss(ignore_index=pad_index) + exp_dir = 'exp-nnlm' + writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') + + Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True) + trainer = Trainer(device, + model, + criterion, + optimizer, + train_data_loader=None, + dev_data_loader=dev_data_loader, + ntokens=ntokens, + batch_size=args.batch_size, + epoch=args.model_iter + 1, + num_epochs=args.num_epochs, + clip=args.clip, + model_dir=args.model_dir, + writer=writer) + + trainer.get_word_ppl(args.dev_txt) + + +if __name__ == '__main__': + main() diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py index e99cfb9c..24eb2546 100644 --- a/egs/librispeech/asr/nnlm/local/trainer.py +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -5,6 +5,7 @@ import logging import math +import numpy as np import torch from common import load_checkpoint, save_checkpoint @@ -134,6 +135,11 @@ def eval(self): total_loss = 0.0 total_examples = 0 for batch_idx, batch in enumerate(self.dev_data_loader): + # batch_input: [seq_len, batch_size] + # with contents: token_id token_id .... + # + # batch_target: [seq_len, batch_size] + # with contensts: token_id token_id ... batch_input, batch_target = batch batch_input = batch_input.to(self.device) batch_target = batch_target.to(self.device) @@ -158,3 +164,66 @@ def eval(self): log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format( loss.item(), ppl, self.epoch) logging.info(log_str) + + def get_word_counts(self, dev_txt: str): + word_counts = [] + with open(dev_txt, 'r') as f: + for line in f: + # +1: for append + word_counts.append(len(line.split()) + 1) + + return word_counts + + def compute_words_ppl(self, tokens_loss, tokens_counts, word_counts): + assert len(tokens_loss) == len(tokens_counts) + assert len(word_counts) == len(tokens_counts) + words_ppl = [ + math.exp(tokens_loss[i] * tokens_counts[i] / word_counts[i]) + for i in range(len(word_counts)) + ] + word_ppl = np.mean(words_ppl) + return word_ppl + + @torch.no_grad() + def get_word_ppl(self, dev_txt: str): + word_counts = self.get_word_counts(dev_txt) + tokens_ppl = [] + tokens_loss = [] + tokens_counts = [] + + self.model.eval() + for batch_idx, batch in enumerate(self.dev_data_loader): + if batch_idx % 1000 == 0 and batch_idx > 0: + logging.info('{}/{} computed'.format( + batch_idx, len(self.dev_data_loader))) + # batch_input: [seq_len, batch_size] + # with contents: token_id token_id .... + # + # batch_target: [seq_len, batch_size] + # with contensts: token_id token_id ... + batch_input, batch_target = batch + # batch_size == 1 to get loss and ppl for each seq + assert batch_input.shape[1] == 1 + batch_input = batch_input.to(self.device) + batch_target = batch_target.to(self.device) + self.model.to(self.device) + if isinstance(self.model, TransformerModel): + batch_output = self.model(batch_input) + + prediction = batch_output.view(-1, self.ntokens) + else: + hidden = self.model.init_hidden(batch_input.shape[1]) + prediction, _ = self.model(batch_input, hidden) + # target: [max_seq_len * batch_size] + # example_1_token_1 example_2_token_1 example_3_token_1 ..... + target = batch_target.view(-1) + loss = self.criterion(prediction, target).item() + ppl = math.exp(loss) + tokens_ppl.append(ppl) + tokens_loss.append(loss) + tokens_counts.append(len(target)) + word_ppl = self.compute_words_ppl(tokens_loss, tokens_counts, + word_counts) + token_ppl = np.mean(tokens_ppl) + logging.info('token_ppl: {}, word_ppl: {}'.format(token_ppl, word_ppl)) + return word_ppl, token_ppl diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index dba4c443..e0c443a9 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -23,13 +23,13 @@ text_dir=data/nnlm/text all_train_text=$text_dir/librispeech.txt # there are 40,398,052 pieces in all_train_text, which will take 50 MINUTES to be tokenized, with a single process. # Now only $train_pieces data is used for debugging pipeline -train_pieces=100000 # 5 times of dev.txt +train_pieces=300000 # 15 times of dev.txt # uncomment follwoing line to use all_train_text # train_pieces= dev_text=$text_dir/dev.txt # vocab_size of huggingface tokenizer -vocab_size=2000 +vocab_size=3000 # for neural models, number of final classes is: # ntokens = $vocab_size + 3 # while: bos_id = ntokens - 3 @@ -39,13 +39,13 @@ vocab_size=2000 mkdir -p $text_dir -if [ $stage -eq -1 ]; then +if [ $stage -le -1 ]; then # env for experiment ../simple_v1 is expected to have been built. echo "Install extra dependencies" pip install -r requirements.txt fi -if [ $stage -eq 0 ]; then +if [ $stage -le 0 ]; then # reference: # https://github.com/kaldi-asr/kaldi/blob/pybind11/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh#L75 # use the same data seperation method to kaldi whose result can be used as a baseline @@ -67,7 +67,7 @@ else train_text=$all_train_text fi -if [ $stage -eq 1 ]; then +if [ $stage -le 1 ]; then echo "training tokenizer" python3 local/huggingface_tokenizer.py \ --train-file $train_text \ @@ -76,7 +76,7 @@ if [ $stage -eq 1 ]; then fi -if [ $stage -eq 2 ]; then +if [ $stage -le 2 ]; then echo "tokenize train and dev files" for text in $dev_text $train_text; do python3 local/huggingface_tokenizer.py \ @@ -85,19 +85,30 @@ if [ $stage -eq 2 ]; then done fi -if [ $stage -eq 3 ]; then +if [ $stage -le 3 ]; then echo "start to train" # model_iter if for resume training # -1 means train from scratch python main.py \ - --model_iter 48 \ + --model_iter -1 \ --train_token ${train_text}.tokens \ --vocab_size $vocab_size \ --model_type Transformer fi -if [ $stage -eq 4 ]; then +if [ $stage -le 4 ]; then + echo "start to train" + # model_iter if for resume training + # -1 means train from scratch + python compute_word_ppl.py \ + --model_iter 40 \ + --vocab_size $vocab_size \ + --model_type Transformer + +fi + +if [ $stage -le 5 ]; then # generate words.txt tokens.txt and lexicion.txt # which is used in future rescore process lexicon_path=./data/nnlm/lexicon From a4d5f1b2051397073a3720f8f62112afd838e247 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Sat, 3 Apr 2021 08:50:25 +0800 Subject: [PATCH 15/25] add results.md --- egs/librispeech/asr/nnlm/RESULTS.md | 11 +++++++++++ egs/librispeech/asr/nnlm/local/trainer.py | 11 ++++++++--- egs/librispeech/asr/nnlm/main.py | 4 +++- egs/librispeech/asr/nnlm/run.sh | 3 ++- 4 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 egs/librispeech/asr/nnlm/RESULTS.md diff --git a/egs/librispeech/asr/nnlm/RESULTS.md b/egs/librispeech/asr/nnlm/RESULTS.md new file mode 100644 index 00000000..1db693ed --- /dev/null +++ b/egs/librispeech/asr/nnlm/RESULTS.md @@ -0,0 +1,11 @@ +##tokens ppl train_pieces=300000 # 15 times of dev.txt + +###vocab_size=2000 + epochs=50 train/dev perplexity was 80.0 / 119.0 + +###vocab_size=3000 + dev perplexity of random initialized model is around 2998.13 + epochs=1 train/dev perplexity was around 120 / 137.67 + epochs=2 train/dev perplexity was around 113 / 132.51 + epochs=3 train/dev perplexity was around 111 / 130.09 + epochs=4 train/dev perplexity was around 109 / 130.29 diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py index 24eb2546..dc9b6e1e 100644 --- a/egs/librispeech/asr/nnlm/local/trainer.py +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -62,17 +62,22 @@ def __init__(self, self.num_infinite_grad_norm = 0 def run(self): - for epoch in range(self.num_epochs): + # save and eval initialized moel + if 0 == self.epoch: + save_checkpoint("{}/epoch_0.pt".format(self.model_dir), self.model) + self.eval() + + for epoch in range(self.epoch, self.num_epochs): if self.train_data_loader is not None: self.train() + self.epoch += 1 if self.dev_data_loader is not None: self.eval() + save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, epoch), self.model) - self.epoch += 1 - def train(self): self.model.train() total_loss = 0 diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py index 2fb6bc6a..6fed85e6 100644 --- a/egs/librispeech/asr/nnlm/main.py +++ b/egs/librispeech/asr/nnlm/main.py @@ -101,7 +101,7 @@ def main(): train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, - num_workers=0, + num_workers=10, drop_last=True, collate_fn=collate_func) @@ -131,6 +131,7 @@ def main(): writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True) + log_interval = max(100, len(train_data_loader) // 20) trainer = Trainer(device, model, criterion, @@ -142,6 +143,7 @@ def main(): epoch=args.model_iter + 1, num_epochs=args.num_epochs, clip=args.clip, + log_interval=log_interval, model_dir=args.model_dir, writer=writer) trainer.run() diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index e0c443a9..bf86de2f 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -98,7 +98,8 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - echo "start to train" + # TODO: this module is in developing + echo "compute word ppl from token ppl" # model_iter if for resume training # -1 means train from scratch python compute_word_ppl.py \ From 53e2d1e330da2cb0fef910c6e7389f97654f4640 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Sat, 3 Apr 2021 23:57:26 +0800 Subject: [PATCH 16/25] compute word_ppl from token_ppl --- egs/librispeech/asr/nnlm/local/trainer.py | 26 +++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py index dc9b6e1e..9520ef9f 100644 --- a/egs/librispeech/asr/nnlm/local/trainer.py +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -179,16 +179,6 @@ def get_word_counts(self, dev_txt: str): return word_counts - def compute_words_ppl(self, tokens_loss, tokens_counts, word_counts): - assert len(tokens_loss) == len(tokens_counts) - assert len(word_counts) == len(tokens_counts) - words_ppl = [ - math.exp(tokens_loss[i] * tokens_counts[i] / word_counts[i]) - for i in range(len(word_counts)) - ] - word_ppl = np.mean(words_ppl) - return word_ppl - @torch.no_grad() def get_word_ppl(self, dev_txt: str): word_counts = self.get_word_counts(dev_txt) @@ -227,8 +217,18 @@ def get_word_ppl(self, dev_txt: str): tokens_ppl.append(ppl) tokens_loss.append(loss) tokens_counts.append(len(target)) - word_ppl = self.compute_words_ppl(tokens_loss, tokens_counts, - word_counts) - token_ppl = np.mean(tokens_ppl) + + assert len(tokens_loss) == len(tokens_counts) + assert len(word_counts) == len(tokens_counts) + sentence_log_prob = [ + tokens_loss[i] * tokens_counts[i] + for i in range(len(tokens_counts)) + ] + total_log_prob = np.sum(sentence_log_prob) + total_words = np.sum(word_counts) + total_tokens = np.sum(tokens_counts) + + word_ppl = math.exp(total_log_prob / total_words) + token_ppl = math.exp(total_log_prob / total_tokens) logging.info('token_ppl: {}, word_ppl: {}'.format(token_ppl, word_ppl)) return word_ppl, token_ppl From b226a3a94a131f4af87460f96f5bc3b3ffd792b7 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Fri, 9 Apr 2021 11:44:31 +0800 Subject: [PATCH 17/25] support yaml configuration --- .../asr/nnlm/conf/lm_small_transformer.yaml | 43 +++++ .../asr/nnlm/conf/lm_transformer.yaml | 44 +++++ egs/librispeech/asr/nnlm/local/dataset.py | 9 +- egs/librispeech/asr/nnlm/local/model.py | 123 ++++--------- egs/librispeech/asr/nnlm/local/trainer.py | 28 ++- egs/librispeech/asr/nnlm/main.py | 168 ++++++++---------- egs/librispeech/asr/nnlm/run.sh | 26 ++- 7 files changed, 229 insertions(+), 212 deletions(-) create mode 100644 egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml create mode 100644 egs/librispeech/asr/nnlm/conf/lm_transformer.yaml diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml new file mode 100644 index 00000000..01bd5d05 --- /dev/null +++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml @@ -0,0 +1,43 @@ + +gpu: 1 +tensorboard_dir: 'exp-nnlm/tensorobard' + +# network architecture equivalent configuration to +# https://github.com/pytorch/examples/blob/master/word_language_model/main.py +model_module: transformer +transformer_conf: + embed_unit: 200 + attention_heads: 2 + nlayers: 2 + linear_units: 200 + dropout: 0.2 + +shared_conf: + ntoken: 5003 + batch_size: 30 + +optimizer_conf: + lr: 0.02 + weight_decay: 0.005 + +trainer_conf: + num_epochs: 50 + clip: 0.25 + model_dir: './exp-nnlm/models/' + + +dataset_conf: + train_token: 'data/nnlm/text/300000_librispeech.txt.tokens' + dev_token: 'data/nnlm/text/dev.txt.tokens' + +dataloader_conf: + train: + batch_size: 20 + shuffle: True + num_workers: 10 + drop_last: True + dev: + batch_size: 20 + shuffle: False + num_workers: 10 + drop_last: False diff --git a/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml new file mode 100644 index 00000000..21851ded --- /dev/null +++ b/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml @@ -0,0 +1,44 @@ +# modified from: +# https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/conf/tuning/lm_transformer.yaml + +gpu: 1 +tensorboard_dir: 'exp-nnlm/tensorobard' + +# network architecture +model_module: transformer +transformer_conf: + embed_unit: 128 + attention_heads: 8 + nlayers: 16 + linear_units: 2048 + dropout: 0.2 + +shared_conf: + ntoken: 5003 + batch_size: 30 + +optimizer_conf: + lr: 0.02 + weight_decay: 0.005 + +trainer_conf: + num_epochs: 50 + clip: 0.25 + model_dir: './exp-nnlm/models/' + + +dataset_conf: + train_token: 'data/nnlm/text/librispeech.txt.tokens' + dev_token: 'data/nnlm/text/dev.txt.tokens' + +dataloader_conf: + train: + batch_size: 20 + shuffle: True + num_workers: 10 + drop_last: True + dev: + batch_size: 20 + shuffle: False + num_workers: 10 + drop_last: False diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index be7904d7..e32db6fc 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -43,15 +43,15 @@ def __call__(self, batch: List[List[int]]): class LMDataset(Dataset): - def __init__(self, text_file: str, ntokens=None): + def __init__(self, text_file: str, ntoken:int): '''Dataset to load Language Model train/dev text data Args: text_file: text file, text for one utt per line. ''' - self.bos_id = ntokens - 3 - self.eos_id = ntokens - 2 - self.pad_index = ntokens - 1 + self.bos_id = ntoken - 3 + self.eos_id = ntoken - 2 + self.pad_index = ntoken - 1 assert os.path.exists( text_file ), "text_file: {} does not exist, please check that.".format(text_file) @@ -59,6 +59,7 @@ def __init__(self, text_file: str, ntokens=None): with open(text_file, 'r') as f: for idx, line in enumerate(f): token_id = [int(i) for i in line.strip().split()] + # https://github.com/espnet/espnet/blob/master/espnet/lm/lm_utils.py#L179 # add bos_id and eos_id to each piece of example # then each valid example should be longer than 2 token_id.insert(0, self.bos_id) diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py index bc3833dc..e9daeb52 100644 --- a/egs/librispeech/asr/nnlm/local/model.py +++ b/egs/librispeech/asr/nnlm/local/model.py @@ -6,86 +6,6 @@ import torch.nn.functional as F -class RNNModel(nn.Module): - """Container module with an encoder, a recurrent module, and a decoder.""" - - def __init__(self, - rnn_type, - ntoken, - ninp, - nhid, - nlayers, - dropout=0.5, - tie_weights=False): - super(RNNModel, self).__init__() - self.ntoken = ntoken - self.drop = nn.Dropout(dropout) - self.encoder = nn.Embedding(ntoken, ninp, padding_idx=ntoken - 1) - if rnn_type in ['LSTM', 'GRU']: - self.rnn = getattr(nn, rnn_type)(ninp, - nhid, - nlayers, - dropout=dropout) - else: - try: - nonlinearity = { - 'RNN_TANH': 'tanh', - 'RNN_RELU': 'relu' - }[rnn_type] - except KeyError: - raise ValueError( - """An invalid option for `--model` was supplied, - options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""" - ) - self.rnn = nn.RNN(ninp, - nhid, - nlayers, - nonlinearity=nonlinearity, - dropout=dropout) - self.decoder = nn.Linear(nhid, ntoken) - - # Optionally tie weights as in: - # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) - # https://arxiv.org/abs/1608.05859 - # and - # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) - # https://arxiv.org/abs/1611.01462 - if tie_weights: - if nhid != ninp: - raise ValueError( - 'When using the tied flag, nhid must be equal to emsize') - self.decoder.weight = self.encoder.weight - - self.init_weights() - - self.rnn_type = rnn_type - self.nhid = nhid - self.nlayers = nlayers - - def init_weights(self): - initrange = 0.1 - nn.init.uniform_(self.encoder.weight, -initrange, initrange) - nn.init.zeros_(self.decoder.weight) - nn.init.uniform_(self.decoder.weight, -initrange, initrange) - - def forward(self, input, hidden): - # import pdb; pdb.set_trace() - emb = self.drop(self.encoder(input)) - output, hidden = self.rnn(emb, hidden) - output = self.drop(output) - decoded = self.decoder(output) - decoded = decoded.view(-1, self.ntoken) - return F.log_softmax(decoded, dim=1), hidden - - def init_hidden(self, bsz): - weight = next(self.parameters()) - if self.rnn_type == 'LSTM': - return (weight.new_zeros(self.nlayers, bsz, self.nhid), - weight.new_zeros(self.nlayers, bsz, self.nhid)) - else: - return weight.new_zeros(self.nlayers, bsz, self.nhid) - - # Temporarily leave PositionalEncoding module here. Will be moved somewhere else. class PositionalEncoding(nn.Module): r"""Inject some information about the relative or absolute position of the tokens @@ -115,6 +35,7 @@ def __init__(self, d_model, dropout=0.1, max_len=5000): (-math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) + # pe: [max_len, 1, d_model] pe = pe.unsqueeze(0).transpose(0, 1) self.register_buffer('pe', pe) @@ -129,6 +50,9 @@ def forward(self, x): >>> output = pos_encoder(x) """ + # x: [seq_len, batch_size, d_model] + # self.pe: [max_len, 1, d_model] + # add with broadcasting x = x + self.pe[:x.size(0), :] return self.dropout(x) @@ -136,7 +60,24 @@ def forward(self, x): class TransformerModel(nn.Module): """Container module with an encoder, a recurrent or transformer module, and a decoder.""" - def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): + def __init__(self, + ntoken: int, + embed_unit: int, + attention_heads: int, + linear_units: int, + nlayers: int, + dropout: float = 0.5): + ''' + ntoken: usually vocab_size + 3; 1 for , 1 for , 1 for + embed_unit: the number of input channels + attention_heads: parallel attention attention_headss + linear_units: the dimension of the feedforward network model. + feedforward contains two Linear modules. + self.linear1 = Linear(d_model, dim_feedforward) + self.linear2 = Linear(dim_feedforward, d_model) + so for a torch.nn.TransformerEncoder layer, the output dimension equals to input_dimension. + + ''' super(TransformerModel, self).__init__() try: from torch.nn import TransformerEncoder, TransformerEncoderLayer @@ -144,14 +85,18 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): raise ImportError( 'TransformerEncoder module does not exist in PyTorch 1.1 or lower.' ) + attention_head_dim = embed_unit / attention_heads + assert attention_head_dim * attention_heads == embed_unit, "embed_dim must be divisible by num_attention_headss" + self.model_type = 'Transformer' self.src_mask = None - self.pos_encoder = PositionalEncoding(ninp, dropout) - encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout) + self.pos_encoder = PositionalEncoding(embed_unit, dropout) + encoder_layers = TransformerEncoderLayer(embed_unit, attention_heads, + linear_units, dropout) self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) - self.encoder = nn.Embedding(ntoken, ninp, padding_idx=ntoken - 1) - self.ninp = ninp - self.decoder = nn.Linear(ninp, ntoken) + self.encoder = nn.Embedding(ntoken, embed_unit, padding_idx=ntoken - 1) + self.embed_unit = embed_unit + self.decoder = nn.Linear(embed_unit, ntoken) self.init_weights() @@ -168,7 +113,7 @@ def init_weights(self): nn.init.uniform_(self.decoder.weight, -initrange, initrange) def forward(self, src, has_mask=True): - # src: [seq—len, batch_size] + # src: [seq_len, batch_size] # len(src) is seq_len if has_mask: device = src.device @@ -189,7 +134,9 @@ def forward(self, src, has_mask=True): # [0., 0., 0., ..., 0., 0., -inf], # [0., 0., 0., ..., 0., 0., 0.]], device='cuda:0') - src = self.encoder(src) * math.sqrt(self.ninp) + # after self.encoder + # src: [seq_len, batch_size, channel] + src = self.encoder(src) * math.sqrt(self.embed_unit) src = self.pos_encoder(src) output = self.transformer_encoder(src, self.src_mask) output = self.decoder(output) diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py index 9520ef9f..d25bbe10 100644 --- a/egs/librispeech/asr/nnlm/local/trainer.py +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -9,7 +9,7 @@ import torch from common import load_checkpoint, save_checkpoint -from model import TransformerModel, RNNModel +from model import TransformerModel # references: # https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py @@ -36,8 +36,7 @@ def __init__(self, optimizer=None, train_data_loader=None, dev_data_loader=None, - ntokens=None, - batch_size=1, + ntoken=None, epoch=0, num_epochs=10, clip=0.25, @@ -48,8 +47,7 @@ def __init__(self, self.model = model self.criterion = criterion self.optimizer = optimizer - self.ntokens = ntokens - self.batch_size = batch_size + self.ntoken = ntoken self.epoch = epoch self.num_epochs = num_epochs self.train_data_loader = train_data_loader @@ -67,16 +65,13 @@ def run(self): save_checkpoint("{}/epoch_0.pt".format(self.model_dir), self.model) self.eval() - for epoch in range(self.epoch, self.num_epochs): + for _ in range(self.epoch, self.num_epochs): if self.train_data_loader is not None: self.train() - self.epoch += 1 if self.dev_data_loader is not None: self.eval() - save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, epoch), - self.model) def train(self): self.model.train() @@ -86,15 +81,13 @@ def train(self): # batch_input, batch_target: [max_seq_len, batch_size] # max_seq_len is the maximum lenght in current batch batch_input, batch_target = batch - assert batch_input.shape[1] == self.batch_size - assert batch_target.shape[1] == self.batch_size batch_input = batch_input.to(self.device) batch_target = batch_target.to(self.device) self.model.to(self.device) if isinstance(self.model, TransformerModel): batch_output = self.model(batch_input) - prediction = batch_output.view(-1, self.ntokens) + prediction = batch_output.view(-1, self.ntoken) else: # reinitiate hidden for everch batch # as batches are independent on each other @@ -128,12 +121,17 @@ def train(self): logging.info('infinite grad_norm detected {} times'.format( self.num_infinite_grad_norm)) total_loss = 0.0 - if batch_idx % 10000 == 0 and batch_idx > 0: save_checkpoint( "{}/epoch_{}-batch_{}.pt".format(self.model_dir, self.epoch, batch_idx), self.model) + save_checkpoint( + "{}/epoch_{}.pt".format(self.model_dir, self.epoch), + self.model) + + self.epoch += 1 + @torch.no_grad() def eval(self): self.model.eval() @@ -152,7 +150,7 @@ def eval(self): if isinstance(self.model, TransformerModel): batch_output = self.model(batch_input) - prediction = batch_output.view(-1, self.ntokens) + prediction = batch_output.view(-1, self.ntoken) else: hidden = self.model.init_hidden(batch_input.shape[1]) prediction, _ = self.model(batch_input, hidden) @@ -205,7 +203,7 @@ def get_word_ppl(self, dev_txt: str): if isinstance(self.model, TransformerModel): batch_output = self.model(batch_input) - prediction = batch_output.view(-1, self.ntokens) + prediction = batch_output.view(-1, self.ntoken) else: hidden = self.model.init_hidden(batch_input.shape[1]) prediction, _ = self.model(batch_input, hidden) diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py index 6fed85e6..710b8898 100644 --- a/egs/librispeech/asr/nnlm/main.py +++ b/egs/librispeech/asr/nnlm/main.py @@ -4,6 +4,7 @@ # Apache 2.0 # Reference: +# https://github.com/espnet/espnet/blob/master/espnet/lm/pytorch_backend/lm.py # https://github.com/mobvoi/wenet/blob/main/wenet/bin/train.py import argparse @@ -13,139 +14,126 @@ import torch.nn as nn import torch.optim as optim import sys +import yaml sys.path.insert(0, './local/') from common import load_checkpoint from dataset import LMDataset, CollateFunc -from model import TransformerModel, RNNModel +from model import TransformerModel from pathlib import Path from trainer import Trainer from torch.utils.tensorboard import SummaryWriter from torch.utils.data import DataLoader +from typing import List, Dict def get_args(): parser = argparse.ArgumentParser( description='training Neural Language Model') - parser.add_argument('--train_token', - default='data/nnlm/text/librispeech.txt.tokens', - help='train data file') - parser.add_argument('--dev_token', - default='data/nnlm/text/dev.txt.tokens', - help='dev data file') - parser.add_argument('--batch_size', type=int, default=60) - parser.add_argument('--vocab_size', type=int, default=10000) - parser.add_argument('--emsize', type=int, default=200) - parser.add_argument('--nhead', type=int, default=2) - parser.add_argument('--nhid', type=int, default=200) - parser.add_argument('--nlayers', type=int, default=2) - parser.add_argument('--num_epochs', type=int, default=50) - parser.add_argument('--dropout', type=int, default=0.2) - parser.add_argument('--lr', - type=float, - default=1e-2, - help='initial learning rate') - parser.add_argument('--clip', - type=float, - default=50.0, - help='gradient clipping') - parser.add_argument('--model_dir', - default='./exp-nnlm/models/', - help='path to save model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='path to save tensorboard log') - parser.add_argument('--gpu', + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--vocab_size', type=int, default=3000) + parser.add_argument('--resume_model_iter', type=int, - default=1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument( - '--model_iter', - type=int, - default=-1, - help='resume from trained model; if -1 training from scratch') - parser.add_argument('--model_type', - type=str, - default='Transformer', - help='model type') + default=-1, + help='resume from trained model;') args = parser.parse_args() return args +def validate_configs(configs: Dict, required_fields: List) -> bool: + not_exist_fields = [] + for field in required_fields: + if field not in configs or configs[field] is None: + not_exist_fields.append(field) + if len(not_exist_fields) > 0: + assert False, 'set following required fields {}'.format( + ' '.join(not_exist_fields)) + return True + + +def extract_configs(args) -> Dict: + assert os.path.exists(args.config), '{} does not exist'.format(args.cofnig) + required_fields = [ + 'model_module', 'shared_conf', 'optimizer_conf', 'trainer_conf', + 'dataset_conf' + ] + with open(args.config, 'r') as f: + configs = yaml.load(f, Loader=yaml.FullLoader) + validate_configs(configs, required_fields) + + model_conf = '{}_conf'.format(configs['model_module']) + ntoken = configs['shared_conf']['ntoken'] + + configs[model_conf]['ntoken'] = ntoken + configs['trainer_conf']['ntoken'] = ntoken + + assert 'model_dir' in configs['trainer_conf'] + model_dir = configs['trainer_conf']['model_dir'] + Path(os.path.dirname(model_dir)).mkdir(parents=True, exist_ok=True) + + return configs + + def main(): args = get_args() + configs = extract_configs(args) logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') # Set random seed torch.manual_seed(2021) - # args.vocab_size: number of tokens in tokenizer.get_vocab - # + 2: one for eos_id, another for pad_idx - # i.e. token_idxs[0, 1, 2, ...., ntokens -3, ntokens - 2, ntokens - 1] - # bos_id: ntokens - 3 - # eos_id: ntokens - 2 - # pad_idx: ntokens - 1 - ntokens = args.vocab_size + 3 - pad_index = ntokens - 1 + ntoken = args.vocab_size + 3 + assert ntoken == configs['shared_conf']['ntoken'] + + # Data + pad_index = ntoken - 1 collate_func = CollateFunc(pad_index=pad_index) - train_dataset = LMDataset(args.train_token, ntokens=ntokens) - dev_dataset = LMDataset(args.dev_token, ntokens=ntokens) + train_dataset = LMDataset(configs['dataset_conf']['train_token'], + ntoken=ntoken) + dev_dataset = LMDataset(configs['dataset_conf']['dev_token'], + ntoken=ntoken) - # To debug dataset.py, set shuffle=False and num_workers=0 - # then examples will be loaded as the sequence they are in {train, dev}.tokens train_data_loader = DataLoader(train_dataset, - batch_size=args.batch_size, - shuffle=True, - num_workers=10, - drop_last=True, - collate_fn=collate_func) + collate_fn=collate_func, + **configs['dataloader_conf']['train']) dev_data_loader = DataLoader(dev_dataset, - batch_size=20, - shuffle=False, - num_workers=0, - drop_last=True, - collate_fn=collate_func) - - if 'Trasformer' == args.model_type: - model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, - args.nlayers, args.dropout) - else: - model = RNNModel('LSTM', ntokens, args.emsize, args.nhid, args.nlayers, - args.dropout, False) - - if args.model_iter > 0: - model_path = '{}/epoch_{}.pt'.format(args.model_dir, args.model_iter) + collate_fn=collate_func, + **configs['dataloader_conf']['dev']) + + # initialize or resume model + if configs['model_module'] == 'transformer': + model = TransformerModel(**configs['transformer_conf']) + + if args.resume_model_iter > 0: + model_dir = configs['trainer_conf']['model_dir'] + model_path = '{}/epoch_{}.pt'.format(model_dir, args.resume_model_iter) + assert os.path.exists(model_path) load_checkpoint(model_path, model) - optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() + + optimizer = optim.AdamW(model.parameters(), **configs['optimizer_conf']) + use_cuda = configs['gpu'] >= 0 and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') - print(device) criterion = nn.NLLLoss(ignore_index=pad_index) - exp_dir = 'exp-nnlm' - writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') - Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True) + writer = SummaryWriter(log_dir=configs['tensorboard_dir']) + log_interval = max(100, len(train_data_loader) // 20) - trainer = Trainer(device, - model, - criterion, - optimizer, + trainer = Trainer(device=device, + model=model, + criterion=criterion, + optimizer=optimizer, train_data_loader=train_data_loader, dev_data_loader=dev_data_loader, - ntokens=ntokens, - batch_size=args.batch_size, - epoch=args.model_iter + 1, - num_epochs=args.num_epochs, - clip=args.clip, + epoch=args.resume_model_iter + 1, log_interval=log_interval, - model_dir=args.model_dir, - writer=writer) + writer=writer, + **configs['trainer_conf']) trainer.run() diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index bf86de2f..ffa82e8a 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -3,14 +3,8 @@ # Copyright 2020 Xiaomi Corporation (Author: Liyong Guo) # Apache 2.0 -# References: -# https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/train_rnnlm.sh -# https://github.com/kaldi-asr/kaldi/blob/pybind11/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh#L75 -# https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/prepare_rnnlm_dir.sh -# https://github.com/pytorch/examples/tree/master/word_language_model -# https://huggingface.co/docs/tokenizers/python/latest/quicktour.html -# Example of how to use HuggingFace tokenizer and train {RNN, Transformer} based LMs +# Example of how to use HuggingFace tokenizer and train Transformer based LMs set -e stage=$1 @@ -22,20 +16,22 @@ text=data/local/lm/librispeech-lm-norm.txt.gz text_dir=data/nnlm/text all_train_text=$text_dir/librispeech.txt # there are 40,398,052 pieces in all_train_text, which will take 50 MINUTES to be tokenized, with a single process. -# Now only $train_pieces data is used for debugging pipeline +# use $train_pieces data to validate pipeline train_pieces=300000 # 15 times of dev.txt # uncomment follwoing line to use all_train_text # train_pieces= dev_text=$text_dir/dev.txt # vocab_size of huggingface tokenizer -vocab_size=3000 +vocab_size=5000 # for neural models, number of final classes is: # ntokens = $vocab_size + 3 # while: bos_id = ntokens - 3 # eos_id = ntokens - 2 # pad_index = ntokens - 1 +# lm_config=conf/lm_transformer.yaml +lm_config=conf/lm_small_transformer.yaml mkdir -p $text_dir @@ -60,13 +56,14 @@ fi if [ ! -z "$train_pieces" ]; then train_text=$text_dir/${train_pieces}_librispeech.txt if [ $train_text -ot $all_train_text ] || [ ! -f $train_text ]; then - # if [ ! -f $train_text) || $train_text -ot $all_train_text ]; then head -n $train_pieces $all_train_text > $train_text fi else train_text=$all_train_text fi +# Reference: huggingface tokenizer +# https://huggingface.co/docs/tokenizers/python/latest/quicktour.html if [ $stage -le 1 ]; then echo "training tokenizer" python3 local/huggingface_tokenizer.py \ @@ -87,13 +84,12 @@ fi if [ $stage -le 3 ]; then echo "start to train" - # model_iter if for resume training + # resume_model_iter is for resume training # -1 means train from scratch python main.py \ - --model_iter -1 \ - --train_token ${train_text}.tokens \ - --vocab_size $vocab_size \ - --model_type Transformer + --config $lm_config \ + --vocab_size $vocab_size + --resume_model_iter -1 fi From 89ece61a617d4ae2b326ed8327c8182ecd7677ca Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Fri, 9 Apr 2021 11:57:19 +0800 Subject: [PATCH 18/25] update results with nvocab=5000 --- egs/librispeech/asr/nnlm/RESULTS.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/egs/librispeech/asr/nnlm/RESULTS.md b/egs/librispeech/asr/nnlm/RESULTS.md index 1db693ed..0018015f 100644 --- a/egs/librispeech/asr/nnlm/RESULTS.md +++ b/egs/librispeech/asr/nnlm/RESULTS.md @@ -1,4 +1,4 @@ -##tokens ppl train_pieces=300000 # 15 times of dev.txt +##tokens ppl with train_pieces=300000 # 15 times of dev.txt ###vocab_size=2000 epochs=50 train/dev perplexity was 80.0 / 119.0 @@ -9,3 +9,11 @@ epochs=2 train/dev perplexity was around 113 / 132.51 epochs=3 train/dev perplexity was around 111 / 130.09 epochs=4 train/dev perplexity was around 109 / 130.29 + +###vocab_size=5000 + dev perplexity of random initialized model is around 6844.12 + epochs=1 train/dev perplexity was around 898 / 984.12 + epochs=2 train/dev perplexity was around 964 / 982.52 + epochs=3 train/dev perplexity was around 908 / 1020.44 + epochs=4 train/dev perplexity was around 914 / 1030.31 + epochs=4 train/dev perplexity was around 916 / 975.74 From c3f88116e0bf3c25b13b920672dc6b75b1c64cb9 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Fri, 9 Apr 2021 17:53:13 +0800 Subject: [PATCH 19/25] fix reviews --- egs/librispeech/asr/nnlm/local/common.py | 2 +- egs/librispeech/asr/nnlm/local/dataset.py | 11 +++++----- .../asr/nnlm/local/generate_lexicon.py | 21 ++++++++++++------- egs/librispeech/asr/nnlm/local/model.py | 5 +++-- egs/librispeech/asr/nnlm/local/trainer.py | 8 +++---- 5 files changed, 26 insertions(+), 21 deletions(-) diff --git a/egs/librispeech/asr/nnlm/local/common.py b/egs/librispeech/asr/nnlm/local/common.py index f561cf1d..770ae6e1 100644 --- a/egs/librispeech/asr/nnlm/local/common.py +++ b/egs/librispeech/asr/nnlm/local/common.py @@ -12,7 +12,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union Pathlike = Union[str, Path] -Info = Union[dict, None] +Info = Optional[dict] def load_checkpoint(filename: Pathlike, diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index e32db6fc..f932b197 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -43,7 +43,7 @@ def __call__(self, batch: List[List[int]]): class LMDataset(Dataset): - def __init__(self, text_file: str, ntoken:int): + def __init__(self, text_file: str, ntoken: int): '''Dataset to load Language Model train/dev text data Args: @@ -57,15 +57,16 @@ def __init__(self, text_file: str, ntoken:int): ), "text_file: {} does not exist, please check that.".format(text_file) self.data = [] with open(text_file, 'r') as f: - for idx, line in enumerate(f): + for line in f: token_id = [int(i) for i in line.strip().split()] + # Empty line exists in librispeech.txt. Disregrad that. + if len(token_id) == 0: + continue # https://github.com/espnet/espnet/blob/master/espnet/lm/lm_utils.py#L179 # add bos_id and eos_id to each piece of example - # then each valid example should be longer than 2 token_id.insert(0, self.bos_id) token_id.append(self.eos_id) - if len(token_id) > 2: - self.data.append(token_id) + self.data.append(token_id) def __len__(self): return len(self.data) diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py index 3b50ecf5..e542b00d 100644 --- a/egs/librispeech/asr/nnlm/local/generate_lexicon.py +++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py @@ -30,16 +30,21 @@ def get_args(): def generate_tokens(args): - ''' Extract symbols and there corresponding ids from a tokenizer, + ''' Extract symbols and the corresponding ids from a tokenizer, and save as tokens.txt. - An example file looks like: - a 1 - b 2 - c 3 + A real token.txt with nvocab=10000 is: + [unk] 0 + ' 1 + a 2 + b 3 + c 4 ... - it 100 - sh 101 - + patty 9994 + neatly 9995 + stormy 9996 + daddy 9997 + ##enon 9998 + remarkably 9999 ''' tokenizer = Tokenizer.from_file(args.tokenizer_path) diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py index e9daeb52..fd37601f 100644 --- a/egs/librispeech/asr/nnlm/local/model.py +++ b/egs/librispeech/asr/nnlm/local/model.py @@ -88,7 +88,6 @@ def __init__(self, attention_head_dim = embed_unit / attention_heads assert attention_head_dim * attention_heads == embed_unit, "embed_dim must be divisible by num_attention_headss" - self.model_type = 'Transformer' self.src_mask = None self.pos_encoder = PositionalEncoding(embed_unit, dropout) encoder_layers = TransformerEncoderLayer(embed_unit, attention_heads, @@ -135,9 +134,11 @@ def forward(self, src, has_mask=True): # [0., 0., 0., ..., 0., 0., 0.]], device='cuda:0') # after self.encoder - # src: [seq_len, batch_size, channel] + # src: [seq_len, batch_size, embed_unit] src = self.encoder(src) * math.sqrt(self.embed_unit) src = self.pos_encoder(src) + + # output: [seq_len, batch_size, ntoken] output = self.transformer_encoder(src, self.src_mask) output = self.decoder(output) return F.log_softmax(output, dim=-1) diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py index d25bbe10..6269db7e 100644 --- a/egs/librispeech/asr/nnlm/local/trainer.py +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -58,6 +58,7 @@ def __init__(self, self.clip = clip self.model_dir = model_dir self.num_infinite_grad_norm = 0 + self.model.to(device) def run(self): # save and eval initialized moel @@ -72,7 +73,6 @@ def run(self): if self.dev_data_loader is not None: self.eval() - def train(self): self.model.train() total_loss = 0 @@ -83,7 +83,6 @@ def train(self): batch_input, batch_target = batch batch_input = batch_input.to(self.device) batch_target = batch_target.to(self.device) - self.model.to(self.device) if isinstance(self.model, TransformerModel): batch_output = self.model(batch_input) @@ -126,9 +125,8 @@ def train(self): self.epoch, batch_idx), self.model) - save_checkpoint( - "{}/epoch_{}.pt".format(self.model_dir, self.epoch), - self.model) + save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, self.epoch), + self.model) self.epoch += 1 From d1b803b24f28340e79cc2a09cde140958d795668 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Fri, 9 Apr 2021 18:28:35 +0800 Subject: [PATCH 20/25] fixed reviews --- egs/librispeech/asr/nnlm/local/dataset.py | 22 ++++++++++++++----- .../asr/nnlm/local/generate_lexicon.py | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py index f932b197..e4f59616 100644 --- a/egs/librispeech/asr/nnlm/local/dataset.py +++ b/egs/librispeech/asr/nnlm/local/dataset.py @@ -35,7 +35,7 @@ def __call__(self, batch: List[List[int]]): self.pad_index) data_pad = data_pad.t().contiguous() # xs_pad, ys_pad: [max_seq_len, batch_size] - # max_seq_len is the maximum lenght in current batch + # max_seq_len is the maximum length in current batch xs_pad = data_pad[:-1, :] ys_pad = data_pad[1:, :] return xs_pad, ys_pad @@ -43,20 +43,30 @@ def __call__(self, batch: List[List[int]]): class LMDataset(Dataset): - def __init__(self, text_file: str, ntoken: int): + def __init__(self, token_file: str, ntoken: int): '''Dataset to load Language Model train/dev text data Args: - text_file: text file, text for one utt per line. + token_file: each line is a tokenized text, looks like: + token_id token_id *** token_id token_id + + A real example is: + + 485 135 974 255 1220 33 35 377 + 2130 1960 + + when loaded, / is added to compose input/target + ''' self.bos_id = ntoken - 3 self.eos_id = ntoken - 2 self.pad_index = ntoken - 1 assert os.path.exists( - text_file - ), "text_file: {} does not exist, please check that.".format(text_file) + token_file + ), "token_file: {} does not exist, please check that.".format( + token_file) self.data = [] - with open(text_file, 'r') as f: + with open(token_file, 'r') as f: for line in f: token_id = [int(i) for i in line.strip().split()] # Empty line exists in librispeech.txt. Disregrad that. diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py index e542b00d..30249453 100644 --- a/egs/librispeech/asr/nnlm/local/generate_lexicon.py +++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py @@ -51,7 +51,7 @@ def generate_tokens(args): symbols = tokenizer.get_vocab() tokens_file = '{}/tokens.txt'.format(args.lexicon_path) tokens_f = open(tokens_file, 'w') - id2sym = dict((v, k.lower()) for k, v in symbols.items()) + id2sym = {idx: sym.lower() for sym, idx in symbols.items()} for idx in range(len(symbols)): assert idx in id2sym tokens_f.write('{} {}\n'.format(id2sym[idx], idx)) From c45d31fe1a7fa2c5aa9b47966b6032fe12c02f20 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Sat, 10 Apr 2021 12:45:17 +0800 Subject: [PATCH 21/25] support multi-gpu training with ddp --- egs/librispeech/asr/nnlm/compute_word_ppl.py | 2 +- .../asr/nnlm/conf/lm_small_transformer.yaml | 7 +- .../asr/nnlm/conf/lm_transformer.yaml | 7 +- egs/librispeech/asr/nnlm/local/common.py | 2 +- egs/librispeech/asr/nnlm/local/trainer.py | 81 ++++++++++--------- egs/librispeech/asr/nnlm/main.py | 27 +++++-- egs/librispeech/asr/nnlm/run.sh | 6 +- 7 files changed, 75 insertions(+), 57 deletions(-) diff --git a/egs/librispeech/asr/nnlm/compute_word_ppl.py b/egs/librispeech/asr/nnlm/compute_word_ppl.py index d7d2f47b..89f4d899 100644 --- a/egs/librispeech/asr/nnlm/compute_word_ppl.py +++ b/egs/librispeech/asr/nnlm/compute_word_ppl.py @@ -18,7 +18,7 @@ from common import load_checkpoint from dataset import LMDataset, CollateFunc -from model import TransformerModel, RNNModel +from model import TransformerModel from pathlib import Path from trainer import Trainer from torch.utils.tensorboard import SummaryWriter diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml index 01bd5d05..903b5302 100644 --- a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml +++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml @@ -14,7 +14,6 @@ transformer_conf: shared_conf: ntoken: 5003 - batch_size: 30 optimizer_conf: lr: 0.02 @@ -32,12 +31,10 @@ dataset_conf: dataloader_conf: train: - batch_size: 20 - shuffle: True + batch_size: 60 num_workers: 10 drop_last: True dev: - batch_size: 20 - shuffle: False + batch_size: 60 num_workers: 10 drop_last: False diff --git a/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml index 21851ded..eaeb28b6 100644 --- a/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml +++ b/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml @@ -15,7 +15,6 @@ transformer_conf: shared_conf: ntoken: 5003 - batch_size: 30 optimizer_conf: lr: 0.02 @@ -33,12 +32,10 @@ dataset_conf: dataloader_conf: train: - batch_size: 20 - shuffle: True + batch_size: 60 num_workers: 10 drop_last: True dev: - batch_size: 20 - shuffle: False + batch_size: 60 num_workers: 10 drop_last: False diff --git a/egs/librispeech/asr/nnlm/local/common.py b/egs/librispeech/asr/nnlm/local/common.py index 770ae6e1..365ab964 100644 --- a/egs/librispeech/asr/nnlm/local/common.py +++ b/egs/librispeech/asr/nnlm/local/common.py @@ -34,7 +34,7 @@ def save_checkpoint(filename: Pathlike, Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True) logging.info(f'Save checkpoint to {filename}') checkpoint = { - 'state_dict': model.state_dict(), + 'state_dict': model.module.state_dict(), } if info is not None: checkpoint.update(info) diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py index 6269db7e..f03bb1f5 100644 --- a/egs/librispeech/asr/nnlm/local/trainer.py +++ b/egs/librispeech/asr/nnlm/local/trainer.py @@ -7,6 +7,7 @@ import math import numpy as np import torch +import torch.distributed as dist from common import load_checkpoint, save_checkpoint from model import TransformerModel @@ -44,7 +45,6 @@ def __init__(self, model_dir="exp-nnlm/models/", writer=None): self.device = device - self.model = model self.criterion = criterion self.optimizer = optimizer self.ntoken = ntoken @@ -58,12 +58,16 @@ def __init__(self, self.clip = clip self.model_dir = model_dir self.num_infinite_grad_norm = 0 - self.model.to(device) + self.model = model + self.world_size = dist.get_world_size() + self.local_rank = dist.get_rank() def run(self): # save and eval initialized moel if 0 == self.epoch: - save_checkpoint("{}/epoch_0.pt".format(self.model_dir), self.model) + if self.local_rank == 0: + save_checkpoint("{}/epoch_0.pt".format(self.model_dir), + self.model) self.eval() for _ in range(self.epoch, self.num_epochs): @@ -83,15 +87,8 @@ def train(self): batch_input, batch_target = batch batch_input = batch_input.to(self.device) batch_target = batch_target.to(self.device) - if isinstance(self.model, TransformerModel): - batch_output = self.model(batch_input) - - prediction = batch_output.view(-1, self.ntoken) - else: - # reinitiate hidden for everch batch - # as batches are independent on each other - hidden = self.model.init_hidden(batch_input.shape[1]) - prediction, _ = self.model(batch_input, hidden) + batch_output = self.model(batch_input) + prediction = batch_output.view(-1, self.ntoken) # target: [max_seq_len * batch_size] # example_1_token_1 example_2_token_1 example_3_token_1 ..... @@ -117,24 +114,26 @@ def train(self): batch_idx, num_total_batch, cur_loss, math.exp(cur_loss), self.epoch) logging.info(log_str) - logging.info('infinite grad_norm detected {} times'.format( - self.num_infinite_grad_norm)) + if self.num_infinite_grad_norm > 0: + logging.info('infinite grad_norm detected {} times'.format( + self.num_infinite_grad_norm)) total_loss = 0.0 - save_checkpoint( - "{}/epoch_{}-batch_{}.pt".format(self.model_dir, - self.epoch, batch_idx), - self.model) - - save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, self.epoch), - self.model) + if self.local_rank == 0: + save_checkpoint( + "{}/epoch_{}-batch_{}.pt".format( + self.model_dir, self.epoch, batch_idx), self.model) self.epoch += 1 + if self.local_rank == 0: + save_checkpoint( + "{}/epoch_{}.pt".format(self.model_dir, self.epoch), + self.model) @torch.no_grad() def eval(self): self.model.eval() - total_loss = 0.0 - total_examples = 0 + total_loss = torch.tensor([0.0]).to(self.device) + total_examples = torch.tensor([0.0]).to(self.device) for batch_idx, batch in enumerate(self.dev_data_loader): # batch_input: [seq_len, batch_size] # with contents: token_id token_id .... @@ -144,14 +143,9 @@ def eval(self): batch_input, batch_target = batch batch_input = batch_input.to(self.device) batch_target = batch_target.to(self.device) - self.model.to(self.device) - if isinstance(self.model, TransformerModel): - batch_output = self.model(batch_input) + batch_output = self.model(batch_input) - prediction = batch_output.view(-1, self.ntoken) - else: - hidden = self.model.init_hidden(batch_input.shape[1]) - prediction, _ = self.model(batch_input, hidden) + prediction = batch_output.view(-1, self.ntoken) # target: [max_seq_len * batch_size] # example_1_token_1 example_2_token_1 example_3_token_1 ..... target = batch_target.view(-1) @@ -159,12 +153,27 @@ def eval(self): total_loss += loss * batch_input.shape[1] total_examples += batch_input.shape[1] - loss = total_loss / total_examples - ppl = math.exp(loss) - self.writer.add_scalar('dev_ppl', ppl, self.epoch) - log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format( - loss.item(), ppl, self.epoch) - logging.info(log_str) + total_loss_list = [ + torch.zeros_like(total_loss) for _ in range(self.world_size) + ] + total_examples_list = [ + torch.zeros_like(total_examples) for _ in range(self.world_size) + ] + dist.all_gather(total_loss_list, total_loss) + dist.all_gather(total_examples_list, total_examples) + total_loss = 0 + total_examples = 0 + for loss, examples in zip(total_loss_list, total_examples_list): + total_loss += loss + total_examples += examples + + if self.local_rank == 0: + loss = total_loss / total_examples + ppl = math.exp(loss) + self.writer.add_scalar('dev_ppl', ppl, self.epoch) + log_str = 'dev examples: {} dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format( + int(total_examples.item()), loss.item(), ppl, self.epoch) + logging.info(log_str) def get_word_counts(self, dev_txt: str): word_counts = [] diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py index 710b8898..7f7b9ae8 100644 --- a/egs/librispeech/asr/nnlm/main.py +++ b/egs/librispeech/asr/nnlm/main.py @@ -11,6 +11,7 @@ import logging import os import torch +import torch.distributed as dist import torch.nn as nn import torch.optim as optim import sys @@ -33,6 +34,7 @@ def get_args(): description='training Neural Language Model') parser.add_argument('--config', required=True, help='config file') parser.add_argument('--vocab_size', type=int, default=3000) + parser.add_argument('--local_rank', type=int, default=0) parser.add_argument('--resume_model_iter', type=int, default=-1, @@ -89,6 +91,11 @@ def main(): ntoken = args.vocab_size + 3 assert ntoken == configs['shared_conf']['ntoken'] + dist.init_process_group('nccl') + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + print(device) + # Data pad_index = ntoken - 1 collate_func = CollateFunc(pad_index=pad_index) @@ -98,27 +105,33 @@ def main(): dev_dataset = LMDataset(configs['dataset_conf']['dev_token'], ntoken=ntoken) + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, shuffle=True) train_data_loader = DataLoader(train_dataset, + sampler=train_sampler, collate_fn=collate_func, **configs['dataloader_conf']['train']) + dev_sampler = torch.utils.data.distributed.DistributedSampler( + dev_dataset, shuffle=False) dev_data_loader = DataLoader(dev_dataset, + sampler=dev_sampler, collate_fn=collate_func, **configs['dataloader_conf']['dev']) # initialize or resume model if configs['model_module'] == 'transformer': model = TransformerModel(**configs['transformer_conf']) + if args.resume_model_iter > 0: + model_dir = configs['trainer_conf']['model_dir'] + model_path = '{}/epoch_{}.pt'.format(model_dir, args.resume_model_iter) + assert os.path.exists(model_path) + load_checkpoint(model_path, model) + model = torch.nn.parallel.DistributedDataParallel( + model.to(device), [args.local_rank]) - if args.resume_model_iter > 0: - model_dir = configs['trainer_conf']['model_dir'] - model_path = '{}/epoch_{}.pt'.format(model_dir, args.resume_model_iter) - assert os.path.exists(model_path) - load_checkpoint(model_path, model) optimizer = optim.AdamW(model.parameters(), **configs['optimizer_conf']) - use_cuda = configs['gpu'] >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') criterion = nn.NLLLoss(ignore_index=pad_index) writer = SummaryWriter(log_dir=configs['tensorboard_dir']) diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index ffa82e8a..b9cdaec5 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -86,9 +86,11 @@ if [ $stage -le 3 ]; then echo "start to train" # resume_model_iter is for resume training # -1 means train from scratch - python main.py \ + # python main.py \ + export CUDA_VISIBLE_DEVICES=0,1,2,3 + python -m torch.distributed.launch --nproc_per_node=4 main.py \ --config $lm_config \ - --vocab_size $vocab_size + --vocab_size $vocab_size \ --resume_model_iter -1 fi From 1d38c218042fbdbfd40c9a7fd1036b5cb0396c71 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Wed, 14 Apr 2021 21:48:02 +0800 Subject: [PATCH 22/25] n-best rescoring result with 8-layer transformer lm --- egs/librispeech/asr/nnlm/RESULTS.md | 19 -- egs/librispeech/asr/nnlm/compute_word_ppl.py | 131 ++------ .../asr/nnlm/conf/lm_small_transformer.yaml | 26 +- egs/librispeech/asr/nnlm/local/evaluator.py | 246 ++++++++++++++ egs/librispeech/asr/nnlm/local/model.py | 3 + egs/librispeech/asr/nnlm/run.sh | 15 +- .../simple_v1/mmi_att_transformer_decode.py | 255 ++++++++++----- snowfall/decoding/lm_rescore.py | 306 ++++++++++++++++++ 8 files changed, 785 insertions(+), 216 deletions(-) create mode 100644 egs/librispeech/asr/nnlm/local/evaluator.py create mode 100644 snowfall/decoding/lm_rescore.py diff --git a/egs/librispeech/asr/nnlm/RESULTS.md b/egs/librispeech/asr/nnlm/RESULTS.md index 0018015f..e69de29b 100644 --- a/egs/librispeech/asr/nnlm/RESULTS.md +++ b/egs/librispeech/asr/nnlm/RESULTS.md @@ -1,19 +0,0 @@ -##tokens ppl with train_pieces=300000 # 15 times of dev.txt - -###vocab_size=2000 - epochs=50 train/dev perplexity was 80.0 / 119.0 - -###vocab_size=3000 - dev perplexity of random initialized model is around 2998.13 - epochs=1 train/dev perplexity was around 120 / 137.67 - epochs=2 train/dev perplexity was around 113 / 132.51 - epochs=3 train/dev perplexity was around 111 / 130.09 - epochs=4 train/dev perplexity was around 109 / 130.29 - -###vocab_size=5000 - dev perplexity of random initialized model is around 6844.12 - epochs=1 train/dev perplexity was around 898 / 984.12 - epochs=2 train/dev perplexity was around 964 / 982.52 - epochs=3 train/dev perplexity was around 908 / 1020.44 - epochs=4 train/dev perplexity was around 914 / 1030.31 - epochs=4 train/dev perplexity was around 916 / 975.74 diff --git a/egs/librispeech/asr/nnlm/compute_word_ppl.py b/egs/librispeech/asr/nnlm/compute_word_ppl.py index 89f4d899..5453395a 100644 --- a/egs/librispeech/asr/nnlm/compute_word_ppl.py +++ b/egs/librispeech/asr/nnlm/compute_word_ppl.py @@ -4,74 +4,45 @@ # Apache 2.0 # Reference: +# https://github.com/espnet/espnet/blob/master/espnet/lm/pytorch_backend/lm.py # https://github.com/mobvoi/wenet/blob/main/wenet/bin/train.py import argparse import logging import os import torch +import torch.distributed as dist import torch.nn as nn import torch.optim as optim import sys +import yaml sys.path.insert(0, './local/') from common import load_checkpoint -from dataset import LMDataset, CollateFunc -from model import TransformerModel +from evaluator import Evaluator +# from model import TransformerModel from pathlib import Path -from trainer import Trainer -from torch.utils.tensorboard import SummaryWriter -from torch.utils.data import DataLoader +from typing import List, Dict def get_args(): parser = argparse.ArgumentParser( - description='training Neural Language Model') - parser.add_argument('--train_token', - default='data/nnlm/text/librispeech.txt.tokens', - help='train token file') - parser.add_argument('--dev_token', - default='data/nnlm/text/dev.txt.tokens', - help='dev token file') - parser.add_argument('--dev_txt', - default='data/nnlm/text/dev.txt', - help='dev txt file, used to compute word ppl') - parser.add_argument('--batch_size', type=int, default=60) - parser.add_argument('--vocab_size', type=int, default=2000) - parser.add_argument('--emsize', type=int, default=200) - parser.add_argument('--nhead', type=int, default=2) - parser.add_argument('--nhid', type=int, default=200) - parser.add_argument('--nlayers', type=int, default=2) - parser.add_argument('--num_epochs', type=int, default=50) - parser.add_argument('--dropout', type=int, default=0.2) - parser.add_argument('--lr', - type=float, - default=1e-2, - help='initial learning rate') - parser.add_argument('--clip', - type=float, - default=50.0, - help='gradient clipping') - parser.add_argument('--model_dir', - default='./exp-nnlm/models/', - help='path to save model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='path to save tensorboard log') - parser.add_argument('--gpu', - type=int, - default=1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument( - '--model_iter', - type=int, - default=19, - help='resume from trained model; if -1 training from scratch') - parser.add_argument('--model_type', + description='compute token/word ppl of txt') + parser.add_argument('--config', + help='config file', + default='conf/lm_small_transformer.yaml') + parser.add_argument('--vocab_size', type=int, default=5000) + parser.add_argument('--model', type=str, - default='Transformer', - help='model type') + default='exp-nnlm/models/epoch_30.pt', + help='full path of loaded model') + parser.add_argument('--tokenizer_path', + type=str, + default='exp-nnlm/tokenizer-librispeech.json') + parser.add_argument('--txt_file', + type=str, + default='data/nnlm/text/dev.txt') args = parser.parse_args() @@ -85,60 +56,16 @@ def main(): # Set random seed torch.manual_seed(2021) - # args.vocab_size: number of tokens in tokenizer.get_vocab - # + 2: one for eos_id, another for pad_idx - # i.e. token_idxs[0, 1, 2, ...., ntokens -3, ntokens - 2, ntokens - 1] - # bos_id: ntokens - 3 - # eos_id: ntokens - 2 - # pad_idx: ntokens - 1 - ntokens = args.vocab_size + 3 - pad_index = ntokens - 1 - - collate_func = CollateFunc(pad_index=pad_index) - - dev_dataset = LMDataset(args.dev_token, ntokens=ntokens) - - dev_data_loader = DataLoader(dev_dataset, - batch_size=1, - shuffle=False, - num_workers=0, - drop_last=False, - collate_fn=collate_func) - - if 'Trasformer' == args.model_type: - model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, - args.nlayers, args.dropout) - else: - model = RNNModel('LSTM', ntokens, args.emsize, args.nhid, args.nlayers, - args.dropout, False) - - if args.model_iter > 0: - model_path = '{}/epoch_{}.pt'.format(args.model_dir, args.model_iter) - load_checkpoint(model_path, model) - optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') + + # device = torch.device("cuda", args.local_rank) + device = torch.device('cpu') print(device) - criterion = nn.NLLLoss(ignore_index=pad_index) - exp_dir = 'exp-nnlm' - writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') - - Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True) - trainer = Trainer(device, - model, - criterion, - optimizer, - train_data_loader=None, - dev_data_loader=dev_data_loader, - ntokens=ntokens, - batch_size=args.batch_size, - epoch=args.model_iter + 1, - num_epochs=args.num_epochs, - clip=args.clip, - model_dir=args.model_dir, - writer=writer) - - trainer.get_word_ppl(args.dev_txt) + + evaluator = Evaluator(device=device, + model_path=args.model, + config_file=args.config, + tokenizer_path=args.tokenizer_path) + evaluator.compute_ppl(txt_file=args.txt_file) if __name__ == '__main__': diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml index 903b5302..4ee16290 100644 --- a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml +++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml @@ -7,34 +7,38 @@ tensorboard_dir: 'exp-nnlm/tensorobard' model_module: transformer transformer_conf: embed_unit: 200 - attention_heads: 2 - nlayers: 2 - linear_units: 200 + attention_heads: 8 + nlayers: 8 + linear_units: 2048 dropout: 0.2 shared_conf: ntoken: 5003 optimizer_conf: - lr: 0.02 - weight_decay: 0.005 + # for Adam + lr: 0.0003 + weight_decay: 0.001 + # for SGD + # lr: 0.01 + # weight_decay: 0.001 trainer_conf: - num_epochs: 50 + num_epochs: 60 clip: 0.25 model_dir: './exp-nnlm/models/' dataset_conf: - train_token: 'data/nnlm/text/300000_librispeech.txt.tokens' + train_token: 'data/nnlm/text/librispeech.txt.tokens' dev_token: 'data/nnlm/text/dev.txt.tokens' dataloader_conf: train: - batch_size: 60 - num_workers: 10 + batch_size: 256 + num_workers: 0 drop_last: True dev: - batch_size: 60 - num_workers: 10 + batch_size: 20 + num_workers: 0 drop_last: False diff --git a/egs/librispeech/asr/nnlm/local/evaluator.py b/egs/librispeech/asr/nnlm/local/evaluator.py new file mode 100644 index 00000000..7eff021d --- /dev/null +++ b/egs/librispeech/asr/nnlm/local/evaluator.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) +# Apache 2.0 + +import logging +import os +import yaml +import math +import numpy as np +import torch +import torch.distributed as dist +from torch.nn.utils.rnn import pad_sequence +import torch.nn as nn + +from model import TransformerModel + +from tokenizers import Tokenizer +from tokenizers.models import WordPiece +from tokenizers import decoders +from common import load_checkpoint +from model import TransformerModel +from typing import Dict, List + +import k2 + + +def word_seqs_to_list_str(word_seqs: k2.RaggedInt, + symbol_table: k2.SymbolTable) -> List[str]: + ''' + Args: + word_seqs:[path][word] + ''' + word_ids = word_seqs.values() + words = [symbol_table.get(word_idx.item()) for word_idx in word_ids] + ragged_shape = word_seqs.row_splits(1) + sentences = [] + for idx, start_idx in enumerate(ragged_shape[:-1]): + sentences.append(' '.join(words[start_idx:ragged_shape[idx + 1]])) + return sentences + + +def validate_configs(configs: Dict, required_fields: List) -> bool: + not_exist_fields = [] + for field in required_fields: + if field not in configs or configs[field] is None: + not_exist_fields.append(field) + if len(not_exist_fields) > 0: + assert False, 'set following required fields {}'.format( + ' '.join(not_exist_fields)) + return True + + +def extract_configs(config_file) -> Dict: + assert os.path.exists(config_file), '{} does not exist'.format(cofnig_file) + required_fields = [ + 'model_module', + 'shared_conf', + ] + with open(config_file, 'r') as f: + configs = yaml.load(f, Loader=yaml.FullLoader) + validate_configs(configs, required_fields) + + model_conf = '{}_conf'.format(configs['model_module']) + ntoken = configs['shared_conf']['ntoken'] + + assert 'model_dir' in configs['trainer_conf'] + configs[model_conf]['ntoken'] = ntoken + + return configs + + +class Evaluator(object): + + def __init__(self, + device, + model_path, + config_file=None, + tokenizer_path=None, + words_txt=None, + batch_size=1): + self.device = device + configs = extract_configs(config_file) + if configs['model_module'] == 'transformer': + model = TransformerModel(**configs['transformer_conf']) + if model_path is not None: + assert os.path.exists(model_path) + load_checkpoint(model_path, model) + self.model = model + self.ntoken = model.ntoken + self.batch_size = batch_size + self.word_count = 0 + self.token_count = 0 + self.total_examples = 0 + self.model.to(self.device) + self.model.eval() + + self.tokenizer = Tokenizer.from_file(tokenizer_path) + self.tokenizer.decoder = decoders.WordPiece() + self.bos_id = self.ntoken - 3 + self.eos_id = self.ntoken - 2 + self.pad_index = self.ntoken - 1 + if words_txt is not None: + self.symbol_table = k2.SymbolTable.from_file(words_txt) + + self.criterion = nn.NLLLoss(ignore_index=self.pad_index, + reduction='mean') + + def set_criterion(self, doing_rescore: bool): + if doing_rescore: + self.criterion = nn.NLLLoss(ignore_index=self.pad_index, + reduction='sum') + else: + self.criterion = nn.NLLLoss(ignore_index=self.pad_index, + reduction='mean') + def reset_count_variables(self): + self.word_count = 0 + self.token_count = 0 + self.total_examples = 0 + + def batchify(self, txt_f): + batch = [] + + for line in txt_f: + self.total_examples += 1 + line = line.strip().lower() + + token_id = self.tokenizer.encode(line).ids + # +1 for + self.word_count += len(line.split()) + 1 + # +1 for + self.token_count += len(token_id) + 1 + token_id.insert(0, self.bos_id) + token_id.append(self.eos_id) + batch.append(token_id) + if len(batch) == self.batch_size: + # data_pad: [batch_size, seq_len] + # each seq_len always different + data_pad = pad_sequence( + [torch.from_numpy(np.array(x)).long() for x in batch], + True, self.pad_index) + data_pad = data_pad.t().contiguous() + # xs_pad, ys_pad: [max_seq_len, batch_size] + # max_seq_len is the maximum length in current batch + xs_pad = data_pad[:-1, :] + ys_pad = data_pad[1:, :] + yield xs_pad, ys_pad + batch = [] + + @torch.no_grad() + def compute_ppl(self, txt_file: str): + self.set_criterion(doing_rescore=False) + # total_loss = torch.tensor([0.0]).to(self.device) + # total_examples = torch.tensor([0.0]).to(self.device) + # for batch_idx, batch in enumerate(self.dev_data_loader): + total_loss = 0.0 + txt_f = open(txt_file, 'r') + for batch_input, batch_target in self.batchify(txt_f): + # batch_input: [seq_len, batch_size] + # with contents: token_id token_id .... + # + # batch_target: [seq_len, batch_size] + # with contensts: token_id token_id ... + batch_input = batch_input.to(self.device) + batch_target = batch_target.to(self.device) + batch_output = self.model(batch_input) + + prediction = batch_output.view(-1, self.ntoken) + # target: [max_seq_len * batch_size] + # example_1_token_1 example_2_token_1 example_3_token_1 ..... + target = batch_target.view(-1) + loss = self.criterion(prediction, target) + total_loss += loss * batch_input.shape[0] + + loss = total_loss / self.token_count + token_ppl = math.exp(total_loss / self.token_count) + word_ppl = math.exp(total_loss / self.word_count) + log_str = 'dev examples: {} dev loss is {:.6f} and token_ppl {:.6f} word_ppl {}'.format( + int(self.total_examples), loss.item(), token_ppl, word_ppl) + logging.info(log_str) + txt_f.close() + self.reset_count_variables() + + def batchify_sentences(self, sentences: List[str]): + batch = [] + for line in sentences: + self.total_examples += 1 + token_id = self.tokenizer.encode(line).ids + # print('token_id: ', token_id) + # +1 for + self.word_count += len(line.split()) + 1 + # +1 for + self.token_count += len(token_id) + 1 + + token_id.insert(0, self.bos_id) + token_id.append(self.eos_id) + batch.append(token_id) + if len(batch) == self.batch_size: + # data_pad: [batch_size, seq_len] + # each seq_len always different + data_pad = pad_sequence( + [torch.from_numpy(np.array(x)).long() for x in batch], + True, self.pad_index) + data_pad = data_pad.t().contiguous() + # xs_pad, ys_pad: [max_seq_len, batch_size] + # max_seq_len is the maximum length in current batch + xs_pad = data_pad[:-1, :] + ys_pad = data_pad[1:, :] + yield xs_pad, ys_pad + batch = [] + + @torch.no_grad() + def score_sentences(self, sentences: List[str]) -> torch.tensor: + ''' + Args: + sentences: each element is a sentence, words seperated by whitespace + ''' + total_loss = 0.0 + average_negative_logp = [] + for batch_input, batch_target in self.batchify_sentences(sentences): + # batch_input: [seq_len, batch_size] + # with contents: token_id token_id .... + # + # batch_target: [seq_len, batch_size] + # with contensts: token_id token_id ... + batch_input = batch_input.to(self.device) + batch_target = batch_target.to(self.device) + batch_output = self.model(batch_input) + + prediction = batch_output.view(-1, self.ntoken) + # target: [max_seq_len * batch_size] + # example_1_token_1 example_2_token_1 example_3_token_1 ..... + target = batch_target.view(-1) + loss = self.criterion(prediction, target) + average_negative_logp.append(loss.item()) + self.reset_count_variables() + return torch.tensor(average_negative_logp).to(self.device) + + @torch.no_grad() + def score_word_seqs(self, word_seqs: k2.RaggedInt, doing_rescore:bool = True) -> torch.tensor: + ''' + used when rescoring + ''' + self.set_criterion(doing_rescore=True) + sentences = word_seqs_to_list_str(word_seqs, self.symbol_table) + return self.score_sentences(sentences) diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py index fd37601f..71c2feb5 100644 --- a/egs/librispeech/asr/nnlm/local/model.py +++ b/egs/librispeech/asr/nnlm/local/model.py @@ -99,6 +99,9 @@ def __init__(self, self.init_weights() + # used by evaluator + self.ntoken = ntoken + def _generate_square_subsequent_mask(self, sz): mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill( diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index b9cdaec5..343ef473 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -17,9 +17,9 @@ text_dir=data/nnlm/text all_train_text=$text_dir/librispeech.txt # there are 40,398,052 pieces in all_train_text, which will take 50 MINUTES to be tokenized, with a single process. # use $train_pieces data to validate pipeline -train_pieces=300000 # 15 times of dev.txt +# train_pieces=300000 # 15 times of dev.txt # uncomment follwoing line to use all_train_text -# train_pieces= +train_pieces= dev_text=$text_dir/dev.txt # vocab_size of huggingface tokenizer @@ -88,6 +88,7 @@ if [ $stage -le 3 ]; then # -1 means train from scratch # python main.py \ export CUDA_VISIBLE_DEVICES=0,1,2,3 + # python -m torch.distributed.launch --nproc_per_node=4 test.py \ python -m torch.distributed.launch --nproc_per_node=4 main.py \ --config $lm_config \ --vocab_size $vocab_size \ @@ -96,14 +97,8 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - # TODO: this module is in developing echo "compute word ppl from token ppl" - # model_iter if for resume training - # -1 means train from scratch - python compute_word_ppl.py \ - --model_iter 40 \ - --vocab_size $vocab_size \ - --model_type Transformer + python compute_word_ppl.py fi @@ -125,3 +120,5 @@ if [ $stage -le 5 ]; then --tokenizer-path $tokenizer fi + +# cut -f 2- -d" " /home/storage15/huangying/tools/espnet/egs/librispeech/asr1/data/dev/text > data/dev/text diff --git a/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py b/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py index cfe25c9e..879a9fe7 100755 --- a/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py +++ b/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py @@ -11,19 +11,20 @@ import os import torch from k2 import Fsa, SymbolTable -from kaldialign import edit_distance from pathlib import Path from typing import List from typing import Union -from lhotse import CutSet, load_manifest -from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler from snowfall.common import average_checkpoint, store_transcripts from snowfall.common import find_first_disambig_symbol from snowfall.common import get_texts +from snowfall.common import write_error_stats from snowfall.common import load_checkpoint from snowfall.common import setup_logger +from snowfall.common import str2bool +from snowfall.data import LibriSpeechAsrDataModule from snowfall.decoding.graph import compile_HLG +from snowfall.decoding.lm_rescore import decode_with_lm_rescoring from snowfall.models import AcousticModel from snowfall.models.transformer import Transformer from snowfall.models.conformer import Conformer @@ -31,9 +32,18 @@ from snowfall.training.mmi_graph import create_bigram_phone_lm from snowfall.training.mmi_graph import get_phone_symbols +from evaluator import Evaluator -def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel, - device: Union[str, torch.device], HLG: Fsa, symbols: SymbolTable): + +def decode(dataloader: torch.utils.data.DataLoader, + model: AcousticModel, + device: Union[str, torch.device], + HLG: Fsa, + symbols: SymbolTable, + num_paths: int, + G: k2.Fsa, + use_whole_lattice: bool, + evaluator=None): tot_num_cuts = len(dataloader.dataset.cuts) num_cuts = 0 results = [] # a list of pair (ref_words, hyp_words) @@ -43,7 +53,8 @@ def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel, supervision_segments = torch.stack( (supervisions['sequence_idx'], (((supervisions['start_frame'] - 1) // 2 - 1) // 2), - (((supervisions['num_frames'] - 1) // 2 - 1) // 2)), 1).to(torch.int32) + (((supervisions['num_frames'] - 1) // 2 - 1) // 2)), + 1).to(torch.int32) supervision_segments = torch.clamp(supervision_segments, min=0) indices = torch.argsort(supervision_segments[:, 2], descending=True) supervision_segments = supervision_segments[indices] @@ -71,8 +82,22 @@ def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel, lattices = k2.intersect_dense_pruned(HLG, dense_fsa_vec, 20.0, 7.0, 30, 10000) - # lattices = k2.intersect_dense(HLG, dense_fsa_vec, 10.0) - best_paths = k2.shortest_path(lattices, use_double_scores=True) + if G is None: + best_paths = k2.shortest_path(lattices, use_double_scores=True) + elif evaluator is not None: + best_paths = decode_with_lm_rescoring( + lattices, + evaluator=evaluator, + G=None, + num_paths=num_paths, + use_whole_lattice=use_whole_lattice) + else: + best_paths = decode_with_lm_rescoring( + lattices, + G, + num_paths=num_paths, + use_whole_lattice=use_whole_lattice) + assert best_paths.shape[0] == len(texts) hyps = get_texts(best_paths, indices) assert len(hyps) == len(texts) @@ -157,56 +182,82 @@ def print_transition_probabilities(P: k2.Fsa, phone_symbol_table: SymbolTable, def get_parser(): parser = argparse.ArgumentParser() - parser.add_argument( - '--model-type', - type=str, - default="conformer", - choices=["transformer", "conformer"], - help="Model type.") - parser.add_argument( - '--epoch', - type=int, - default=10, - help="Decoding epoch.") - parser.add_argument( - '--max-duration', - type=int, - default=1000.0, - help="Maximum pooled recordings duration (seconds) in a single batch.") + parser.add_argument('--model-type', + type=str, + default="conformer", + choices=["transformer", "conformer"], + help="Model type.") + parser.add_argument('--epoch', + type=int, + default=10, + help="Decoding epoch.") parser.add_argument( '--avg', type=int, default=5, help="Number of checkpionts to average. Automaticly select " - "consecutive checkpoints before checkpoint specified by'--epoch'. ") - parser.add_argument( - '--att-rate', - type=float, - default=0.0, - help="Attention loss rate.") - parser.add_argument( - '--nhead', - type=int, - default=4, - help="Number of attention heads in transformer.") + "consecutive checkpoints before checkpoint specified by'--epoch'. ") + parser.add_argument('--att-rate', + type=float, + default=0.0, + help="Attention loss rate.") + parser.add_argument('--nhead', + type=int, + default=4, + help="Number of attention heads in transformer.") parser.add_argument( '--attention-dim', type=int, default=256, help="Number of units in transformer attention layers.") + parser.add_argument( + '--output-beam-size', + type=int, + default=8, + help='Output beam size. Used in k2.intersect_dense_pruned.'\ + 'Choose a large value (e.g., 20), for 1-best decoding '\ + 'and n-best rescoring. Choose a small value (e.g., 8) for ' \ + 'rescoring with the whole lattice') + parser.add_argument('--use-lm-rescoring', + type=str2bool, + default=True, + help='When enabled, it uses LM for rescoring') + + parser.add_argument('--use-nnlm-rescoring', + type=str2bool, + default=True, + help='When enabled, it uses LM for rescoring') + parser.add_argument( + '--num-paths', + type=int, + default=-1, + help='Number of paths for rescoring using n-best list.' \ + 'If it is negative, then rescore with the whole lattice.'\ + 'CAUTION: You have to reduce max_duration in case of CUDA OOM' + ) return parser def main(): - args = get_parser().parse_args() + + parser = get_parser() + LibriSpeechAsrDataModule.add_arguments(parser) + args = parser.parse_args() model_type = args.model_type epoch = args.epoch - max_duration = args.max_duration avg = args.avg att_rate = args.att_rate - - exp_dir = Path('exp-' + model_type + '-noam-mmi-att-musan') + num_paths = args.num_paths + use_lm_rescoring = args.use_lm_rescoring + use_nnlm_rescoring = args.use_nnlm_rescoring + use_whole_lattice = False + if use_lm_rescoring and num_paths < 1: + # It doesn't make sense to use n-best list for rescoring + # when n is less than 1 + use_whole_lattice = True + + exp_dir = Path('exp-' + model_type + '-noam-mmi-att-musan-sa') setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug') # load L, G, symbol_table @@ -225,6 +276,23 @@ def main(): # device = torch.device('cuda', 1) device = torch.device('cuda') + if use_nnlm_rescoring: + # now only support n-best rescoring with nnlm + use_whole_lattice = False + # TODO: make following paths configurable + model_path = '../nnlm/exp-nnlm/models/epoch_30.pt' + config_file = '../nnlm/conf/lm_small_transformer.yaml' + tokenizer_path = '../nnlm/exp-nnlm/tokenizer-librispeech.json' + words_txt = './data/lang_nosp/words.txt' + + evaluator = Evaluator(device=device, + words_txt=words_txt, + model_path=model_path, + config_file=config_file, + tokenizer_path=tokenizer_path) + else: + evaluator = None + if att_rate != 0.0: num_decoder_layers = 6 else: @@ -232,7 +300,7 @@ def main(): if model_type == "transformer": model = Transformer( - num_features=40, + num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol @@ -240,7 +308,7 @@ def main(): num_decoder_layers=num_decoder_layers) else: model = Conformer( - num_features=40, + num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol @@ -253,8 +321,10 @@ def main(): checkpoint = os.path.join(exp_dir, 'epoch-' + str(epoch - 1) + '.pt') load_checkpoint(checkpoint, model) else: - checkpoints = [os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt') for avg_epoch in - range(epoch - avg, epoch)] + checkpoints = [ + os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt') + for avg_epoch in range(epoch - avg, epoch) + ] average_checkpoint(checkpoints, model) model.to(device) @@ -262,10 +332,16 @@ def main(): assert P.requires_grad is False P.scores = model.P_scores.cpu() - print_transition_probabilities(P, phone_symbol_table, phone_ids, filename='model_P_scores.txt') + print_transition_probabilities(P, + phone_symbol_table, + phone_ids, + filename='model_P_scores.txt') P.set_scores_stochastic_(model.P_scores) - print_transition_probabilities(P, phone_symbol_table, phone_ids, filename='P_scores.txt') + print_transition_probabilities(P, + phone_symbol_table, + phone_ids, + filename='P_scores.txt') if not os.path.exists(lang_dir / 'HLG.pt'): logging.debug("Loading L_disambig.fst.txt") @@ -274,61 +350,90 @@ def main(): logging.debug("Loading G.fst.txt") with open(lang_dir / 'G.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) - first_phone_disambig_id = find_first_disambig_symbol(phone_symbol_table) + first_phone_disambig_id = find_first_disambig_symbol( + phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) HLG = compile_HLG(L=L, - G=G, - H=ctc_topo, - labels_disambig_id_start=first_phone_disambig_id, - aux_labels_disambig_id_start=first_word_disambig_id) + G=G, + H=ctc_topo, + labels_disambig_id_start=first_phone_disambig_id, + aux_labels_disambig_id_start=first_word_disambig_id) torch.save(HLG.as_dict(), lang_dir / 'HLG.pt') else: logging.debug("Loading pre-compiled HLG") d = torch.load(lang_dir / 'HLG.pt') HLG = k2.Fsa.from_dict(d) + if use_lm_rescoring: + if use_whole_lattice: + logging.info('Rescoring with the whole lattice') + else: + logging.info(f'Rescoring with n-best list, n is {num_paths}') + first_word_disambig_id = find_first_disambig_symbol(symbol_table) + if not os.path.exists(lang_dir / 'G_4_gram.pt'): + logging.debug('Loading G_4_gram.fst.txt') + with open(lang_dir / 'G_4_gram.fst.txt') as f: + G = k2.Fsa.from_openfst(f.read(), acceptor=False) + # G.aux_labels is not needed in later computations, so + # remove it here. + del G.aux_labels + # CAUTION(fangjun): The following line is crucial. + # Arcs entering the back-off state have label equal to #0. + # We have to change it to 0 here. + G.labels[G.labels >= first_word_disambig_id] = 0 + G = k2.create_fsa_vec([G]).to(device) + G = k2.arc_sort(G) + torch.save(G.as_dict(), lang_dir / 'G_4_gram.pt') + else: + logging.debug('Loading pre-compiled G_4_gram.pt') + d = torch.load(lang_dir / 'G_4_gram.pt') + G = k2.Fsa.from_dict(d).to(device) + + if use_whole_lattice: + # Add epsilon self-loops to G as we will compose + # it with the whole lattice later + G = k2.add_epsilon_self_loops(G) + G = k2.arc_sort(G) + G = G.to(device) + else: + logging.debug('Decoding without LM rescoring') + G = None + logging.debug("convert HLG to device") HLG = HLG.to(device) HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0) HLG.requires_grad_(False) + if not hasattr(HLG, 'lm_scores'): + HLG.lm_scores = HLG.scores.clone() + # load dataset - feature_dir = Path('exp/data') + librispeech = LibriSpeechAsrDataModule(args) test_sets = ['test-clean', 'test-other'] - for test_set in test_sets: + # test_sets = ['test-other'] + for test_set, test_dl in zip(test_sets, librispeech.test_dataloaders()): logging.info(f'* DECODING: {test_set}') - logging.debug("About to get test cuts") - cuts_test = load_manifest(feature_dir / f'cuts_{test_set}.json.gz') - logging.debug("About to create test dataset") - test = K2SpeechRecognitionDataset(cuts_test) - sampler = SingleCutSampler(cuts_test, max_duration=max_duration) - logging.debug("About to create test dataloader") - test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) - - logging.debug("About to decode") results = decode(dataloader=test_dl, model=model, device=device, HLG=HLG, - symbols=symbol_table) + symbols=symbol_table, + num_paths=num_paths, + G=G, + evaluator=evaluator, + use_whole_lattice=use_whole_lattice) recog_path = exp_dir / f'recogs-{test_set}.txt' store_transcripts(path=recog_path, texts=results) logging.info(f'The transcripts are stored in {recog_path}') - # compute WER - dists = [edit_distance(r, h) for r, h in results] - errors = { - key: sum(dist[key] for dist in dists) - for key in ['sub', 'ins', 'del', 'total'] - } - total_words = sum(len(ref) for ref, _ in results) - # Print Kaldi-like message: - # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] - logging.info( - f'[{test_set}] %WER {errors["total"] / total_words:.2%} ' - f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' - ) + + # The following prints out WERs, per-word error statistics and aligned + # ref/hyp pairs. + errs_filename = exp_dir / f'errs-{test_set}.txt' + with open(errs_filename, 'w') as f: + write_error_stats(f, test_set, results) + logging.info('Wrote detailed error stats to {}'.format(errs_filename)) torch.set_num_threads(1) diff --git a/snowfall/decoding/lm_rescore.py b/snowfall/decoding/lm_rescore.py new file mode 100644 index 00000000..be4cf2f8 --- /dev/null +++ b/snowfall/decoding/lm_rescore.py @@ -0,0 +1,306 @@ +# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) + +# modified from: +# https://github.com/k2-fsa/snowfall/blob/16e9f5949be9db99730d65335adbf27d2729424d/snowfall/decoding/lm_rescore.py +from typing import Optional + +import k2 +import torch + + +def compute_am_scores(lats: k2.Fsa, word_fsas_with_epsilon_loops: k2.Fsa, + path_to_seq_map: torch.Tensor) -> torch.Tensor: + '''Compute AM scores of n-best lists (represented as word_fsas). + + Args: + lats: + An FsaVec, which is the output of `k2.intersect_dense_pruned`. + It must have the attribute `lm_scores`. + word_fsas_with_epsilon_loops: + An FsaVec representing a n-best list. Note that it has been processed + by `k2.add_epsilon_self_loops`. + path_to_seq_map: + A 1-D torch.Tensor with dtype torch.int32. path_to_seq_map[i] indicates + which sequence the i-th Fsa in word_fsas_with_epsilon_loops belongs to. + path_to_seq_map.numel() == word_fsas_with_epsilon_loops.arcs.dim0(). + Returns: + Return a 1-D torch.Tensor containing the AM scores of each path. + `ans.numel() == word_fsas_with_epsilon_loops.shape[0]` + ''' + device = lats.device + assert len(lats.shape) == 3 + assert hasattr(lats, 'lm_scores') + + # k2.compose() currently does not support b_to_a_map. To void + # replicating `lats`, we use k2.intersect_device here. + # + # lats has phone IDs as `labels` and word IDs as aux_labels, so we + # need to invert it here. + inverted_lats = k2.invert(lats) + + # Now the `labels` of inverted_lats are word IDs (a 1-D torch.Tensor) + # and its `aux_labels` are phone IDs ( a k2.RaggedInt with 2 axes) + + # Remove its `aux_labels` since it is not needed in the + # following computation + del inverted_lats.aux_labels + inverted_lats = k2.arc_sort(inverted_lats) + + am_path_lats = k2.intersect_device(inverted_lats, + word_fsas_with_epsilon_loops, + b_to_a_map=path_to_seq_map, + sorted_match_a=True) + + # NOTE: `k2.connect` and `k2.top_sort` support only CPU at present + am_path_lats = k2.top_sort(k2.connect(am_path_lats.to('cpu'))).to(device) + + # The `scores` of every arc consists of `am_scores` and `lm_scores` + am_path_lats.scores = am_path_lats.scores - am_path_lats.lm_scores + + am_scores = am_path_lats.get_tot_scores(True, True) + + return am_scores + + +@torch.no_grad() +def rescore_with_n_best_list(lats: k2.Fsa, + G: k2.Fsa, + num_paths: int, + evaluator=None) -> k2.Fsa: + '''Decode using n-best list with LM rescoring. + + `lats` is a decoding lattice, which has 3 axes. This function first + extracts `num_paths` paths from `lats` for each sequence using + `k2.random_paths`. The `am_scores` of these paths are computed. + For each path, its `lm_scores` is computed using `G` (which is an LM). + The final `tot_scores` is the sum of `am_scores` and `lm_scores`. + The path with the greatest `tot_scores` within a sequence is used + as the decoding output. + + Args: + lats: + An FsaVec. It can be the output of `k2.intersect_dense_pruned`. + G: + An FsaVec representing the language model (LM). Note that it + is an FsaVec, but it contains only one Fsa. + num_paths: + It is the size `n` in `n-best` list. + Returns: + An FsaVec representing the best decoding path for each sequence + in the lattice. + ''' + device = lats.device + + assert len(lats.shape) == 3 + assert hasattr(lats, 'aux_labels') + assert hasattr(lats, 'lm_scores') + + if evaluator is None: + assert G.shape == (1, None, None) + assert G.device == device + assert hasattr(G, 'aux_labels') is False + + # First, extract `num_paths` paths for each sequence. + # paths is a k2.RaggedInt with axes [seq][path][arc_pos] + paths = k2.random_paths(lats, num_paths=num_paths, use_double_scores=True) + + # word_seqs is a k2.RaggedInt sharing the same shape as `paths` + # but it contains word IDs. Note that it also contains 0s and -1s. + # The last entry in each sublist is -1. + word_seqs = k2.index(lats.aux_labels, paths) + + # Remove epsilons and -1 from word_seqs + word_seqs = k2.ragged.remove_values_leq(word_seqs, 0) + + # Remove repeated sequences to avoid redundant computation later. + # + # unique_word_seqs is still a k2.RaggedInt with 3 axes [seq][path][word] + # except that there are no repeated paths with the same word_seq + # within a seq. + # + # num_repeats is also a k2.RaggedInt with 2 axes containing the + # multiplicities of each path. + # num_repeats.num_elements() == unique_word_seqs.num_elements() + # + # Since k2.ragged.unique_sequences will reorder paths within a seq, + # `new2old` is a 1-D torch.Tensor mapping from the output path index + # to the input path index. + # new2old.numel() == unique_word_seqs.num_elements() + unique_word_seqs, num_repeats, new2old = k2.ragged.unique_sequences( + word_seqs, need_num_repeats=True, need_new2old_indexes=True) + + seq_to_path_shape = k2.ragged.get_layer(unique_word_seqs.shape(), 0) + + # path_to_seq_map is a 1-D torch.Tensor. + # path_to_seq_map[i] is the seq to which the i-th path + # belongs. + path_to_seq_map = seq_to_path_shape.row_ids(1) + + # Remove the seq axis. + # Now unique_word_seqs has only two axes [path][word] + unique_word_seqs = k2.ragged.remove_axis(unique_word_seqs, 0) + + # word_fsas is an FsaVec with axes [path][state][arc] + word_fsas = k2.linear_fsa(unique_word_seqs) + + word_fsas_with_epsilon_loops = k2.add_epsilon_self_loops(word_fsas) + + am_scores = compute_am_scores(lats, word_fsas_with_epsilon_loops, + path_to_seq_map) + + # Now compute lm_scores + b_to_a_map = torch.zeros_like(path_to_seq_map) + if evaluator is None: + lm_path_lats = k2.intersect_device(G, + word_fsas_with_epsilon_loops, + b_to_a_map=b_to_a_map, + sorted_match_a=True) + lm_path_lats = k2.top_sort(k2.connect( + lm_path_lats.to('cpu'))).to(device) + lm_scores = lm_path_lats.get_tot_scores(True, True) + else: + lm_scores = -evaluator.score_word_seqs(unique_word_seqs) + + # import pdb + # pdb.set_trace() + tot_scores = am_scores + lm_scores + # tot_scores = lm_scores + + # Remember that we used `k2.ragged.unique_sequences` to remove repeated + # paths to avoid redundant computation in `k2.intersect_device`. + # Now we use `num_repeats` to correct the scores for each path. + # + # NOTE(fangjun): It is commented out as it leads to a worse WER + # tot_scores = tot_scores * num_repeats.values() + + # TODO(fangjun): We may need to add `k2.RaggedDouble` + ragged_tot_scores = k2.RaggedFloat(seq_to_path_shape, + tot_scores.to(torch.float32)) + argmax_indexes = k2.ragged.argmax_per_sublist(ragged_tot_scores) + + # Use k2.index here since argmax_indexes' dtype is torch.int32 + best_path_indexes = k2.index(new2old, argmax_indexes) + + paths = k2.ragged.remove_axis(paths, 0) + + # best_path is a k2.RaggedInt with 2 axes [path][arc_pos] + best_paths = k2.index(paths, best_path_indexes) + + # labels is a k2.RaggedInt with 2 axes [path][phone_id] + # Note that it contains -1s. + labels = k2.index(lats.labels.contiguous(), best_paths) + + labels = k2.ragged.remove_values_eq(labels, -1) + + # lats.aux_labels is a k2.RaggedInt tensor with 2 axes, so + # aux_labels is also a k2.RaggedInt with 2 axes + aux_labels = k2.index(lats.aux_labels, best_paths.values()) + + best_path_fsas = k2.linear_fsa(labels) + best_path_fsas.aux_labels = aux_labels + + return best_path_fsas + + +@torch.no_grad() +def rescore_with_whole_lattice(lats: k2.Fsa, + G_with_epsilon_loops: k2.Fsa) -> k2.Fsa: + '''Use whole lattice to rescore. + + Args: + lats: + An FsaVec It can be the output of `k2.intersect_dense_pruned`. + G_with_epsilon_loops: + An FsaVec representing the language model (LM). Note that it + is an FsaVec, but it contains only one Fsa. + ''' + assert len(lats.shape) == 3 + assert hasattr(lats, 'lm_scores') + assert G_with_epsilon_loops.shape == (1, None, None) + + device = lats.device + lats.scores = lats.scores - lats.lm_scores + # Now, lats.scores contains only am_scores + + # inverted_lats has word IDs as labels. + # Its aux_labels are phone IDs, which is a ragged tensor k2.RaggedInt + inverted_lats = k2.invert(lats) + num_seqs = lats.shape[0] + inverted_lats_with_epsilon_loops = k2.add_epsilon_self_loops(inverted_lats) + + b_to_a_map = torch.zeros(num_seqs, device=device, dtype=torch.int32) + try: + rescoring_lats = k2.intersect_device(G_with_epsilon_loops, + inverted_lats_with_epsilon_loops, + b_to_a_map, + sorted_match_a=True) + except RuntimeError as e: + print(f'Caught exception:\n{e}\n') + print(f'Number of FSAs: {inverted_lats.shape[0]}') + print('num_arcs before pruning: ', + inverted_lats_with_epsilon_loops.arcs.num_elements()) + + # NOTE(fangjun): The choice of the threshold 0.01 is arbitrary here + # to avoid OOM. We may need to fine tune it. + inverted_lats = k2.prune_on_arc_post(inverted_lats, 0.001, True) + inverted_lats_with_epsilon_loops = k2.add_epsilon_self_loops( + inverted_lats) + print('num_arcs after pruning: ', + inverted_lats_with_epsilon_loops.arcs.num_elements()) + + rescoring_lats = k2.intersect_device(G_with_epsilon_loops, + inverted_lats_with_epsilon_loops, + b_to_a_map, + sorted_match_a=True) + + rescoring_lats = k2.top_sort(k2.connect( + rescoring_lats.to('cpu'))).to(device) + inverted_rescoring_lats = k2.invert(rescoring_lats) + # inverted rescoring_lats has phone IDs as labels + # and word IDs as aux_labels. + + inverted_rescoring_lats = k2.remove_epsilon_self_loops( + inverted_rescoring_lats) + best_paths = k2.shortest_path(inverted_rescoring_lats, + use_double_scores=True) + return best_paths + + +@torch.no_grad() +def decode_with_lm_rescoring(lats: k2.Fsa, + G: k2.Fsa, + num_paths: int, + use_whole_lattice: bool, + evaluator=None) -> k2.Fsa: + '''Decode using n-best list with LM rescoring. + + `lats` is a decoding lattice, which has 3 axes. This function first + extracts `num_paths` paths from `lats` for each sequence using + `k2.random_paths`. The `am_scores` of these paths are computed. + For each path, its `lm_scores` is computed using `G` (which is an LM). + The final `tot_scores` is the sum of `am_scores` and `lm_scores`. + The path with the greatest `tot_scores` within a sequence is used + as the decoding output. + + Args: + lats: + An FsaVec It can be the output of `k2.intersect_dense_pruned`. + G: + An FsaVec representing the language model (LM). Note that it + is an FsaVec, but it contains only one Fsa. + num_paths: + It is the size `n` in `n-best` list. + Used only if use_whole_lattice is False. + use_whole_lattice: + True to use whole lattice for rescoring. False to use n-best list + for rescoring. + Returns: + An FsaVec representing the best decoding path for each sequence + in the lattice. + ''' + if use_whole_lattice: + return rescore_with_whole_lattice(lats, G) + elif evaluator is not None: + return rescore_with_n_best_list(lats, None, num_paths, evaluator) + else: + return rescore_with_n_best_list(lats, G, num_paths) From d847b2862dde178872e4789971f3a7fd37e60303 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 20 Apr 2021 17:08:51 +0800 Subject: [PATCH 23/25] filter train data by length to increase batch_size --- egs/librispeech/asr/nnlm/run.sh | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index 343ef473..e18c6769 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -83,12 +83,28 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then + # TODO:Move following flollowig filtered by length module in Dataset + # The longest sample has 1344 tokens. Batchsize is quite small if training data contains these Long samples. + # Only 1.31% = 529,260/40,198,051 samples are filtered out by length 90. + maximum_length=90 + echo "filter out sampels which longher than "$maximum_length" tokens" + data_dir=./data/nnlm/text + train_data_filtered_by_length=${data_dir}/length_${maximum_length}_librispeech.txt.tokens + train_data=${data_dir}/librispeech.txt.tokens + ori_train_data=${data_dir}/ori_librispeech.txt.tokens + if [ ! -f $ori_train_data ]; then + mv ${train_data} ${ori_train_data} + fi + + if [ ! -f $train_data_filtered_by_length ]; then + awk -v maximum_length=$maximum_length 'NF $train_data_filtered_by_length + ln -sf `realpath $train_data_filtered_by_length` ${train_data} + fi + echo "start to train" # resume_model_iter is for resume training # -1 means train from scratch - # python main.py \ export CUDA_VISIBLE_DEVICES=0,1,2,3 - # python -m torch.distributed.launch --nproc_per_node=4 test.py \ python -m torch.distributed.launch --nproc_per_node=4 main.py \ --config $lm_config \ --vocab_size $vocab_size \ From 52300df56f0ab1591661775b2c63e382ffce97e2 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 20 Apr 2021 17:32:45 +0800 Subject: [PATCH 24/25] use Noam optimizer --- .../asr/nnlm/conf/lm_small_transformer.yaml | 17 +++++++++-------- egs/librispeech/asr/nnlm/main.py | 16 +++++++++++----- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml index 4ee16290..3cbe8596 100644 --- a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml +++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml @@ -2,7 +2,7 @@ gpu: 1 tensorboard_dir: 'exp-nnlm/tensorobard' -# network architecture equivalent configuration to +# network architecture equivalent configuration to # https://github.com/pytorch/examples/blob/master/word_language_model/main.py model_module: transformer transformer_conf: @@ -15,13 +15,14 @@ transformer_conf: shared_conf: ntoken: 5003 -optimizer_conf: - # for Adam - lr: 0.0003 - weight_decay: 0.001 - # for SGD - # lr: 0.01 - # weight_decay: 0.001 +# Now using Noam optimizer and tuning configuration +# optimizer_conf: +# # for Adam +# lr: 0.0003 +# weight_decay: 0.001 +# # for SGD +# # lr: 0.01 +# # weight_decay: 0.001 trainer_conf: num_epochs: 60 diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py index 7f7b9ae8..ed640a27 100644 --- a/egs/librispeech/asr/nnlm/main.py +++ b/egs/librispeech/asr/nnlm/main.py @@ -28,6 +28,8 @@ from torch.utils.data import DataLoader from typing import List, Dict +from snowfall.models.transformer import Noam + def get_args(): parser = argparse.ArgumentParser( @@ -59,7 +61,7 @@ def validate_configs(configs: Dict, required_fields: List) -> bool: def extract_configs(args) -> Dict: assert os.path.exists(args.config), '{} does not exist'.format(args.cofnig) required_fields = [ - 'model_module', 'shared_conf', 'optimizer_conf', 'trainer_conf', + 'model_module', 'shared_conf', 'trainer_conf', 'dataset_conf' ] with open(args.config, 'r') as f: @@ -124,17 +126,21 @@ def main(): model = TransformerModel(**configs['transformer_conf']) if args.resume_model_iter > 0: model_dir = configs['trainer_conf']['model_dir'] - model_path = '{}/epoch_{}.pt'.format(model_dir, args.resume_model_iter) + model_path = '{}/epoch_{}.pt'.format(model_dir, + args.resume_model_iter) assert os.path.exists(model_path) load_checkpoint(model_path, model) model = torch.nn.parallel.DistributedDataParallel( model.to(device), [args.local_rank]) - - optimizer = optim.AdamW(model.parameters(), **configs['optimizer_conf']) + optimizer = Noam(model.parameters(), + model.module.embed_unit, + factor=1.0, + warm_step=5000) criterion = nn.NLLLoss(ignore_index=pad_index) - writer = SummaryWriter(log_dir=configs['tensorboard_dir']) + writer = SummaryWriter(log_dir=configs['tensorboard_dir'] + + str(args.local_rank)) log_interval = max(100, len(train_data_loader) // 20) trainer = Trainer(device=device, From e61a9d157c60b450d2a1cbc2e49e6ed0a3687588 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Tue, 20 Apr 2021 19:21:37 +0800 Subject: [PATCH 25/25] add rescore scripts --- .../asr/nnlm/conf/lm_small_transformer.yaml | 2 +- egs/librispeech/asr/nnlm/run.sh | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml index 3cbe8596..34cc8ec8 100644 --- a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml +++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml @@ -8,7 +8,7 @@ model_module: transformer transformer_conf: embed_unit: 200 attention_heads: 8 - nlayers: 8 + nlayers: 16 linear_units: 2048 dropout: 0.2 diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh index e18c6769..86fe6c4f 100644 --- a/egs/librispeech/asr/nnlm/run.sh +++ b/egs/librispeech/asr/nnlm/run.sh @@ -117,8 +117,24 @@ if [ $stage -le 4 ]; then python compute_word_ppl.py fi - if [ $stage -le 5 ]; then + # this stage requires trained mmi models + export PYTHONPATH=$PWD/local:$PYTHONPATH + + cd ../simple_v1 + + # TODO: Remove hard-code Transformer language mode path + ./mmi_att_transformer_decode.py \ + --use-nnlm-rescoring=1 \ + --num-path=100 \ + --max-duration=500 \ + --output-beam-size=20 + + cd ../nnlm + +fi + +if [ $stage -le 6 ]; then # generate words.txt tokens.txt and lexicion.txt # which is used in future rescore process lexicon_path=./data/nnlm/lexicon @@ -130,6 +146,7 @@ if [ $stage -le 5 ]; then echo "please set words_txt path of your previous experiment" echo "the NN-LM trained LM is used as a rescore module, \ currently the same words.txt with previous experiment is prefered" + exit 0 fi echo "generate lexicon" python local/generate_lexicon.py \