From f038e6003c1fc5fa7bcc9873b1492d9c73515e66 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Thu, 25 Mar 2021 19:34:08 +0800
Subject: [PATCH 01/25] hugginface tokenizer and Neural LM training pipeline.

This commit is mainly about hugginface tokenizer and
a draft transformer/RNN based LM training pipeline.
---
 egs/librispeech/asr/nnlm/local/data.py        |  54 +++
 .../asr/nnlm/local/download_lm_train_data.py  |  42 +++
 .../asr/nnlm/local/huggingface_tokenizer.py   |  98 ++++++
 egs/librispeech/asr/nnlm/local/model.py       | 154 ++++++++
 egs/librispeech/asr/nnlm/main.py              | 331 ++++++++++++++++++
 egs/librispeech/asr/nnlm/run.sh               |  61 ++++
 6 files changed, 740 insertions(+)
 create mode 100644 egs/librispeech/asr/nnlm/local/data.py
 create mode 100644 egs/librispeech/asr/nnlm/local/download_lm_train_data.py
 create mode 100644 egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
 create mode 100644 egs/librispeech/asr/nnlm/local/model.py
 create mode 100644 egs/librispeech/asr/nnlm/main.py
 create mode 100644 egs/librispeech/asr/nnlm/run.sh
diff --git a/egs/librispeech/asr/nnlm/local/data.py b/egs/librispeech/asr/nnlm/local/data.py
new file mode 100644
index 00000000..7cdb4c2d
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/data.py
@@ -0,0 +1,54 @@
+import os
+from io import open
+import torch
+
+
+class Dictionary(object):
+
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+        self.idx2word.append('<PAD>')
+        self.word2idx['<PAD>'] = 0
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+            # self.word2idx[word] = len(self.idx2word)
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+
+    def __init__(self, path):
+        self.dictionary = Dictionary()
+        self.train = self.tokenize(os.path.join(path, 'train.tokens'))
+        self.valid = self.tokenize(os.path.join(path, 'valid.tokens'))
+        self.test = self.tokenize(os.path.join(path, 'test.tokens'))
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        assert os.path.exists(path)
+        # Add words to the dictionary
+        with open(path, 'r', encoding="utf8") as f:
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    self.dictionary.add_word(word)
+
+        # Tokenize file content
+        with open(path, 'r', encoding="utf8") as f:
+            idss = []
+            for line in f:
+                words = line.split() + ['<eos>']
+                ids = []
+                for word in words:
+                    ids.append(self.dictionary.word2idx[word])
+                idss.append(torch.tensor(ids).type(torch.int64))
+            # ids = torch.cat(idss)
+
+        return idss
diff --git a/egs/librispeech/asr/nnlm/local/download_lm_train_data.py b/egs/librispeech/asr/nnlm/local/download_lm_train_data.py
new file mode 100644
index 00000000..d9ff066a
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/download_lm_train_data.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+import os
+import logging
+from google_drive_downloader import GoogleDriveDownloader as gdd
+from pathlib import Path
+
+# librispeech-lm-norm.txt is 4G
+# train_960_text is 48M, which is stands for the sum of {train_clean_360, train_clean_100, train_other_500}
+# here only train_960_text used to verify the whole pipeline
+# A copy of train_960_text: "htts://drive.google.com/file/d/1AgP4wTqbfp12dv4fJmjKXHdOf8eOtp_A/view?usp=sharing"
+# local_path: "/ceph-ly/open-source/snowfall/egs/librispeech/asr/simple_v1/data/local/lm_train/train_960_text"
+
+
+def download_librispeech_train_960_text():
+    train_960_text = "./data/lm_train/librispeech_train_960_text"
+    if not os.path.exists(train_960_text):
+        Path(os.path.dirname(train_960_text)).mkdir(parents=True,
+                                                    exist_ok=True)
+
+        logging.info("downloading train_960_text of librispeech.")
+        gdd.download_file_from_google_drive(
+            file_id='1AgP4wTqbfp12dv4fJmjKXHdOf8eOtp_A',
+            dest_path=train_960_text,
+            unzip=False)
+    else:
+        logging.info(
+            "train_960_text of librispeech is already downloaded. You may should check that"
+        )
+
+
+def main():
+    logging.getLogger().setLevel(logging.INFO)
+
+    download_librispeech_train_960_text()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
new file mode 100644
index 00000000..8f2cff43
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+# reference: https://huggingface.co/docs/tokenizers/python/latest/quicktour.html
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers import normalizers
+from tokenizers.normalizers import Lowercase, NFD, StripAccents
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.trainers import WordPieceTrainer
+from tokenizers import decoders
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='train and tokenize with huggingface tokenizer')
+    parser.add_argument('--train-file',
+                        type=str,
+                        help="""file to train tokenizer""")
+    parser.add_argument('--vocab-size',
+                        type=int,
+                        default=1000,
+                        help="""number of tokens of the tokenizer""")
+    parser.add_argument('--tokenizer-path',
+                        type=str,
+                        help="path to save or load tokenizer")
+    parser.add_argument('--test-file',
+                        type=str,
+                        help="""file to be tokenized""")
+    args = parser.parse_args()
+    return args
+
+
+def train_tokenizer(train_files, save_path, vocab_size):
+    if os.path.exists(save_path):
+        logging.warning(
+            "{} already exists. Please check that.".format(save_path))
+        return
+    else:
+        Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True)
+
+    tokenizer = Tokenizer(WordPiece(unk_token='[UNK]'))
+    tokenizer.normalizer = normalizers.Sequence(
+        [NFD(), Lowercase(), StripAccents()])
+    tokenizer.pre_tokenizer = Whitespace()
+
+    # default vocab_size=30000
+    # here set vocab_size=1000 for accelerating
+    trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=['[UNK]'])
+    tokenizer.train(train_files, trainer)
+    tokenizer.save(save_path)
+
+
+def tokenize_text(test_file, tokenizer_path):
+    if not os.path.exists(tokenizer_path):
+        logging.warning(
+            "Tokenizer {} does not exist. Please check that.".format(
+                tokenizer_path))
+        return
+    tokenizer = Tokenizer.from_file(tokenizer_path)
+    tokenizer.decoder = decoders.WordPiece()
+    tokenized_file = "{}.tokens".format(test_file)
+    # tokenized_ids = "{}.ids".format(test_file)
+    if os.path.exists(tokenized_file):
+        logging.warning(
+            "The input file seems already tokenized. Buckupping previous result"
+        )
+        shutil.copyfile(tokenized_file, "{}.bk".format(tokenized_file))
+    logging.warning("Tokenizing {}.".format(test_file))
+    fout = open(tokenized_file, 'w')
+    with open(test_file) as f:
+        for line in f:
+            line = line.strip()
+            output = tokenizer.encode(line)
+            fout.write(" ".join(output.tokens) + '\n')
+
+    fout.close()
+
+
+def main():
+    args = get_args()
+    if args.train_file is not None:
+        train_files = [args.train_file]
+        train_tokenizer(train_files, args.tokenizer_path, args.vocab_size)
+
+    if args.test_file is not None:
+        tokenize_text(args.test_file, args.tokenizer_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py
new file mode 100644
index 00000000..3767302f
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/model.py
@@ -0,0 +1,154 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class RNNModel(nn.Module):
+    """Container module with an encoder, a recurrent module, and a decoder."""
+
+    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
+        super(RNNModel, self).__init__()
+        self.ntoken = ntoken
+        self.drop = nn.Dropout(dropout)
+        # import pdb; pdb.set_trace()
+        self.encoder = nn.Embedding(ntoken, ninp, padding_idx=0)
+        if rnn_type in ['LSTM', 'GRU']:
+            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
+        else:
+            try:
+                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
+            except KeyError:
+                raise ValueError( """An invalid option for `--model` was supplied,
+                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
+            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
+        self.decoder = nn.Linear(nhid, ntoken)
+
+        # Optionally tie weights as in:
+        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
+        # https://arxiv.org/abs/1608.05859
+        # and
+        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
+        # https://arxiv.org/abs/1611.01462
+        if tie_weights:
+            if nhid != ninp:
+                raise ValueError('When using the tied flag, nhid must be equal to emsize')
+            self.decoder.weight = self.encoder.weight
+
+        self.init_weights()
+
+        self.rnn_type = rnn_type
+        self.nhid = nhid
+        self.nlayers = nlayers
+
+    def init_weights(self):
+        initrange = 0.1
+        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
+        nn.init.zeros_(self.decoder.weight)
+        nn.init.uniform_(self.decoder.weight, -initrange, initrange)
+
+    def forward(self, input, hidden):
+        # import pdb; pdb.set_trace()
+        emb = self.drop(self.encoder(input))
+        output, hidden = self.rnn(emb, hidden)
+        output = self.drop(output)
+        decoded = self.decoder(output)
+        decoded = decoded.view(-1, self.ntoken)
+        return F.log_softmax(decoded, dim=1), hidden
+
+    def init_hidden(self, bsz):
+        weight = next(self.parameters())
+        if self.rnn_type == 'LSTM':
+            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
+                    weight.new_zeros(self.nlayers, bsz, self.nhid))
+        else:
+            return weight.new_zeros(self.nlayers, bsz, self.nhid)
+
+# Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
+class PositionalEncoding(nn.Module):
+    r"""Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+    Examples:
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+class TransformerModel(nn.Module):
+    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
+
+    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
+        super(TransformerModel, self).__init__()
+        try:
+            from torch.nn import TransformerEncoder, TransformerEncoderLayer
+        except:
+            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
+        self.model_type = 'Transformer'
+        self.src_mask = None
+        self.pos_encoder = PositionalEncoding(ninp, dropout)
+        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
+        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        self.ninp = ninp
+        self.decoder = nn.Linear(ninp, ntoken)
+
+        self.init_weights()
+
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+
+    def init_weights(self):
+        initrange = 0.1
+        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
+        nn.init.zeros_(self.decoder.weight)
+        nn.init.uniform_(self.decoder.weight, -initrange, initrange)
+
+    def forward(self, src, has_mask=True):
+        if has_mask:
+            device = src.device
+            if self.src_mask is None or self.src_mask.size(0) != len(src):
+                mask = self._generate_square_subsequent_mask(len(src)).to(device)
+                self.src_mask = mask
+        else:
+            self.src_mask = None
+
+        src = self.encoder(src) * math.sqrt(self.ninp)
+        src = self.pos_encoder(src)
+        output = self.transformer_encoder(src, self.src_mask)
+        output = self.decoder(output)
+        return F.log_softmax(output, dim=-1)
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
new file mode 100644
index 00000000..6d81ecde
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -0,0 +1,331 @@
+# coding: utf-8
+import argparse
+import time
+import math
+import os
+import sys
+import torch
+import torch.nn as nn
+import torch.onnx
+
+sys.path.insert(0, './local/')
+
+import data
+import model
+
+parser = argparse.ArgumentParser(
+    description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
+parser.add_argument('--data',
+                    type=str,
+                    default='./data/lm_train/',
+                    help='location of the data corpus')
+parser.add_argument(
+    '--model',
+    type=str,
+    default='LSTM',
+    help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
+parser.add_argument('--emsize',
+                    type=int,
+                    default=200,
+                    help='size of word embeddings')
+parser.add_argument('--nhid',
+                    type=int,
+                    default=200,
+                    help='number of hidden units per layer')
+parser.add_argument('--nlayers', type=int, default=2, help='number of layers')
+parser.add_argument('--lr',
+                    type=float,
+                    default=20,
+                    help='initial learning rate')
+parser.add_argument('--clip',
+                    type=float,
+                    default=0.25,
+                    help='gradient clipping')
+parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit')
+parser.add_argument('--batch_size',
+                    type=int,
+                    default=30,
+                    metavar='N',
+                    help='batch size')
+parser.add_argument('--bptt', type=int, default=35, help='sequence length')
+parser.add_argument('--dropout',
+                    type=float,
+                    default=0.2,
+                    help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--tied',
+                    action='store_true',
+                    help='tie the word embedding and softmax weights')
+parser.add_argument('--seed', type=int, default=1111, help='random seed')
+parser.add_argument('--cuda', action='store_true', help='use CUDA')
+parser.add_argument('--log-interval',
+                    type=int,
+                    default=200,
+                    metavar='N',
+                    help='report interval')
+parser.add_argument('--save',
+                    type=str,
+                    default='model.pt',
+                    help='path to save the final model')
+parser.add_argument('--onnx-export',
+                    type=str,
+                    default='',
+                    help='path to export the final model in onnx format')
+
+parser.add_argument(
+    '--nhead',
+    type=int,
+    default=2,
+    help='the number of heads in the encoder/decoder of the transformer model')
+parser.add_argument('--dry-run',
+                    action='store_true',
+                    help='verify the code and the model')
+
+args = parser.parse_args()
+
+# Set the random seed manually for reproducibility.
+torch.manual_seed(args.seed)
+if torch.cuda.is_available():
+    if not args.cuda:
+        print(
+            "WARNING: You have a CUDA device, so you should probably run with --cuda"
+        )
+
+device = torch.device("cuda" if args.cuda else "cpu")
+
+###############################################################################
+# Load data
+###############################################################################
+
+corpus = data.Corpus(args.data)
+
+# Starting from sequential data, batchify arranges the dataset into columns.
+# For instance, with the alphabet as the sequence and batch size 4, we'd get
+# ┌ a g m s ┐
+# │ b h n t │
+# │ c i o u │
+# │ d j p v │
+# │ e k q w │
+# └ f l r x ┘.
+# These columns are treated as independent by the model, which means that the
+# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
+# batch processing.
+
+
+def batchify(data, bsz):
+    # Work out how cleanly we can divide the dataset into bsz parts.
+    nbatch = len(data) // bsz
+    # Trim off any extra elements that wouldn't cleanly fit (remainders).
+    data = data.narrow(0, 0, nbatch * bsz)
+    # Evenly divide the data across the bsz batches.
+    data = data.view(bsz, -1).t().contiguous()
+    return data.to(device)
+
+
+eval_batch_size = args.batch_size
+# train_data = batchify(corpus.train, args.batch_size)
+# val_data = batchify(corpus.valid, eval_batch_size)
+# test_data = batchify(corpus.test, eval_batch_size)
+
+train_data = corpus.train
+val_data = corpus.valid
+test_data = corpus.test
+###############################################################################
+# Build the model
+###############################################################################
+
+ntokens = len(corpus.dictionary)
+if args.model == 'Transformer':
+    model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
+                                   args.nlayers, args.dropout).to(device)
+else:
+    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
+                           args.nlayers, args.dropout, args.tied).to(device)
+
+criterion = nn.NLLLoss(ignore_index=0)
+
+###############################################################################
+# Training code
+###############################################################################
+
+
+def repackage_hidden(h):
+    """Wraps hidden states in new Tensors, to detach them from their history."""
+
+    if isinstance(h, torch.Tensor):
+        return h.detach()
+    else:
+        return tuple(repackage_hidden(v) for v in h)
+
+
+# get_batch subdivides the source data into chunks of length args.bptt.
+# If source is equal to the example output of the batchify function, with
+# a bptt-limit of 2, we'd get the following two Variables for i = 0:
+# ┌ a g m s ┐ ┌ b h n t ┐
+# └ b h n t ┘ └ c i o u ┘
+# Note that despite the name of the function, the subdivison of data is not
+# done along the batch dimension (i.e. dimension 1), since that was handled
+# by the batchify function. The chunks are along dimension 0, corresponding
+# to the seq_len dimension in the LSTM.
+
+# def get_batch(source, i):
+#     seq_len = min(args.bptt, len(source) - 1 - i)
+#     data = source[i:i+seq_len]
+#     target = source[i+1:i+1+seq_len].view(-1)
+#     return data, target
+
+
+def get_batch(source, i, pad_index=0, batch_size=args.batch_size):
+    batch = source[i * batch_size:(i + 1) * batch_size]
+    # import pdb; pdb.set_trace()
+    seq_lens = [len(batch[i]) for i in range(batch_size)]
+    seq_len = max(seq_lens) + 1
+    data_padded = []
+    target = []
+    for data in batch:
+        # import pdb; pdb.set_trace()
+        # print("{} {}".format(seq_len,len(data)))
+        padding = torch.tensor([pad_index for _ in range(seq_len - len(data))])
+        data = torch.cat((data, padding), dim=0)
+        data_padded.append(data[:-1])
+        target.append(data[1:])
+
+    # import pdb; pdb.set_trace()
+    # data_padded: (Length, Batch_size)
+    # target: (Batch_size, Batch_size, Batch_size, ***)
+    try:
+        data_padded = torch.stack(data_padded).to(device).transpose(0, 1)
+        target = torch.flatten(torch.stack(target).to(device).transpose(0, 1))
+        return data_padded, target
+    except:
+        import pdb
+        pdb.set_trace()
+
+    #return torch.stack(data_padded).to(device), torch.stack(target).to(device)
+
+
+def evaluate(data_source):
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    total_loss = 0.
+    ntokens = len(corpus.dictionary)
+    if args.model != 'Transformer':
+        hidden = model.init_hidden(eval_batch_size)
+    with torch.no_grad():
+        for i in range(0, len(data_source) // eval_batch_size - 1):
+            data, targets = get_batch(data_source, i)
+            if args.model == 'Transformer':
+                output = model(data)
+                output = output.view(-1, ntokens)
+            else:
+                output, hidden = model(data, hidden)
+                hidden = repackage_hidden(hidden)
+            total_loss += len(data) * criterion(output, targets).item()
+    return total_loss / (len(data_source) - 1)
+
+
+def train():
+    # Turn on training mode which enables dropout.
+    batch_size = args.batch_size
+    model.train()
+    total_loss = 0.
+    start_time = time.time()
+    ntokens = len(corpus.dictionary)
+    if args.model != 'Transformer':
+        hidden = model.init_hidden(batch_size)
+    for batch_idx in range(0, len(train_data) // batch_size - 1):
+        data, targets = get_batch(train_data, batch_idx)
+        # Starting each batch, we detach the hidden state from how it was previously produced.
+        # If we didn't, the model would try backpropagating all the way to start of the dataset.
+        model.zero_grad()
+        if args.model == 'Transformer':
+            output = model(data)
+            output = output.view(-1, ntokens)
+        else:
+            hidden = repackage_hidden(hidden)
+            output, hidden = model(data, hidden)
+
+        # import pdb; pdb.set_trace()
+        loss = criterion(output, targets)
+        loss.backward()
+
+        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
+        for p in model.parameters():
+            p.data.add_(p.grad, alpha=-lr)
+
+        #import pdb; pdb.set_trace()
+        total_loss += loss.item()
+
+        if batch_idx % args.log_interval == 0 and batch_idx > 0:
+            cur_loss = total_loss / args.log_interval
+            elapsed = time.time() - start_time
+            print(
+                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
+                'loss {:5.2f} | ppl {:8.2f}'.format(
+                    epoch, batch_idx,
+                    len(train_data) // batch_size, lr,
+                    elapsed * 1000 / args.log_interval, cur_loss,
+                    math.exp(cur_loss)))
+            total_loss = 0
+            start_time = time.time()
+        if args.dry_run:
+            break
+
+
+def export_onnx(path, batch_size, seq_len):
+    print('The model is also exported in ONNX format at {}'.format(
+        os.path.realpath(args.onnx_export)))
+    model.eval()
+    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(
+        -1, batch_size).to(device)
+    hidden = model.init_hidden(batch_size)
+    torch.onnx.export(model, (dummy_input, hidden), path)
+
+
+# Loop over epochs.
+lr = args.lr
+best_val_loss = None
+
+# At any point you can hit Ctrl + C to break out of training early.
+try:
+    for epoch in range(1, args.epochs + 1):
+        epoch_start_time = time.time()
+        train()
+        val_loss = evaluate(val_data)
+        print('-' * 89)
+        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
+              'valid ppl {:8.2f}'.format(epoch,
+                                         (time.time() - epoch_start_time),
+                                         val_loss, math.exp(val_loss)))
+        print('-' * 89)
+        # Save the model if the validation loss is the best we've seen so far.
+        if not best_val_loss or val_loss < best_val_loss:
+            with open(args.save, 'wb') as f:
+                torch.save(model, f)
+            best_val_loss = val_loss
+        else:
+            # Anneal the learning rate if no improvement has been seen in the validation dataset.
+            lr /= 4.0
+except KeyboardInterrupt:
+    print('-' * 89)
+    print('Exiting from training early')
+
+# Load the best saved model.
+with open(args.save, 'rb') as f:
+    model = torch.load(f)
+    # after load the rnn params are not a continuous chunk of memory
+    # this makes them a continuous chunk, and will speed up forward pass
+    # Currently, only rnn model supports flatten_parameters function.
+    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
+        model.rnn.flatten_parameters()
+
+# Run on test data.
+test_loss = evaluate(test_data)
+print('=' * 89)
+print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
+    test_loss, math.exp(test_loss)))
+print('=' * 89)
+
+if len(args.onnx_export) > 0:
+    # Export the model in ONNX format.
+    export_onnx(args.onnx_export, batch_size=1, seq_len=args.bptt)
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
new file mode 100644
index 00000000..3cfd5e18
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Xiaomi Corporation (Author: Liyong Guo)
+# Apache 2.0
+
+# References:
+# https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/train_rnnlm.sh
+# https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/prepare_rnnlm_dir.sh
+# https://github.com/pytorch/examples/tree/master/word_language_model
+# https://huggingface.co/docs/tokenizers/python/latest/quicktour.html
+
+# Example of how to use HuggingFace tokenizer and train {RNN, Transformer} based LMs
+
+set -e
+stage=$1
+
+lm_train=data/lm_train/
+full_text=$lm_train/librispeech_train_960_text
+tokenizer=$lm_train/tokenizer-librispeech_train_960.json
+if [ $stage -eq 1 ]; then
+  python3 ./local/download_lm_train_data.py
+fi
+if [ $stage -eq 2 ]; then
+  echo "training tokenizer"
+  python3 local/huggingface_tokenizer.py \
+    --train-file=$full_text \
+    --tokenizer-path=$tokenizer
+fi
+
+if [ $stage -eq 3 ]; then
+  echo "tokenize a file"
+  python3 local/huggingface_tokenizer.py \
+    --test-file=$full_text \
+    --tokenizer-path=$tokenizer
+fi
+
+if [ $stage -eq 4 ]; then
+  echo "split all data into train/valid/test"
+
+  full_tokens=${full_text}.tokens
+  valid_test_fraction=10 # currently 5 percent for valid and 5 percent for test
+  valid_test_tokens=$lm_train/valid_test.tokens
+  train_tokens=$lm_train/train.tokens
+
+  num_utts_total=$(wc -l <$full_tokens )
+  num_valid_test=$(($num_utts_total/${valid_test_fraction}))
+  set +x
+  shuf -n $num_valid_test  $full_tokens > $valid_test_tokens
+
+  comm -3 <(sort $valid_test_tokens) <(sort $full_tokens) > $train_tokens
+  shuf -n $(($num_valid_test/2)) $valid_test_tokens > $lm_train/valid.tokens
+  comm -3 <(sort $lm_train/valid.tokens) <(sort $valid_test_tokens) > $lm_train/test.tokens
+
+fi
+
+
+if [ $stage -eq 5 ]; then
+  python main.py \
+    --cuda \
+    --model Transformer
+fi

From e9482d25349a0367775e3b0470b67b3099362ce8 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Mon, 29 Mar 2021 21:52:18 +0800
Subject: [PATCH 02/25] draft of class LMDataset

---
 egs/librispeech/asr/nnlm/local/dataset.py | 71 +++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 egs/librispeech/asr/nnlm/local/dataset.py

diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
new file mode 100644
index 00000000..6792b4d8
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+
+
+class CollateFunc(object):
+    '''Collate function for LMDataset
+    '''
+
+    def __init__(self, pad_index):
+        self.pad_index = pad_index
+
+    def __call__(self, batch):
+        # xs: input sequence
+        # ys: label sequence
+        xs = batch
+        # ys = batch
+        xs_pad = pad_sequence(
+            [torch.from_numpy(x).int() for x in xs, True, self.pad_index])
+        ys_pad = xs_pad
+        return xs_pad, ys_pad
+
+
+class LMDataset(Dataset):
+
+    def __init__(self, text_file: str):
+        '''Dataset to load Language Model train/dev text data
+
+        Args:
+            text_file: text file, one utt per line.
+        '''
+        assert os.path.exists(
+            text_file), "text_file: {} does not exist, please check that."
+        self.data = []
+        with open(text_file, 'r') as f:
+            for line in f:
+                text = line.strip().split()
+                assert len(text) > 0
+                text_id = text2id(text)
+                token_id = text_id2token_id(text_id)
+                self.data.append(token_id)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+    def text2id(text: list[str]) -> list[int]:
+        pass
+
+    def text_id2token_id(text_id: list[int]) -> list[int]:
+        pass
+
+
+if __name__ == '__main__':
+    train_file = "./data/local/lm/train.txt"
+    # dev_file = "./data/local/lm/dev.txt"
+    dataset = LMDataset(train_file)
+    data_loader = DataLoader(dataset,
+                             batch_size=1,
+                             shuffle=True,
+                             num_workers=0,
+                             collaate_fn=collate_func)
+    for i, batch in enumerate(data_loader):
+        print(i)
+        print(batch)

From 135bfdb7985318e3ab77319ac8cc2de239dc2a9e Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Mon, 29 Mar 2021 22:07:43 +0800
Subject: [PATCH 03/25] a dummy implementation of LMDataset

---
 egs/librispeech/asr/nnlm/local/dataset.py | 36 ++++++++++++++---------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index 6792b4d8..8fb98fa7 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -5,22 +5,27 @@
 
 from torch.utils.data import Dataset, DataLoader
 from torch.nn.utils.rnn import pad_sequence
+from typing import List
+
+import os
 
 
 class CollateFunc(object):
     '''Collate function for LMDataset
     '''
 
-    def __init__(self, pad_index):
+    def __init__(self, pad_index=0):
         self.pad_index = pad_index
 
     def __call__(self, batch):
+        import pdb
+        pdb.set_trace()
         # xs: input sequence
         # ys: label sequence
         xs = batch
         # ys = batch
-        xs_pad = pad_sequence(
-            [torch.from_numpy(x).int() for x in xs, True, self.pad_index])
+        xs_pad = pad_sequence([torch.from_numpy(x).float() for x in xs], True,
+                              self.pad_index)
         ys_pad = xs_pad
         return xs_pad, ys_pad
 
@@ -31,7 +36,7 @@ def __init__(self, text_file: str):
         '''Dataset to load Language Model train/dev text data
 
         Args:
-            text_file: text file, one utt per line.
+            text_file: text file, text for one utt per line.
         '''
         assert os.path.exists(
             text_file), "text_file: {} does not exist, please check that."
@@ -40,8 +45,8 @@ def __init__(self, text_file: str):
             for line in f:
                 text = line.strip().split()
                 assert len(text) > 0
-                text_id = text2id(text)
-                token_id = text_id2token_id(text_id)
+                text_id = self.text2id(text)
+                token_id = self.text_id2token_id(text_id)
                 self.data.append(token_id)
 
     def __len__(self):
@@ -50,22 +55,25 @@ def __len__(self):
     def __getitem__(self, idx):
         return self.data[idx]
 
-    def text2id(text: list[str]) -> list[int]:
-        pass
+    def text2id(self, text: List[str]) -> List[int]:
+        # A dumpy implementation
+        return [i for i in range(len(text))]
 
-    def text_id2token_id(text_id: list[int]) -> list[int]:
-        pass
+    def text_id2token_id(self, text_id: List[int]) -> List[int]:
+        # A dumpy implementation
+        return [i for i in range(len(text_id))]
 
 
 if __name__ == '__main__':
-    train_file = "./data/local/lm/train.txt"
-    # dev_file = "./data/local/lm/dev.txt"
-    dataset = LMDataset(train_file)
+    # train_file = "./data/nnlm/text/librispeech.txt"
+    dev_file = "./data/nnlm/text/dev.txt"
+    dataset = LMDataset(dev_file)
+    collate_func = CollateFunc()
     data_loader = DataLoader(dataset,
                              batch_size=1,
                              shuffle=True,
                              num_workers=0,
-                             collaate_fn=collate_func)
+                             collate_fn=collate_func)
     for i, batch in enumerate(data_loader):
         print(i)
         print(batch)

From 88e0d49d559860134bfdf244b38bf25c84fa2c56 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 30 Mar 2021 11:11:40 +0800
Subject: [PATCH 04/25] collate function of NNLM

---
 egs/librispeech/asr/nnlm/local/dataset.py | 34 +++++++++++++++--------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index 8fb98fa7..50cc8f73 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -7,7 +7,9 @@
 from torch.nn.utils.rnn import pad_sequence
 from typing import List
 
+import numpy as np
 import os
+import torch
 
 
 class CollateFunc(object):
@@ -15,18 +17,20 @@ class CollateFunc(object):
     '''
 
     def __init__(self, pad_index=0):
+        # pad_index should be identical to ignore_index of torch.nn.NLLLoss
         self.pad_index = pad_index
 
-    def __call__(self, batch):
-        import pdb
-        pdb.set_trace()
-        # xs: input sequence
-        # ys: label sequence
-        xs = batch
-        # ys = batch
-        xs_pad = pad_sequence([torch.from_numpy(x).float() for x in xs], True,
-                              self.pad_index)
-        ys_pad = xs_pad
+    def __call__(self, batch: List[List[int]]):
+        '''batch contains token_id.
+           batch can be viewd as a ragged 2-d array, with a row represent a token_id.
+           token_id reprents a tokenized text, whose format is:
+           <bos_id> token_id token_id token_id *** <eos_id>
+        '''
+        data_pad = pad_sequence(
+            [torch.from_numpy(np.array(x)).float() for x in batch], True,
+            self.pad_index)
+        xs_pad = data_pad[:, :-1]
+        ys_pad = data_pad[:, 1:]
         return xs_pad, ys_pad
 
 
@@ -42,10 +46,14 @@ def __init__(self, text_file: str):
             text_file), "text_file: {} does not exist, please check that."
         self.data = []
         with open(text_file, 'r') as f:
+            # a line represent a piece of text, e.g.
+            # DELAWARE IS NOT AFRAID OF DOGS
             for line in f:
                 text = line.strip().split()
                 assert len(text) > 0
                 text_id = self.text2id(text)
+                # token_id format:
+                # <bos_id> token_id token_id token_id *** <eos_id>
                 token_id = self.text_id2token_id(text_id)
                 self.data.append(token_id)
 
@@ -70,10 +78,12 @@ def text_id2token_id(self, text_id: List[int]) -> List[int]:
     dataset = LMDataset(dev_file)
     collate_func = CollateFunc()
     data_loader = DataLoader(dataset,
-                             batch_size=1,
+                             batch_size=2,
                              shuffle=True,
                              num_workers=0,
                              collate_fn=collate_func)
     for i, batch in enumerate(data_loader):
-        print(i)
+        xs, ys = batch
+        print(xs)
+        print(ys)
         print(batch)

From 27b1863fbf5e5d7b99910960a58fd4834e1e2d26 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Tue, 30 Mar 2021 16:02:53 +0800
Subject: [PATCH 05/25] add scripts to process word piece lexicons.

---
 .flake8                                       |  14 ++
 egs/librispeech/asr/nnlm/scripts/lexicon.py   |  53 +++++
 egs/librispeech/asr/nnlm/scripts/util.py      | 207 ++++++++++++++++++
 egs/librispeech/asr/nnlm/scripts/util_test.py | 153 +++++++++++++
 4 files changed, 427 insertions(+)
 create mode 100644 .flake8
 create mode 100644 egs/librispeech/asr/nnlm/scripts/lexicon.py
 create mode 100644 egs/librispeech/asr/nnlm/scripts/util.py
 create mode 100755 egs/librispeech/asr/nnlm/scripts/util_test.py

diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..8cebb5dd
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,14 @@
+[flake8]
+show-source=true
+statistics=true
+max-line-length=80
+exclude =
+  .git,
+
+ignore =
+  # E127 continuation line over-indented for visual indent
+  E127,
+  # F401, import but not used
+  F401,
+  # W504, line break after binary operator
+  W504,
diff --git a/egs/librispeech/asr/nnlm/scripts/lexicon.py b/egs/librispeech/asr/nnlm/scripts/lexicon.py
new file mode 100644
index 00000000..859f1b48
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/scripts/lexicon.py
@@ -0,0 +1,53 @@
+# Copyright (c)  2021  Xiaomi Corp.       (authors: Fangjun Kuang)
+#
+
+from pathlib import Path
+from typing import Union
+
+from util import create_ragged_lexicon
+from util import read_lexicon
+from util import read_mapping
+
+import torch
+import k2
+
+
+class Lexicon(object):
+
+    def __init__(self, lexicon_filename: Union[Path, str],
+                 word2id_filename: Union[Path, str],
+                 piece2id_filename: Union[Path, str]) -> None:
+        '''
+        Args:
+          lexicon_filename:
+            Path to the lexicon file. Each line in it consists of
+            spaces separated columns. The first column is a word
+            and the remaining columns are the word pieces of this word.
+          word2id_filename:
+            Path to the file that maps a word to an ID.
+          piece2id_filename:
+            Path to the file that maps a word piece to an ID.
+        '''
+        lexicon = read_lexicon(lexicon_filename)
+        word2id = read_mapping(word2id_filename)
+        piece2id = read_mapping(piece2id_filename)
+
+        self.lexicon = create_ragged_lexicon(lexicon=lexicon,
+                                             word2id=word2id,
+                                             piece2id=piece2id)
+
+    def word_seq_to_word_piece_seq(self, words: torch.Tensor) -> torch.Tensor:
+        '''Convert a word sequence to a word piece seq.
+
+        Args:
+          words:
+            A 1-D torch.Tensor of dtype torch.int32 containing word IDs.
+          Returns:
+            Return a 1-D torch.Tensor containing the IDs of the
+            corresponding word pieces.
+        '''
+        assert words.ndim == 1
+        assert words.dtype == torch.int32
+
+        ans = k2.index(self.lexicon, words)
+        return ans.values()
diff --git a/egs/librispeech/asr/nnlm/scripts/util.py b/egs/librispeech/asr/nnlm/scripts/util.py
new file mode 100644
index 00000000..7c053e60
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/scripts/util.py
@@ -0,0 +1,207 @@
+# Copyright (c)  2021  Xiaomi Corp.       (authors: Fangjun Kuang)
+
+from pathlib import Path
+from typing import Dict
+from typing import List
+from typing import Set
+from typing import Tuple
+from typing import Union
+
+import k2
+
+
+def read_mapping(filename: Union[str, Path]) -> Dict[str, int]:
+    '''Read a file that contains ID mappings.
+
+    Each line in the file contains two fields separated by spaces.
+    The first field is a token and the second is its integer ID.
+
+    An example file may look like the following::
+
+        a 1
+        b 2
+        hello 3
+
+    Args:
+      filename:
+       Filename containing the mapping.
+    Returns:
+      Return a dict that maps a token to an integer.
+    '''
+    filename = Path(filename)
+    assert filename.is_file(), f'{filename} is not a file'
+
+    ans: Dict[str, int] = dict()
+    seen: Set[int] = set()
+
+    with open(filename) as f:
+        for line in f:
+            line = line.strip()
+            if len(line) == 0:
+                continue  # skip empty lines
+
+            splits = line.split()
+            assert len(splits) == 2, \
+                    f"Invalid line '{line}'.\n" \
+                    'Each line should contain exactly two columns'
+
+            key = splits[0]
+            value = int(splits[1])
+            assert key not in ans, \
+                    f"Duplicate key '{key}' in line '{line}'"
+
+            assert value not in seen, \
+                    f"Duplicate ID '{value}' in line '{line}'"
+            ans[key] = value
+            seen.add(value)
+    return ans
+
+
+def convert_tokens_to_ids(tokens: List[str],
+                          mapping: Dict[str, int]) -> List[int]:
+    '''Convert a list of tokens to its corresponding IDs.
+
+    Caution:
+      We require that there are no OOVs. That is, every token
+      present in `tokens` has a corresponding ID in `mapping`.
+
+    Args:
+      tokens:
+        A list of str representing tokens.
+      mapping:
+        A map that maps a token to an integer.
+    Returns:
+      A list of integers that are the IDs of the input tokens.
+    '''
+    ans = []
+    for t in tokens:
+        assert t in mapping, f"token '{t}' does not have an ID"
+        ans.append(mapping[t])
+    return ans
+
+
+def convert_lexicon_to_mappings(
+        filename: Union[str, Path]
+) -> Tuple[Dict[str, int], Dict[str, int]]:  # noqa
+    '''Generate IDs for tokens from a lexicon file.
+
+    Each line in the lexicon consists of spaces separated columns.
+    The first column is the word and the remaining columns are its
+    word pieces. We require that each word has a unique decomposition
+    into word pieces.
+
+    Args:
+      filename:
+        The lexicon file.
+    Returns:
+      Return a tuple containing two mappings:
+        - The first dict maps a word to an ID
+        - The second dict maps a word piece to an ID
+    '''
+    filename = Path(filename)
+    assert filename.is_file(), f'File {filename} is not a file'
+
+    words: Set[str] = set()
+    pieces: Set[str] = set()
+
+    with open(filename) as f:
+        for line in f:
+            line = line.strip()
+            if len(line) == 0:
+                continue  # skip empty lines
+            splits = line.split()
+            assert len(splits) >= 2, \
+                    f"Invalid line '{line}'.' \
+                    'Expecting at least two columns"
+
+            assert splits[0] not in words, "'Duplicate word '{splits[0]}'"
+            words.add(splits[0])
+
+            for p in splits[1:]:
+                pieces.add(p)
+
+    words = list(words)
+    pieces = list(pieces)
+    words.sort()
+    pieces.sort()
+
+    word2id: Dict[str, int] = dict()
+    piece2id: Dict[str, int] = dict()
+
+    for i, w in enumerate(words):
+        word2id[w] = i
+
+    for i, p in enumerate(pieces):
+        piece2id[p] = i
+
+    return word2id, piece2id
+
+
+def read_lexicon(lexicon_filename: Union[Path, str]) -> Dict[str, List[str]]:
+    '''Read a lexicon file.
+
+    Each line in the lexicon consists of spaces separated columns.
+    The first column is the word and the remaining columns are the
+    corresponding word pieces.
+
+    Args:
+      lexicon_filename:
+        Path to the lexicon.
+    Returns:
+      Return a dict mapping a word to its word pieces.
+    '''
+    lexicon_filename = Path(lexicon_filename)
+    assert lexicon_filename.is_file(), f'File {lexicon_filename} is not a file'
+
+    ans: Dict[str, List[str]] = dict()
+
+    with open(lexicon_filename) as f:
+        for line in f:
+            line = line.strip()
+            if len(line) == 0:
+                continue  # skip empty lines
+
+            splits = line.split()
+            assert len(splits) >= 2, \
+                    f"Invalid line '{line}'" \
+                    'Expected a line with at least two fields'
+            word = splits[0]
+
+            assert word not in ans, \
+                    f"Duplicate word '{word}' in line '{line}'"
+            ans[word] = splits[1:]
+    return ans
+
+
+def create_ragged_lexicon(lexicon: Dict[str, List[str]],
+                          word2id: Dict[str, int],
+                          piece2id: Dict[str, int]) -> k2.RaggedInt:
+    '''
+    Args:
+      lexicon:
+        A dict that maps a word to word pieces.
+      word2id:
+        A dict that maps a word to an ID.
+
+        CAUTION:
+          We require that word IDs are contiguous. For instance, if
+          there are 3 words, then the word IDs are 0, 1, and 2.
+      piece2id:
+        A dict that maps a word piece to an ID.
+    '''
+    # First, check that word IDs are contiguous
+    id2word = {i: w for w, i in word2id.items()}
+    ids = list(id2word.keys())
+    ids.sort()
+    # we assume that word IDs are contiguous
+    expected_ids = list(range(ids[-1] + 1))
+    assert ids == expected_ids
+
+    values = []
+    for i in ids:
+        word = id2word[i]
+        pieces = lexicon[word]
+        pieces_id = convert_tokens_to_ids(pieces, piece2id)
+        values.append(pieces_id)
+
+    return k2.create_ragged2(values)
diff --git a/egs/librispeech/asr/nnlm/scripts/util_test.py b/egs/librispeech/asr/nnlm/scripts/util_test.py
new file mode 100755
index 00000000..273be1f1
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/scripts/util_test.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+
+import os
+import tempfile
+
+from lexicon import Lexicon
+from util import convert_lexicon_to_mappings
+from util import convert_tokens_to_ids
+from util import read_lexicon
+from util import read_mapping
+
+import torch
+
+
+def get_temp_filename() -> str:
+    '''Return a temporary file.
+
+    The caller is expected to remove the returned file.
+    '''
+    with tempfile.NamedTemporaryFile(delete=False) as tmp:
+        name = tmp.name
+        tmp.close()
+    return name
+
+
+def generate_mapping_file() -> str:
+    '''Generate a temporary mapping file for testing.
+
+    Caution:
+      The caller is responsible to delete the returned file after using it.
+
+    Returns:
+      A temporary file that contains an example mapping.
+    '''
+    s = '''
+        a 1
+        b 2
+        hello 3
+    '''
+    filename = get_temp_filename()
+    with open(filename, 'w') as f:
+        f.write(s)
+    return filename
+
+
+def generate_lexicon_file() -> str:
+    '''Generate a temporary lexicon file for testing.
+
+    Caution:
+      The caller is responsible to delete the returned file after using it.
+
+    Returns:
+      A temporary file that contains an example lexicon.
+    '''
+    s = '''
+        tom to m
+        the the
+        piper p ip er
+        son so n
+    '''
+    filename = get_temp_filename()
+    with open(filename, 'w') as f:
+        f.write(s)
+    return filename
+
+
+def test_read_mapping_file():
+    filename = generate_mapping_file()
+    mapping = read_mapping(filename)
+    os.remove(filename)
+    assert mapping['a'] == 1
+    assert mapping['b'] == 2
+    assert mapping['hello'] == 3
+
+
+def test_convert_tokens_to_ids():
+    filename = generate_mapping_file()
+    mapping = read_mapping(filename)
+    os.remove(filename)
+
+    tokens = ['b', 'a', 'a', 'hello', 'a', 'a', 'b']
+    ids = convert_tokens_to_ids(tokens=tokens, mapping=mapping)
+    assert ids == [2, 1, 1, 3, 1, 1, 2]
+
+
+def test_convert_lexicon_to_mappings():
+    filename = generate_lexicon_file()
+    word2id, piece2id = convert_lexicon_to_mappings(filename)
+    print(word2id)
+    print(piece2id)
+    os.remove(filename)
+
+
+def test_read_lexicon():
+    filename = generate_lexicon_file()
+    lexicon = read_lexicon(filename)
+    os.remove(filename)
+    print(lexicon)
+
+
+def test_lexicon():
+    lexicon_filename = generate_lexicon_file()
+
+    word2id, piece2id = convert_lexicon_to_mappings(lexicon_filename)
+
+    word2id_filename = get_temp_filename()
+    piece2id_filename = get_temp_filename()
+    # piper: 0
+    # son: 1
+    # the 2
+    # tome 3
+
+    # er 0
+    # ip 1
+    # m 2
+    # n 3
+    # p 4
+    # so 5
+    # the 6
+    # to 7
+
+    with open(word2id_filename, 'w') as f:
+        for w, i in word2id.items():
+            f.write(f'{w} {i}\n')
+
+    with open(piece2id_filename, 'w') as f:
+        for p, i in piece2id.items():
+            f.write(f'{p} {i}\n')
+
+    lexicon = Lexicon(lexicon_filename, word2id_filename, piece2id_filename)
+    words = ['the', 'son', 'tom', 'piper', 'the']
+    word_ids = convert_tokens_to_ids(words, word2id)
+    word_piece_ids = lexicon.word_seq_to_word_piece_seq(
+        torch.tensor(word_ids, dtype=torch.int32))
+    # the so n to m p ip er the
+    #  6   5 3 7  2 4 1  0  6
+    expected_word_piece_ids = torch.tensor([6, 5, 3, 7, 2, 4, 1, 0, 6])
+
+    assert torch.all(torch.eq(word_piece_ids, expected_word_piece_ids))
+
+    os.remove(lexicon_filename)
+    os.remove(word2id_filename)
+    os.remove(piece2id_filename)
+
+
+if __name__ == '__main__':
+    test_read_mapping_file()
+    test_convert_tokens_to_ids()
+    test_convert_lexicon_to_mappings()
+    test_read_lexicon()
+    test_lexicon()

From 47bf358ecac609428582a5d6793c17e5bc5a6fd6 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 30 Mar 2021 16:33:03 +0800
Subject: [PATCH 06/25] trainer

---
 egs/librispeech/asr/nnlm/local/trainer.py | 101 ++++++
 egs/librispeech/asr/nnlm/main.py          | 424 +++++-----------------
 2 files changed, 199 insertions(+), 326 deletions(-)
 create mode 100644 egs/librispeech/asr/nnlm/local/trainer.py

diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
new file mode 100644
index 00000000..50813b96
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+import logging
+import math
+import torch
+
+
+# references:
+# https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py
+# https://github.com/espnet/espnet/blob/master/espnet/lm/pytorch_backend/lm.py
+# https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py
+# https://www.jianshu.com/p/c88df856dbc8
+class Trainer(object):
+
+    def __init__(self,
+                 device,
+                 model=None,
+                 criterion=None,
+                 optimizer=None,
+                 train_data_loader=None,
+                 dev_data_loader=None,
+                 ntokens=None,
+                 batch_size=1,
+                 epoch=0,
+                 num_epochs=10,
+                 log_interval=10,
+                 writer=None):
+        self.device = device
+        self.model = model
+        self.criterion = criterion
+        self.optimizer = optimizer
+        self.ntokens = ntokens
+        self.batch_size = batch_size
+        self.epoch = epoch
+        self.num_epochs = num_epochs
+        self.train_data_loader = train_data_loader
+        self.dev_data_loader = dev_data_loader
+        self.iterations = 0
+        self.writer = writer
+        self.log_interval = log_interval
+
+    def run(self):
+        for epoch in range(self.num_epochs):
+            if self.train_data_loader is not None:
+                self.train()
+
+            if self.dev_data_loader is not None:
+                self.eval()
+
+            self.epoch += 1
+
+    def train(self):
+        self.model.train()
+        num_total_batch = len(self.train_data_loader)
+        for batch_idx, batch in enumerate(self.train_data_loader):
+            batch_input, batch_target = batch
+            batch_input = batch_input.to(self.device)
+            batch_target = batch_target.to(self.device)
+            self.model.to(self.device)
+            batch_output = self.model(batch_input)
+
+            prediction = batch_output.view(-1, self.ntokens)
+            target = torch.flatten(batch_target.transpose(0, 1))
+            loss = self.criterion(prediction, target)
+            self.optimizer.zero_grad()
+            self.optimizer.step()
+
+            self.writer.add_scalar('train_loss', loss, self.iterations)
+
+            self.iterations += 1
+            if batch_idx % self.log_interval == 0:
+                log_str = 'TRAIN Batch {}/{} loss {:.6f} ppl {:.6f} at epoch {}'.format(
+                    batch_idx, num_total_batch, loss.item(),
+                    math.exp(loss.item()), self.epoch)
+                logging.info(log_str)
+
+    def eval(self):
+        self.model.eval()
+        total_loss = 0.0
+        num_total_batch = len(self.dev_data_loader)
+        for batch_idx, batch in enumerate(self.dev_data_loader):
+            batch_input, batch_target = batch
+            batch_input = batch_input.to(self.device)
+            batch_target = batch_target.to(self.device)
+            self.model.to(self.device)
+            batch_output = self.model(batch_input)
+
+            prediction = batch_output.view(-1, self.ntokens)
+            target = torch.flatten(batch_target.transpose(0, 1))
+            loss = self.criterion(prediction, target)
+            total_loss += loss * self.batch_size
+
+        loss = total_loss / (num_total_batch * self.batch_size)
+        ppl = math.exp(loss)
+        self.writer.add_scalar('dev_ppl', ppl, self.epoch)
+        log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format(
+            loss.item(), ppl, self.epoch)
+        logging.info(log_str)
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
index 6d81ecde..cce38a26 100644
--- a/egs/librispeech/asr/nnlm/main.py
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -1,331 +1,103 @@
-# coding: utf-8
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+# Reference:
+# https://github.com/mobvoi/wenet/blob/main/wenet/bin/train.py
 import argparse
-import time
-import math
-import os
-import sys
+
+import logging
 import torch
 import torch.nn as nn
-import torch.onnx
+import torch.optim as optim
+import sys
 
 sys.path.insert(0, './local/')
-
-import data
-import model
-
-parser = argparse.ArgumentParser(
-    description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
-parser.add_argument('--data',
-                    type=str,
-                    default='./data/lm_train/',
-                    help='location of the data corpus')
-parser.add_argument(
-    '--model',
-    type=str,
-    default='LSTM',
-    help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
-parser.add_argument('--emsize',
-                    type=int,
-                    default=200,
-                    help='size of word embeddings')
-parser.add_argument('--nhid',
-                    type=int,
-                    default=200,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=2, help='number of layers')
-parser.add_argument('--lr',
-                    type=float,
-                    default=20,
-                    help='initial learning rate')
-parser.add_argument('--clip',
-                    type=float,
-                    default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit')
-parser.add_argument('--batch_size',
-                    type=int,
-                    default=30,
-                    metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=35, help='sequence length')
-parser.add_argument('--dropout',
-                    type=float,
-                    default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--tied',
-                    action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111, help='random seed')
-parser.add_argument('--cuda', action='store_true', help='use CUDA')
-parser.add_argument('--log-interval',
-                    type=int,
-                    default=200,
-                    metavar='N',
-                    help='report interval')
-parser.add_argument('--save',
-                    type=str,
-                    default='model.pt',
-                    help='path to save the final model')
-parser.add_argument('--onnx-export',
-                    type=str,
-                    default='',
-                    help='path to export the final model in onnx format')
-
-parser.add_argument(
-    '--nhead',
-    type=int,
-    default=2,
-    help='the number of heads in the encoder/decoder of the transformer model')
-parser.add_argument('--dry-run',
-                    action='store_true',
-                    help='verify the code and the model')
-
-args = parser.parse_args()
-
-# Set the random seed manually for reproducibility.
-torch.manual_seed(args.seed)
-if torch.cuda.is_available():
-    if not args.cuda:
-        print(
-            "WARNING: You have a CUDA device, so you should probably run with --cuda"
-        )
-
-device = torch.device("cuda" if args.cuda else "cpu")
-
-###############################################################################
-# Load data
-###############################################################################
-
-corpus = data.Corpus(args.data)
-
-# Starting from sequential data, batchify arranges the dataset into columns.
-# For instance, with the alphabet as the sequence and batch size 4, we'd get
-# ┌ a g m s ┐
-# │ b h n t │
-# │ c i o u │
-# │ d j p v │
-# │ e k q w │
-# └ f l r x ┘.
-# These columns are treated as independent by the model, which means that the
-# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
-# batch processing.
-
-
-def batchify(data, bsz):
-    # Work out how cleanly we can divide the dataset into bsz parts.
-    nbatch = len(data) // bsz
-    # Trim off any extra elements that wouldn't cleanly fit (remainders).
-    data = data.narrow(0, 0, nbatch * bsz)
-    # Evenly divide the data across the bsz batches.
-    data = data.view(bsz, -1).t().contiguous()
-    return data.to(device)
-
-
-eval_batch_size = args.batch_size
-# train_data = batchify(corpus.train, args.batch_size)
-# val_data = batchify(corpus.valid, eval_batch_size)
-# test_data = batchify(corpus.test, eval_batch_size)
-
-train_data = corpus.train
-val_data = corpus.valid
-test_data = corpus.test
-###############################################################################
-# Build the model
-###############################################################################
-
-ntokens = len(corpus.dictionary)
-if args.model == 'Transformer':
-    model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
-                                   args.nlayers, args.dropout).to(device)
-else:
-    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
-                           args.nlayers, args.dropout, args.tied).to(device)
-
-criterion = nn.NLLLoss(ignore_index=0)
-
-###############################################################################
-# Training code
-###############################################################################
-
-
-def repackage_hidden(h):
-    """Wraps hidden states in new Tensors, to detach them from their history."""
-
-    if isinstance(h, torch.Tensor):
-        return h.detach()
-    else:
-        return tuple(repackage_hidden(v) for v in h)
-
-
-# get_batch subdivides the source data into chunks of length args.bptt.
-# If source is equal to the example output of the batchify function, with
-# a bptt-limit of 2, we'd get the following two Variables for i = 0:
-# ┌ a g m s ┐ ┌ b h n t ┐
-# └ b h n t ┘ └ c i o u ┘
-# Note that despite the name of the function, the subdivison of data is not
-# done along the batch dimension (i.e. dimension 1), since that was handled
-# by the batchify function. The chunks are along dimension 0, corresponding
-# to the seq_len dimension in the LSTM.
-
-# def get_batch(source, i):
-#     seq_len = min(args.bptt, len(source) - 1 - i)
-#     data = source[i:i+seq_len]
-#     target = source[i+1:i+1+seq_len].view(-1)
-#     return data, target
-
-
-def get_batch(source, i, pad_index=0, batch_size=args.batch_size):
-    batch = source[i * batch_size:(i + 1) * batch_size]
-    # import pdb; pdb.set_trace()
-    seq_lens = [len(batch[i]) for i in range(batch_size)]
-    seq_len = max(seq_lens) + 1
-    data_padded = []
-    target = []
-    for data in batch:
-        # import pdb; pdb.set_trace()
-        # print("{} {}".format(seq_len,len(data)))
-        padding = torch.tensor([pad_index for _ in range(seq_len - len(data))])
-        data = torch.cat((data, padding), dim=0)
-        data_padded.append(data[:-1])
-        target.append(data[1:])
-
-    # import pdb; pdb.set_trace()
-    # data_padded: (Length, Batch_size)
-    # target: (Batch_size, Batch_size, Batch_size, ***)
-    try:
-        data_padded = torch.stack(data_padded).to(device).transpose(0, 1)
-        target = torch.flatten(torch.stack(target).to(device).transpose(0, 1))
-        return data_padded, target
-    except:
-        import pdb
-        pdb.set_trace()
-
-    #return torch.stack(data_padded).to(device), torch.stack(target).to(device)
-
-
-def evaluate(data_source):
-    # Turn on evaluation mode which disables dropout.
-    model.eval()
-    total_loss = 0.
-    ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
-        hidden = model.init_hidden(eval_batch_size)
-    with torch.no_grad():
-        for i in range(0, len(data_source) // eval_batch_size - 1):
-            data, targets = get_batch(data_source, i)
-            if args.model == 'Transformer':
-                output = model(data)
-                output = output.view(-1, ntokens)
-            else:
-                output, hidden = model(data, hidden)
-                hidden = repackage_hidden(hidden)
-            total_loss += len(data) * criterion(output, targets).item()
-    return total_loss / (len(data_source) - 1)
-
-
-def train():
-    # Turn on training mode which enables dropout.
-    batch_size = args.batch_size
-    model.train()
-    total_loss = 0.
-    start_time = time.time()
-    ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
-        hidden = model.init_hidden(batch_size)
-    for batch_idx in range(0, len(train_data) // batch_size - 1):
-        data, targets = get_batch(train_data, batch_idx)
-        # Starting each batch, we detach the hidden state from how it was previously produced.
-        # If we didn't, the model would try backpropagating all the way to start of the dataset.
-        model.zero_grad()
-        if args.model == 'Transformer':
-            output = model(data)
-            output = output.view(-1, ntokens)
-        else:
-            hidden = repackage_hidden(hidden)
-            output, hidden = model(data, hidden)
-
-        # import pdb; pdb.set_trace()
-        loss = criterion(output, targets)
-        loss.backward()
-
-        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
-        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
-        for p in model.parameters():
-            p.data.add_(p.grad, alpha=-lr)
-
-        #import pdb; pdb.set_trace()
-        total_loss += loss.item()
-
-        if batch_idx % args.log_interval == 0 and batch_idx > 0:
-            cur_loss = total_loss / args.log_interval
-            elapsed = time.time() - start_time
-            print(
-                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
-                'loss {:5.2f} | ppl {:8.2f}'.format(
-                    epoch, batch_idx,
-                    len(train_data) // batch_size, lr,
-                    elapsed * 1000 / args.log_interval, cur_loss,
-                    math.exp(cur_loss)))
-            total_loss = 0
-            start_time = time.time()
-        if args.dry_run:
-            break
-
-
-def export_onnx(path, batch_size, seq_len):
-    print('The model is also exported in ONNX format at {}'.format(
-        os.path.realpath(args.onnx_export)))
-    model.eval()
-    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(
-        -1, batch_size).to(device)
-    hidden = model.init_hidden(batch_size)
-    torch.onnx.export(model, (dummy_input, hidden), path)
-
-
-# Loop over epochs.
-lr = args.lr
-best_val_loss = None
-
-# At any point you can hit Ctrl + C to break out of training early.
-try:
-    for epoch in range(1, args.epochs + 1):
-        epoch_start_time = time.time()
-        train()
-        val_loss = evaluate(val_data)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-              'valid ppl {:8.2f}'.format(epoch,
-                                         (time.time() - epoch_start_time),
-                                         val_loss, math.exp(val_loss)))
-        print('-' * 89)
-        # Save the model if the validation loss is the best we've seen so far.
-        if not best_val_loss or val_loss < best_val_loss:
-            with open(args.save, 'wb') as f:
-                torch.save(model, f)
-            best_val_loss = val_loss
-        else:
-            # Anneal the learning rate if no improvement has been seen in the validation dataset.
-            lr /= 4.0
-except KeyboardInterrupt:
-    print('-' * 89)
-    print('Exiting from training early')
-
-# Load the best saved model.
-with open(args.save, 'rb') as f:
-    model = torch.load(f)
-    # after load the rnn params are not a continuous chunk of memory
-    # this makes them a continuous chunk, and will speed up forward pass
-    # Currently, only rnn model supports flatten_parameters function.
-    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
-        model.rnn.flatten_parameters()
-
-# Run on test data.
-test_loss = evaluate(test_data)
-print('=' * 89)
-print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
-    test_loss, math.exp(test_loss)))
-print('=' * 89)
-
-if len(args.onnx_export) > 0:
-    # Export the model in ONNX format.
-    export_onnx(args.onnx_export, batch_size=1, seq_len=args.bptt)
+from dataset import LMDataset, CollateFunc
+from model import TransformerModel
+from trainer import Trainer
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data import DataLoader
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='training Neural Language Model')
+    parser.add_argument('--train_text',
+                        default='data/nnlm/text/librispeech.txt',
+                        help='train data file')
+    parser.add_argument('--dev_text',
+                        default='data/nnlm/text/dev.txt',
+                        help='dev data file')
+    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument('--ntokens', type=int, default=3000)
+    parser.add_argument('--emsize', type=int, default=128)
+    parser.add_argument('--nhead', type=int, default=4)
+    parser.add_argument('--nhid', type=int, default=128)
+    parser.add_argument('--nlayers', type=int, default=6)
+    parser.add_argument('--dropout', type=int, default=0.2)
+    parser.add_argument('--model_dir',
+                        default='./exp/',
+                        help='path to save model')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='path to save tensorboard log')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this local rank, -1 for cpu')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+
+    #Set random seed
+    torch.manual_seed(2021)
+    collate_func = CollateFunc()
+
+    train_dataset = LMDataset(args.train_text)
+    dev_dataset = LMDataset(args.dev_text)
+
+    train_data_loader = DataLoader(train_dataset,
+                                   batch_size=args.batch_size,
+                                   shuffle=False,
+                                   num_workers=0,
+                                   collate_fn=collate_func)
+
+    dev_data_loader = DataLoader(dev_dataset,
+                                 batch_size=args.batch_size,
+                                 shuffle=False,
+                                 num_workers=0,
+                                 collate_fn=collate_func)
+
+    ntokens = args.ntokens
+    model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
+                             args.nlayers, args.dropout)
+    optimizer = optim.Adam(model.parameters())
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    criterion = nn.NLLLoss(ignore_index=0)
+    exp_dir = 'exp-nnlm'
+    writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard')
+    trainer = Trainer(device,
+                      model,
+                      criterion,
+                      optimizer,
+                      train_data_loader=train_data_loader,
+                      dev_data_loader=dev_data_loader,
+                      ntokens=ntokens,
+                      batch_size=args.batch_size,
+                      epoch=0,
+                      writer=writer)
+    trainer.run()
+
+
+if __name__ == '__main__':
+    main()

From d8aaabdddfcc84ed6dc27fe5bbb9c3752b54ac5a Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 30 Mar 2021 17:50:13 +0800
Subject: [PATCH 07/25] generate lexicon

---
 .../asr/nnlm/local/generate_lexicon.py        | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 egs/librispeech/asr/nnlm/local/generate_lexicon.py

diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
new file mode 100644
index 00000000..337ffb4e
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+import argparse
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers import decoders
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='generate words.txt tokens.txt and lexicon.txt')
+    parser.add_argument('--lexicon-path',
+                        default='data/nnlm/lexicon',
+                        type=str,
+                        help="path to save lexicon files")
+    parser.add_argument('--tokenizer-path',
+                        type=str,
+                        default='./data/lm_train/tokenizer-librispeech.json',
+                        help="path to load tokenizer")
+    parser.add_argument('--train-file',
+                        default='data/nnlm/text/librispeech.txt',
+                        type=str,
+                        help="""file to be tokenized""")
+    args = parser.parse_args()
+    return args
+
+
+def generate_tokens(args):
+    tokenizer = Tokenizer.from_file(args.tokenizer_path)
+    symbols = tokenizer.get_vocab()
+    tokens_file = '{}/tokens.txt'.format(args.lexicon_path)
+    tokens_f = open(tokens_file, 'w')
+    for idx, sym in enumerate(symbols):
+        tokens_f.write('{} {}\n'.format(sym, idx))
+
+    tokens_f.close()
+
+
+def generate_lexicon(args, words):
+    lexicon_file = '{}/lexicon.txt'.format(args.lexicon_path)
+    lf = open(lexicon_file, 'w')
+    tokenizer = Tokenizer.from_file(args.tokenizer_path)
+    tokenizer.decoder = decoders.WordPiece()
+    for word in words:
+        output = tokenizer.encode(word)
+        tokens = " ".join(output.tokens)
+        lf.write("{}\t{}\n".format(word, tokens))
+    lf.close()
+
+
+def load_words(args):
+    words = []
+    tokens_file = '{}/words.txt'.format(args.lexicon_path)
+    special_words = [
+        '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
+    ]
+
+    with open(tokens_file) as f:
+        for line in f:
+            arr = line.strip().split()
+            if arr[0] not in special_words:
+                words.append(arr[0])
+
+    return words
+
+
+def main():
+    args = get_args()
+    generate_tokens(args)
+    words = load_words(args)
+    generate_lexicon(args, words)
+
+
+if __name__ == '__main__':
+    main()

From c44f99da9c0027f34981e359ff439c1f31c16b96 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 30 Mar 2021 17:57:57 +0800
Subject: [PATCH 08/25] check text length in dataset.py

---
 egs/librispeech/asr/nnlm/local/data.py    | 54 -----------------------
 egs/librispeech/asr/nnlm/local/dataset.py | 10 +++--
 2 files changed, 6 insertions(+), 58 deletions(-)
 delete mode 100644 egs/librispeech/asr/nnlm/local/data.py

diff --git a/egs/librispeech/asr/nnlm/local/data.py b/egs/librispeech/asr/nnlm/local/data.py
deleted file mode 100644
index 7cdb4c2d..00000000
--- a/egs/librispeech/asr/nnlm/local/data.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-from io import open
-import torch
-
-
-class Dictionary(object):
-
-    def __init__(self):
-        self.word2idx = {}
-        self.idx2word = []
-        self.idx2word.append('<PAD>')
-        self.word2idx['<PAD>'] = 0
-
-    def add_word(self, word):
-        if word not in self.word2idx:
-            self.idx2word.append(word)
-            self.word2idx[word] = len(self.idx2word) - 1
-            # self.word2idx[word] = len(self.idx2word)
-        return self.word2idx[word]
-
-    def __len__(self):
-        return len(self.idx2word)
-
-
-class Corpus(object):
-
-    def __init__(self, path):
-        self.dictionary = Dictionary()
-        self.train = self.tokenize(os.path.join(path, 'train.tokens'))
-        self.valid = self.tokenize(os.path.join(path, 'valid.tokens'))
-        self.test = self.tokenize(os.path.join(path, 'test.tokens'))
-
-    def tokenize(self, path):
-        """Tokenizes a text file."""
-        assert os.path.exists(path)
-        # Add words to the dictionary
-        with open(path, 'r', encoding="utf8") as f:
-            for line in f:
-                words = line.split() + ['<eos>']
-                for word in words:
-                    self.dictionary.add_word(word)
-
-        # Tokenize file content
-        with open(path, 'r', encoding="utf8") as f:
-            idss = []
-            for line in f:
-                words = line.split() + ['<eos>']
-                ids = []
-                for word in words:
-                    ids.append(self.dictionary.word2idx[word])
-                idss.append(torch.tensor(ids).type(torch.int64))
-            # ids = torch.cat(idss)
-
-        return idss
diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index 50cc8f73..41b4dad0 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -22,12 +22,12 @@ def __init__(self, pad_index=0):
 
     def __call__(self, batch: List[List[int]]):
         '''batch contains token_id.
-           batch can be viewd as a ragged 2-d array, with a row represent a token_id.
+           batch can be viewd as a ragged 2-d array, with a row represents a token_id.
            token_id reprents a tokenized text, whose format is:
            <bos_id> token_id token_id token_id *** <eos_id>
         '''
         data_pad = pad_sequence(
-            [torch.from_numpy(np.array(x)).float() for x in batch], True,
+            [torch.from_numpy(np.array(x)).long() for x in batch], True,
             self.pad_index)
         xs_pad = data_pad[:, :-1]
         ys_pad = data_pad[:, 1:]
@@ -50,12 +50,14 @@ def __init__(self, text_file: str):
             # DELAWARE IS NOT AFRAID OF DOGS
             for line in f:
                 text = line.strip().split()
-                assert len(text) > 0
+                if len(text) == 0:
+                    continue
                 text_id = self.text2id(text)
                 # token_id format:
                 # <bos_id> token_id token_id token_id *** <eos_id>
                 token_id = self.text_id2token_id(text_id)
-                self.data.append(token_id)
+                if len(token_id) >= 2:
+                    self.data.append(token_id)
 
     def __len__(self):
         return len(self.data)

From b13954d8d54e945747c0468452c9373f2f199fc2 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 30 Mar 2021 18:00:18 +0800
Subject: [PATCH 09/25] remove shuf/comm commands

---
 egs/librispeech/asr/nnlm/run.sh | 48 +++++++++++++--------------------
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index 3cfd5e18..c7603ac9 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -5,6 +5,7 @@
 
 # References:
 # https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/train_rnnlm.sh
+# https://github.com/kaldi-asr/kaldi/blob/pybind11/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh#L75
 # https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/prepare_rnnlm_dir.sh
 # https://github.com/pytorch/examples/tree/master/word_language_model
 # https://huggingface.co/docs/tokenizers/python/latest/quicktour.html
@@ -15,45 +16,34 @@ set -e
 stage=$1
 
 lm_train=data/lm_train/
-full_text=$lm_train/librispeech_train_960_text
-tokenizer=$lm_train/tokenizer-librispeech_train_960.json
-if [ $stage -eq 1 ]; then
-  python3 ./local/download_lm_train_data.py
+tokenizer=$lm_train/tokenizer-librispeech.json
+
+text=data/local/lm/librispeech-lm-norm.txt.gz
+text_dir=data/nnlm/text
+train_text=$text_dir/librispeech.txt
+if [ $stage -eq 0 ]; then
+  mkdir -p $text_dir
+  if [ ! -f $text ]; then
+    wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm 
+  fi
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 2000 lines as dev data.
+  gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$train_text
 fi
+
+
 if [ $stage -eq 2 ]; then
   echo "training tokenizer"
   python3 local/huggingface_tokenizer.py \
-    --train-file=$full_text \
+    --train-file=$train_text \
     --tokenizer-path=$tokenizer
 fi
 
 if [ $stage -eq 3 ]; then
-  echo "tokenize a file"
-  python3 local/huggingface_tokenizer.py \
-    --test-file=$full_text \
-    --tokenizer-path=$tokenizer
+  echo "generate lexicon"
+  python local/generate_lexicon.py
 fi
 
-if [ $stage -eq 4 ]; then
-  echo "split all data into train/valid/test"
-
-  full_tokens=${full_text}.tokens
-  valid_test_fraction=10 # currently 5 percent for valid and 5 percent for test
-  valid_test_tokens=$lm_train/valid_test.tokens
-  train_tokens=$lm_train/train.tokens
-
-  num_utts_total=$(wc -l <$full_tokens )
-  num_valid_test=$(($num_utts_total/${valid_test_fraction}))
-  set +x
-  shuf -n $num_valid_test  $full_tokens > $valid_test_tokens
-
-  comm -3 <(sort $valid_test_tokens) <(sort $full_tokens) > $train_tokens
-  shuf -n $(($num_valid_test/2)) $valid_test_tokens > $lm_train/valid.tokens
-  comm -3 <(sort $lm_train/valid.tokens) <(sort $valid_test_tokens) > $lm_train/test.tokens
-
-fi
-
-
 if [ $stage -eq 5 ]; then
   python main.py \
     --cuda \

From 775d4775f9989b93680a19da35ab3dfd884fcb8c Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 30 Mar 2021 20:44:59 +0800
Subject: [PATCH 10/25] beta version of training pipeline

---
 egs/librispeech/asr/nnlm/local/dataset.py     | 17 ++++--
 .../asr/nnlm/local/generate_lexicon.py        | 25 +++++----
 .../asr/nnlm/local/huggingface_tokenizer.py   |  2 +-
 egs/librispeech/asr/nnlm/local/model.py       | 53 +++++++++++++++----
 egs/librispeech/asr/nnlm/main.py              | 21 ++++++--
 egs/librispeech/asr/nnlm/scripts/lexicon.py   |  4 +-
 egs/librispeech/asr/nnlm/scripts/util.py      | 21 ++++----
 7 files changed, 102 insertions(+), 41 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index 41b4dad0..20fe34bd 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -6,6 +6,7 @@
 from torch.utils.data import Dataset, DataLoader
 from torch.nn.utils.rnn import pad_sequence
 from typing import List
+from util import convert_tokens_to_ids
 
 import numpy as np
 import os
@@ -36,12 +37,13 @@ def __call__(self, batch: List[List[int]]):
 
 class LMDataset(Dataset):
 
-    def __init__(self, text_file: str):
+    def __init__(self, text_file: str, lexicon):
         '''Dataset to load Language Model train/dev text data
 
         Args:
             text_file: text file, text for one utt per line.
         '''
+        self.lexicon = lexicon
         assert os.path.exists(
             text_file), "text_file: {} does not exist, please check that."
         self.data = []
@@ -49,13 +51,20 @@ def __init__(self, text_file: str):
             # a line represent a piece of text, e.g.
             # DELAWARE IS NOT AFRAID OF DOGS
             for line in f:
-                text = line.strip().split()
+                # import pdb
+                # pdb.set_trace()
+                text = line.strip().lower().split()
+                # print(text)
                 if len(text) == 0:
                     continue
-                text_id = self.text2id(text)
+                word_id = convert_tokens_to_ids(text, self.lexicon.word2id)
+                if len(word_id) == 0:
+                    continue
+                word_id = torch.from_numpy(np.array(word_id, dtype="int32"))
+
+                token_id = self.lexicon.word_seq_to_word_piece_seq(word_id)
                 # token_id format:
                 # <bos_id> token_id token_id token_id *** <eos_id>
-                token_id = self.text_id2token_id(text_id)
                 if len(token_id) >= 2:
                     self.data.append(token_id)
 
diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
index 337ffb4e..0fa62afa 100644
--- a/egs/librispeech/asr/nnlm/local/generate_lexicon.py
+++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
@@ -34,35 +34,42 @@ def generate_tokens(args):
     tokens_file = '{}/tokens.txt'.format(args.lexicon_path)
     tokens_f = open(tokens_file, 'w')
     for idx, sym in enumerate(symbols):
-        tokens_f.write('{} {}\n'.format(sym, idx))
+        tokens_f.write('{} {}\n'.format(sym.lower(), idx))
 
     tokens_f.close()
 
 
 def generate_lexicon(args, words):
+    special_words = [
+        '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
+    ]
     lexicon_file = '{}/lexicon.txt'.format(args.lexicon_path)
     lf = open(lexicon_file, 'w')
     tokenizer = Tokenizer.from_file(args.tokenizer_path)
     tokenizer.decoder = decoders.WordPiece()
     for word in words:
-        output = tokenizer.encode(word)
-        tokens = " ".join(output.tokens)
-        lf.write("{}\t{}\n".format(word, tokens))
+        if word not in special_words:
+            output = tokenizer.encode(word)
+            tokens = ' '.join(output.tokens)
+        else:
+            tokens = '[unk]'
+        lf.write("{}\t{}\n".format(word.lower(), tokens.lower()))
     lf.close()
 
 
 def load_words(args):
     words = []
     tokens_file = '{}/words.txt'.format(args.lexicon_path)
-    special_words = [
-        '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
-    ]
+    # special_words = [
+    #     '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
+    # ]
+    # special_words = []
 
     with open(tokens_file) as f:
         for line in f:
             arr = line.strip().split()
-            if arr[0] not in special_words:
-                words.append(arr[0])
+            # if arr[0] not in special_words:
+            words.append(arr[0])
 
     return words
 
diff --git a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
index 8f2cff43..8779fb2d 100644
--- a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
+++ b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
@@ -26,7 +26,7 @@ def get_args():
                         help="""file to train tokenizer""")
     parser.add_argument('--vocab-size',
                         type=int,
-                        default=1000,
+                        default=10000,
                         help="""number of tokens of the tokenizer""")
     parser.add_argument('--tokenizer-path',
                         type=str,
diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py
index 3767302f..fcd7b8fc 100644
--- a/egs/librispeech/asr/nnlm/local/model.py
+++ b/egs/librispeech/asr/nnlm/local/model.py
@@ -1,26 +1,48 @@
+# reference:
+# https://github.com/pytorch/examples/blob/master/word_language_model/model.py
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+
 class RNNModel(nn.Module):
     """Container module with an encoder, a recurrent module, and a decoder."""
 
-    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
+    def __init__(self,
+                 rnn_type,
+                 ntoken,
+                 ninp,
+                 nhid,
+                 nlayers,
+                 dropout=0.5,
+                 tie_weights=False):
         super(RNNModel, self).__init__()
         self.ntoken = ntoken
         self.drop = nn.Dropout(dropout)
         # import pdb; pdb.set_trace()
         self.encoder = nn.Embedding(ntoken, ninp, padding_idx=0)
         if rnn_type in ['LSTM', 'GRU']:
-            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
+            self.rnn = getattr(nn, rnn_type)(ninp,
+                                             nhid,
+                                             nlayers,
+                                             dropout=dropout)
         else:
             try:
-                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
+                nonlinearity = {
+                    'RNN_TANH': 'tanh',
+                    'RNN_RELU': 'relu'
+                }[rnn_type]
             except KeyError:
-                raise ValueError( """An invalid option for `--model` was supplied,
-                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
-            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
+                raise ValueError(
+                    """An invalid option for `--model` was supplied,
+                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']"""
+                )
+            self.rnn = nn.RNN(ninp,
+                              nhid,
+                              nlayers,
+                              nonlinearity=nonlinearity,
+                              dropout=dropout)
         self.decoder = nn.Linear(nhid, ntoken)
 
         # Optionally tie weights as in:
@@ -31,7 +53,8 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weigh
         # https://arxiv.org/abs/1611.01462
         if tie_weights:
             if nhid != ninp:
-                raise ValueError('When using the tied flag, nhid must be equal to emsize')
+                raise ValueError(
+                    'When using the tied flag, nhid must be equal to emsize')
             self.decoder.weight = self.encoder.weight
 
         self.init_weights()
@@ -63,6 +86,7 @@ def init_hidden(self, bsz):
         else:
             return weight.new_zeros(self.nlayers, bsz, self.nhid)
 
+
 # Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
 class PositionalEncoding(nn.Module):
     r"""Inject some information about the relative or absolute position of the tokens
@@ -87,7 +111,9 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
 
         pe = torch.zeros(max_len, d_model)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() *
+            (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0).transpose(0, 1)
@@ -107,6 +133,7 @@ def forward(self, x):
         x = x + self.pe[:x.size(0), :]
         return self.dropout(x)
 
+
 class TransformerModel(nn.Module):
     """Container module with an encoder, a recurrent or transformer module, and a decoder."""
 
@@ -115,7 +142,9 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
         try:
             from torch.nn import TransformerEncoder, TransformerEncoderLayer
         except:
-            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
+            raise ImportError(
+                'TransformerEncoder module does not exist in PyTorch 1.1 or lower.'
+            )
         self.model_type = 'Transformer'
         self.src_mask = None
         self.pos_encoder = PositionalEncoding(ninp, dropout)
@@ -129,7 +158,8 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
 
     def _generate_square_subsequent_mask(self, sz):
         mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(
+            mask == 1, float(0.0))
         return mask
 
     def init_weights(self):
@@ -142,7 +172,8 @@ def forward(self, src, has_mask=True):
         if has_mask:
             device = src.device
             if self.src_mask is None or self.src_mask.size(0) != len(src):
-                mask = self._generate_square_subsequent_mask(len(src)).to(device)
+                mask = self._generate_square_subsequent_mask(
+                    len(src)).to(device)
                 self.src_mask = mask
         else:
             self.src_mask = None
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
index cce38a26..a3eb8c38 100644
--- a/egs/librispeech/asr/nnlm/main.py
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -14,6 +14,9 @@
 import sys
 
 sys.path.insert(0, './local/')
+sys.path.insert(0, './scripts/')
+from lexicon import Lexicon
+
 from dataset import LMDataset, CollateFunc
 from model import TransformerModel
 from trainer import Trainer
@@ -30,8 +33,8 @@ def get_args():
     parser.add_argument('--dev_text',
                         default='data/nnlm/text/dev.txt',
                         help='dev data file')
-    parser.add_argument('--batch_size', type=int, default=4)
-    parser.add_argument('--ntokens', type=int, default=3000)
+    parser.add_argument('--batch_size', type=int, default=256)
+    parser.add_argument('--ntokens', type=int, default=10000)
     parser.add_argument('--emsize', type=int, default=128)
     parser.add_argument('--nhead', type=int, default=4)
     parser.add_argument('--nhid', type=int, default=128)
@@ -45,8 +48,12 @@ def get_args():
                         help='path to save tensorboard log')
     parser.add_argument('--gpu',
                         type=int,
-                        default=-1,
+                        default=1,
                         help='gpu id for this local rank, -1 for cpu')
+    parser.add_argument('--lexicon-path',
+                        default='data/nnlm/lexicon',
+                        type=str,
+                        help="path to save lexicon files")
 
     args = parser.parse_args()
 
@@ -61,9 +68,13 @@ def main():
     #Set random seed
     torch.manual_seed(2021)
     collate_func = CollateFunc()
+    lexicon_filename = '{}/lexicon.txt'.format(args.lexicon_path)
+    word2id_filename = '{}/words.txt'.format(args.lexicon_path)
+    piece2id_filename = '{}/tokens.txt'.format(args.lexicon_path)
 
-    train_dataset = LMDataset(args.train_text)
-    dev_dataset = LMDataset(args.dev_text)
+    lexicon = Lexicon(lexicon_filename, word2id_filename, piece2id_filename)
+    train_dataset = LMDataset(args.train_text, lexicon)
+    dev_dataset = LMDataset(args.dev_text, lexicon)
 
     train_data_loader = DataLoader(train_dataset,
                                    batch_size=args.batch_size,
diff --git a/egs/librispeech/asr/nnlm/scripts/lexicon.py b/egs/librispeech/asr/nnlm/scripts/lexicon.py
index 859f1b48..f882f06a 100644
--- a/egs/librispeech/asr/nnlm/scripts/lexicon.py
+++ b/egs/librispeech/asr/nnlm/scripts/lexicon.py
@@ -29,11 +29,11 @@ def __init__(self, lexicon_filename: Union[Path, str],
             Path to the file that maps a word piece to an ID.
         '''
         lexicon = read_lexicon(lexicon_filename)
-        word2id = read_mapping(word2id_filename)
+        self.word2id = read_mapping(word2id_filename)
         piece2id = read_mapping(piece2id_filename)
 
         self.lexicon = create_ragged_lexicon(lexicon=lexicon,
-                                             word2id=word2id,
+                                             word2id=self.word2id,
                                              piece2id=piece2id)
 
     def word_seq_to_word_piece_seq(self, words: torch.Tensor) -> torch.Tensor:
diff --git a/egs/librispeech/asr/nnlm/scripts/util.py b/egs/librispeech/asr/nnlm/scripts/util.py
index 7c053e60..dda33da1 100644
--- a/egs/librispeech/asr/nnlm/scripts/util.py
+++ b/egs/librispeech/asr/nnlm/scripts/util.py
@@ -36,7 +36,7 @@ def read_mapping(filename: Union[str, Path]) -> Dict[str, int]:
 
     with open(filename) as f:
         for line in f:
-            line = line.strip()
+            line = line.strip().lower()
             if len(line) == 0:
                 continue  # skip empty lines
 
@@ -57,8 +57,8 @@ def read_mapping(filename: Union[str, Path]) -> Dict[str, int]:
     return ans
 
 
-def convert_tokens_to_ids(tokens: List[str],
-                          mapping: Dict[str, int]) -> List[int]:
+def convert_tokens_to_ids(tokens: List[str], mapping: Dict[str,
+                                                           int]) -> List[int]:
     '''Convert a list of tokens to its corresponding IDs.
 
     Caution:
@@ -75,14 +75,17 @@ def convert_tokens_to_ids(tokens: List[str],
     '''
     ans = []
     for t in tokens:
-        assert t in mapping, f"token '{t}' does not have an ID"
-        ans.append(mapping[t])
+        # assert t in mapping, f"token '{t}' does not have an ID"
+        if t in mapping:
+            ans.append(mapping[t])
+        else:
+            ans.append(mapping['<unk>'])
     return ans
 
 
 def convert_lexicon_to_mappings(
-        filename: Union[str, Path]
-) -> Tuple[Dict[str, int], Dict[str, int]]:  # noqa
+    filename: Union[str,
+                    Path]) -> Tuple[Dict[str, int], Dict[str, int]]:  # noqa
     '''Generate IDs for tokens from a lexicon file.
 
     Each line in the lexicon consists of spaces separated columns.
@@ -173,8 +176,8 @@ def read_lexicon(lexicon_filename: Union[Path, str]) -> Dict[str, List[str]]:
     return ans
 
 
-def create_ragged_lexicon(lexicon: Dict[str, List[str]],
-                          word2id: Dict[str, int],
+def create_ragged_lexicon(lexicon: Dict[str, List[str]], word2id: Dict[str,
+                                                                       int],
                           piece2id: Dict[str, int]) -> k2.RaggedInt:
     '''
     Args:

From d415ed0773c8d76adde59900c7daf944c50db6fd Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 30 Mar 2021 20:49:56 +0800
Subject: [PATCH 11/25] remove unused file

---
 .../asr/nnlm/local/download_lm_train_data.py  | 42 -------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 egs/librispeech/asr/nnlm/local/download_lm_train_data.py

diff --git a/egs/librispeech/asr/nnlm/local/download_lm_train_data.py b/egs/librispeech/asr/nnlm/local/download_lm_train_data.py
deleted file mode 100644
index d9ff066a..00000000
--- a/egs/librispeech/asr/nnlm/local/download_lm_train_data.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
-# Apache 2.0
-
-import os
-import logging
-from google_drive_downloader import GoogleDriveDownloader as gdd
-from pathlib import Path
-
-# librispeech-lm-norm.txt is 4G
-# train_960_text is 48M, which is stands for the sum of {train_clean_360, train_clean_100, train_other_500}
-# here only train_960_text used to verify the whole pipeline
-# A copy of train_960_text: "htts://drive.google.com/file/d/1AgP4wTqbfp12dv4fJmjKXHdOf8eOtp_A/view?usp=sharing"
-# local_path: "/ceph-ly/open-source/snowfall/egs/librispeech/asr/simple_v1/data/local/lm_train/train_960_text"
-
-
-def download_librispeech_train_960_text():
-    train_960_text = "./data/lm_train/librispeech_train_960_text"
-    if not os.path.exists(train_960_text):
-        Path(os.path.dirname(train_960_text)).mkdir(parents=True,
-                                                    exist_ok=True)
-
-        logging.info("downloading train_960_text of librispeech.")
-        gdd.download_file_from_google_drive(
-            file_id='1AgP4wTqbfp12dv4fJmjKXHdOf8eOtp_A',
-            dest_path=train_960_text,
-            unzip=False)
-    else:
-        logging.info(
-            "train_960_text of librispeech is already downloaded. You may should check that"
-        )
-
-
-def main():
-    logging.getLogger().setLevel(logging.INFO)
-
-    download_librispeech_train_960_text()
-
-
-if __name__ == '__main__':
-    main()

From 4937232c65a1a2e607825887fec567497c2419a2 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Thu, 1 Apr 2021 19:26:13 +0800
Subject: [PATCH 12/25] add dependency and fix known bugs

scripts to install tokenizers
fix training bugs
port online tokenization to offline tokenization
load/save checkpoint
---
 egs/librispeech/asr/nnlm/local/common.py      | 42 +++++++++++
 egs/librispeech/asr/nnlm/local/dataset.py     | 42 +++--------
 .../asr/nnlm/local/generate_lexicon.py        | 39 +++++++---
 .../asr/nnlm/local/huggingface_tokenizer.py   | 26 ++++---
 egs/librispeech/asr/nnlm/local/trainer.py     | 34 +++++++--
 egs/librispeech/asr/nnlm/main.py              | 57 +++++++-------
 egs/librispeech/asr/nnlm/requirements.txt     |  1 +
 egs/librispeech/asr/nnlm/run.sh               | 75 ++++++++++++++++---
 8 files changed, 222 insertions(+), 94 deletions(-)
 create mode 100644 egs/librispeech/asr/nnlm/local/common.py
 create mode 100644 egs/librispeech/asr/nnlm/requirements.txt

diff --git a/egs/librispeech/asr/nnlm/local/common.py b/egs/librispeech/asr/nnlm/local/common.py
new file mode 100644
index 00000000..f561cf1d
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/common.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+# modified from https://github.com/k2-fsa/snowfall/blob/master/snowfall/common.py to save/load non-Acoustic Model
+import logging
+import os
+import torch
+
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+Pathlike = Union[str, Path]
+Info = Union[dict, None]
+
+
+def load_checkpoint(filename: Pathlike,
+                    model: torch.nn.Module,
+                    info: Info = None) -> Dict[str, Any]:
+    logging.info('load checkpoint from {}'.format(filename))
+
+    checkpoint = torch.load(filename, map_location='cpu')
+
+    model.load_state_dict(checkpoint['state_dict'])
+
+    return checkpoint
+
+
+def save_checkpoint(filename: Pathlike,
+                    model: torch.nn.Module,
+                    info: Info = None) -> None:
+    if not os.path.exists(os.path.dirname(filename)):
+        Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True)
+    logging.info(f'Save checkpoint to {filename}')
+    checkpoint = {
+        'state_dict': model.state_dict(),
+    }
+    if info is not None:
+        checkpoint.update(info)
+
+    torch.save(checkpoint, filename)
diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index 20fe34bd..eb17491b 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -3,10 +3,10 @@
 # Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
 # Apache 2.0
 
+import time
 from torch.utils.data import Dataset, DataLoader
 from torch.nn.utils.rnn import pad_sequence
 from typing import List
-from util import convert_tokens_to_ids
 
 import numpy as np
 import os
@@ -37,35 +37,22 @@ def __call__(self, batch: List[List[int]]):
 
 class LMDataset(Dataset):
 
-    def __init__(self, text_file: str, lexicon):
+    def __init__(self, text_file: str):
         '''Dataset to load Language Model train/dev text data
 
         Args:
             text_file: text file, text for one utt per line.
         '''
-        self.lexicon = lexicon
         assert os.path.exists(
-            text_file), "text_file: {} does not exist, please check that."
+            text_file
+        ), "text_file: {} does not exist, please check that.".format(text_file)
         self.data = []
         with open(text_file, 'r') as f:
-            # a line represent a piece of text, e.g.
-            # DELAWARE IS NOT AFRAID OF DOGS
-            for line in f:
-                # import pdb
-                # pdb.set_trace()
-                text = line.strip().lower().split()
-                # print(text)
-                if len(text) == 0:
-                    continue
-                word_id = convert_tokens_to_ids(text, self.lexicon.word2id)
-                if len(word_id) == 0:
-                    continue
-                word_id = torch.from_numpy(np.array(word_id, dtype="int32"))
-
-                token_id = self.lexicon.word_seq_to_word_piece_seq(word_id)
-                # token_id format:
-                # <bos_id> token_id token_id token_id *** <eos_id>
-                if len(token_id) >= 2:
+            for idx, line in enumerate(f):
+                token_id = [int(i) for i in line.strip().split()]
+                # TODO(Liyong Guo): add bos_id and eos_id to each piece of example
+                # then each valid example should be longer than 2
+                if len(token_id) > 2:
                     self.data.append(token_id)
 
     def __len__(self):
@@ -74,18 +61,9 @@ def __len__(self):
     def __getitem__(self, idx):
         return self.data[idx]
 
-    def text2id(self, text: List[str]) -> List[int]:
-        # A dumpy implementation
-        return [i for i in range(len(text))]
-
-    def text_id2token_id(self, text_id: List[int]) -> List[int]:
-        # A dumpy implementation
-        return [i for i in range(len(text_id))]
-
 
 if __name__ == '__main__':
-    # train_file = "./data/nnlm/text/librispeech.txt"
-    dev_file = "./data/nnlm/text/dev.txt"
+    dev_file = "./data/nnlm/text/dev.txt.tokens"
     dataset = LMDataset(dev_file)
     collate_func = CollateFunc()
     data_loader = DataLoader(dataset,
diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
index 0fa62afa..3b50ecf5 100644
--- a/egs/librispeech/asr/nnlm/local/generate_lexicon.py
+++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
@@ -4,6 +4,7 @@
 # Apache 2.0
 
 import argparse
+import collections
 from tokenizers import Tokenizer
 from tokenizers.models import WordPiece
 from tokenizers import decoders
@@ -29,17 +30,41 @@ def get_args():
 
 
 def generate_tokens(args):
+    ''' Extract symbols and there corresponding ids from a tokenizer,
+        and save as tokens.txt.
+        An example file looks like:
+        a 1
+        b 2
+        c 3
+        ...
+        it 100
+        sh 101
+
+    '''
+
     tokenizer = Tokenizer.from_file(args.tokenizer_path)
     symbols = tokenizer.get_vocab()
     tokens_file = '{}/tokens.txt'.format(args.lexicon_path)
     tokens_f = open(tokens_file, 'w')
-    for idx, sym in enumerate(symbols):
-        tokens_f.write('{} {}\n'.format(sym.lower(), idx))
+    id2sym = dict((v, k.lower()) for k, v in symbols.items())
+    for idx in range(len(symbols)):
+        assert idx in id2sym
+        tokens_f.write('{} {}\n'.format(id2sym[idx], idx))
 
     tokens_f.close()
 
 
 def generate_lexicon(args, words):
+    ''' Tokenize every word in words.txt and save as lexicont.txt. 
+        Each line represents a word and its tokenized representation, i.e. a sequence of tokens. a word and its tokens are seprated by a table.
+ 
+        An example file looks like:
+
+        abbreviating	abb ##re ##via ##ting
+        abbreviation	abb ##re ##via ##t ##ion
+        abbreviations	abb ##re ##via ##t ##ions
+ 
+    '''
     special_words = [
         '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
     ]
@@ -48,7 +73,8 @@ def generate_lexicon(args, words):
     tokenizer = Tokenizer.from_file(args.tokenizer_path)
     tokenizer.decoder = decoders.WordPiece()
     for word in words:
-        if word not in special_words:
+        if not (word.upper() in special_words or
+                word.lower() in special_words):
             output = tokenizer.encode(word)
             tokens = ' '.join(output.tokens)
         else:
@@ -60,16 +86,11 @@ def generate_lexicon(args, words):
 def load_words(args):
     words = []
     tokens_file = '{}/words.txt'.format(args.lexicon_path)
-    # special_words = [
-    #     '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
-    # ]
-    # special_words = []
 
     with open(tokens_file) as f:
         for line in f:
             arr = line.strip().split()
-            # if arr[0] not in special_words:
-            words.append(arr[0])
+            words.append(arr[0].lower())
 
     return words
 
diff --git a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
index 8779fb2d..5a260b9f 100644
--- a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
+++ b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
@@ -41,8 +41,8 @@ def get_args():
 def train_tokenizer(train_files, save_path, vocab_size):
     if os.path.exists(save_path):
         logging.warning(
-            "{} already exists. Please check that.".format(save_path))
-        return
+            "{} already exists. Backing up that.".format(save_path))
+        shutil.move(save_path, '{}'.format(save_path))
     else:
         Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True)
 
@@ -52,34 +52,42 @@ def train_tokenizer(train_files, save_path, vocab_size):
     tokenizer.pre_tokenizer = Whitespace()
 
     # default vocab_size=30000
-    # here set vocab_size=1000 for accelerating
     trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=['[UNK]'])
     tokenizer.train(train_files, trainer)
     tokenizer.save(save_path)
 
 
 def tokenize_text(test_file, tokenizer_path):
+    '''
+    tokenize text
+    input format looks like:
+        BOY IS BETTER UNBORN THAN
+        BRAVE OFFICER
+
+
+    output format looks like:
+        355 127 794 4824 346 370
+        1330 1898
+    '''
     if not os.path.exists(tokenizer_path):
-        logging.warning(
-            "Tokenizer {} does not exist. Please check that.".format(
-                tokenizer_path))
+        logging.warning("Tokenizer {} does not exist.".format(tokenizer_path))
         return
     tokenizer = Tokenizer.from_file(tokenizer_path)
     tokenizer.decoder = decoders.WordPiece()
     tokenized_file = "{}.tokens".format(test_file)
-    # tokenized_ids = "{}.ids".format(test_file)
     if os.path.exists(tokenized_file):
         logging.warning(
             "The input file seems already tokenized. Buckupping previous result"
         )
-        shutil.copyfile(tokenized_file, "{}.bk".format(tokenized_file))
+        shutil.move(tokenized_file, "{}.bk".format(tokenized_file))
     logging.warning("Tokenizing {}.".format(test_file))
     fout = open(tokenized_file, 'w')
     with open(test_file) as f:
         for line in f:
             line = line.strip()
             output = tokenizer.encode(line)
-            fout.write(" ".join(output.tokens) + '\n')
+            if len(output.ids) > 0:
+                fout.write(' '.join([str(i) for i in output.ids]) + '\n')
 
     fout.close()
 
diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
index 50813b96..03b9f95b 100644
--- a/egs/librispeech/asr/nnlm/local/trainer.py
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -7,6 +7,8 @@
 import math
 import torch
 
+from common import load_checkpoint, save_checkpoint
+
 
 # references:
 # https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py
@@ -26,7 +28,9 @@ def __init__(self,
                  batch_size=1,
                  epoch=0,
                  num_epochs=10,
-                 log_interval=10,
+                 clip=0.25,
+                 log_interval=100,
+                 model_dir="exp-nnlm/models/",
                  writer=None):
         self.device = device
         self.model = model
@@ -41,6 +45,8 @@ def __init__(self,
         self.iterations = 0
         self.writer = writer
         self.log_interval = log_interval
+        self.clip = clip
+        self.model_dir = model_dir
 
     def run(self):
         for epoch in range(self.num_epochs):
@@ -49,13 +55,17 @@ def run(self):
 
             if self.dev_data_loader is not None:
                 self.eval()
+            save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, epoch),
+                            self.model)
 
             self.epoch += 1
 
     def train(self):
         self.model.train()
+        total_loss = 0
         num_total_batch = len(self.train_data_loader)
         for batch_idx, batch in enumerate(self.train_data_loader):
+            self.optimizer.zero_grad()
             batch_input, batch_target = batch
             batch_input = batch_input.to(self.device)
             batch_target = batch_target.to(self.device)
@@ -65,18 +75,28 @@ def train(self):
             prediction = batch_output.view(-1, self.ntokens)
             target = torch.flatten(batch_target.transpose(0, 1))
             loss = self.criterion(prediction, target)
-            self.optimizer.zero_grad()
+            loss.backward()
+
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
             self.optimizer.step()
 
             self.writer.add_scalar('train_loss', loss, self.iterations)
 
             self.iterations += 1
-            if batch_idx % self.log_interval == 0:
+            total_loss += loss.item()
+            if batch_idx % self.log_interval == 0 and batch_idx > 0:
+                cur_loss = total_loss / self.log_interval
                 log_str = 'TRAIN Batch {}/{} loss {:.6f} ppl {:.6f} at epoch {}'.format(
-                    batch_idx, num_total_batch, loss.item(),
-                    math.exp(loss.item()), self.epoch)
+                    batch_idx, num_total_batch, cur_loss, math.exp(cur_loss),
+                    self.epoch)
                 logging.info(log_str)
+                total_loss = 0.0
+            if batch_idx % 10000 == 0 and batch_idx > 0:
+                save_checkpoint(
+                    "./exp/nn-lm/models/epoch_{}-batch_{}.pt".format(
+                        self.epoch, batch_idx), self.model)
 
+    @torch.no_grad()
     def eval(self):
         self.model.eval()
         total_loss = 0.0
@@ -91,9 +111,9 @@ def eval(self):
             prediction = batch_output.view(-1, self.ntokens)
             target = torch.flatten(batch_target.transpose(0, 1))
             loss = self.criterion(prediction, target)
-            total_loss += loss * self.batch_size
+            total_loss += loss
 
-        loss = total_loss / (num_total_batch * self.batch_size)
+        loss = total_loss / num_total_batch
         ppl = math.exp(loss)
         self.writer.add_scalar('dev_ppl', ppl, self.epoch)
         log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format(
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
index a3eb8c38..358c7592 100644
--- a/egs/librispeech/asr/nnlm/main.py
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -8,17 +8,17 @@
 import argparse
 
 import logging
+import os
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import sys
 
 sys.path.insert(0, './local/')
-sys.path.insert(0, './scripts/')
-from lexicon import Lexicon
 
 from dataset import LMDataset, CollateFunc
 from model import TransformerModel
+from pathlib import Path
 from trainer import Trainer
 from torch.utils.tensorboard import SummaryWriter
 from torch.utils.data import DataLoader
@@ -27,21 +27,30 @@
 def get_args():
     parser = argparse.ArgumentParser(
         description='training Neural Language Model')
-    parser.add_argument('--train_text',
-                        default='data/nnlm/text/librispeech.txt',
+    parser.add_argument('--train_token',
+                        default='data/nnlm/text/librispeech.txt.tokens',
                         help='train data file')
-    parser.add_argument('--dev_text',
-                        default='data/nnlm/text/dev.txt',
+    parser.add_argument('--dev_token',
+                        default='data/nnlm/text/dev.txt.tokens',
                         help='dev data file')
-    parser.add_argument('--batch_size', type=int, default=256)
+    parser.add_argument('--batch_size', type=int, default=60)
     parser.add_argument('--ntokens', type=int, default=10000)
-    parser.add_argument('--emsize', type=int, default=128)
-    parser.add_argument('--nhead', type=int, default=4)
-    parser.add_argument('--nhid', type=int, default=128)
-    parser.add_argument('--nlayers', type=int, default=6)
+    parser.add_argument('--emsize', type=int, default=200)
+    parser.add_argument('--nhead', type=int, default=2)
+    parser.add_argument('--nhid', type=int, default=200)
+    parser.add_argument('--nlayers', type=int, default=2)
+    parser.add_argument('--num_epochs', type=int, default=50)
     parser.add_argument('--dropout', type=int, default=0.2)
+    parser.add_argument('--lr',
+                        type=float,
+                        default=1e-2,
+                        help='initial learning rate')
+    parser.add_argument('--clip',
+                        type=float,
+                        default=50.0,
+                        help='gradient clipping')
     parser.add_argument('--model_dir',
-                        default='./exp/',
+                        default='./exp-nnlm/models/',
                         help='path to save model')
     parser.add_argument('--tensorboard_dir',
                         default='tensorboard',
@@ -50,10 +59,6 @@ def get_args():
                         type=int,
                         default=1,
                         help='gpu id for this local rank, -1 for cpu')
-    parser.add_argument('--lexicon-path',
-                        default='data/nnlm/lexicon',
-                        type=str,
-                        help="path to save lexicon files")
 
     args = parser.parse_args()
 
@@ -68,22 +73,18 @@ def main():
     #Set random seed
     torch.manual_seed(2021)
     collate_func = CollateFunc()
-    lexicon_filename = '{}/lexicon.txt'.format(args.lexicon_path)
-    word2id_filename = '{}/words.txt'.format(args.lexicon_path)
-    piece2id_filename = '{}/tokens.txt'.format(args.lexicon_path)
 
-    lexicon = Lexicon(lexicon_filename, word2id_filename, piece2id_filename)
-    train_dataset = LMDataset(args.train_text, lexicon)
-    dev_dataset = LMDataset(args.dev_text, lexicon)
+    train_dataset = LMDataset(args.train_token)
+    dev_dataset = LMDataset(args.dev_token)
 
     train_data_loader = DataLoader(train_dataset,
                                    batch_size=args.batch_size,
                                    shuffle=False,
-                                   num_workers=0,
+                                   num_workers=10,
                                    collate_fn=collate_func)
 
     dev_data_loader = DataLoader(dev_dataset,
-                                 batch_size=args.batch_size,
+                                 batch_size=20,
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_func)
@@ -91,12 +92,15 @@ def main():
     ntokens = args.ntokens
     model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
                              args.nlayers, args.dropout)
-    optimizer = optim.Adam(model.parameters())
+    optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4)
     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
     device = torch.device('cuda' if use_cuda else 'cpu')
+    print(device)
     criterion = nn.NLLLoss(ignore_index=0)
     exp_dir = 'exp-nnlm'
     writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard')
+
+    Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True)
     trainer = Trainer(device,
                       model,
                       criterion,
@@ -106,6 +110,9 @@ def main():
                       ntokens=ntokens,
                       batch_size=args.batch_size,
                       epoch=0,
+                      num_epochs=args.num_epochs,
+                      clip=args.clip,
+                      model_dir=args.model_dir,
                       writer=writer)
     trainer.run()
 
diff --git a/egs/librispeech/asr/nnlm/requirements.txt b/egs/librispeech/asr/nnlm/requirements.txt
new file mode 100644
index 00000000..fb4a0dd1
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/requirements.txt
@@ -0,0 +1 @@
+tokenizers==0.10.0
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index c7603ac9..9953080b 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -15,37 +15,88 @@
 set -e
 stage=$1
 
-lm_train=data/lm_train/
-tokenizer=$lm_train/tokenizer-librispeech.json
+exp=exp-nnlm
+tokenizer=$exp/tokenizer-librispeech.json
 
 text=data/local/lm/librispeech-lm-norm.txt.gz
 text_dir=data/nnlm/text
-train_text=$text_dir/librispeech.txt
+all_train_text=$text_dir/librispeech.txt
+# there are 40,398,052 pieces in all_train_text, which will take 50 MINUTES to be tokenized, with a single process.
+# Now only $train_pieces data is used for debugging pipeline
+train_pieces=100000 # 5 times of dev.txt
+# uncomment follwoing line to use all_train_text
+# train_pieces=
+dev_text=$text_dir/dev.txt
+
+
+mkdir -p $text_dir
+
+if [ $stage -eq -1 ]; then
+  # env for experiment ../simple_v1 is expected to have been built.
+  echo "Install extra dependencies"
+  pip install -r requirements.txt
+fi
+
 if [ $stage -eq 0 ]; then
-  mkdir -p $text_dir
+  # reference:
+  # https://github.com/kaldi-asr/kaldi/blob/pybind11/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh#L75
+  # use the same data seperation method to kaldi whose result can be used as a baseline
   if [ ! -f $text ]; then
-    wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm 
+    wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm
   fi
   echo -n >$text_dir/dev.txt
   # hold out one in every 2000 lines as dev data.
-  gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$train_text
+  gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$all_train_text
 fi
 
+if [ ! -z "$train_pieces" ]; then
+  train_text=$text_dir/${train_pieces}_librispeech.txt
+  if [ $train_text -ot $all_train_text ] || [  ! -f $train_text ]; then
+  # if [ ! -f $train_text) || $train_text -ot $all_train_text ]; then
+    head -n $train_pieces $all_train_text > $train_text
+  fi
+else
+  train_text=$all_train_text
+fi
 
-if [ $stage -eq 2 ]; then
+
+if [ $stage -eq 1 ]; then
   echo "training tokenizer"
   python3 local/huggingface_tokenizer.py \
     --train-file=$train_text \
     --tokenizer-path=$tokenizer
 fi
 
-if [ $stage -eq 3 ]; then
-  echo "generate lexicon"
-  python local/generate_lexicon.py
+
+if [ $stage -eq 2 ]; then
+  echo "tokenize train and dev files"
+  for text in $dev_text $train_text; do
+    python3 local/huggingface_tokenizer.py \
+      --test-file=$text \
+      --tokenizer-path=$tokenizer
+  done
 fi
 
-if [ $stage -eq 5 ]; then
+if [ $stage -eq 3 ]; then
+  echo "start to train"
   python main.py \
-    --cuda \
+    --train_token ${train_text}.tokens \
     --model Transformer
 fi
+
+if [ $stage -eq 4 ]; then
+  # generate words.txt tokens.txt and lexicion.txt
+  # which is used in future rescore process
+  lexicon_path=./data/nnlm/lexicon
+  mkdir -p $lexicon_path
+  words_txt=../simple_v1/data/lang_nosp/words.txt
+  if [ -f $words_txt ]; then
+    cp $words_txt $lexicon_path
+  else
+    echo "please set words_txt path of your previous experiment"
+    echo "the NN-LM trained LM is used as a rescore module, \
+      currently the same words.txt with previous experiment is prefered"
+  fi
+  echo "generate lexicon"
+  python local/generate_lexicon.py
+fi

From 61863dbabffaaf710f60c6ea403efdf43e387424 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Fri, 2 Apr 2021 23:14:49 +0800
Subject: [PATCH 13/25] fix various bugs

with vocab_size=2000, epochs=50
tokens ppl of train: around 80
           of dev: 119
---
 egs/librispeech/asr/nnlm/local/dataset.py | 21 +++++--
 egs/librispeech/asr/nnlm/local/model.py   | 17 +++++-
 egs/librispeech/asr/nnlm/local/trainer.py | 73 +++++++++++++++++------
 egs/librispeech/asr/nnlm/main.py          | 60 ++++++++++++++-----
 egs/librispeech/asr/nnlm/run.sh           | 29 ++++++---
 5 files changed, 153 insertions(+), 47 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index eb17491b..be7904d7 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -17,8 +17,9 @@ class CollateFunc(object):
     '''Collate function for LMDataset
     '''
 
-    def __init__(self, pad_index=0):
+    def __init__(self, pad_index=None):
         # pad_index should be identical to ignore_index of torch.nn.NLLLoss
+        # and padding_idx in torch.nn.Embedding
         self.pad_index = pad_index
 
     def __call__(self, batch: List[List[int]]):
@@ -27,22 +28,30 @@ def __call__(self, batch: List[List[int]]):
            token_id reprents a tokenized text, whose format is:
            <bos_id> token_id token_id token_id *** <eos_id>
         '''
+        # data_pad: [batch_size, seq_len]
+        # each seq_len always different
         data_pad = pad_sequence(
             [torch.from_numpy(np.array(x)).long() for x in batch], True,
             self.pad_index)
-        xs_pad = data_pad[:, :-1]
-        ys_pad = data_pad[:, 1:]
+        data_pad = data_pad.t().contiguous()
+        # xs_pad, ys_pad: [max_seq_len, batch_size]
+        # max_seq_len is the maximum lenght in current batch
+        xs_pad = data_pad[:-1, :]
+        ys_pad = data_pad[1:, :]
         return xs_pad, ys_pad
 
 
 class LMDataset(Dataset):
 
-    def __init__(self, text_file: str):
+    def __init__(self, text_file: str, ntokens=None):
         '''Dataset to load Language Model train/dev text data
 
         Args:
             text_file: text file, text for one utt per line.
         '''
+        self.bos_id = ntokens - 3
+        self.eos_id = ntokens - 2
+        self.pad_index = ntokens - 1
         assert os.path.exists(
             text_file
         ), "text_file: {} does not exist, please check that.".format(text_file)
@@ -50,8 +59,10 @@ def __init__(self, text_file: str):
         with open(text_file, 'r') as f:
             for idx, line in enumerate(f):
                 token_id = [int(i) for i in line.strip().split()]
-                # TODO(Liyong Guo): add bos_id and eos_id to each piece of example
+                # add bos_id and eos_id to each piece of example
                 # then each valid example should be longer than 2
+                token_id.insert(0, self.bos_id)
+                token_id.append(self.eos_id)
                 if len(token_id) > 2:
                     self.data.append(token_id)
 
diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py
index fcd7b8fc..bc3833dc 100644
--- a/egs/librispeech/asr/nnlm/local/model.py
+++ b/egs/librispeech/asr/nnlm/local/model.py
@@ -20,8 +20,7 @@ def __init__(self,
         super(RNNModel, self).__init__()
         self.ntoken = ntoken
         self.drop = nn.Dropout(dropout)
-        # import pdb; pdb.set_trace()
-        self.encoder = nn.Embedding(ntoken, ninp, padding_idx=0)
+        self.encoder = nn.Embedding(ntoken, ninp, padding_idx=ntoken - 1)
         if rnn_type in ['LSTM', 'GRU']:
             self.rnn = getattr(nn, rnn_type)(ninp,
                                              nhid,
@@ -150,7 +149,7 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
         self.pos_encoder = PositionalEncoding(ninp, dropout)
         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
-        self.encoder = nn.Embedding(ntoken, ninp)
+        self.encoder = nn.Embedding(ntoken, ninp, padding_idx=ntoken - 1)
         self.ninp = ninp
         self.decoder = nn.Linear(ninp, ntoken)
 
@@ -169,6 +168,8 @@ def init_weights(self):
         nn.init.uniform_(self.decoder.weight, -initrange, initrange)
 
     def forward(self, src, has_mask=True):
+        # src: [seq—len, batch_size]
+        # len(src) is seq_len
         if has_mask:
             device = src.device
             if self.src_mask is None or self.src_mask.size(0) != len(src):
@@ -178,6 +179,16 @@ def forward(self, src, has_mask=True):
         else:
             self.src_mask = None
 
+        # mask: [seq_len, seq_len]
+        # looks like:
+        #     tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
+        #             [0., 0., -inf,  ..., -inf, -inf, -inf],
+        #             [0., 0., 0.,  ..., -inf, -inf, -inf],
+        #             ...,
+        #             [0., 0., 0.,  ..., 0., -inf, -inf],
+        #             [0., 0., 0.,  ..., 0., 0., -inf],
+        #             [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
+
         src = self.encoder(src) * math.sqrt(self.ninp)
         src = self.pos_encoder(src)
         output = self.transformer_encoder(src, self.src_mask)
diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
index 03b9f95b..e99cfb9c 100644
--- a/egs/librispeech/asr/nnlm/local/trainer.py
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -8,13 +8,24 @@
 import torch
 
 from common import load_checkpoint, save_checkpoint
-
+from model import TransformerModel, RNNModel
 
 # references:
 # https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py
 # https://github.com/espnet/espnet/blob/master/espnet/lm/pytorch_backend/lm.py
 # https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py
 # https://www.jianshu.com/p/c88df856dbc8
+
+
+def repackage_hidden(h):
+    """Wraps hidden states in new Tensors, to detach them from their history."""
+
+    if isinstance(h, torch.Tensor):
+        return h.detach()
+    else:
+        return tuple(repackage_hidden(v) for v in h)
+
+
 class Trainer(object):
 
     def __init__(self,
@@ -47,6 +58,7 @@ def __init__(self,
         self.log_interval = log_interval
         self.clip = clip
         self.model_dir = model_dir
+        self.num_infinite_grad_norm = 0
 
     def run(self):
         for epoch in range(self.num_epochs):
@@ -65,20 +77,37 @@ def train(self):
         total_loss = 0
         num_total_batch = len(self.train_data_loader)
         for batch_idx, batch in enumerate(self.train_data_loader):
-            self.optimizer.zero_grad()
+            # batch_input, batch_target: [max_seq_len, batch_size]
+            # max_seq_len is the maximum lenght in current batch
             batch_input, batch_target = batch
+            assert batch_input.shape[1] == self.batch_size
+            assert batch_target.shape[1] == self.batch_size
             batch_input = batch_input.to(self.device)
             batch_target = batch_target.to(self.device)
             self.model.to(self.device)
-            batch_output = self.model(batch_input)
-
-            prediction = batch_output.view(-1, self.ntokens)
-            target = torch.flatten(batch_target.transpose(0, 1))
+            if isinstance(self.model, TransformerModel):
+                batch_output = self.model(batch_input)
+
+                prediction = batch_output.view(-1, self.ntokens)
+            else:
+                # reinitiate hidden for everch batch
+                # as batches are independent on each other
+                hidden = self.model.init_hidden(batch_input.shape[1])
+                prediction, _ = self.model(batch_input, hidden)
+
+            # target: [max_seq_len * batch_size]
+            # example_1_token_1 example_2_token_1 example_3_token_1 .....
+            target = batch_target.view(-1)
             loss = self.criterion(prediction, target)
             loss.backward()
 
-            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
-            self.optimizer.step()
+            grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(),
+                                                       self.clip)
+            if torch.isfinite(grad_norm):
+                self.optimizer.step()
+            else:
+                self.num_infinite_grad_norm += 1
+            self.optimizer.zero_grad()
 
             self.writer.add_scalar('train_loss', loss, self.iterations)
 
@@ -90,30 +119,40 @@ def train(self):
                     batch_idx, num_total_batch, cur_loss, math.exp(cur_loss),
                     self.epoch)
                 logging.info(log_str)
+                logging.info('infinite grad_norm detected {} times'.format(
+                    self.num_infinite_grad_norm))
                 total_loss = 0.0
             if batch_idx % 10000 == 0 and batch_idx > 0:
                 save_checkpoint(
-                    "./exp/nn-lm/models/epoch_{}-batch_{}.pt".format(
-                        self.epoch, batch_idx), self.model)
+                    "{}/epoch_{}-batch_{}.pt".format(self.model_dir,
+                                                     self.epoch, batch_idx),
+                    self.model)
 
     @torch.no_grad()
     def eval(self):
         self.model.eval()
         total_loss = 0.0
-        num_total_batch = len(self.dev_data_loader)
+        total_examples = 0
         for batch_idx, batch in enumerate(self.dev_data_loader):
             batch_input, batch_target = batch
             batch_input = batch_input.to(self.device)
             batch_target = batch_target.to(self.device)
             self.model.to(self.device)
-            batch_output = self.model(batch_input)
-
-            prediction = batch_output.view(-1, self.ntokens)
-            target = torch.flatten(batch_target.transpose(0, 1))
+            if isinstance(self.model, TransformerModel):
+                batch_output = self.model(batch_input)
+
+                prediction = batch_output.view(-1, self.ntokens)
+            else:
+                hidden = self.model.init_hidden(batch_input.shape[1])
+                prediction, _ = self.model(batch_input, hidden)
+            # target: [max_seq_len * batch_size]
+            # example_1_token_1 example_2_token_1 example_3_token_1 .....
+            target = batch_target.view(-1)
             loss = self.criterion(prediction, target)
-            total_loss += loss
+            total_loss += loss * batch_input.shape[1]
+            total_examples += batch_input.shape[1]
 
-        loss = total_loss / num_total_batch
+        loss = total_loss / total_examples
         ppl = math.exp(loss)
         self.writer.add_scalar('dev_ppl', ppl, self.epoch)
         log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format(
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
index 358c7592..2fb6bc6a 100644
--- a/egs/librispeech/asr/nnlm/main.py
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -16,8 +16,9 @@
 
 sys.path.insert(0, './local/')
 
+from common import load_checkpoint
 from dataset import LMDataset, CollateFunc
-from model import TransformerModel
+from model import TransformerModel, RNNModel
 from pathlib import Path
 from trainer import Trainer
 from torch.utils.tensorboard import SummaryWriter
@@ -34,7 +35,7 @@ def get_args():
                         default='data/nnlm/text/dev.txt.tokens',
                         help='dev data file')
     parser.add_argument('--batch_size', type=int, default=60)
-    parser.add_argument('--ntokens', type=int, default=10000)
+    parser.add_argument('--vocab_size', type=int, default=10000)
     parser.add_argument('--emsize', type=int, default=200)
     parser.add_argument('--nhead', type=int, default=2)
     parser.add_argument('--nhid', type=int, default=200)
@@ -59,6 +60,15 @@ def get_args():
                         type=int,
                         default=1,
                         help='gpu id for this local rank, -1 for cpu')
+    parser.add_argument(
+        '--model_iter',
+        type=int,
+        default=-1,
+        help='resume from trained model; if -1 training from scratch')
+    parser.add_argument('--model_type',
+                        type=str,
+                        default='Transformer',
+                        help='model type')
 
     args = parser.parse_args()
 
@@ -70,33 +80,53 @@ def main():
     logging.basicConfig(level=logging.DEBUG,
                         format='%(asctime)s %(levelname)s %(message)s')
 
-    #Set random seed
+    # Set random seed
     torch.manual_seed(2021)
-    collate_func = CollateFunc()
-
-    train_dataset = LMDataset(args.train_token)
-    dev_dataset = LMDataset(args.dev_token)
-
+    # args.vocab_size: number of tokens in tokenizer.get_vocab
+    # + 2: one for eos_id, another for pad_idx
+    # i.e. token_idxs[0, 1, 2, ...., ntokens -3, ntokens - 2, ntokens - 1]
+    # bos_id: ntokens - 3
+    # eos_id: ntokens - 2
+    # pad_idx: ntokens - 1
+    ntokens = args.vocab_size + 3
+    pad_index = ntokens - 1
+
+    collate_func = CollateFunc(pad_index=pad_index)
+
+    train_dataset = LMDataset(args.train_token, ntokens=ntokens)
+    dev_dataset = LMDataset(args.dev_token, ntokens=ntokens)
+
+    # To debug dataset.py, set shuffle=False and num_workers=0
+    # then examples will be loaded as the sequence they are in {train, dev}.tokens
     train_data_loader = DataLoader(train_dataset,
                                    batch_size=args.batch_size,
-                                   shuffle=False,
-                                   num_workers=10,
+                                   shuffle=True,
+                                   num_workers=0,
+                                   drop_last=True,
                                    collate_fn=collate_func)
 
     dev_data_loader = DataLoader(dev_dataset,
                                  batch_size=20,
                                  shuffle=False,
                                  num_workers=0,
+                                 drop_last=True,
                                  collate_fn=collate_func)
 
-    ntokens = args.ntokens
-    model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
-                             args.nlayers, args.dropout)
+    if 'Trasformer' == args.model_type:
+        model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
+                                 args.nlayers, args.dropout)
+    else:
+        model = RNNModel('LSTM', ntokens, args.emsize, args.nhid, args.nlayers,
+                         args.dropout, False)
+
+    if args.model_iter > 0:
+        model_path = '{}/epoch_{}.pt'.format(args.model_dir, args.model_iter)
+        load_checkpoint(model_path, model)
     optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4)
     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
     device = torch.device('cuda' if use_cuda else 'cpu')
     print(device)
-    criterion = nn.NLLLoss(ignore_index=0)
+    criterion = nn.NLLLoss(ignore_index=pad_index)
     exp_dir = 'exp-nnlm'
     writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard')
 
@@ -109,7 +139,7 @@ def main():
                       dev_data_loader=dev_data_loader,
                       ntokens=ntokens,
                       batch_size=args.batch_size,
-                      epoch=0,
+                      epoch=args.model_iter + 1,
                       num_epochs=args.num_epochs,
                       clip=args.clip,
                       model_dir=args.model_dir,
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index 9953080b..dba4c443 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -28,6 +28,14 @@ train_pieces=100000 # 5 times of dev.txt
 # train_pieces=
 dev_text=$text_dir/dev.txt
 
+# vocab_size of huggingface tokenizer
+vocab_size=2000
+# for neural models, number of final classes is:
+# ntokens = $vocab_size + 3
+# while: bos_id = ntokens - 3
+#        eos_id = ntokens - 2
+#        pad_index = ntokens - 1
+
 
 mkdir -p $text_dir
 
@@ -59,12 +67,12 @@ else
   train_text=$all_train_text
 fi
 
-
 if [ $stage -eq 1 ]; then
   echo "training tokenizer"
   python3 local/huggingface_tokenizer.py \
-    --train-file=$train_text \
-    --tokenizer-path=$tokenizer
+    --train-file $train_text \
+    --vocab-size $vocab_size \
+    --tokenizer-path $tokenizer
 fi
 
 
@@ -72,16 +80,21 @@ if [ $stage -eq 2 ]; then
   echo "tokenize train and dev files"
   for text in $dev_text $train_text; do
     python3 local/huggingface_tokenizer.py \
-      --test-file=$text \
-      --tokenizer-path=$tokenizer
+      --test-file $text \
+      --tokenizer-path $tokenizer
   done
 fi
 
 if [ $stage -eq 3 ]; then
   echo "start to train"
+  # model_iter if for resume training
+  # -1 means train from scratch
   python main.py \
+    --model_iter 48 \
     --train_token ${train_text}.tokens \
-    --model Transformer
+    --vocab_size $vocab_size \
+    --model_type Transformer
+
 fi
 
 if [ $stage -eq 4 ]; then
@@ -98,5 +111,7 @@ if [ $stage -eq 4 ]; then
       currently the same words.txt with previous experiment is prefered"
   fi
   echo "generate lexicon"
-  python local/generate_lexicon.py
+  python local/generate_lexicon.py \
+    --tokenizer-path $tokenizer
+
 fi

From d4dccae36a6db4815dc0c52b0826b8ab8f89fa30 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Fri, 2 Apr 2021 23:39:45 +0800
Subject: [PATCH 14/25] compute word_ppl from token_ppl

---
 egs/librispeech/asr/nnlm/compute_word_ppl.py | 145 +++++++++++++++++++
 egs/librispeech/asr/nnlm/local/trainer.py    |  69 +++++++++
 egs/librispeech/asr/nnlm/run.sh              |  29 ++--
 3 files changed, 234 insertions(+), 9 deletions(-)
 create mode 100644 egs/librispeech/asr/nnlm/compute_word_ppl.py

diff --git a/egs/librispeech/asr/nnlm/compute_word_ppl.py b/egs/librispeech/asr/nnlm/compute_word_ppl.py
new file mode 100644
index 00000000..d7d2f47b
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/compute_word_ppl.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+# Reference:
+# https://github.com/mobvoi/wenet/blob/main/wenet/bin/train.py
+import argparse
+
+import logging
+import os
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import sys
+
+sys.path.insert(0, './local/')
+
+from common import load_checkpoint
+from dataset import LMDataset, CollateFunc
+from model import TransformerModel, RNNModel
+from pathlib import Path
+from trainer import Trainer
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data import DataLoader
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='training Neural Language Model')
+    parser.add_argument('--train_token',
+                        default='data/nnlm/text/librispeech.txt.tokens',
+                        help='train token file')
+    parser.add_argument('--dev_token',
+                        default='data/nnlm/text/dev.txt.tokens',
+                        help='dev token file')
+    parser.add_argument('--dev_txt',
+                        default='data/nnlm/text/dev.txt',
+                        help='dev txt file, used to compute word ppl')
+    parser.add_argument('--batch_size', type=int, default=60)
+    parser.add_argument('--vocab_size', type=int, default=2000)
+    parser.add_argument('--emsize', type=int, default=200)
+    parser.add_argument('--nhead', type=int, default=2)
+    parser.add_argument('--nhid', type=int, default=200)
+    parser.add_argument('--nlayers', type=int, default=2)
+    parser.add_argument('--num_epochs', type=int, default=50)
+    parser.add_argument('--dropout', type=int, default=0.2)
+    parser.add_argument('--lr',
+                        type=float,
+                        default=1e-2,
+                        help='initial learning rate')
+    parser.add_argument('--clip',
+                        type=float,
+                        default=50.0,
+                        help='gradient clipping')
+    parser.add_argument('--model_dir',
+                        default='./exp-nnlm/models/',
+                        help='path to save model')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='path to save tensorboard log')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=1,
+                        help='gpu id for this local rank, -1 for cpu')
+    parser.add_argument(
+        '--model_iter',
+        type=int,
+        default=19,
+        help='resume from trained model; if -1 training from scratch')
+    parser.add_argument('--model_type',
+                        type=str,
+                        default='Transformer',
+                        help='model type')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+
+    # Set random seed
+    torch.manual_seed(2021)
+    # args.vocab_size: number of tokens in tokenizer.get_vocab
+    # + 2: one for eos_id, another for pad_idx
+    # i.e. token_idxs[0, 1, 2, ...., ntokens -3, ntokens - 2, ntokens - 1]
+    # bos_id: ntokens - 3
+    # eos_id: ntokens - 2
+    # pad_idx: ntokens - 1
+    ntokens = args.vocab_size + 3
+    pad_index = ntokens - 1
+
+    collate_func = CollateFunc(pad_index=pad_index)
+
+    dev_dataset = LMDataset(args.dev_token, ntokens=ntokens)
+
+    dev_data_loader = DataLoader(dev_dataset,
+                                 batch_size=1,
+                                 shuffle=False,
+                                 num_workers=0,
+                                 drop_last=False,
+                                 collate_fn=collate_func)
+
+    if 'Trasformer' == args.model_type:
+        model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
+                                 args.nlayers, args.dropout)
+    else:
+        model = RNNModel('LSTM', ntokens, args.emsize, args.nhid, args.nlayers,
+                         args.dropout, False)
+
+    if args.model_iter > 0:
+        model_path = '{}/epoch_{}.pt'.format(args.model_dir, args.model_iter)
+        load_checkpoint(model_path, model)
+    optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4)
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    print(device)
+    criterion = nn.NLLLoss(ignore_index=pad_index)
+    exp_dir = 'exp-nnlm'
+    writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard')
+
+    Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True)
+    trainer = Trainer(device,
+                      model,
+                      criterion,
+                      optimizer,
+                      train_data_loader=None,
+                      dev_data_loader=dev_data_loader,
+                      ntokens=ntokens,
+                      batch_size=args.batch_size,
+                      epoch=args.model_iter + 1,
+                      num_epochs=args.num_epochs,
+                      clip=args.clip,
+                      model_dir=args.model_dir,
+                      writer=writer)
+
+    trainer.get_word_ppl(args.dev_txt)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
index e99cfb9c..24eb2546 100644
--- a/egs/librispeech/asr/nnlm/local/trainer.py
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -5,6 +5,7 @@
 
 import logging
 import math
+import numpy as np
 import torch
 
 from common import load_checkpoint, save_checkpoint
@@ -134,6 +135,11 @@ def eval(self):
         total_loss = 0.0
         total_examples = 0
         for batch_idx, batch in enumerate(self.dev_data_loader):
+            # batch_input: [seq_len, batch_size]
+            # with contents: <bos> token_id token_id ....
+            #
+            # batch_target: [seq_len, batch_size]
+            # with contensts: token_id token_id ... <eos>
             batch_input, batch_target = batch
             batch_input = batch_input.to(self.device)
             batch_target = batch_target.to(self.device)
@@ -158,3 +164,66 @@ def eval(self):
         log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format(
             loss.item(), ppl, self.epoch)
         logging.info(log_str)
+
+    def get_word_counts(self, dev_txt: str):
+        word_counts = []
+        with open(dev_txt, 'r') as f:
+            for line in f:
+                # +1: for append <eos>
+                word_counts.append(len(line.split()) + 1)
+
+        return word_counts
+
+    def compute_words_ppl(self, tokens_loss, tokens_counts, word_counts):
+        assert len(tokens_loss) == len(tokens_counts)
+        assert len(word_counts) == len(tokens_counts)
+        words_ppl = [
+            math.exp(tokens_loss[i] * tokens_counts[i] / word_counts[i])
+            for i in range(len(word_counts))
+        ]
+        word_ppl = np.mean(words_ppl)
+        return word_ppl
+
+    @torch.no_grad()
+    def get_word_ppl(self, dev_txt: str):
+        word_counts = self.get_word_counts(dev_txt)
+        tokens_ppl = []
+        tokens_loss = []
+        tokens_counts = []
+
+        self.model.eval()
+        for batch_idx, batch in enumerate(self.dev_data_loader):
+            if batch_idx % 1000 == 0 and batch_idx > 0:
+                logging.info('{}/{} computed'.format(
+                    batch_idx, len(self.dev_data_loader)))
+            # batch_input: [seq_len, batch_size]
+            # with contents: <bos> token_id token_id ....
+            #
+            # batch_target: [seq_len, batch_size]
+            # with contensts: token_id token_id ... <eos>
+            batch_input, batch_target = batch
+            # batch_size == 1 to get loss and ppl for each seq
+            assert batch_input.shape[1] == 1
+            batch_input = batch_input.to(self.device)
+            batch_target = batch_target.to(self.device)
+            self.model.to(self.device)
+            if isinstance(self.model, TransformerModel):
+                batch_output = self.model(batch_input)
+
+                prediction = batch_output.view(-1, self.ntokens)
+            else:
+                hidden = self.model.init_hidden(batch_input.shape[1])
+                prediction, _ = self.model(batch_input, hidden)
+            # target: [max_seq_len * batch_size]
+            # example_1_token_1 example_2_token_1 example_3_token_1 .....
+            target = batch_target.view(-1)
+            loss = self.criterion(prediction, target).item()
+            ppl = math.exp(loss)
+            tokens_ppl.append(ppl)
+            tokens_loss.append(loss)
+            tokens_counts.append(len(target))
+        word_ppl = self.compute_words_ppl(tokens_loss, tokens_counts,
+                                          word_counts)
+        token_ppl = np.mean(tokens_ppl)
+        logging.info('token_ppl: {}, word_ppl: {}'.format(token_ppl, word_ppl))
+        return word_ppl, token_ppl
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index dba4c443..e0c443a9 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -23,13 +23,13 @@ text_dir=data/nnlm/text
 all_train_text=$text_dir/librispeech.txt
 # there are 40,398,052 pieces in all_train_text, which will take 50 MINUTES to be tokenized, with a single process.
 # Now only $train_pieces data is used for debugging pipeline
-train_pieces=100000 # 5 times of dev.txt
+train_pieces=300000 # 15 times of dev.txt
 # uncomment follwoing line to use all_train_text
 # train_pieces=
 dev_text=$text_dir/dev.txt
 
 # vocab_size of huggingface tokenizer
-vocab_size=2000
+vocab_size=3000
 # for neural models, number of final classes is:
 # ntokens = $vocab_size + 3
 # while: bos_id = ntokens - 3
@@ -39,13 +39,13 @@ vocab_size=2000
 
 mkdir -p $text_dir
 
-if [ $stage -eq -1 ]; then
+if [ $stage -le -1 ]; then
   # env for experiment ../simple_v1 is expected to have been built.
   echo "Install extra dependencies"
   pip install -r requirements.txt
 fi
 
-if [ $stage -eq 0 ]; then
+if [ $stage -le 0 ]; then
   # reference:
   # https://github.com/kaldi-asr/kaldi/blob/pybind11/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh#L75
   # use the same data seperation method to kaldi whose result can be used as a baseline
@@ -67,7 +67,7 @@ else
   train_text=$all_train_text
 fi
 
-if [ $stage -eq 1 ]; then
+if [ $stage -le 1 ]; then
   echo "training tokenizer"
   python3 local/huggingface_tokenizer.py \
     --train-file $train_text \
@@ -76,7 +76,7 @@ if [ $stage -eq 1 ]; then
 fi
 
 
-if [ $stage -eq 2 ]; then
+if [ $stage -le 2 ]; then
   echo "tokenize train and dev files"
   for text in $dev_text $train_text; do
     python3 local/huggingface_tokenizer.py \
@@ -85,19 +85,30 @@ if [ $stage -eq 2 ]; then
   done
 fi
 
-if [ $stage -eq 3 ]; then
+if [ $stage -le 3 ]; then
   echo "start to train"
   # model_iter if for resume training
   # -1 means train from scratch
   python main.py \
-    --model_iter 48 \
+    --model_iter -1 \
     --train_token ${train_text}.tokens \
     --vocab_size $vocab_size \
     --model_type Transformer
 
 fi
 
-if [ $stage -eq 4 ]; then
+if [ $stage -le 4 ]; then
+  echo "start to train"
+  # model_iter if for resume training
+  # -1 means train from scratch
+  python compute_word_ppl.py \
+    --model_iter 40 \
+    --vocab_size $vocab_size \
+    --model_type Transformer
+
+fi
+
+if [ $stage -le 5 ]; then
   # generate words.txt tokens.txt and lexicion.txt
   # which is used in future rescore process
   lexicon_path=./data/nnlm/lexicon

From a4d5f1b2051397073a3720f8f62112afd838e247 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Sat, 3 Apr 2021 08:50:25 +0800
Subject: [PATCH 15/25] add results.md

---
 egs/librispeech/asr/nnlm/RESULTS.md       | 11 +++++++++++
 egs/librispeech/asr/nnlm/local/trainer.py | 11 ++++++++---
 egs/librispeech/asr/nnlm/main.py          |  4 +++-
 egs/librispeech/asr/nnlm/run.sh           |  3 ++-
 4 files changed, 24 insertions(+), 5 deletions(-)
 create mode 100644 egs/librispeech/asr/nnlm/RESULTS.md

diff --git a/egs/librispeech/asr/nnlm/RESULTS.md b/egs/librispeech/asr/nnlm/RESULTS.md
new file mode 100644
index 00000000..1db693ed
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/RESULTS.md
@@ -0,0 +1,11 @@
+##tokens ppl train_pieces=300000 # 15 times of dev.txt
+
+###vocab_size=2000
+  epochs=50 train/dev perplexity was 80.0 / 119.0
+
+###vocab_size=3000
+  dev perplexity of random initialized model is around 2998.13
+  epochs=1 train/dev perplexity was around 120 / 137.67
+  epochs=2 train/dev perplexity was around 113 / 132.51
+  epochs=3 train/dev perplexity was around 111 / 130.09
+  epochs=4 train/dev perplexity was around 109 / 130.29
diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
index 24eb2546..dc9b6e1e 100644
--- a/egs/librispeech/asr/nnlm/local/trainer.py
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -62,17 +62,22 @@ def __init__(self,
         self.num_infinite_grad_norm = 0
 
     def run(self):
-        for epoch in range(self.num_epochs):
+        # save and eval initialized moel
+        if 0 == self.epoch:
+            save_checkpoint("{}/epoch_0.pt".format(self.model_dir), self.model)
+            self.eval()
+
+        for epoch in range(self.epoch, self.num_epochs):
             if self.train_data_loader is not None:
                 self.train()
 
+            self.epoch += 1
             if self.dev_data_loader is not None:
                 self.eval()
+
             save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, epoch),
                             self.model)
 
-            self.epoch += 1
-
     def train(self):
         self.model.train()
         total_loss = 0
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
index 2fb6bc6a..6fed85e6 100644
--- a/egs/librispeech/asr/nnlm/main.py
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -101,7 +101,7 @@ def main():
     train_data_loader = DataLoader(train_dataset,
                                    batch_size=args.batch_size,
                                    shuffle=True,
-                                   num_workers=0,
+                                   num_workers=10,
                                    drop_last=True,
                                    collate_fn=collate_func)
 
@@ -131,6 +131,7 @@ def main():
     writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard')
 
     Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True)
+    log_interval = max(100, len(train_data_loader) // 20)
     trainer = Trainer(device,
                       model,
                       criterion,
@@ -142,6 +143,7 @@ def main():
                       epoch=args.model_iter + 1,
                       num_epochs=args.num_epochs,
                       clip=args.clip,
+                      log_interval=log_interval,
                       model_dir=args.model_dir,
                       writer=writer)
     trainer.run()
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index e0c443a9..bf86de2f 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -98,7 +98,8 @@ if [ $stage -le 3 ]; then
 fi
 
 if [ $stage -le 4 ]; then
-  echo "start to train"
+  # TODO: this module is in developing
+  echo "compute word ppl from token ppl"
   # model_iter if for resume training
   # -1 means train from scratch
   python compute_word_ppl.py \

From 53e2d1e330da2cb0fef910c6e7389f97654f4640 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Sat, 3 Apr 2021 23:57:26 +0800
Subject: [PATCH 16/25] compute word_ppl from token_ppl

---
 egs/librispeech/asr/nnlm/local/trainer.py | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
index dc9b6e1e..9520ef9f 100644
--- a/egs/librispeech/asr/nnlm/local/trainer.py
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -179,16 +179,6 @@ def get_word_counts(self, dev_txt: str):
 
         return word_counts
 
-    def compute_words_ppl(self, tokens_loss, tokens_counts, word_counts):
-        assert len(tokens_loss) == len(tokens_counts)
-        assert len(word_counts) == len(tokens_counts)
-        words_ppl = [
-            math.exp(tokens_loss[i] * tokens_counts[i] / word_counts[i])
-            for i in range(len(word_counts))
-        ]
-        word_ppl = np.mean(words_ppl)
-        return word_ppl
-
     @torch.no_grad()
     def get_word_ppl(self, dev_txt: str):
         word_counts = self.get_word_counts(dev_txt)
@@ -227,8 +217,18 @@ def get_word_ppl(self, dev_txt: str):
             tokens_ppl.append(ppl)
             tokens_loss.append(loss)
             tokens_counts.append(len(target))
-        word_ppl = self.compute_words_ppl(tokens_loss, tokens_counts,
-                                          word_counts)
-        token_ppl = np.mean(tokens_ppl)
+
+        assert len(tokens_loss) == len(tokens_counts)
+        assert len(word_counts) == len(tokens_counts)
+        sentence_log_prob = [
+            tokens_loss[i] * tokens_counts[i]
+            for i in range(len(tokens_counts))
+        ]
+        total_log_prob = np.sum(sentence_log_prob)
+        total_words = np.sum(word_counts)
+        total_tokens = np.sum(tokens_counts)
+
+        word_ppl = math.exp(total_log_prob / total_words)
+        token_ppl = math.exp(total_log_prob / total_tokens)
         logging.info('token_ppl: {}, word_ppl: {}'.format(token_ppl, word_ppl))
         return word_ppl, token_ppl

From b226a3a94a131f4af87460f96f5bc3b3ffd792b7 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Fri, 9 Apr 2021 11:44:31 +0800
Subject: [PATCH 17/25] support yaml configuration

---
 .../asr/nnlm/conf/lm_small_transformer.yaml   |  43 +++++
 .../asr/nnlm/conf/lm_transformer.yaml         |  44 +++++
 egs/librispeech/asr/nnlm/local/dataset.py     |   9 +-
 egs/librispeech/asr/nnlm/local/model.py       | 123 ++++---------
 egs/librispeech/asr/nnlm/local/trainer.py     |  28 ++-
 egs/librispeech/asr/nnlm/main.py              | 168 ++++++++----------
 egs/librispeech/asr/nnlm/run.sh               |  26 ++-
 7 files changed, 229 insertions(+), 212 deletions(-)
 create mode 100644 egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
 create mode 100644 egs/librispeech/asr/nnlm/conf/lm_transformer.yaml

diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
new file mode 100644
index 00000000..01bd5d05
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
@@ -0,0 +1,43 @@
+
+gpu: 1
+tensorboard_dir: 'exp-nnlm/tensorobard'
+
+# network architecture equivalent configuration to 
+# https://github.com/pytorch/examples/blob/master/word_language_model/main.py
+model_module: transformer
+transformer_conf:
+  embed_unit: 200
+  attention_heads: 2
+  nlayers: 2
+  linear_units: 200
+  dropout: 0.2
+
+shared_conf:
+  ntoken: 5003
+  batch_size: 30
+
+optimizer_conf:
+  lr: 0.02
+  weight_decay: 0.005
+
+trainer_conf:
+  num_epochs: 50
+  clip: 0.25
+  model_dir: './exp-nnlm/models/'
+
+
+dataset_conf:
+  train_token: 'data/nnlm/text/300000_librispeech.txt.tokens'
+  dev_token: 'data/nnlm/text/dev.txt.tokens'
+
+dataloader_conf:
+  train:
+    batch_size: 20
+    shuffle: True
+    num_workers: 10
+    drop_last: True
+  dev:
+    batch_size: 20
+    shuffle: False
+    num_workers: 10
+    drop_last: False
diff --git a/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml
new file mode 100644
index 00000000..21851ded
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml
@@ -0,0 +1,44 @@
+# modified from:
+# https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/conf/tuning/lm_transformer.yaml
+
+gpu: 1
+tensorboard_dir: 'exp-nnlm/tensorobard'
+
+# network architecture
+model_module: transformer
+transformer_conf:
+  embed_unit: 128
+  attention_heads: 8
+  nlayers: 16
+  linear_units: 2048
+  dropout: 0.2
+
+shared_conf:
+  ntoken: 5003
+  batch_size: 30
+
+optimizer_conf:
+  lr: 0.02
+  weight_decay: 0.005
+
+trainer_conf:
+  num_epochs: 50
+  clip: 0.25
+  model_dir: './exp-nnlm/models/'
+
+
+dataset_conf:
+  train_token: 'data/nnlm/text/librispeech.txt.tokens'
+  dev_token: 'data/nnlm/text/dev.txt.tokens'
+
+dataloader_conf:
+  train:
+    batch_size: 20
+    shuffle: True
+    num_workers: 10
+    drop_last: True
+  dev:
+    batch_size: 20
+    shuffle: False
+    num_workers: 10
+    drop_last: False
diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index be7904d7..e32db6fc 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -43,15 +43,15 @@ def __call__(self, batch: List[List[int]]):
 
 class LMDataset(Dataset):
 
-    def __init__(self, text_file: str, ntokens=None):
+    def __init__(self, text_file: str, ntoken:int):
         '''Dataset to load Language Model train/dev text data
 
         Args:
             text_file: text file, text for one utt per line.
         '''
-        self.bos_id = ntokens - 3
-        self.eos_id = ntokens - 2
-        self.pad_index = ntokens - 1
+        self.bos_id = ntoken - 3
+        self.eos_id = ntoken - 2
+        self.pad_index = ntoken - 1
         assert os.path.exists(
             text_file
         ), "text_file: {} does not exist, please check that.".format(text_file)
@@ -59,6 +59,7 @@ def __init__(self, text_file: str, ntokens=None):
         with open(text_file, 'r') as f:
             for idx, line in enumerate(f):
                 token_id = [int(i) for i in line.strip().split()]
+                # https://github.com/espnet/espnet/blob/master/espnet/lm/lm_utils.py#L179
                 # add bos_id and eos_id to each piece of example
                 # then each valid example should be longer than 2
                 token_id.insert(0, self.bos_id)
diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py
index bc3833dc..e9daeb52 100644
--- a/egs/librispeech/asr/nnlm/local/model.py
+++ b/egs/librispeech/asr/nnlm/local/model.py
@@ -6,86 +6,6 @@
 import torch.nn.functional as F
 
 
-class RNNModel(nn.Module):
-    """Container module with an encoder, a recurrent module, and a decoder."""
-
-    def __init__(self,
-                 rnn_type,
-                 ntoken,
-                 ninp,
-                 nhid,
-                 nlayers,
-                 dropout=0.5,
-                 tie_weights=False):
-        super(RNNModel, self).__init__()
-        self.ntoken = ntoken
-        self.drop = nn.Dropout(dropout)
-        self.encoder = nn.Embedding(ntoken, ninp, padding_idx=ntoken - 1)
-        if rnn_type in ['LSTM', 'GRU']:
-            self.rnn = getattr(nn, rnn_type)(ninp,
-                                             nhid,
-                                             nlayers,
-                                             dropout=dropout)
-        else:
-            try:
-                nonlinearity = {
-                    'RNN_TANH': 'tanh',
-                    'RNN_RELU': 'relu'
-                }[rnn_type]
-            except KeyError:
-                raise ValueError(
-                    """An invalid option for `--model` was supplied,
-                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']"""
-                )
-            self.rnn = nn.RNN(ninp,
-                              nhid,
-                              nlayers,
-                              nonlinearity=nonlinearity,
-                              dropout=dropout)
-        self.decoder = nn.Linear(nhid, ntoken)
-
-        # Optionally tie weights as in:
-        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
-        # https://arxiv.org/abs/1608.05859
-        # and
-        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
-        # https://arxiv.org/abs/1611.01462
-        if tie_weights:
-            if nhid != ninp:
-                raise ValueError(
-                    'When using the tied flag, nhid must be equal to emsize')
-            self.decoder.weight = self.encoder.weight
-
-        self.init_weights()
-
-        self.rnn_type = rnn_type
-        self.nhid = nhid
-        self.nlayers = nlayers
-
-    def init_weights(self):
-        initrange = 0.1
-        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
-        nn.init.zeros_(self.decoder.weight)
-        nn.init.uniform_(self.decoder.weight, -initrange, initrange)
-
-    def forward(self, input, hidden):
-        # import pdb; pdb.set_trace()
-        emb = self.drop(self.encoder(input))
-        output, hidden = self.rnn(emb, hidden)
-        output = self.drop(output)
-        decoded = self.decoder(output)
-        decoded = decoded.view(-1, self.ntoken)
-        return F.log_softmax(decoded, dim=1), hidden
-
-    def init_hidden(self, bsz):
-        weight = next(self.parameters())
-        if self.rnn_type == 'LSTM':
-            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
-                    weight.new_zeros(self.nlayers, bsz, self.nhid))
-        else:
-            return weight.new_zeros(self.nlayers, bsz, self.nhid)
-
-
 # Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
 class PositionalEncoding(nn.Module):
     r"""Inject some information about the relative or absolute position of the tokens
@@ -115,6 +35,7 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
             (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
+        # pe: [max_len, 1, d_model]
         pe = pe.unsqueeze(0).transpose(0, 1)
         self.register_buffer('pe', pe)
 
@@ -129,6 +50,9 @@ def forward(self, x):
             >>> output = pos_encoder(x)
         """
 
+        # x: [seq_len, batch_size, d_model]
+        # self.pe: [max_len, 1, d_model]
+        # add with broadcasting
         x = x + self.pe[:x.size(0), :]
         return self.dropout(x)
 
@@ -136,7 +60,24 @@ def forward(self, x):
 class TransformerModel(nn.Module):
     """Container module with an encoder, a recurrent or transformer module, and a decoder."""
 
-    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
+    def __init__(self,
+                 ntoken: int,
+                 embed_unit: int,
+                 attention_heads: int,
+                 linear_units: int,
+                 nlayers: int,
+                 dropout: float = 0.5):
+        '''
+        ntoken: usually vocab_size + 3; 1 for <bos>, 1 for <eos>, 1 for <pad>
+        embed_unit: the number of input channels
+        attention_heads: parallel attention attention_headss
+        linear_units: the dimension of the feedforward network model. 
+              feedforward contains two Linear modules.
+              self.linear1 = Linear(d_model, dim_feedforward)
+              self.linear2 = Linear(dim_feedforward, d_model)
+              so for a torch.nn.TransformerEncoder layer, the output dimension equals to input_dimension.
+
+        '''
         super(TransformerModel, self).__init__()
         try:
             from torch.nn import TransformerEncoder, TransformerEncoderLayer
@@ -144,14 +85,18 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
             raise ImportError(
                 'TransformerEncoder module does not exist in PyTorch 1.1 or lower.'
             )
+        attention_head_dim = embed_unit / attention_heads
+        assert attention_head_dim * attention_heads == embed_unit, "embed_dim must be divisible by num_attention_headss"
+
         self.model_type = 'Transformer'
         self.src_mask = None
-        self.pos_encoder = PositionalEncoding(ninp, dropout)
-        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
+        self.pos_encoder = PositionalEncoding(embed_unit, dropout)
+        encoder_layers = TransformerEncoderLayer(embed_unit, attention_heads,
+                                                 linear_units, dropout)
         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
-        self.encoder = nn.Embedding(ntoken, ninp, padding_idx=ntoken - 1)
-        self.ninp = ninp
-        self.decoder = nn.Linear(ninp, ntoken)
+        self.encoder = nn.Embedding(ntoken, embed_unit, padding_idx=ntoken - 1)
+        self.embed_unit = embed_unit
+        self.decoder = nn.Linear(embed_unit, ntoken)
 
         self.init_weights()
 
@@ -168,7 +113,7 @@ def init_weights(self):
         nn.init.uniform_(self.decoder.weight, -initrange, initrange)
 
     def forward(self, src, has_mask=True):
-        # src: [seq—len, batch_size]
+        # src: [seq_len, batch_size]
         # len(src) is seq_len
         if has_mask:
             device = src.device
@@ -189,7 +134,9 @@ def forward(self, src, has_mask=True):
         #             [0., 0., 0.,  ..., 0., 0., -inf],
         #             [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
 
-        src = self.encoder(src) * math.sqrt(self.ninp)
+        # after self.encoder
+        # src: [seq_len, batch_size, channel]
+        src = self.encoder(src) * math.sqrt(self.embed_unit)
         src = self.pos_encoder(src)
         output = self.transformer_encoder(src, self.src_mask)
         output = self.decoder(output)
diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
index 9520ef9f..d25bbe10 100644
--- a/egs/librispeech/asr/nnlm/local/trainer.py
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -9,7 +9,7 @@
 import torch
 
 from common import load_checkpoint, save_checkpoint
-from model import TransformerModel, RNNModel
+from model import TransformerModel
 
 # references:
 # https://github.com/Hiroshiba/pytorch-trainer/blob/master/pytorch_trainer/training/trainer.py
@@ -36,8 +36,7 @@ def __init__(self,
                  optimizer=None,
                  train_data_loader=None,
                  dev_data_loader=None,
-                 ntokens=None,
-                 batch_size=1,
+                 ntoken=None,
                  epoch=0,
                  num_epochs=10,
                  clip=0.25,
@@ -48,8 +47,7 @@ def __init__(self,
         self.model = model
         self.criterion = criterion
         self.optimizer = optimizer
-        self.ntokens = ntokens
-        self.batch_size = batch_size
+        self.ntoken = ntoken
         self.epoch = epoch
         self.num_epochs = num_epochs
         self.train_data_loader = train_data_loader
@@ -67,16 +65,13 @@ def run(self):
             save_checkpoint("{}/epoch_0.pt".format(self.model_dir), self.model)
             self.eval()
 
-        for epoch in range(self.epoch, self.num_epochs):
+        for _ in range(self.epoch, self.num_epochs):
             if self.train_data_loader is not None:
                 self.train()
 
-            self.epoch += 1
             if self.dev_data_loader is not None:
                 self.eval()
 
-            save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, epoch),
-                            self.model)
 
     def train(self):
         self.model.train()
@@ -86,15 +81,13 @@ def train(self):
             # batch_input, batch_target: [max_seq_len, batch_size]
             # max_seq_len is the maximum lenght in current batch
             batch_input, batch_target = batch
-            assert batch_input.shape[1] == self.batch_size
-            assert batch_target.shape[1] == self.batch_size
             batch_input = batch_input.to(self.device)
             batch_target = batch_target.to(self.device)
             self.model.to(self.device)
             if isinstance(self.model, TransformerModel):
                 batch_output = self.model(batch_input)
 
-                prediction = batch_output.view(-1, self.ntokens)
+                prediction = batch_output.view(-1, self.ntoken)
             else:
                 # reinitiate hidden for everch batch
                 # as batches are independent on each other
@@ -128,12 +121,17 @@ def train(self):
                 logging.info('infinite grad_norm detected {} times'.format(
                     self.num_infinite_grad_norm))
                 total_loss = 0.0
-            if batch_idx % 10000 == 0 and batch_idx > 0:
                 save_checkpoint(
                     "{}/epoch_{}-batch_{}.pt".format(self.model_dir,
                                                      self.epoch, batch_idx),
                     self.model)
 
+        save_checkpoint(
+            "{}/epoch_{}.pt".format(self.model_dir, self.epoch),
+            self.model)
+
+        self.epoch += 1
+
     @torch.no_grad()
     def eval(self):
         self.model.eval()
@@ -152,7 +150,7 @@ def eval(self):
             if isinstance(self.model, TransformerModel):
                 batch_output = self.model(batch_input)
 
-                prediction = batch_output.view(-1, self.ntokens)
+                prediction = batch_output.view(-1, self.ntoken)
             else:
                 hidden = self.model.init_hidden(batch_input.shape[1])
                 prediction, _ = self.model(batch_input, hidden)
@@ -205,7 +203,7 @@ def get_word_ppl(self, dev_txt: str):
             if isinstance(self.model, TransformerModel):
                 batch_output = self.model(batch_input)
 
-                prediction = batch_output.view(-1, self.ntokens)
+                prediction = batch_output.view(-1, self.ntoken)
             else:
                 hidden = self.model.init_hidden(batch_input.shape[1])
                 prediction, _ = self.model(batch_input, hidden)
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
index 6fed85e6..710b8898 100644
--- a/egs/librispeech/asr/nnlm/main.py
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -4,6 +4,7 @@
 # Apache 2.0
 
 # Reference:
+# https://github.com/espnet/espnet/blob/master/espnet/lm/pytorch_backend/lm.py
 # https://github.com/mobvoi/wenet/blob/main/wenet/bin/train.py
 import argparse
 
@@ -13,139 +14,126 @@
 import torch.nn as nn
 import torch.optim as optim
 import sys
+import yaml
 
 sys.path.insert(0, './local/')
 
 from common import load_checkpoint
 from dataset import LMDataset, CollateFunc
-from model import TransformerModel, RNNModel
+from model import TransformerModel
 from pathlib import Path
 from trainer import Trainer
 from torch.utils.tensorboard import SummaryWriter
 from torch.utils.data import DataLoader
+from typing import List, Dict
 
 
 def get_args():
     parser = argparse.ArgumentParser(
         description='training Neural Language Model')
-    parser.add_argument('--train_token',
-                        default='data/nnlm/text/librispeech.txt.tokens',
-                        help='train data file')
-    parser.add_argument('--dev_token',
-                        default='data/nnlm/text/dev.txt.tokens',
-                        help='dev data file')
-    parser.add_argument('--batch_size', type=int, default=60)
-    parser.add_argument('--vocab_size', type=int, default=10000)
-    parser.add_argument('--emsize', type=int, default=200)
-    parser.add_argument('--nhead', type=int, default=2)
-    parser.add_argument('--nhid', type=int, default=200)
-    parser.add_argument('--nlayers', type=int, default=2)
-    parser.add_argument('--num_epochs', type=int, default=50)
-    parser.add_argument('--dropout', type=int, default=0.2)
-    parser.add_argument('--lr',
-                        type=float,
-                        default=1e-2,
-                        help='initial learning rate')
-    parser.add_argument('--clip',
-                        type=float,
-                        default=50.0,
-                        help='gradient clipping')
-    parser.add_argument('--model_dir',
-                        default='./exp-nnlm/models/',
-                        help='path to save model')
-    parser.add_argument('--tensorboard_dir',
-                        default='tensorboard',
-                        help='path to save tensorboard log')
-    parser.add_argument('--gpu',
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--vocab_size', type=int, default=3000)
+    parser.add_argument('--resume_model_iter',
                         type=int,
-                        default=1,
-                        help='gpu id for this local rank, -1 for cpu')
-    parser.add_argument(
-        '--model_iter',
-        type=int,
-        default=-1,
-        help='resume from trained model; if -1 training from scratch')
-    parser.add_argument('--model_type',
-                        type=str,
-                        default='Transformer',
-                        help='model type')
+                        default=-1,
+                        help='resume from trained model;')
 
     args = parser.parse_args()
 
     return args
 
 
+def validate_configs(configs: Dict, required_fields: List) -> bool:
+    not_exist_fields = []
+    for field in required_fields:
+        if field not in configs or configs[field] is None:
+            not_exist_fields.append(field)
+    if len(not_exist_fields) > 0:
+        assert False, 'set following required fields {}'.format(
+            ' '.join(not_exist_fields))
+    return True
+
+
+def extract_configs(args) -> Dict:
+    assert os.path.exists(args.config), '{} does not exist'.format(args.cofnig)
+    required_fields = [
+        'model_module', 'shared_conf', 'optimizer_conf', 'trainer_conf',
+        'dataset_conf'
+    ]
+    with open(args.config, 'r') as f:
+        configs = yaml.load(f, Loader=yaml.FullLoader)
+    validate_configs(configs, required_fields)
+
+    model_conf = '{}_conf'.format(configs['model_module'])
+    ntoken = configs['shared_conf']['ntoken']
+
+    configs[model_conf]['ntoken'] = ntoken
+    configs['trainer_conf']['ntoken'] = ntoken
+
+    assert 'model_dir' in configs['trainer_conf']
+    model_dir = configs['trainer_conf']['model_dir']
+    Path(os.path.dirname(model_dir)).mkdir(parents=True, exist_ok=True)
+
+    return configs
+
+
 def main():
     args = get_args()
+    configs = extract_configs(args)
     logging.basicConfig(level=logging.DEBUG,
                         format='%(asctime)s %(levelname)s %(message)s')
 
     # Set random seed
     torch.manual_seed(2021)
-    # args.vocab_size: number of tokens in tokenizer.get_vocab
-    # + 2: one for eos_id, another for pad_idx
-    # i.e. token_idxs[0, 1, 2, ...., ntokens -3, ntokens - 2, ntokens - 1]
-    # bos_id: ntokens - 3
-    # eos_id: ntokens - 2
-    # pad_idx: ntokens - 1
-    ntokens = args.vocab_size + 3
-    pad_index = ntokens - 1
 
+    ntoken = args.vocab_size + 3
+    assert ntoken == configs['shared_conf']['ntoken']
+
+    # Data
+    pad_index = ntoken - 1
     collate_func = CollateFunc(pad_index=pad_index)
 
-    train_dataset = LMDataset(args.train_token, ntokens=ntokens)
-    dev_dataset = LMDataset(args.dev_token, ntokens=ntokens)
+    train_dataset = LMDataset(configs['dataset_conf']['train_token'],
+                              ntoken=ntoken)
+    dev_dataset = LMDataset(configs['dataset_conf']['dev_token'],
+                            ntoken=ntoken)
 
-    # To debug dataset.py, set shuffle=False and num_workers=0
-    # then examples will be loaded as the sequence they are in {train, dev}.tokens
     train_data_loader = DataLoader(train_dataset,
-                                   batch_size=args.batch_size,
-                                   shuffle=True,
-                                   num_workers=10,
-                                   drop_last=True,
-                                   collate_fn=collate_func)
+                                   collate_fn=collate_func,
+                                   **configs['dataloader_conf']['train'])
 
     dev_data_loader = DataLoader(dev_dataset,
-                                 batch_size=20,
-                                 shuffle=False,
-                                 num_workers=0,
-                                 drop_last=True,
-                                 collate_fn=collate_func)
-
-    if 'Trasformer' == args.model_type:
-        model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
-                                 args.nlayers, args.dropout)
-    else:
-        model = RNNModel('LSTM', ntokens, args.emsize, args.nhid, args.nlayers,
-                         args.dropout, False)
-
-    if args.model_iter > 0:
-        model_path = '{}/epoch_{}.pt'.format(args.model_dir, args.model_iter)
+                                 collate_fn=collate_func,
+                                 **configs['dataloader_conf']['dev'])
+
+    # initialize or resume model
+    if configs['model_module'] == 'transformer':
+        model = TransformerModel(**configs['transformer_conf'])
+
+    if args.resume_model_iter > 0:
+        model_dir = configs['trainer_conf']['model_dir']
+        model_path = '{}/epoch_{}.pt'.format(model_dir, args.resume_model_iter)
+        assert os.path.exists(model_path)
         load_checkpoint(model_path, model)
-    optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4)
-    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+
+    optimizer = optim.AdamW(model.parameters(), **configs['optimizer_conf'])
+    use_cuda = configs['gpu'] >= 0 and torch.cuda.is_available()
     device = torch.device('cuda' if use_cuda else 'cpu')
-    print(device)
     criterion = nn.NLLLoss(ignore_index=pad_index)
-    exp_dir = 'exp-nnlm'
-    writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard')
 
-    Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True)
+    writer = SummaryWriter(log_dir=configs['tensorboard_dir'])
+
     log_interval = max(100, len(train_data_loader) // 20)
-    trainer = Trainer(device,
-                      model,
-                      criterion,
-                      optimizer,
+    trainer = Trainer(device=device,
+                      model=model,
+                      criterion=criterion,
+                      optimizer=optimizer,
                       train_data_loader=train_data_loader,
                       dev_data_loader=dev_data_loader,
-                      ntokens=ntokens,
-                      batch_size=args.batch_size,
-                      epoch=args.model_iter + 1,
-                      num_epochs=args.num_epochs,
-                      clip=args.clip,
+                      epoch=args.resume_model_iter + 1,
                       log_interval=log_interval,
-                      model_dir=args.model_dir,
-                      writer=writer)
+                      writer=writer,
+                      **configs['trainer_conf'])
     trainer.run()
 
 
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index bf86de2f..ffa82e8a 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -3,14 +3,8 @@
 # Copyright 2020 Xiaomi Corporation (Author: Liyong Guo)
 # Apache 2.0
 
-# References:
-# https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/train_rnnlm.sh
-# https://github.com/kaldi-asr/kaldi/blob/pybind11/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh#L75
-# https://github.com/kaldi-asr/kaldi/blob/master/scripts/rnnlm/prepare_rnnlm_dir.sh
-# https://github.com/pytorch/examples/tree/master/word_language_model
-# https://huggingface.co/docs/tokenizers/python/latest/quicktour.html
 
-# Example of how to use HuggingFace tokenizer and train {RNN, Transformer} based LMs
+# Example of how to use HuggingFace tokenizer and train Transformer based LMs
 
 set -e
 stage=$1
@@ -22,20 +16,22 @@ text=data/local/lm/librispeech-lm-norm.txt.gz
 text_dir=data/nnlm/text
 all_train_text=$text_dir/librispeech.txt
 # there are 40,398,052 pieces in all_train_text, which will take 50 MINUTES to be tokenized, with a single process.
-# Now only $train_pieces data is used for debugging pipeline
+# use $train_pieces data to validate pipeline
 train_pieces=300000 # 15 times of dev.txt
 # uncomment follwoing line to use all_train_text
 # train_pieces=
 dev_text=$text_dir/dev.txt
 
 # vocab_size of huggingface tokenizer
-vocab_size=3000
+vocab_size=5000
 # for neural models, number of final classes is:
 # ntokens = $vocab_size + 3
 # while: bos_id = ntokens - 3
 #        eos_id = ntokens - 2
 #        pad_index = ntokens - 1
 
+# lm_config=conf/lm_transformer.yaml
+lm_config=conf/lm_small_transformer.yaml
 
 mkdir -p $text_dir
 
@@ -60,13 +56,14 @@ fi
 if [ ! -z "$train_pieces" ]; then
   train_text=$text_dir/${train_pieces}_librispeech.txt
   if [ $train_text -ot $all_train_text ] || [  ! -f $train_text ]; then
-  # if [ ! -f $train_text) || $train_text -ot $all_train_text ]; then
     head -n $train_pieces $all_train_text > $train_text
   fi
 else
   train_text=$all_train_text
 fi
 
+# Reference: huggingface tokenizer
+# https://huggingface.co/docs/tokenizers/python/latest/quicktour.html
 if [ $stage -le 1 ]; then
   echo "training tokenizer"
   python3 local/huggingface_tokenizer.py \
@@ -87,13 +84,12 @@ fi
 
 if [ $stage -le 3 ]; then
   echo "start to train"
-  # model_iter if for resume training
+  # resume_model_iter is for resume training
   # -1 means train from scratch
   python main.py \
-    --model_iter -1 \
-    --train_token ${train_text}.tokens \
-    --vocab_size $vocab_size \
-    --model_type Transformer
+    --config $lm_config \
+    --vocab_size $vocab_size
+    --resume_model_iter -1
 
 fi
 

From 89ece61a617d4ae2b326ed8327c8182ecd7677ca Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Fri, 9 Apr 2021 11:57:19 +0800
Subject: [PATCH 18/25] update results with nvocab=5000

---
 egs/librispeech/asr/nnlm/RESULTS.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/egs/librispeech/asr/nnlm/RESULTS.md b/egs/librispeech/asr/nnlm/RESULTS.md
index 1db693ed..0018015f 100644
--- a/egs/librispeech/asr/nnlm/RESULTS.md
+++ b/egs/librispeech/asr/nnlm/RESULTS.md
@@ -1,4 +1,4 @@
-##tokens ppl train_pieces=300000 # 15 times of dev.txt
+##tokens ppl with train_pieces=300000 # 15 times of dev.txt
 
 ###vocab_size=2000
   epochs=50 train/dev perplexity was 80.0 / 119.0
@@ -9,3 +9,11 @@
   epochs=2 train/dev perplexity was around 113 / 132.51
   epochs=3 train/dev perplexity was around 111 / 130.09
   epochs=4 train/dev perplexity was around 109 / 130.29
+
+###vocab_size=5000
+  dev perplexity of random initialized model is around 6844.12
+  epochs=1 train/dev perplexity was around 898 / 984.12
+  epochs=2 train/dev perplexity was around 964 / 982.52
+  epochs=3 train/dev perplexity was around 908 / 1020.44
+  epochs=4 train/dev perplexity was around 914 / 1030.31
+  epochs=4 train/dev perplexity was around 916 / 975.74

From c3f88116e0bf3c25b13b920672dc6b75b1c64cb9 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Fri, 9 Apr 2021 17:53:13 +0800
Subject: [PATCH 19/25] fix reviews

---
 egs/librispeech/asr/nnlm/local/common.py      |  2 +-
 egs/librispeech/asr/nnlm/local/dataset.py     | 11 +++++-----
 .../asr/nnlm/local/generate_lexicon.py        | 21 ++++++++++++-------
 egs/librispeech/asr/nnlm/local/model.py       |  5 +++--
 egs/librispeech/asr/nnlm/local/trainer.py     |  8 +++----
 5 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/local/common.py b/egs/librispeech/asr/nnlm/local/common.py
index f561cf1d..770ae6e1 100644
--- a/egs/librispeech/asr/nnlm/local/common.py
+++ b/egs/librispeech/asr/nnlm/local/common.py
@@ -12,7 +12,7 @@
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 Pathlike = Union[str, Path]
-Info = Union[dict, None]
+Info = Optional[dict]
 
 
 def load_checkpoint(filename: Pathlike,
diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index e32db6fc..f932b197 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -43,7 +43,7 @@ def __call__(self, batch: List[List[int]]):
 
 class LMDataset(Dataset):
 
-    def __init__(self, text_file: str, ntoken:int):
+    def __init__(self, text_file: str, ntoken: int):
         '''Dataset to load Language Model train/dev text data
 
         Args:
@@ -57,15 +57,16 @@ def __init__(self, text_file: str, ntoken:int):
         ), "text_file: {} does not exist, please check that.".format(text_file)
         self.data = []
         with open(text_file, 'r') as f:
-            for idx, line in enumerate(f):
+            for line in f:
                 token_id = [int(i) for i in line.strip().split()]
+                # Empty line exists in librispeech.txt. Disregrad that.
+                if len(token_id) == 0:
+                    continue
                 # https://github.com/espnet/espnet/blob/master/espnet/lm/lm_utils.py#L179
                 # add bos_id and eos_id to each piece of example
-                # then each valid example should be longer than 2
                 token_id.insert(0, self.bos_id)
                 token_id.append(self.eos_id)
-                if len(token_id) > 2:
-                    self.data.append(token_id)
+                self.data.append(token_id)
 
     def __len__(self):
         return len(self.data)
diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
index 3b50ecf5..e542b00d 100644
--- a/egs/librispeech/asr/nnlm/local/generate_lexicon.py
+++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
@@ -30,16 +30,21 @@ def get_args():
 
 
 def generate_tokens(args):
-    ''' Extract symbols and there corresponding ids from a tokenizer,
+    ''' Extract symbols and the corresponding ids from a tokenizer,
         and save as tokens.txt.
-        An example file looks like:
-        a 1
-        b 2
-        c 3
+        A real token.txt with nvocab=10000 is:
+        [unk] 0
+        ' 1
+        a 2
+        b 3
+        c 4
         ...
-        it 100
-        sh 101
-
+        patty 9994
+        neatly 9995
+        stormy 9996
+        daddy 9997
+        ##enon 9998
+        remarkably 9999
     '''
 
     tokenizer = Tokenizer.from_file(args.tokenizer_path)
diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py
index e9daeb52..fd37601f 100644
--- a/egs/librispeech/asr/nnlm/local/model.py
+++ b/egs/librispeech/asr/nnlm/local/model.py
@@ -88,7 +88,6 @@ def __init__(self,
         attention_head_dim = embed_unit / attention_heads
         assert attention_head_dim * attention_heads == embed_unit, "embed_dim must be divisible by num_attention_headss"
 
-        self.model_type = 'Transformer'
         self.src_mask = None
         self.pos_encoder = PositionalEncoding(embed_unit, dropout)
         encoder_layers = TransformerEncoderLayer(embed_unit, attention_heads,
@@ -135,9 +134,11 @@ def forward(self, src, has_mask=True):
         #             [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
 
         # after self.encoder
-        # src: [seq_len, batch_size, channel]
+        # src: [seq_len, batch_size, embed_unit]
         src = self.encoder(src) * math.sqrt(self.embed_unit)
         src = self.pos_encoder(src)
+
+        # output: [seq_len, batch_size, ntoken]
         output = self.transformer_encoder(src, self.src_mask)
         output = self.decoder(output)
         return F.log_softmax(output, dim=-1)
diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
index d25bbe10..6269db7e 100644
--- a/egs/librispeech/asr/nnlm/local/trainer.py
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -58,6 +58,7 @@ def __init__(self,
         self.clip = clip
         self.model_dir = model_dir
         self.num_infinite_grad_norm = 0
+        self.model.to(device)
 
     def run(self):
         # save and eval initialized moel
@@ -72,7 +73,6 @@ def run(self):
             if self.dev_data_loader is not None:
                 self.eval()
 
-
     def train(self):
         self.model.train()
         total_loss = 0
@@ -83,7 +83,6 @@ def train(self):
             batch_input, batch_target = batch
             batch_input = batch_input.to(self.device)
             batch_target = batch_target.to(self.device)
-            self.model.to(self.device)
             if isinstance(self.model, TransformerModel):
                 batch_output = self.model(batch_input)
 
@@ -126,9 +125,8 @@ def train(self):
                                                      self.epoch, batch_idx),
                     self.model)
 
-        save_checkpoint(
-            "{}/epoch_{}.pt".format(self.model_dir, self.epoch),
-            self.model)
+        save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, self.epoch),
+                        self.model)
 
         self.epoch += 1
 

From d1b803b24f28340e79cc2a09cde140958d795668 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Fri, 9 Apr 2021 18:28:35 +0800
Subject: [PATCH 20/25] fixed reviews

---
 egs/librispeech/asr/nnlm/local/dataset.py     | 22 ++++++++++++++-----
 .../asr/nnlm/local/generate_lexicon.py        |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
index f932b197..e4f59616 100644
--- a/egs/librispeech/asr/nnlm/local/dataset.py
+++ b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -35,7 +35,7 @@ def __call__(self, batch: List[List[int]]):
             self.pad_index)
         data_pad = data_pad.t().contiguous()
         # xs_pad, ys_pad: [max_seq_len, batch_size]
-        # max_seq_len is the maximum lenght in current batch
+        # max_seq_len is the maximum length in current batch
         xs_pad = data_pad[:-1, :]
         ys_pad = data_pad[1:, :]
         return xs_pad, ys_pad
@@ -43,20 +43,30 @@ def __call__(self, batch: List[List[int]]):
 
 class LMDataset(Dataset):
 
-    def __init__(self, text_file: str, ntoken: int):
+    def __init__(self, token_file: str, ntoken: int):
         '''Dataset to load Language Model train/dev text data
 
         Args:
-            text_file: text file, text for one utt per line.
+            token_file: each line is a tokenized text, looks like:
+                token_id token_id *** token_id token_id
+
+                A real example is:
+
+                485 135 974 255 1220 33 35 377
+                2130 1960
+
+            when loaded, <bos_id>/<eos_id> is added to compose input/target
+
         '''
         self.bos_id = ntoken - 3
         self.eos_id = ntoken - 2
         self.pad_index = ntoken - 1
         assert os.path.exists(
-            text_file
-        ), "text_file: {} does not exist, please check that.".format(text_file)
+            token_file
+        ), "token_file: {} does not exist, please check that.".format(
+            token_file)
         self.data = []
-        with open(text_file, 'r') as f:
+        with open(token_file, 'r') as f:
             for line in f:
                 token_id = [int(i) for i in line.strip().split()]
                 # Empty line exists in librispeech.txt. Disregrad that.
diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
index e542b00d..30249453 100644
--- a/egs/librispeech/asr/nnlm/local/generate_lexicon.py
+++ b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
@@ -51,7 +51,7 @@ def generate_tokens(args):
     symbols = tokenizer.get_vocab()
     tokens_file = '{}/tokens.txt'.format(args.lexicon_path)
     tokens_f = open(tokens_file, 'w')
-    id2sym = dict((v, k.lower()) for k, v in symbols.items())
+    id2sym = {idx: sym.lower() for sym, idx in symbols.items()}
     for idx in range(len(symbols)):
         assert idx in id2sym
         tokens_f.write('{} {}\n'.format(id2sym[idx], idx))

From c45d31fe1a7fa2c5aa9b47966b6032fe12c02f20 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Sat, 10 Apr 2021 12:45:17 +0800
Subject: [PATCH 21/25] support multi-gpu training with ddp

---
 egs/librispeech/asr/nnlm/compute_word_ppl.py  |  2 +-
 .../asr/nnlm/conf/lm_small_transformer.yaml   |  7 +-
 .../asr/nnlm/conf/lm_transformer.yaml         |  7 +-
 egs/librispeech/asr/nnlm/local/common.py      |  2 +-
 egs/librispeech/asr/nnlm/local/trainer.py     | 81 ++++++++++---------
 egs/librispeech/asr/nnlm/main.py              | 27 +++++--
 egs/librispeech/asr/nnlm/run.sh               |  6 +-
 7 files changed, 75 insertions(+), 57 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/compute_word_ppl.py b/egs/librispeech/asr/nnlm/compute_word_ppl.py
index d7d2f47b..89f4d899 100644
--- a/egs/librispeech/asr/nnlm/compute_word_ppl.py
+++ b/egs/librispeech/asr/nnlm/compute_word_ppl.py
@@ -18,7 +18,7 @@
 
 from common import load_checkpoint
 from dataset import LMDataset, CollateFunc
-from model import TransformerModel, RNNModel
+from model import TransformerModel
 from pathlib import Path
 from trainer import Trainer
 from torch.utils.tensorboard import SummaryWriter
diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
index 01bd5d05..903b5302 100644
--- a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
+++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
@@ -14,7 +14,6 @@ transformer_conf:
 
 shared_conf:
   ntoken: 5003
-  batch_size: 30
 
 optimizer_conf:
   lr: 0.02
@@ -32,12 +31,10 @@ dataset_conf:
 
 dataloader_conf:
   train:
-    batch_size: 20
-    shuffle: True
+    batch_size: 60
     num_workers: 10
     drop_last: True
   dev:
-    batch_size: 20
-    shuffle: False
+    batch_size: 60
     num_workers: 10
     drop_last: False
diff --git a/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml
index 21851ded..eaeb28b6 100644
--- a/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml
+++ b/egs/librispeech/asr/nnlm/conf/lm_transformer.yaml
@@ -15,7 +15,6 @@ transformer_conf:
 
 shared_conf:
   ntoken: 5003
-  batch_size: 30
 
 optimizer_conf:
   lr: 0.02
@@ -33,12 +32,10 @@ dataset_conf:
 
 dataloader_conf:
   train:
-    batch_size: 20
-    shuffle: True
+    batch_size: 60
     num_workers: 10
     drop_last: True
   dev:
-    batch_size: 20
-    shuffle: False
+    batch_size: 60
     num_workers: 10
     drop_last: False
diff --git a/egs/librispeech/asr/nnlm/local/common.py b/egs/librispeech/asr/nnlm/local/common.py
index 770ae6e1..365ab964 100644
--- a/egs/librispeech/asr/nnlm/local/common.py
+++ b/egs/librispeech/asr/nnlm/local/common.py
@@ -34,7 +34,7 @@ def save_checkpoint(filename: Pathlike,
         Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True)
     logging.info(f'Save checkpoint to {filename}')
     checkpoint = {
-        'state_dict': model.state_dict(),
+        'state_dict': model.module.state_dict(),
     }
     if info is not None:
         checkpoint.update(info)
diff --git a/egs/librispeech/asr/nnlm/local/trainer.py b/egs/librispeech/asr/nnlm/local/trainer.py
index 6269db7e..f03bb1f5 100644
--- a/egs/librispeech/asr/nnlm/local/trainer.py
+++ b/egs/librispeech/asr/nnlm/local/trainer.py
@@ -7,6 +7,7 @@
 import math
 import numpy as np
 import torch
+import torch.distributed as dist
 
 from common import load_checkpoint, save_checkpoint
 from model import TransformerModel
@@ -44,7 +45,6 @@ def __init__(self,
                  model_dir="exp-nnlm/models/",
                  writer=None):
         self.device = device
-        self.model = model
         self.criterion = criterion
         self.optimizer = optimizer
         self.ntoken = ntoken
@@ -58,12 +58,16 @@ def __init__(self,
         self.clip = clip
         self.model_dir = model_dir
         self.num_infinite_grad_norm = 0
-        self.model.to(device)
+        self.model = model
+        self.world_size = dist.get_world_size()
+        self.local_rank = dist.get_rank()
 
     def run(self):
         # save and eval initialized moel
         if 0 == self.epoch:
-            save_checkpoint("{}/epoch_0.pt".format(self.model_dir), self.model)
+            if self.local_rank == 0:
+                save_checkpoint("{}/epoch_0.pt".format(self.model_dir),
+                                self.model)
             self.eval()
 
         for _ in range(self.epoch, self.num_epochs):
@@ -83,15 +87,8 @@ def train(self):
             batch_input, batch_target = batch
             batch_input = batch_input.to(self.device)
             batch_target = batch_target.to(self.device)
-            if isinstance(self.model, TransformerModel):
-                batch_output = self.model(batch_input)
-
-                prediction = batch_output.view(-1, self.ntoken)
-            else:
-                # reinitiate hidden for everch batch
-                # as batches are independent on each other
-                hidden = self.model.init_hidden(batch_input.shape[1])
-                prediction, _ = self.model(batch_input, hidden)
+            batch_output = self.model(batch_input)
+            prediction = batch_output.view(-1, self.ntoken)
 
             # target: [max_seq_len * batch_size]
             # example_1_token_1 example_2_token_1 example_3_token_1 .....
@@ -117,24 +114,26 @@ def train(self):
                     batch_idx, num_total_batch, cur_loss, math.exp(cur_loss),
                     self.epoch)
                 logging.info(log_str)
-                logging.info('infinite grad_norm detected {} times'.format(
-                    self.num_infinite_grad_norm))
+                if self.num_infinite_grad_norm > 0:
+                    logging.info('infinite grad_norm detected {} times'.format(
+                        self.num_infinite_grad_norm))
                 total_loss = 0.0
-                save_checkpoint(
-                    "{}/epoch_{}-batch_{}.pt".format(self.model_dir,
-                                                     self.epoch, batch_idx),
-                    self.model)
-
-        save_checkpoint("{}/epoch_{}.pt".format(self.model_dir, self.epoch),
-                        self.model)
+                if self.local_rank == 0:
+                    save_checkpoint(
+                        "{}/epoch_{}-batch_{}.pt".format(
+                            self.model_dir, self.epoch, batch_idx), self.model)
 
         self.epoch += 1
+        if self.local_rank == 0:
+            save_checkpoint(
+                "{}/epoch_{}.pt".format(self.model_dir, self.epoch),
+                self.model)
 
     @torch.no_grad()
     def eval(self):
         self.model.eval()
-        total_loss = 0.0
-        total_examples = 0
+        total_loss = torch.tensor([0.0]).to(self.device)
+        total_examples = torch.tensor([0.0]).to(self.device)
         for batch_idx, batch in enumerate(self.dev_data_loader):
             # batch_input: [seq_len, batch_size]
             # with contents: <bos> token_id token_id ....
@@ -144,14 +143,9 @@ def eval(self):
             batch_input, batch_target = batch
             batch_input = batch_input.to(self.device)
             batch_target = batch_target.to(self.device)
-            self.model.to(self.device)
-            if isinstance(self.model, TransformerModel):
-                batch_output = self.model(batch_input)
+            batch_output = self.model(batch_input)
 
-                prediction = batch_output.view(-1, self.ntoken)
-            else:
-                hidden = self.model.init_hidden(batch_input.shape[1])
-                prediction, _ = self.model(batch_input, hidden)
+            prediction = batch_output.view(-1, self.ntoken)
             # target: [max_seq_len * batch_size]
             # example_1_token_1 example_2_token_1 example_3_token_1 .....
             target = batch_target.view(-1)
@@ -159,12 +153,27 @@ def eval(self):
             total_loss += loss * batch_input.shape[1]
             total_examples += batch_input.shape[1]
 
-        loss = total_loss / total_examples
-        ppl = math.exp(loss)
-        self.writer.add_scalar('dev_ppl', ppl, self.epoch)
-        log_str = 'dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format(
-            loss.item(), ppl, self.epoch)
-        logging.info(log_str)
+        total_loss_list = [
+            torch.zeros_like(total_loss) for _ in range(self.world_size)
+        ]
+        total_examples_list = [
+            torch.zeros_like(total_examples) for _ in range(self.world_size)
+        ]
+        dist.all_gather(total_loss_list, total_loss)
+        dist.all_gather(total_examples_list, total_examples)
+        total_loss = 0
+        total_examples = 0
+        for loss, examples in zip(total_loss_list, total_examples_list):
+            total_loss += loss
+            total_examples += examples
+
+        if self.local_rank == 0:
+            loss = total_loss / total_examples
+            ppl = math.exp(loss)
+            self.writer.add_scalar('dev_ppl', ppl, self.epoch)
+            log_str = 'dev examples: {} dev loss is {:.6f} and ppl {:.6f} at epoch {}'.format(
+                int(total_examples.item()), loss.item(), ppl, self.epoch)
+            logging.info(log_str)
 
     def get_word_counts(self, dev_txt: str):
         word_counts = []
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
index 710b8898..7f7b9ae8 100644
--- a/egs/librispeech/asr/nnlm/main.py
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -11,6 +11,7 @@
 import logging
 import os
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
 import sys
@@ -33,6 +34,7 @@ def get_args():
         description='training Neural Language Model')
     parser.add_argument('--config', required=True, help='config file')
     parser.add_argument('--vocab_size', type=int, default=3000)
+    parser.add_argument('--local_rank', type=int, default=0)
     parser.add_argument('--resume_model_iter',
                         type=int,
                         default=-1,
@@ -89,6 +91,11 @@ def main():
     ntoken = args.vocab_size + 3
     assert ntoken == configs['shared_conf']['ntoken']
 
+    dist.init_process_group('nccl')
+    torch.cuda.set_device(args.local_rank)
+    device = torch.device("cuda", args.local_rank)
+    print(device)
+
     # Data
     pad_index = ntoken - 1
     collate_func = CollateFunc(pad_index=pad_index)
@@ -98,27 +105,33 @@ def main():
     dev_dataset = LMDataset(configs['dataset_conf']['dev_token'],
                             ntoken=ntoken)
 
+    train_sampler = torch.utils.data.distributed.DistributedSampler(
+        train_dataset, shuffle=True)
     train_data_loader = DataLoader(train_dataset,
+                                   sampler=train_sampler,
                                    collate_fn=collate_func,
                                    **configs['dataloader_conf']['train'])
 
+    dev_sampler = torch.utils.data.distributed.DistributedSampler(
+        dev_dataset, shuffle=False)
     dev_data_loader = DataLoader(dev_dataset,
+                                 sampler=dev_sampler,
                                  collate_fn=collate_func,
                                  **configs['dataloader_conf']['dev'])
 
     # initialize or resume model
     if configs['model_module'] == 'transformer':
         model = TransformerModel(**configs['transformer_conf'])
+        if args.resume_model_iter > 0:
+            model_dir = configs['trainer_conf']['model_dir']
+            model_path = '{}/epoch_{}.pt'.format(model_dir, args.resume_model_iter)
+            assert os.path.exists(model_path)
+            load_checkpoint(model_path, model)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model.to(device), [args.local_rank])
 
-    if args.resume_model_iter > 0:
-        model_dir = configs['trainer_conf']['model_dir']
-        model_path = '{}/epoch_{}.pt'.format(model_dir, args.resume_model_iter)
-        assert os.path.exists(model_path)
-        load_checkpoint(model_path, model)
 
     optimizer = optim.AdamW(model.parameters(), **configs['optimizer_conf'])
-    use_cuda = configs['gpu'] >= 0 and torch.cuda.is_available()
-    device = torch.device('cuda' if use_cuda else 'cpu')
     criterion = nn.NLLLoss(ignore_index=pad_index)
 
     writer = SummaryWriter(log_dir=configs['tensorboard_dir'])
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index ffa82e8a..b9cdaec5 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -86,9 +86,11 @@ if [ $stage -le 3 ]; then
   echo "start to train"
   # resume_model_iter is for resume training
   # -1 means train from scratch
-  python main.py \
+  # python main.py \
+  export CUDA_VISIBLE_DEVICES=0,1,2,3
+  python -m torch.distributed.launch --nproc_per_node=4 main.py \
     --config $lm_config \
-    --vocab_size $vocab_size
+    --vocab_size $vocab_size \
     --resume_model_iter -1
 
 fi

From 1d38c218042fbdbfd40c9a7fd1036b5cb0396c71 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Wed, 14 Apr 2021 21:48:02 +0800
Subject: [PATCH 22/25] n-best rescoring result with 8-layer transformer lm

---
 egs/librispeech/asr/nnlm/RESULTS.md           |  19 --
 egs/librispeech/asr/nnlm/compute_word_ppl.py  | 131 ++------
 .../asr/nnlm/conf/lm_small_transformer.yaml   |  26 +-
 egs/librispeech/asr/nnlm/local/evaluator.py   | 246 ++++++++++++++
 egs/librispeech/asr/nnlm/local/model.py       |   3 +
 egs/librispeech/asr/nnlm/run.sh               |  15 +-
 .../simple_v1/mmi_att_transformer_decode.py   | 255 ++++++++++-----
 snowfall/decoding/lm_rescore.py               | 306 ++++++++++++++++++
 8 files changed, 785 insertions(+), 216 deletions(-)
 create mode 100644 egs/librispeech/asr/nnlm/local/evaluator.py
 create mode 100644 snowfall/decoding/lm_rescore.py

diff --git a/egs/librispeech/asr/nnlm/RESULTS.md b/egs/librispeech/asr/nnlm/RESULTS.md
index 0018015f..e69de29b 100644
--- a/egs/librispeech/asr/nnlm/RESULTS.md
+++ b/egs/librispeech/asr/nnlm/RESULTS.md
@@ -1,19 +0,0 @@
-##tokens ppl with train_pieces=300000 # 15 times of dev.txt
-
-###vocab_size=2000
-  epochs=50 train/dev perplexity was 80.0 / 119.0
-
-###vocab_size=3000
-  dev perplexity of random initialized model is around 2998.13
-  epochs=1 train/dev perplexity was around 120 / 137.67
-  epochs=2 train/dev perplexity was around 113 / 132.51
-  epochs=3 train/dev perplexity was around 111 / 130.09
-  epochs=4 train/dev perplexity was around 109 / 130.29
-
-###vocab_size=5000
-  dev perplexity of random initialized model is around 6844.12
-  epochs=1 train/dev perplexity was around 898 / 984.12
-  epochs=2 train/dev perplexity was around 964 / 982.52
-  epochs=3 train/dev perplexity was around 908 / 1020.44
-  epochs=4 train/dev perplexity was around 914 / 1030.31
-  epochs=4 train/dev perplexity was around 916 / 975.74
diff --git a/egs/librispeech/asr/nnlm/compute_word_ppl.py b/egs/librispeech/asr/nnlm/compute_word_ppl.py
index 89f4d899..5453395a 100644
--- a/egs/librispeech/asr/nnlm/compute_word_ppl.py
+++ b/egs/librispeech/asr/nnlm/compute_word_ppl.py
@@ -4,74 +4,45 @@
 # Apache 2.0
 
 # Reference:
+# https://github.com/espnet/espnet/blob/master/espnet/lm/pytorch_backend/lm.py
 # https://github.com/mobvoi/wenet/blob/main/wenet/bin/train.py
 import argparse
 
 import logging
 import os
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
 import sys
+import yaml
 
 sys.path.insert(0, './local/')
 
 from common import load_checkpoint
-from dataset import LMDataset, CollateFunc
-from model import TransformerModel
+from evaluator import Evaluator
+# from model import TransformerModel
 from pathlib import Path
-from trainer import Trainer
-from torch.utils.tensorboard import SummaryWriter
-from torch.utils.data import DataLoader
+from typing import List, Dict
 
 
 def get_args():
     parser = argparse.ArgumentParser(
-        description='training Neural Language Model')
-    parser.add_argument('--train_token',
-                        default='data/nnlm/text/librispeech.txt.tokens',
-                        help='train token file')
-    parser.add_argument('--dev_token',
-                        default='data/nnlm/text/dev.txt.tokens',
-                        help='dev token file')
-    parser.add_argument('--dev_txt',
-                        default='data/nnlm/text/dev.txt',
-                        help='dev txt file, used to compute word ppl')
-    parser.add_argument('--batch_size', type=int, default=60)
-    parser.add_argument('--vocab_size', type=int, default=2000)
-    parser.add_argument('--emsize', type=int, default=200)
-    parser.add_argument('--nhead', type=int, default=2)
-    parser.add_argument('--nhid', type=int, default=200)
-    parser.add_argument('--nlayers', type=int, default=2)
-    parser.add_argument('--num_epochs', type=int, default=50)
-    parser.add_argument('--dropout', type=int, default=0.2)
-    parser.add_argument('--lr',
-                        type=float,
-                        default=1e-2,
-                        help='initial learning rate')
-    parser.add_argument('--clip',
-                        type=float,
-                        default=50.0,
-                        help='gradient clipping')
-    parser.add_argument('--model_dir',
-                        default='./exp-nnlm/models/',
-                        help='path to save model')
-    parser.add_argument('--tensorboard_dir',
-                        default='tensorboard',
-                        help='path to save tensorboard log')
-    parser.add_argument('--gpu',
-                        type=int,
-                        default=1,
-                        help='gpu id for this local rank, -1 for cpu')
-    parser.add_argument(
-        '--model_iter',
-        type=int,
-        default=19,
-        help='resume from trained model; if -1 training from scratch')
-    parser.add_argument('--model_type',
+        description='compute token/word ppl of txt')
+    parser.add_argument('--config',
+                        help='config file',
+                        default='conf/lm_small_transformer.yaml')
+    parser.add_argument('--vocab_size', type=int, default=5000)
+    parser.add_argument('--model',
                         type=str,
-                        default='Transformer',
-                        help='model type')
+                        default='exp-nnlm/models/epoch_30.pt',
+                        help='full path of loaded model')
+    parser.add_argument('--tokenizer_path',
+                        type=str,
+                        default='exp-nnlm/tokenizer-librispeech.json')
+    parser.add_argument('--txt_file',
+                        type=str,
+                        default='data/nnlm/text/dev.txt')
 
     args = parser.parse_args()
 
@@ -85,60 +56,16 @@ def main():
 
     # Set random seed
     torch.manual_seed(2021)
-    # args.vocab_size: number of tokens in tokenizer.get_vocab
-    # + 2: one for eos_id, another for pad_idx
-    # i.e. token_idxs[0, 1, 2, ...., ntokens -3, ntokens - 2, ntokens - 1]
-    # bos_id: ntokens - 3
-    # eos_id: ntokens - 2
-    # pad_idx: ntokens - 1
-    ntokens = args.vocab_size + 3
-    pad_index = ntokens - 1
-
-    collate_func = CollateFunc(pad_index=pad_index)
-
-    dev_dataset = LMDataset(args.dev_token, ntokens=ntokens)
-
-    dev_data_loader = DataLoader(dev_dataset,
-                                 batch_size=1,
-                                 shuffle=False,
-                                 num_workers=0,
-                                 drop_last=False,
-                                 collate_fn=collate_func)
-
-    if 'Trasformer' == args.model_type:
-        model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
-                                 args.nlayers, args.dropout)
-    else:
-        model = RNNModel('LSTM', ntokens, args.emsize, args.nhid, args.nlayers,
-                         args.dropout, False)
-
-    if args.model_iter > 0:
-        model_path = '{}/epoch_{}.pt'.format(args.model_dir, args.model_iter)
-        load_checkpoint(model_path, model)
-    optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4)
-    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
-    device = torch.device('cuda' if use_cuda else 'cpu')
+
+    # device = torch.device("cuda", args.local_rank)
+    device = torch.device('cpu')
     print(device)
-    criterion = nn.NLLLoss(ignore_index=pad_index)
-    exp_dir = 'exp-nnlm'
-    writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard')
-
-    Path(os.path.dirname(args.model_dir)).mkdir(parents=True, exist_ok=True)
-    trainer = Trainer(device,
-                      model,
-                      criterion,
-                      optimizer,
-                      train_data_loader=None,
-                      dev_data_loader=dev_data_loader,
-                      ntokens=ntokens,
-                      batch_size=args.batch_size,
-                      epoch=args.model_iter + 1,
-                      num_epochs=args.num_epochs,
-                      clip=args.clip,
-                      model_dir=args.model_dir,
-                      writer=writer)
-
-    trainer.get_word_ppl(args.dev_txt)
+
+    evaluator = Evaluator(device=device,
+                          model_path=args.model,
+                          config_file=args.config,
+                          tokenizer_path=args.tokenizer_path)
+    evaluator.compute_ppl(txt_file=args.txt_file)
 
 
 if __name__ == '__main__':
diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
index 903b5302..4ee16290 100644
--- a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
+++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
@@ -7,34 +7,38 @@ tensorboard_dir: 'exp-nnlm/tensorobard'
 model_module: transformer
 transformer_conf:
   embed_unit: 200
-  attention_heads: 2
-  nlayers: 2
-  linear_units: 200
+  attention_heads: 8
+  nlayers: 8
+  linear_units: 2048
   dropout: 0.2
 
 shared_conf:
   ntoken: 5003
 
 optimizer_conf:
-  lr: 0.02
-  weight_decay: 0.005
+  # for Adam
+  lr: 0.0003
+  weight_decay: 0.001
+  # for SGD
+  # lr: 0.01
+  # weight_decay: 0.001
 
 trainer_conf:
-  num_epochs: 50
+  num_epochs: 60
   clip: 0.25
   model_dir: './exp-nnlm/models/'
 
 
 dataset_conf:
-  train_token: 'data/nnlm/text/300000_librispeech.txt.tokens'
+  train_token: 'data/nnlm/text/librispeech.txt.tokens'
   dev_token: 'data/nnlm/text/dev.txt.tokens'
 
 dataloader_conf:
   train:
-    batch_size: 60
-    num_workers: 10
+    batch_size: 256
+    num_workers: 0
     drop_last: True
   dev:
-    batch_size: 60
-    num_workers: 10
+    batch_size: 20
+    num_workers: 0
     drop_last: False
diff --git a/egs/librispeech/asr/nnlm/local/evaluator.py b/egs/librispeech/asr/nnlm/local/evaluator.py
new file mode 100644
index 00000000..7eff021d
--- /dev/null
+++ b/egs/librispeech/asr/nnlm/local/evaluator.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+import logging
+import os
+import yaml
+import math
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn as nn
+
+from model import TransformerModel
+
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers import decoders
+from common import load_checkpoint
+from model import TransformerModel
+from typing import Dict, List
+
+import k2
+
+
+def word_seqs_to_list_str(word_seqs: k2.RaggedInt,
+                          symbol_table: k2.SymbolTable) -> List[str]:
+    '''
+    Args:
+      word_seqs:[path][word]
+    '''
+    word_ids = word_seqs.values()
+    words = [symbol_table.get(word_idx.item()) for word_idx in word_ids]
+    ragged_shape = word_seqs.row_splits(1)
+    sentences = []
+    for idx, start_idx in enumerate(ragged_shape[:-1]):
+        sentences.append(' '.join(words[start_idx:ragged_shape[idx + 1]]))
+    return sentences
+
+
+def validate_configs(configs: Dict, required_fields: List) -> bool:
+    not_exist_fields = []
+    for field in required_fields:
+        if field not in configs or configs[field] is None:
+            not_exist_fields.append(field)
+    if len(not_exist_fields) > 0:
+        assert False, 'set following required fields {}'.format(
+            ' '.join(not_exist_fields))
+    return True
+
+
+def extract_configs(config_file) -> Dict:
+    assert os.path.exists(config_file), '{} does not exist'.format(cofnig_file)
+    required_fields = [
+        'model_module',
+        'shared_conf',
+    ]
+    with open(config_file, 'r') as f:
+        configs = yaml.load(f, Loader=yaml.FullLoader)
+    validate_configs(configs, required_fields)
+
+    model_conf = '{}_conf'.format(configs['model_module'])
+    ntoken = configs['shared_conf']['ntoken']
+
+    assert 'model_dir' in configs['trainer_conf']
+    configs[model_conf]['ntoken'] = ntoken
+
+    return configs
+
+
+class Evaluator(object):
+
+    def __init__(self,
+                 device,
+                 model_path,
+                 config_file=None,
+                 tokenizer_path=None,
+                 words_txt=None,
+                 batch_size=1):
+        self.device = device
+        configs = extract_configs(config_file)
+        if configs['model_module'] == 'transformer':
+            model = TransformerModel(**configs['transformer_conf'])
+            if model_path is not None:
+                assert os.path.exists(model_path)
+                load_checkpoint(model_path, model)
+        self.model = model
+        self.ntoken = model.ntoken
+        self.batch_size = batch_size
+        self.word_count = 0
+        self.token_count = 0
+        self.total_examples = 0
+        self.model.to(self.device)
+        self.model.eval()
+
+        self.tokenizer = Tokenizer.from_file(tokenizer_path)
+        self.tokenizer.decoder = decoders.WordPiece()
+        self.bos_id = self.ntoken - 3
+        self.eos_id = self.ntoken - 2
+        self.pad_index = self.ntoken - 1
+        if words_txt is not None:
+            self.symbol_table = k2.SymbolTable.from_file(words_txt)
+
+        self.criterion = nn.NLLLoss(ignore_index=self.pad_index,
+                                    reduction='mean')
+
+    def set_criterion(self, doing_rescore: bool):
+        if doing_rescore:
+            self.criterion = nn.NLLLoss(ignore_index=self.pad_index,
+                                        reduction='sum')
+        else:
+            self.criterion = nn.NLLLoss(ignore_index=self.pad_index,
+                                        reduction='mean')
+    def reset_count_variables(self):
+        self.word_count = 0
+        self.token_count = 0
+        self.total_examples = 0
+
+    def batchify(self, txt_f):
+        batch = []
+
+        for line in txt_f:
+            self.total_examples += 1
+            line = line.strip().lower()
+
+            token_id = self.tokenizer.encode(line).ids
+            # +1 for <eos>
+            self.word_count += len(line.split()) + 1
+            # +1 for <eos>
+            self.token_count += len(token_id) + 1
+            token_id.insert(0, self.bos_id)
+            token_id.append(self.eos_id)
+            batch.append(token_id)
+            if len(batch) == self.batch_size:
+                # data_pad: [batch_size, seq_len]
+                # each seq_len always different
+                data_pad = pad_sequence(
+                    [torch.from_numpy(np.array(x)).long() for x in batch],
+                    True, self.pad_index)
+                data_pad = data_pad.t().contiguous()
+                # xs_pad, ys_pad: [max_seq_len, batch_size]
+                # max_seq_len is the maximum length in current batch
+                xs_pad = data_pad[:-1, :]
+                ys_pad = data_pad[1:, :]
+                yield xs_pad, ys_pad
+                batch = []
+
+    @torch.no_grad()
+    def compute_ppl(self, txt_file: str):
+        self.set_criterion(doing_rescore=False)
+        # total_loss = torch.tensor([0.0]).to(self.device)
+        # total_examples = torch.tensor([0.0]).to(self.device)
+        # for batch_idx, batch in enumerate(self.dev_data_loader):
+        total_loss = 0.0
+        txt_f = open(txt_file, 'r')
+        for batch_input, batch_target in self.batchify(txt_f):
+            # batch_input: [seq_len, batch_size]
+            # with contents: <bos> token_id token_id ....
+            #
+            # batch_target: [seq_len, batch_size]
+            # with contensts: token_id token_id ... <eos>
+            batch_input = batch_input.to(self.device)
+            batch_target = batch_target.to(self.device)
+            batch_output = self.model(batch_input)
+
+            prediction = batch_output.view(-1, self.ntoken)
+            # target: [max_seq_len * batch_size]
+            # example_1_token_1 example_2_token_1 example_3_token_1 .....
+            target = batch_target.view(-1)
+            loss = self.criterion(prediction, target)
+            total_loss += loss * batch_input.shape[0]
+
+        loss = total_loss / self.token_count
+        token_ppl = math.exp(total_loss / self.token_count)
+        word_ppl = math.exp(total_loss / self.word_count)
+        log_str = 'dev examples: {} dev loss is {:.6f} and token_ppl {:.6f}  word_ppl {}'.format(
+            int(self.total_examples), loss.item(), token_ppl, word_ppl)
+        logging.info(log_str)
+        txt_f.close()
+        self.reset_count_variables()
+
+    def batchify_sentences(self, sentences: List[str]):
+        batch = []
+        for line in sentences:
+            self.total_examples += 1
+            token_id = self.tokenizer.encode(line).ids
+            # print('token_id: ', token_id)
+            # +1 for <eos>
+            self.word_count += len(line.split()) + 1
+            # +1 for <eos>
+            self.token_count += len(token_id) + 1
+
+            token_id.insert(0, self.bos_id)
+            token_id.append(self.eos_id)
+            batch.append(token_id)
+            if len(batch) == self.batch_size:
+                # data_pad: [batch_size, seq_len]
+                # each seq_len always different
+                data_pad = pad_sequence(
+                    [torch.from_numpy(np.array(x)).long() for x in batch],
+                    True, self.pad_index)
+                data_pad = data_pad.t().contiguous()
+                # xs_pad, ys_pad: [max_seq_len, batch_size]
+                # max_seq_len is the maximum length in current batch
+                xs_pad = data_pad[:-1, :]
+                ys_pad = data_pad[1:, :]
+                yield xs_pad, ys_pad
+                batch = []
+
+    @torch.no_grad()
+    def score_sentences(self, sentences: List[str]) -> torch.tensor:
+        '''
+        Args:
+            sentences: each element is a sentence, words seperated by whitespace
+        '''
+        total_loss = 0.0
+        average_negative_logp = []
+        for batch_input, batch_target in self.batchify_sentences(sentences):
+            # batch_input: [seq_len, batch_size]
+            # with contents: <bos> token_id token_id ....
+            #
+            # batch_target: [seq_len, batch_size]
+            # with contensts: token_id token_id ... <eos>
+            batch_input = batch_input.to(self.device)
+            batch_target = batch_target.to(self.device)
+            batch_output = self.model(batch_input)
+
+            prediction = batch_output.view(-1, self.ntoken)
+            # target: [max_seq_len * batch_size]
+            # example_1_token_1 example_2_token_1 example_3_token_1 .....
+            target = batch_target.view(-1)
+            loss = self.criterion(prediction, target)
+            average_negative_logp.append(loss.item())
+        self.reset_count_variables()
+        return torch.tensor(average_negative_logp).to(self.device)
+
+    @torch.no_grad()
+    def score_word_seqs(self, word_seqs: k2.RaggedInt, doing_rescore:bool = True) -> torch.tensor:
+        '''
+        used when rescoring
+        '''
+        self.set_criterion(doing_rescore=True)
+        sentences = word_seqs_to_list_str(word_seqs, self.symbol_table)
+        return self.score_sentences(sentences)
diff --git a/egs/librispeech/asr/nnlm/local/model.py b/egs/librispeech/asr/nnlm/local/model.py
index fd37601f..71c2feb5 100644
--- a/egs/librispeech/asr/nnlm/local/model.py
+++ b/egs/librispeech/asr/nnlm/local/model.py
@@ -99,6 +99,9 @@ def __init__(self,
 
         self.init_weights()
 
+        # used by evaluator
+        self.ntoken = ntoken
+
     def _generate_square_subsequent_mask(self, sz):
         mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
         mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index b9cdaec5..343ef473 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -17,9 +17,9 @@ text_dir=data/nnlm/text
 all_train_text=$text_dir/librispeech.txt
 # there are 40,398,052 pieces in all_train_text, which will take 50 MINUTES to be tokenized, with a single process.
 # use $train_pieces data to validate pipeline
-train_pieces=300000 # 15 times of dev.txt
+# train_pieces=300000 # 15 times of dev.txt
 # uncomment follwoing line to use all_train_text
-# train_pieces=
+train_pieces=
 dev_text=$text_dir/dev.txt
 
 # vocab_size of huggingface tokenizer
@@ -88,6 +88,7 @@ if [ $stage -le 3 ]; then
   # -1 means train from scratch
   # python main.py \
   export CUDA_VISIBLE_DEVICES=0,1,2,3
+  # python -m torch.distributed.launch --nproc_per_node=4 test.py \
   python -m torch.distributed.launch --nproc_per_node=4 main.py \
     --config $lm_config \
     --vocab_size $vocab_size \
@@ -96,14 +97,8 @@ if [ $stage -le 3 ]; then
 fi
 
 if [ $stage -le 4 ]; then
-  # TODO: this module is in developing
   echo "compute word ppl from token ppl"
-  # model_iter if for resume training
-  # -1 means train from scratch
-  python compute_word_ppl.py \
-    --model_iter 40 \
-    --vocab_size $vocab_size \
-    --model_type Transformer
+  python compute_word_ppl.py
 
 fi
 
@@ -125,3 +120,5 @@ if [ $stage -le 5 ]; then
     --tokenizer-path $tokenizer
 
 fi
+
+# cut -f 2- -d" " /home/storage15/huangying/tools/espnet/egs/librispeech/asr1/data/dev/text > data/dev/text
diff --git a/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py b/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py
index cfe25c9e..879a9fe7 100755
--- a/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py
+++ b/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py
@@ -11,19 +11,20 @@
 import os
 import torch
 from k2 import Fsa, SymbolTable
-from kaldialign import edit_distance
 from pathlib import Path
 from typing import List
 from typing import Union
 
-from lhotse import CutSet, load_manifest
-from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
 from snowfall.common import average_checkpoint, store_transcripts
 from snowfall.common import find_first_disambig_symbol
 from snowfall.common import get_texts
+from snowfall.common import write_error_stats
 from snowfall.common import load_checkpoint
 from snowfall.common import setup_logger
+from snowfall.common import str2bool
+from snowfall.data import LibriSpeechAsrDataModule
 from snowfall.decoding.graph import compile_HLG
+from snowfall.decoding.lm_rescore import decode_with_lm_rescoring
 from snowfall.models import AcousticModel
 from snowfall.models.transformer import Transformer
 from snowfall.models.conformer import Conformer
@@ -31,9 +32,18 @@
 from snowfall.training.mmi_graph import create_bigram_phone_lm
 from snowfall.training.mmi_graph import get_phone_symbols
 
+from evaluator import Evaluator
 
-def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel,
-           device: Union[str, torch.device], HLG: Fsa, symbols: SymbolTable):
+
+def decode(dataloader: torch.utils.data.DataLoader,
+           model: AcousticModel,
+           device: Union[str, torch.device],
+           HLG: Fsa,
+           symbols: SymbolTable,
+           num_paths: int,
+           G: k2.Fsa,
+           use_whole_lattice: bool,
+           evaluator=None):
     tot_num_cuts = len(dataloader.dataset.cuts)
     num_cuts = 0
     results = []  # a list of pair (ref_words, hyp_words)
@@ -43,7 +53,8 @@ def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel,
         supervision_segments = torch.stack(
             (supervisions['sequence_idx'],
              (((supervisions['start_frame'] - 1) // 2 - 1) // 2),
-             (((supervisions['num_frames'] - 1) // 2 - 1) // 2)), 1).to(torch.int32)
+             (((supervisions['num_frames'] - 1) // 2 - 1) // 2)),
+            1).to(torch.int32)
         supervision_segments = torch.clamp(supervision_segments, min=0)
         indices = torch.argsort(supervision_segments[:, 2], descending=True)
         supervision_segments = supervision_segments[indices]
@@ -71,8 +82,22 @@ def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel,
         lattices = k2.intersect_dense_pruned(HLG, dense_fsa_vec, 20.0, 7.0, 30,
                                              10000)
 
-        # lattices = k2.intersect_dense(HLG, dense_fsa_vec, 10.0)
-        best_paths = k2.shortest_path(lattices, use_double_scores=True)
+        if G is None:
+            best_paths = k2.shortest_path(lattices, use_double_scores=True)
+        elif evaluator is not None:
+            best_paths = decode_with_lm_rescoring(
+                lattices,
+                evaluator=evaluator,
+                G=None,
+                num_paths=num_paths,
+                use_whole_lattice=use_whole_lattice)
+        else:
+            best_paths = decode_with_lm_rescoring(
+                lattices,
+                G,
+                num_paths=num_paths,
+                use_whole_lattice=use_whole_lattice)
+
         assert best_paths.shape[0] == len(texts)
         hyps = get_texts(best_paths, indices)
         assert len(hyps) == len(texts)
@@ -157,56 +182,82 @@ def print_transition_probabilities(P: k2.Fsa, phone_symbol_table: SymbolTable,
 
 def get_parser():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--model-type',
-        type=str,
-        default="conformer",
-        choices=["transformer", "conformer"],
-        help="Model type.")
-    parser.add_argument(
-        '--epoch',
-        type=int,
-        default=10,
-        help="Decoding epoch.")
-    parser.add_argument(
-        '--max-duration',
-        type=int,
-        default=1000.0,
-        help="Maximum pooled recordings duration (seconds) in a single batch.")
+    parser.add_argument('--model-type',
+                        type=str,
+                        default="conformer",
+                        choices=["transformer", "conformer"],
+                        help="Model type.")
+    parser.add_argument('--epoch',
+                        type=int,
+                        default=10,
+                        help="Decoding epoch.")
     parser.add_argument(
         '--avg',
         type=int,
         default=5,
         help="Number of checkpionts to average. Automaticly select "
-             "consecutive checkpoints before checkpoint specified by'--epoch'. ")
-    parser.add_argument(
-        '--att-rate',
-        type=float,
-        default=0.0,
-        help="Attention loss rate.")
-    parser.add_argument(
-        '--nhead',
-        type=int,
-        default=4,
-        help="Number of attention heads in transformer.")
+        "consecutive checkpoints before checkpoint specified by'--epoch'. ")
+    parser.add_argument('--att-rate',
+                        type=float,
+                        default=0.0,
+                        help="Attention loss rate.")
+    parser.add_argument('--nhead',
+                        type=int,
+                        default=4,
+                        help="Number of attention heads in transformer.")
     parser.add_argument(
         '--attention-dim',
         type=int,
         default=256,
         help="Number of units in transformer attention layers.")
+    parser.add_argument(
+        '--output-beam-size',
+        type=int,
+        default=8,
+        help='Output beam size. Used in k2.intersect_dense_pruned.'\
+             'Choose a large value (e.g., 20), for 1-best decoding '\
+             'and n-best rescoring. Choose a small value (e.g., 8) for ' \
+             'rescoring with the whole lattice')
+    parser.add_argument('--use-lm-rescoring',
+                        type=str2bool,
+                        default=True,
+                        help='When enabled, it uses LM for rescoring')
+
+    parser.add_argument('--use-nnlm-rescoring',
+                        type=str2bool,
+                        default=True,
+                        help='When enabled, it uses LM for rescoring')
+    parser.add_argument(
+        '--num-paths',
+        type=int,
+        default=-1,
+        help='Number of paths for rescoring using n-best list.' \
+             'If it is negative, then rescore with the whole lattice.'\
+             'CAUTION: You have to reduce max_duration in case of CUDA OOM'
+             )
     return parser
 
 
 def main():
-    args = get_parser().parse_args()
+
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
 
     model_type = args.model_type
     epoch = args.epoch
-    max_duration = args.max_duration
     avg = args.avg
     att_rate = args.att_rate
-
-    exp_dir = Path('exp-' + model_type + '-noam-mmi-att-musan')
+    num_paths = args.num_paths
+    use_lm_rescoring = args.use_lm_rescoring
+    use_nnlm_rescoring = args.use_nnlm_rescoring
+    use_whole_lattice = False
+    if use_lm_rescoring and num_paths < 1:
+        # It doesn't make sense to use n-best list for rescoring
+        # when n is less than 1
+        use_whole_lattice = True
+
+    exp_dir = Path('exp-' + model_type + '-noam-mmi-att-musan-sa')
     setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug')
 
     # load L, G, symbol_table
@@ -225,6 +276,23 @@ def main():
     # device = torch.device('cuda', 1)
     device = torch.device('cuda')
 
+    if use_nnlm_rescoring:
+        # now only support n-best rescoring with nnlm
+        use_whole_lattice = False
+        # TODO: make following paths configurable
+        model_path = '../nnlm/exp-nnlm/models/epoch_30.pt'
+        config_file = '../nnlm/conf/lm_small_transformer.yaml'
+        tokenizer_path = '../nnlm/exp-nnlm/tokenizer-librispeech.json'
+        words_txt = './data/lang_nosp/words.txt'
+
+        evaluator = Evaluator(device=device,
+                              words_txt=words_txt,
+                              model_path=model_path,
+                              config_file=config_file,
+                              tokenizer_path=tokenizer_path)
+    else:
+        evaluator = None
+
     if att_rate != 0.0:
         num_decoder_layers = 6
     else:
@@ -232,7 +300,7 @@ def main():
 
     if model_type == "transformer":
         model = Transformer(
-            num_features=40,
+            num_features=80,
             nhead=args.nhead,
             d_model=args.attention_dim,
             num_classes=len(phone_ids) + 1,  # +1 for the blank symbol
@@ -240,7 +308,7 @@ def main():
             num_decoder_layers=num_decoder_layers)
     else:
         model = Conformer(
-            num_features=40,
+            num_features=80,
             nhead=args.nhead,
             d_model=args.attention_dim,
             num_classes=len(phone_ids) + 1,  # +1 for the blank symbol
@@ -253,8 +321,10 @@ def main():
         checkpoint = os.path.join(exp_dir, 'epoch-' + str(epoch - 1) + '.pt')
         load_checkpoint(checkpoint, model)
     else:
-        checkpoints = [os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt') for avg_epoch in
-                       range(epoch - avg, epoch)]
+        checkpoints = [
+            os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt')
+            for avg_epoch in range(epoch - avg, epoch)
+        ]
         average_checkpoint(checkpoints, model)
 
     model.to(device)
@@ -262,10 +332,16 @@ def main():
 
     assert P.requires_grad is False
     P.scores = model.P_scores.cpu()
-    print_transition_probabilities(P, phone_symbol_table, phone_ids, filename='model_P_scores.txt')
+    print_transition_probabilities(P,
+                                   phone_symbol_table,
+                                   phone_ids,
+                                   filename='model_P_scores.txt')
 
     P.set_scores_stochastic_(model.P_scores)
-    print_transition_probabilities(P, phone_symbol_table, phone_ids, filename='P_scores.txt')
+    print_transition_probabilities(P,
+                                   phone_symbol_table,
+                                   phone_ids,
+                                   filename='P_scores.txt')
 
     if not os.path.exists(lang_dir / 'HLG.pt'):
         logging.debug("Loading L_disambig.fst.txt")
@@ -274,61 +350,90 @@ def main():
         logging.debug("Loading G.fst.txt")
         with open(lang_dir / 'G.fst.txt') as f:
             G = k2.Fsa.from_openfst(f.read(), acceptor=False)
-        first_phone_disambig_id = find_first_disambig_symbol(phone_symbol_table)
+        first_phone_disambig_id = find_first_disambig_symbol(
+            phone_symbol_table)
         first_word_disambig_id = find_first_disambig_symbol(symbol_table)
         HLG = compile_HLG(L=L,
-                         G=G,
-                         H=ctc_topo,
-                         labels_disambig_id_start=first_phone_disambig_id,
-                         aux_labels_disambig_id_start=first_word_disambig_id)
+                          G=G,
+                          H=ctc_topo,
+                          labels_disambig_id_start=first_phone_disambig_id,
+                          aux_labels_disambig_id_start=first_word_disambig_id)
         torch.save(HLG.as_dict(), lang_dir / 'HLG.pt')
     else:
         logging.debug("Loading pre-compiled HLG")
         d = torch.load(lang_dir / 'HLG.pt')
         HLG = k2.Fsa.from_dict(d)
 
+    if use_lm_rescoring:
+        if use_whole_lattice:
+            logging.info('Rescoring with the whole lattice')
+        else:
+            logging.info(f'Rescoring with n-best list, n is {num_paths}')
+        first_word_disambig_id = find_first_disambig_symbol(symbol_table)
+        if not os.path.exists(lang_dir / 'G_4_gram.pt'):
+            logging.debug('Loading G_4_gram.fst.txt')
+            with open(lang_dir / 'G_4_gram.fst.txt') as f:
+                G = k2.Fsa.from_openfst(f.read(), acceptor=False)
+                # G.aux_labels is not needed in later computations, so
+                # remove it here.
+                del G.aux_labels
+                # CAUTION(fangjun): The following line is crucial.
+                # Arcs entering the back-off state have label equal to #0.
+                # We have to change it to 0 here.
+                G.labels[G.labels >= first_word_disambig_id] = 0
+                G = k2.create_fsa_vec([G]).to(device)
+                G = k2.arc_sort(G)
+                torch.save(G.as_dict(), lang_dir / 'G_4_gram.pt')
+        else:
+            logging.debug('Loading pre-compiled G_4_gram.pt')
+            d = torch.load(lang_dir / 'G_4_gram.pt')
+            G = k2.Fsa.from_dict(d).to(device)
+
+        if use_whole_lattice:
+            # Add epsilon self-loops to G as we will compose
+            # it with the whole lattice later
+            G = k2.add_epsilon_self_loops(G)
+            G = k2.arc_sort(G)
+            G = G.to(device)
+    else:
+        logging.debug('Decoding without LM rescoring')
+        G = None
+
     logging.debug("convert HLG to device")
     HLG = HLG.to(device)
     HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0)
     HLG.requires_grad_(False)
 
+    if not hasattr(HLG, 'lm_scores'):
+        HLG.lm_scores = HLG.scores.clone()
+
     # load dataset
-    feature_dir = Path('exp/data')
+    librispeech = LibriSpeechAsrDataModule(args)
     test_sets = ['test-clean', 'test-other']
-    for test_set in test_sets:
+    #  test_sets = ['test-other']
+    for test_set, test_dl in zip(test_sets, librispeech.test_dataloaders()):
         logging.info(f'* DECODING: {test_set}')
 
-        logging.debug("About to get test cuts")
-        cuts_test = load_manifest(feature_dir / f'cuts_{test_set}.json.gz')
-        logging.debug("About to create test dataset")
-        test = K2SpeechRecognitionDataset(cuts_test)
-        sampler = SingleCutSampler(cuts_test, max_duration=max_duration)
-        logging.debug("About to create test dataloader")
-        test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1)
-
-        logging.debug("About to decode")
         results = decode(dataloader=test_dl,
                          model=model,
                          device=device,
                          HLG=HLG,
-                         symbols=symbol_table)
+                         symbols=symbol_table,
+                         num_paths=num_paths,
+                         G=G,
+                         evaluator=evaluator,
+                         use_whole_lattice=use_whole_lattice)
 
         recog_path = exp_dir / f'recogs-{test_set}.txt'
         store_transcripts(path=recog_path, texts=results)
         logging.info(f'The transcripts are stored in {recog_path}')
-        # compute WER
-        dists = [edit_distance(r, h) for r, h in results]
-        errors = {
-            key: sum(dist[key] for dist in dists)
-            for key in ['sub', 'ins', 'del', 'total']
-        }
-        total_words = sum(len(ref) for ref, _ in results)
-        # Print Kaldi-like message:
-        # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ]
-        logging.info(
-            f'[{test_set}] %WER {errors["total"] / total_words:.2%} '
-            f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]'
-        )
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = exp_dir / f'errs-{test_set}.txt'
+        with open(errs_filename, 'w') as f:
+            write_error_stats(f, test_set, results)
+        logging.info('Wrote detailed error stats to {}'.format(errs_filename))
 
 
 torch.set_num_threads(1)
diff --git a/snowfall/decoding/lm_rescore.py b/snowfall/decoding/lm_rescore.py
new file mode 100644
index 00000000..be4cf2f8
--- /dev/null
+++ b/snowfall/decoding/lm_rescore.py
@@ -0,0 +1,306 @@
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+
+# modified from:
+# https://github.com/k2-fsa/snowfall/blob/16e9f5949be9db99730d65335adbf27d2729424d/snowfall/decoding/lm_rescore.py
+from typing import Optional
+
+import k2
+import torch
+
+
+def compute_am_scores(lats: k2.Fsa, word_fsas_with_epsilon_loops: k2.Fsa,
+                      path_to_seq_map: torch.Tensor) -> torch.Tensor:
+    '''Compute AM scores of n-best lists (represented as word_fsas).
+
+    Args:
+      lats:
+        An FsaVec, which is the output of `k2.intersect_dense_pruned`.
+        It must have the attribute `lm_scores`.
+      word_fsas_with_epsilon_loops:
+        An FsaVec representing a n-best list. Note that it has been processed
+        by `k2.add_epsilon_self_loops`.
+      path_to_seq_map:
+        A 1-D torch.Tensor with dtype torch.int32. path_to_seq_map[i] indicates
+        which sequence the i-th Fsa in word_fsas_with_epsilon_loops belongs to.
+        path_to_seq_map.numel() == word_fsas_with_epsilon_loops.arcs.dim0().
+    Returns:
+      Return a 1-D torch.Tensor containing the AM scores of each path.
+      `ans.numel() == word_fsas_with_epsilon_loops.shape[0]`
+    '''
+    device = lats.device
+    assert len(lats.shape) == 3
+    assert hasattr(lats, 'lm_scores')
+
+    # k2.compose() currently does not support b_to_a_map. To void
+    # replicating `lats`, we use k2.intersect_device here.
+    #
+    # lats has phone IDs as `labels` and word IDs as aux_labels, so we
+    # need to invert it here.
+    inverted_lats = k2.invert(lats)
+
+    # Now the `labels` of inverted_lats are word IDs (a 1-D torch.Tensor)
+    # and its `aux_labels` are phone IDs ( a k2.RaggedInt with 2 axes)
+
+    # Remove its `aux_labels` since it is not needed in the
+    # following computation
+    del inverted_lats.aux_labels
+    inverted_lats = k2.arc_sort(inverted_lats)
+
+    am_path_lats = k2.intersect_device(inverted_lats,
+                                       word_fsas_with_epsilon_loops,
+                                       b_to_a_map=path_to_seq_map,
+                                       sorted_match_a=True)
+
+    # NOTE: `k2.connect` and `k2.top_sort` support only CPU at present
+    am_path_lats = k2.top_sort(k2.connect(am_path_lats.to('cpu'))).to(device)
+
+    # The `scores` of every arc consists of `am_scores` and `lm_scores`
+    am_path_lats.scores = am_path_lats.scores - am_path_lats.lm_scores
+
+    am_scores = am_path_lats.get_tot_scores(True, True)
+
+    return am_scores
+
+
+@torch.no_grad()
+def rescore_with_n_best_list(lats: k2.Fsa,
+                             G: k2.Fsa,
+                             num_paths: int,
+                             evaluator=None) -> k2.Fsa:
+    '''Decode using n-best list with LM rescoring.
+
+    `lats` is a decoding lattice, which has 3 axes. This function first
+    extracts `num_paths` paths from `lats` for each sequence using
+    `k2.random_paths`. The `am_scores` of these paths are computed.
+    For each path, its `lm_scores` is computed using `G` (which is an LM).
+    The final `tot_scores` is the sum of `am_scores` and `lm_scores`.
+    The path with the greatest `tot_scores` within a sequence is used
+    as the decoding output.
+
+    Args:
+      lats:
+        An FsaVec. It can be the output of `k2.intersect_dense_pruned`.
+      G:
+        An FsaVec representing the language model (LM). Note that it
+        is an FsaVec, but it contains only one Fsa.
+      num_paths:
+        It is the size `n` in `n-best` list.
+    Returns:
+      An FsaVec representing the best decoding path for each sequence
+      in the lattice.
+    '''
+    device = lats.device
+
+    assert len(lats.shape) == 3
+    assert hasattr(lats, 'aux_labels')
+    assert hasattr(lats, 'lm_scores')
+
+    if evaluator is None:
+        assert G.shape == (1, None, None)
+        assert G.device == device
+        assert hasattr(G, 'aux_labels') is False
+
+    # First, extract `num_paths` paths for each sequence.
+    # paths is a k2.RaggedInt with axes [seq][path][arc_pos]
+    paths = k2.random_paths(lats, num_paths=num_paths, use_double_scores=True)
+
+    # word_seqs is a k2.RaggedInt sharing the same shape as `paths`
+    # but it contains word IDs. Note that it also contains 0s and -1s.
+    # The last entry in each sublist is -1.
+    word_seqs = k2.index(lats.aux_labels, paths)
+
+    # Remove epsilons and -1 from word_seqs
+    word_seqs = k2.ragged.remove_values_leq(word_seqs, 0)
+
+    # Remove repeated sequences to avoid redundant computation later.
+    #
+    # unique_word_seqs is still a k2.RaggedInt with 3 axes [seq][path][word]
+    # except that there are no repeated paths with the same word_seq
+    # within a seq.
+    #
+    # num_repeats is also a k2.RaggedInt with 2 axes containing the
+    # multiplicities of each path.
+    # num_repeats.num_elements() == unique_word_seqs.num_elements()
+    #
+    # Since k2.ragged.unique_sequences will reorder paths within a seq,
+    # `new2old` is a 1-D torch.Tensor mapping from the output path index
+    # to the input path index.
+    # new2old.numel() == unique_word_seqs.num_elements()
+    unique_word_seqs, num_repeats, new2old = k2.ragged.unique_sequences(
+        word_seqs, need_num_repeats=True, need_new2old_indexes=True)
+
+    seq_to_path_shape = k2.ragged.get_layer(unique_word_seqs.shape(), 0)
+
+    # path_to_seq_map is a 1-D torch.Tensor.
+    # path_to_seq_map[i] is the seq to which the i-th path
+    # belongs.
+    path_to_seq_map = seq_to_path_shape.row_ids(1)
+
+    # Remove the seq axis.
+    # Now unique_word_seqs has only two axes [path][word]
+    unique_word_seqs = k2.ragged.remove_axis(unique_word_seqs, 0)
+
+    # word_fsas is an FsaVec with axes [path][state][arc]
+    word_fsas = k2.linear_fsa(unique_word_seqs)
+
+    word_fsas_with_epsilon_loops = k2.add_epsilon_self_loops(word_fsas)
+
+    am_scores = compute_am_scores(lats, word_fsas_with_epsilon_loops,
+                                  path_to_seq_map)
+
+    # Now compute lm_scores
+    b_to_a_map = torch.zeros_like(path_to_seq_map)
+    if evaluator is None:
+        lm_path_lats = k2.intersect_device(G,
+                                           word_fsas_with_epsilon_loops,
+                                           b_to_a_map=b_to_a_map,
+                                           sorted_match_a=True)
+        lm_path_lats = k2.top_sort(k2.connect(
+            lm_path_lats.to('cpu'))).to(device)
+        lm_scores = lm_path_lats.get_tot_scores(True, True)
+    else:
+        lm_scores = -evaluator.score_word_seqs(unique_word_seqs)
+
+    # import pdb
+    # pdb.set_trace()
+    tot_scores = am_scores + lm_scores
+    # tot_scores = lm_scores
+
+    # Remember that we used `k2.ragged.unique_sequences` to remove repeated
+    # paths to avoid redundant computation in `k2.intersect_device`.
+    # Now we use `num_repeats` to correct the scores for each path.
+    #
+    # NOTE(fangjun): It is commented out as it leads to a worse WER
+    # tot_scores = tot_scores * num_repeats.values()
+
+    # TODO(fangjun): We may need to add `k2.RaggedDouble`
+    ragged_tot_scores = k2.RaggedFloat(seq_to_path_shape,
+                                       tot_scores.to(torch.float32))
+    argmax_indexes = k2.ragged.argmax_per_sublist(ragged_tot_scores)
+
+    # Use k2.index here since argmax_indexes' dtype is torch.int32
+    best_path_indexes = k2.index(new2old, argmax_indexes)
+
+    paths = k2.ragged.remove_axis(paths, 0)
+
+    # best_path is a k2.RaggedInt with 2 axes [path][arc_pos]
+    best_paths = k2.index(paths, best_path_indexes)
+
+    # labels is a k2.RaggedInt with 2 axes [path][phone_id]
+    # Note that it contains -1s.
+    labels = k2.index(lats.labels.contiguous(), best_paths)
+
+    labels = k2.ragged.remove_values_eq(labels, -1)
+
+    # lats.aux_labels is a k2.RaggedInt tensor with 2 axes, so
+    # aux_labels is also a k2.RaggedInt with 2 axes
+    aux_labels = k2.index(lats.aux_labels, best_paths.values())
+
+    best_path_fsas = k2.linear_fsa(labels)
+    best_path_fsas.aux_labels = aux_labels
+
+    return best_path_fsas
+
+
+@torch.no_grad()
+def rescore_with_whole_lattice(lats: k2.Fsa,
+                               G_with_epsilon_loops: k2.Fsa) -> k2.Fsa:
+    '''Use whole lattice to rescore.
+
+    Args:
+      lats:
+        An FsaVec It can be the output of `k2.intersect_dense_pruned`.
+      G_with_epsilon_loops:
+        An FsaVec representing the language model (LM). Note that it
+        is an FsaVec, but it contains only one Fsa.
+    '''
+    assert len(lats.shape) == 3
+    assert hasattr(lats, 'lm_scores')
+    assert G_with_epsilon_loops.shape == (1, None, None)
+
+    device = lats.device
+    lats.scores = lats.scores - lats.lm_scores
+    # Now, lats.scores contains only am_scores
+
+    # inverted_lats has word IDs as labels.
+    # Its aux_labels are phone IDs, which is a ragged tensor k2.RaggedInt
+    inverted_lats = k2.invert(lats)
+    num_seqs = lats.shape[0]
+    inverted_lats_with_epsilon_loops = k2.add_epsilon_self_loops(inverted_lats)
+
+    b_to_a_map = torch.zeros(num_seqs, device=device, dtype=torch.int32)
+    try:
+        rescoring_lats = k2.intersect_device(G_with_epsilon_loops,
+                                             inverted_lats_with_epsilon_loops,
+                                             b_to_a_map,
+                                             sorted_match_a=True)
+    except RuntimeError as e:
+        print(f'Caught exception:\n{e}\n')
+        print(f'Number of FSAs: {inverted_lats.shape[0]}')
+        print('num_arcs before pruning: ',
+              inverted_lats_with_epsilon_loops.arcs.num_elements())
+
+        # NOTE(fangjun): The choice of the threshold 0.01 is arbitrary here
+        # to avoid OOM. We may need to fine tune it.
+        inverted_lats = k2.prune_on_arc_post(inverted_lats, 0.001, True)
+        inverted_lats_with_epsilon_loops = k2.add_epsilon_self_loops(
+            inverted_lats)
+        print('num_arcs after pruning: ',
+              inverted_lats_with_epsilon_loops.arcs.num_elements())
+
+        rescoring_lats = k2.intersect_device(G_with_epsilon_loops,
+                                             inverted_lats_with_epsilon_loops,
+                                             b_to_a_map,
+                                             sorted_match_a=True)
+
+    rescoring_lats = k2.top_sort(k2.connect(
+        rescoring_lats.to('cpu'))).to(device)
+    inverted_rescoring_lats = k2.invert(rescoring_lats)
+    # inverted rescoring_lats has phone IDs as labels
+    # and word IDs as aux_labels.
+
+    inverted_rescoring_lats = k2.remove_epsilon_self_loops(
+        inverted_rescoring_lats)
+    best_paths = k2.shortest_path(inverted_rescoring_lats,
+                                  use_double_scores=True)
+    return best_paths
+
+
+@torch.no_grad()
+def decode_with_lm_rescoring(lats: k2.Fsa,
+                             G: k2.Fsa,
+                             num_paths: int,
+                             use_whole_lattice: bool,
+                             evaluator=None) -> k2.Fsa:
+    '''Decode using n-best list with LM rescoring.
+
+    `lats` is a decoding lattice, which has 3 axes. This function first
+    extracts `num_paths` paths from `lats` for each sequence using
+    `k2.random_paths`. The `am_scores` of these paths are computed.
+    For each path, its `lm_scores` is computed using `G` (which is an LM).
+    The final `tot_scores` is the sum of `am_scores` and `lm_scores`.
+    The path with the greatest `tot_scores` within a sequence is used
+    as the decoding output.
+
+    Args:
+      lats:
+        An FsaVec It can be the output of `k2.intersect_dense_pruned`.
+      G:
+        An FsaVec representing the language model (LM). Note that it
+        is an FsaVec, but it contains only one Fsa.
+      num_paths:
+        It is the size `n` in `n-best` list.
+        Used only if use_whole_lattice is False.
+      use_whole_lattice:
+        True to use whole lattice for rescoring. False to use n-best list
+        for rescoring.
+    Returns:
+      An FsaVec representing the best decoding path for each sequence
+      in the lattice.
+    '''
+    if use_whole_lattice:
+        return rescore_with_whole_lattice(lats, G)
+    elif evaluator is not None:
+        return rescore_with_n_best_list(lats, None, num_paths, evaluator)
+    else:
+        return rescore_with_n_best_list(lats, G, num_paths)

From d847b2862dde178872e4789971f3a7fd37e60303 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 20 Apr 2021 17:08:51 +0800
Subject: [PATCH 23/25] filter train data by length to increase batch_size

---
 egs/librispeech/asr/nnlm/run.sh | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index 343ef473..e18c6769 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -83,12 +83,28 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
+  # TODO:Move following flollowig filtered by length module in Dataset
+  # The longest sample has 1344 tokens. Batchsize is quite small if training data contains these Long samples.
+  # Only 1.31% = 529,260/40,198,051 samples are filtered out by length 90.
+  maximum_length=90
+  echo "filter out sampels which longher than "$maximum_length" tokens"
+  data_dir=./data/nnlm/text
+  train_data_filtered_by_length=${data_dir}/length_${maximum_length}_librispeech.txt.tokens
+  train_data=${data_dir}/librispeech.txt.tokens
+  ori_train_data=${data_dir}/ori_librispeech.txt.tokens
+  if [ ! -f  $ori_train_data ]; then
+    mv ${train_data} ${ori_train_data}
+  fi
+
+  if [ ! -f $train_data_filtered_by_length ]; then
+    awk -v maximum_length=$maximum_length 'NF<maximum_length{print $0}' ${ori_train_data} > $train_data_filtered_by_length
+    ln -sf `realpath $train_data_filtered_by_length` ${train_data}
+  fi
+
   echo "start to train"
   # resume_model_iter is for resume training
   # -1 means train from scratch
-  # python main.py \
   export CUDA_VISIBLE_DEVICES=0,1,2,3
-  # python -m torch.distributed.launch --nproc_per_node=4 test.py \
   python -m torch.distributed.launch --nproc_per_node=4 main.py \
     --config $lm_config \
     --vocab_size $vocab_size \

From 52300df56f0ab1591661775b2c63e382ffce97e2 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 20 Apr 2021 17:32:45 +0800
Subject: [PATCH 24/25] use Noam optimizer

---
 .../asr/nnlm/conf/lm_small_transformer.yaml     | 17 +++++++++--------
 egs/librispeech/asr/nnlm/main.py                | 16 +++++++++++-----
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
index 4ee16290..3cbe8596 100644
--- a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
+++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
@@ -2,7 +2,7 @@
 gpu: 1
 tensorboard_dir: 'exp-nnlm/tensorobard'
 
-# network architecture equivalent configuration to 
+# network architecture equivalent configuration to
 # https://github.com/pytorch/examples/blob/master/word_language_model/main.py
 model_module: transformer
 transformer_conf:
@@ -15,13 +15,14 @@ transformer_conf:
 shared_conf:
   ntoken: 5003
 
-optimizer_conf:
-  # for Adam
-  lr: 0.0003
-  weight_decay: 0.001
-  # for SGD
-  # lr: 0.01
-  # weight_decay: 0.001
+# Now using Noam optimizer and tuning configuration
+# optimizer_conf:
+#   # for Adam
+#   lr: 0.0003
+#   weight_decay: 0.001
+#   # for SGD
+#   # lr: 0.01
+#   # weight_decay: 0.001
 
 trainer_conf:
   num_epochs: 60
diff --git a/egs/librispeech/asr/nnlm/main.py b/egs/librispeech/asr/nnlm/main.py
index 7f7b9ae8..ed640a27 100644
--- a/egs/librispeech/asr/nnlm/main.py
+++ b/egs/librispeech/asr/nnlm/main.py
@@ -28,6 +28,8 @@
 from torch.utils.data import DataLoader
 from typing import List, Dict
 
+from snowfall.models.transformer import Noam
+
 
 def get_args():
     parser = argparse.ArgumentParser(
@@ -59,7 +61,7 @@ def validate_configs(configs: Dict, required_fields: List) -> bool:
 def extract_configs(args) -> Dict:
     assert os.path.exists(args.config), '{} does not exist'.format(args.cofnig)
     required_fields = [
-        'model_module', 'shared_conf', 'optimizer_conf', 'trainer_conf',
+        'model_module', 'shared_conf', 'trainer_conf',
         'dataset_conf'
     ]
     with open(args.config, 'r') as f:
@@ -124,17 +126,21 @@ def main():
         model = TransformerModel(**configs['transformer_conf'])
         if args.resume_model_iter > 0:
             model_dir = configs['trainer_conf']['model_dir']
-            model_path = '{}/epoch_{}.pt'.format(model_dir, args.resume_model_iter)
+            model_path = '{}/epoch_{}.pt'.format(model_dir,
+                                                 args.resume_model_iter)
             assert os.path.exists(model_path)
             load_checkpoint(model_path, model)
         model = torch.nn.parallel.DistributedDataParallel(
             model.to(device), [args.local_rank])
 
-
-    optimizer = optim.AdamW(model.parameters(), **configs['optimizer_conf'])
+    optimizer = Noam(model.parameters(),
+                     model.module.embed_unit,
+                     factor=1.0,
+                     warm_step=5000)
     criterion = nn.NLLLoss(ignore_index=pad_index)
 
-    writer = SummaryWriter(log_dir=configs['tensorboard_dir'])
+    writer = SummaryWriter(log_dir=configs['tensorboard_dir'] +
+                           str(args.local_rank))
 
     log_interval = max(100, len(train_data_loader) // 20)
     trainer = Trainer(device=device,

From e61a9d157c60b450d2a1cbc2e49e6ed0a3687588 Mon Sep 17 00:00:00 2001
From: Guo Liyong <guonwpu@qq.com>
Date: Tue, 20 Apr 2021 19:21:37 +0800
Subject: [PATCH 25/25] add rescore scripts

---
 .../asr/nnlm/conf/lm_small_transformer.yaml   |  2 +-
 egs/librispeech/asr/nnlm/run.sh               | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
index 3cbe8596..34cc8ec8 100644
--- a/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
+++ b/egs/librispeech/asr/nnlm/conf/lm_small_transformer.yaml
@@ -8,7 +8,7 @@ model_module: transformer
 transformer_conf:
   embed_unit: 200
   attention_heads: 8
-  nlayers: 8
+  nlayers: 16
   linear_units: 2048
   dropout: 0.2
 
diff --git a/egs/librispeech/asr/nnlm/run.sh b/egs/librispeech/asr/nnlm/run.sh
index e18c6769..86fe6c4f 100644
--- a/egs/librispeech/asr/nnlm/run.sh
+++ b/egs/librispeech/asr/nnlm/run.sh
@@ -117,8 +117,24 @@ if [ $stage -le 4 ]; then
   python compute_word_ppl.py
 
 fi
-
 if [ $stage -le 5 ]; then
+  # this stage requires trained mmi models
+  export PYTHONPATH=$PWD/local:$PYTHONPATH
+
+  cd ../simple_v1
+
+  # TODO: Remove hard-code Transformer language mode path
+  ./mmi_att_transformer_decode.py \
+    --use-nnlm-rescoring=1 \
+    --num-path=100 \
+    --max-duration=500 \
+    --output-beam-size=20
+
+  cd ../nnlm
+
+fi
+
+if [ $stage -le 6 ]; then
   # generate words.txt tokens.txt and lexicion.txt
   # which is used in future rescore process
   lexicon_path=./data/nnlm/lexicon
@@ -130,6 +146,7 @@ if [ $stage -le 5 ]; then
     echo "please set words_txt path of your previous experiment"
     echo "the NN-LM trained LM is used as a rescore module, \
       currently the same words.txt with previous experiment is prefered"
+    exit 0
   fi
   echo "generate lexicon"
   python local/generate_lexicon.py \