k2-fsa · glynpu · Mar 25, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 30, 2021
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,14 @@
+[flake8]
+show-source=true
+statistics=true
+max-line-length=80
+exclude =
+  .git,
+
+ignore =
+  # E127 continuation line over-indented for visual indent
+  E127,
+  # F401, import but not used
+  F401,
+  # W504, line break after binary operator
+  W504,
diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+from typing import List
+from util import convert_tokens_to_ids
+
+import numpy as np
+import os
+import torch
+
+
+class CollateFunc(object):
+    '''Collate function for LMDataset
+    '''
+
+    def __init__(self, pad_index=0):
+        # pad_index should be identical to ignore_index of torch.nn.NLLLoss
+        self.pad_index = pad_index
+
+    def __call__(self, batch: List[List[int]]):
+        '''batch contains token_id.
+           batch can be viewd as a ragged 2-d array, with a row represents a token_id.
+           token_id reprents a tokenized text, whose format is:
+           <bos_id> token_id token_id token_id *** <eos_id>
+        '''
+        data_pad = pad_sequence(
+            [torch.from_numpy(np.array(x)).long() for x in batch], True,
+            self.pad_index)
+        xs_pad = data_pad[:, :-1]
+        ys_pad = data_pad[:, 1:]
+        return xs_pad, ys_pad
+
+
+class LMDataset(Dataset):
+
+    def __init__(self, text_file: str, lexicon):
+        '''Dataset to load Language Model train/dev text data
+
+        Args:
+            text_file: text file, text for one utt per line.
+        '''
+        self.lexicon = lexicon
+        assert os.path.exists(
+            text_file), "text_file: {} does not exist, please check that."
+        self.data = []
+        with open(text_file, 'r') as f:
+            # a line represent a piece of text, e.g.
+            # DELAWARE IS NOT AFRAID OF DOGS
+            for line in f:
+                # import pdb
+                # pdb.set_trace()
+                text = line.strip().lower().split()
+                # print(text)
+                if len(text) == 0:
+                    continue
+                word_id = convert_tokens_to_ids(text, self.lexicon.word2id)
+                if len(word_id) == 0:
+                    continue
+                word_id = torch.from_numpy(np.array(word_id, dtype="int32"))
+
+                token_id = self.lexicon.word_seq_to_word_piece_seq(word_id)
+                # token_id format:
+                # <bos_id> token_id token_id token_id *** <eos_id>
+                if len(token_id) >= 2:
+                    self.data.append(token_id)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+    def text2id(self, text: List[str]) -> List[int]:
+        # A dumpy implementation
+        return [i for i in range(len(text))]
+
+    def text_id2token_id(self, text_id: List[int]) -> List[int]:
+        # A dumpy implementation
+        return [i for i in range(len(text_id))]
+
+
+if __name__ == '__main__':
+    # train_file = "./data/nnlm/text/librispeech.txt"
+    dev_file = "./data/nnlm/text/dev.txt"
+    dataset = LMDataset(dev_file)
+    collate_func = CollateFunc()
+    data_loader = DataLoader(dataset,
+                             batch_size=2,
+                             shuffle=True,
+                             num_workers=0,
+                             collate_fn=collate_func)
+    for i, batch in enumerate(data_loader):
+        xs, ys = batch
+        print(xs)
+        print(ys)
+        print(batch)
diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+import argparse
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers import decoders
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='generate words.txt tokens.txt and lexicon.txt')
+    parser.add_argument('--lexicon-path',
+                        default='data/nnlm/lexicon',
+                        type=str,
+                        help="path to save lexicon files")
+    parser.add_argument('--tokenizer-path',
+                        type=str,
+                        default='./data/lm_train/tokenizer-librispeech.json',
+                        help="path to load tokenizer")
+    parser.add_argument('--train-file',
+                        default='data/nnlm/text/librispeech.txt',
+                        type=str,
+                        help="""file to be tokenized""")
+    args = parser.parse_args()
+    return args
+
+
+def generate_tokens(args):
+    tokenizer = Tokenizer.from_file(args.tokenizer_path)
+    symbols = tokenizer.get_vocab()
+    tokens_file = '{}/tokens.txt'.format(args.lexicon_path)
+    tokens_f = open(tokens_file, 'w')
+    for idx, sym in enumerate(symbols):
+        tokens_f.write('{} {}\n'.format(sym.lower(), idx))
+
+    tokens_f.close()
+
+
+def generate_lexicon(args, words):
+    special_words = [
+        '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
+    ]
+    lexicon_file = '{}/lexicon.txt'.format(args.lexicon_path)
+    lf = open(lexicon_file, 'w')
+    tokenizer = Tokenizer.from_file(args.tokenizer_path)
+    tokenizer.decoder = decoders.WordPiece()
+    for word in words:
+        if word not in special_words:
+            output = tokenizer.encode(word)
+            tokens = ' '.join(output.tokens)
+        else:
+            tokens = '[unk]'
+        lf.write("{}\t{}\n".format(word.lower(), tokens.lower()))
+    lf.close()
+
+
+def load_words(args):
+    words = []
+    tokens_file = '{}/words.txt'.format(args.lexicon_path)
+    # special_words = [
+    #     '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
+    # ]
+    # special_words = []
+
+    with open(tokens_file) as f:
+        for line in f:
+            arr = line.strip().split()
+            # if arr[0] not in special_words:
+            words.append(arr[0])
+
+    return words
+
+
+def main():
+    args = get_args()
+    generate_tokens(args)
+    words = load_words(args)
+    generate_lexicon(args, words)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+# reference: https://huggingface.co/docs/tokenizers/python/latest/quicktour.html
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers import normalizers
+from tokenizers.normalizers import Lowercase, NFD, StripAccents
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.trainers import WordPieceTrainer
+from tokenizers import decoders
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='train and tokenize with huggingface tokenizer')
+    parser.add_argument('--train-file',
+                        type=str,
+                        help="""file to train tokenizer""")
+    parser.add_argument('--vocab-size',
+                        type=int,
+                        default=10000,
+                        help="""number of tokens of the tokenizer""")
+    parser.add_argument('--tokenizer-path',
+                        type=str,
+                        help="path to save or load tokenizer")
+    parser.add_argument('--test-file',
+                        type=str,
+                        help="""file to be tokenized""")
+    args = parser.parse_args()
+    return args
+
+
+def train_tokenizer(train_files, save_path, vocab_size):
+    if os.path.exists(save_path):
+        logging.warning(
+            "{} already exists. Please check that.".format(save_path))
+        return
+    else:
+        Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True)
+
+    tokenizer = Tokenizer(WordPiece(unk_token='[UNK]'))
+    tokenizer.normalizer = normalizers.Sequence(
+        [NFD(), Lowercase(), StripAccents()])
+    tokenizer.pre_tokenizer = Whitespace()
+
+    # default vocab_size=30000
+    # here set vocab_size=1000 for accelerating
+    trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=['[UNK]'])
+    tokenizer.train(train_files, trainer)
+    tokenizer.save(save_path)
+
+
+def tokenize_text(test_file, tokenizer_path):
+    if not os.path.exists(tokenizer_path):
+        logging.warning(
+            "Tokenizer {} does not exist. Please check that.".format(
+                tokenizer_path))
+        return
+    tokenizer = Tokenizer.from_file(tokenizer_path)
+    tokenizer.decoder = decoders.WordPiece()
+    tokenized_file = "{}.tokens".format(test_file)
+    # tokenized_ids = "{}.ids".format(test_file)
+    if os.path.exists(tokenized_file):
+        logging.warning(
+            "The input file seems already tokenized. Buckupping previous result"
+        )
+        shutil.copyfile(tokenized_file, "{}.bk".format(tokenized_file))
+    logging.warning("Tokenizing {}.".format(test_file))
+    fout = open(tokenized_file, 'w')
+    with open(test_file) as f:
+        for line in f:
+            line = line.strip()
+            output = tokenizer.encode(line)
+            fout.write(" ".join(output.tokens) + '\n')
+
+    fout.close()
+
+
+def main():
+    args = get_args()
+    if args.train_file is not None:
+        train_files = [args.train_file]
+        train_tokenizer(train_files, args.tokenizer_path, args.vocab_size)
+
+    if args.test_file is not None:
+        tokenize_text(args.test_file, args.tokenizer_path)
+
+
+if __name__ == '__main__':
+    main()