-
Notifications
You must be signed in to change notification settings - Fork 42
WIP: huggingface tokenizer and Neural LM training pipeline. #139
base: master
Are you sure you want to change the base?
Changes from 13 commits
f038e60
e9482d2
135bfdb
88e0d49
27b1863
212b79b
47bf358
d8aaabd
c44f99d
b13954d
775d477
3b83338
d415ed0
4937232
61863db
d4dccae
a4d5f1b
53e2d1e
b226a3a
89ece61
c3f8811
d1b803b
c45d31f
1d38c21
f6914cd
d847b28
52300df
e61a9d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
[flake8] | ||
show-source=true | ||
statistics=true | ||
max-line-length=80 | ||
exclude = | ||
.git, | ||
|
||
ignore = | ||
# E127 continuation line over-indented for visual indent | ||
E127, | ||
# F401, import but not used | ||
F401, | ||
# W504, line break after binary operator | ||
W504, |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) | ||
# Apache 2.0 | ||
|
||
from torch.utils.data import Dataset, DataLoader | ||
from torch.nn.utils.rnn import pad_sequence | ||
from typing import List | ||
from util import convert_tokens_to_ids | ||
|
||
import numpy as np | ||
import os | ||
import torch | ||
|
||
|
||
class CollateFunc(object): | ||
'''Collate function for LMDataset | ||
''' | ||
|
||
def __init__(self, pad_index=0): | ||
# pad_index should be identical to ignore_index of torch.nn.NLLLoss | ||
self.pad_index = pad_index | ||
|
||
def __call__(self, batch: List[List[int]]): | ||
'''batch contains token_id. | ||
batch can be viewd as a ragged 2-d array, with a row represents a token_id. | ||
token_id reprents a tokenized text, whose format is: | ||
<bos_id> token_id token_id token_id *** <eos_id> | ||
''' | ||
data_pad = pad_sequence( | ||
[torch.from_numpy(np.array(x)).long() for x in batch], True, | ||
self.pad_index) | ||
xs_pad = data_pad[:, :-1] | ||
ys_pad = data_pad[:, 1:] | ||
return xs_pad, ys_pad | ||
|
||
|
||
class LMDataset(Dataset): | ||
|
||
def __init__(self, text_file: str, lexicon): | ||
'''Dataset to load Language Model train/dev text data | ||
|
||
Args: | ||
text_file: text file, text for one utt per line. | ||
''' | ||
self.lexicon = lexicon | ||
assert os.path.exists( | ||
text_file), "text_file: {} does not exist, please check that." | ||
self.data = [] | ||
with open(text_file, 'r') as f: | ||
# a line represent a piece of text, e.g. | ||
# DELAWARE IS NOT AFRAID OF DOGS | ||
for line in f: | ||
# import pdb | ||
# pdb.set_trace() | ||
text = line.strip().lower().split() | ||
# print(text) | ||
if len(text) == 0: | ||
continue | ||
word_id = convert_tokens_to_ids(text, self.lexicon.word2id) | ||
if len(word_id) == 0: | ||
continue | ||
word_id = torch.from_numpy(np.array(word_id, dtype="int32")) | ||
|
||
token_id = self.lexicon.word_seq_to_word_piece_seq(word_id) | ||
# token_id format: | ||
# <bos_id> token_id token_id token_id *** <eos_id> | ||
if len(token_id) >= 2: | ||
self.data.append(token_id) | ||
|
||
def __len__(self): | ||
return len(self.data) | ||
|
||
def __getitem__(self, idx): | ||
return self.data[idx] | ||
|
||
def text2id(self, text: List[str]) -> List[int]: | ||
# A dumpy implementation | ||
return [i for i in range(len(text))] | ||
|
||
def text_id2token_id(self, text_id: List[int]) -> List[int]: | ||
# A dumpy implementation | ||
return [i for i in range(len(text_id))] | ||
|
||
|
||
if __name__ == '__main__': | ||
# train_file = "./data/nnlm/text/librispeech.txt" | ||
dev_file = "./data/nnlm/text/dev.txt" | ||
dataset = LMDataset(dev_file) | ||
collate_func = CollateFunc() | ||
data_loader = DataLoader(dataset, | ||
batch_size=2, | ||
shuffle=True, | ||
num_workers=0, | ||
collate_fn=collate_func) | ||
for i, batch in enumerate(data_loader): | ||
xs, ys = batch | ||
print(xs) | ||
print(ys) | ||
print(batch) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) | ||
# Apache 2.0 | ||
|
||
import argparse | ||
from tokenizers import Tokenizer | ||
from tokenizers.models import WordPiece | ||
from tokenizers import decoders | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser( | ||
description='generate words.txt tokens.txt and lexicon.txt') | ||
parser.add_argument('--lexicon-path', | ||
default='data/nnlm/lexicon', | ||
type=str, | ||
help="path to save lexicon files") | ||
parser.add_argument('--tokenizer-path', | ||
type=str, | ||
default='./data/lm_train/tokenizer-librispeech.json', | ||
help="path to load tokenizer") | ||
parser.add_argument('--train-file', | ||
default='data/nnlm/text/librispeech.txt', | ||
type=str, | ||
help="""file to be tokenized""") | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def generate_tokens(args): | ||
tokenizer = Tokenizer.from_file(args.tokenizer_path) | ||
symbols = tokenizer.get_vocab() | ||
tokens_file = '{}/tokens.txt'.format(args.lexicon_path) | ||
tokens_f = open(tokens_file, 'w') | ||
for idx, sym in enumerate(symbols): | ||
tokens_f.write('{} {}\n'.format(sym.lower(), idx)) | ||
|
||
tokens_f.close() | ||
|
||
|
||
def generate_lexicon(args, words): | ||
special_words = [ | ||
'<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0' | ||
] | ||
lexicon_file = '{}/lexicon.txt'.format(args.lexicon_path) | ||
lf = open(lexicon_file, 'w') | ||
tokenizer = Tokenizer.from_file(args.tokenizer_path) | ||
tokenizer.decoder = decoders.WordPiece() | ||
for word in words: | ||
if word not in special_words: | ||
output = tokenizer.encode(word) | ||
tokens = ' '.join(output.tokens) | ||
else: | ||
tokens = '[unk]' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a difference between BTW: what are There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. special tokens is a heritage of words.txt: simple_v1/data/lang_nosp/words.txt. whose head is:
I just want to make sure every word in words.txt could be tokenized. As thoses special workds not "real" words, I think map them to [unk] is better than tokenized by a trained tokenizer. In short, [UNK] amother with other special words is a heritage from upstream asr pipeline. and [unk] is a token by huggingface tokenizer. |
||
lf.write("{}\t{}\n".format(word.lower(), tokens.lower())) | ||
lf.close() | ||
|
||
|
||
def load_words(args): | ||
words = [] | ||
tokens_file = '{}/words.txt'.format(args.lexicon_path) | ||
# special_words = [ | ||
# '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0' | ||
# ] | ||
# special_words = [] | ||
|
||
with open(tokens_file) as f: | ||
for line in f: | ||
arr = line.strip().split() | ||
# if arr[0] not in special_words: | ||
words.append(arr[0]) | ||
|
||
return words | ||
|
||
|
||
def main(): | ||
args = get_args() | ||
generate_tokens(args) | ||
words = load_words(args) | ||
generate_lexicon(args, words) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright (c) 2020 Xiaomi Corporation (author: Liyong Guo) | ||
# Apache 2.0 | ||
|
||
# reference: https://huggingface.co/docs/tokenizers/python/latest/quicktour.html | ||
import argparse | ||
import logging | ||
import os | ||
import shutil | ||
from pathlib import Path | ||
from tokenizers import Tokenizer | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add some documentation describing how the environment is set up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No problem. A Readme.md will be added. |
||
from tokenizers.models import WordPiece | ||
from tokenizers import normalizers | ||
from tokenizers.normalizers import Lowercase, NFD, StripAccents | ||
from tokenizers.pre_tokenizers import Whitespace | ||
from tokenizers.trainers import WordPieceTrainer | ||
from tokenizers import decoders | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser( | ||
description='train and tokenize with huggingface tokenizer') | ||
parser.add_argument('--train-file', | ||
type=str, | ||
help="""file to train tokenizer""") | ||
parser.add_argument('--vocab-size', | ||
type=int, | ||
default=10000, | ||
help="""number of tokens of the tokenizer""") | ||
parser.add_argument('--tokenizer-path', | ||
type=str, | ||
help="path to save or load tokenizer") | ||
parser.add_argument('--test-file', | ||
type=str, | ||
help="""file to be tokenized""") | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def train_tokenizer(train_files, save_path, vocab_size): | ||
if os.path.exists(save_path): | ||
logging.warning( | ||
"{} already exists. Please check that.".format(save_path)) | ||
return | ||
else: | ||
Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True) | ||
|
||
tokenizer = Tokenizer(WordPiece(unk_token='[UNK]')) | ||
tokenizer.normalizer = normalizers.Sequence( | ||
[NFD(), Lowercase(), StripAccents()]) | ||
tokenizer.pre_tokenizer = Whitespace() | ||
|
||
# default vocab_size=30000 | ||
# here set vocab_size=1000 for accelerating | ||
trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=['[UNK]']) | ||
tokenizer.train(train_files, trainer) | ||
tokenizer.save(save_path) | ||
|
||
|
||
def tokenize_text(test_file, tokenizer_path): | ||
if not os.path.exists(tokenizer_path): | ||
logging.warning( | ||
"Tokenizer {} does not exist. Please check that.".format( | ||
tokenizer_path)) | ||
return | ||
tokenizer = Tokenizer.from_file(tokenizer_path) | ||
tokenizer.decoder = decoders.WordPiece() | ||
tokenized_file = "{}.tokens".format(test_file) | ||
# tokenized_ids = "{}.ids".format(test_file) | ||
if os.path.exists(tokenized_file): | ||
logging.warning( | ||
"The input file seems already tokenized. Buckupping previous result" | ||
) | ||
shutil.copyfile(tokenized_file, "{}.bk".format(tokenized_file)) | ||
logging.warning("Tokenizing {}.".format(test_file)) | ||
fout = open(tokenized_file, 'w') | ||
with open(test_file) as f: | ||
for line in f: | ||
line = line.strip() | ||
output = tokenizer.encode(line) | ||
fout.write(" ".join(output.tokens) + '\n') | ||
|
||
fout.close() | ||
|
||
|
||
def main(): | ||
args = get_args() | ||
if args.train_file is not None: | ||
train_files = [args.train_file] | ||
train_tokenizer(train_files, args.tokenizer_path, args.vocab_size) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. methods like these ( Candidate for future work in snowfall: actually this whole script could be easily re-used across recipes had we added a mechanism for auto-registering scripts in PATH (can be done via setup.py) |
||
|
||
if args.test_file is not None: | ||
tokenize_text(args.test_file, args.tokenizer_path) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The following two methods can be removed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed