forked from Morizeyao/GPT2-Chinese
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval.py
184 lines (160 loc) · 7.69 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import transformers
import torch
import os
import json
import random
import numpy as np
import argparse
from datetime import datetime
from tqdm import tqdm
from torch.nn import DataParallel
def build_files(data_path, tokenized_data_path, num_pieces, full_tokenizer, min_length):
if not os.path.exists(tokenized_data_path):
os.mkdir(tokenized_data_path)
with open(data_path, 'r', encoding='utf8') as f:
print('reading lines')
lines = json.load(f)
lines = [line.replace('\n', ' [SEP] ') for line in lines] # 用[SEP]表示换行, 段落之间使用SEP表示段落结束
all_len = len(lines)
for i in tqdm(range(num_pieces)):
sublines = lines[all_len // num_pieces * i: all_len // num_pieces * (i + 1)]
if i == num_pieces - 1:
sublines.extend(lines[all_len // num_pieces * (i + 1):]) # 把尾部例子添加到最后一个piece
sublines = [full_tokenizer.tokenize(line) for line in sublines if
len(line) > min_length] # 只考虑长度超过min_length的句子
sublines = [full_tokenizer.convert_tokens_to_ids(line) for line in sublines]
full_line = []
for subline in sublines:
full_line.append(full_tokenizer.convert_tokens_to_ids('[MASK]')) # 文章开头添加MASK表示文章开始
full_line.extend(subline)
full_line.append(full_tokenizer.convert_tokens_to_ids('[CLS]')) # 文章之间添加CLS表示文章结束
with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'w') as f:
for id in full_line:
f.write(str(id) + ' ')
print('finish')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
help='选择模型参数')
parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库')
parser.add_argument('--raw_data_path', default='data/eval.json', type=str, required=False, help='原始语料')
parser.add_argument('--tokenized_data_path', default='data/tokenized_eval/', type=str, required=False,
help='tokenized语料存放位置')
parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
parser.add_argument('--batch_size', default=8, type=int, required=False, help='batch size')
parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次')
parser.add_argument('--stride', default=768, type=int, required=False, help='取数据的窗口步长')
parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度')
parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型起点路径')
parser.add_argument('--output_dir', default='eval_result/', type=str, required=False, help='结果输出路径')
args = parser.parse_args()
print('args:\n' + args.__repr__())
# if args.no_wordpiece:
# from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert
# else:
from tokenizations import tokenization_bert
os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡
model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
print('config:\n' + model_config.to_json_string())
n_ctx = model_config.n_ctx
full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
full_tokenizer.max_len = n_ctx
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device:', device)
raw_data_path = args.raw_data_path
tokenized_data_path = args.tokenized_data_path
raw = args.raw # 选择是否从零开始构建数据集
batch_size = args.batch_size
log_step = args.log_step
stride = args.stride
num_pieces = args.num_pieces
min_length = args.min_length
output_dir = args.output_dir
if not os.path.exists(output_dir):
os.mkdir(output_dir)
if raw:
print('building files')
build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces,
full_tokenizer=full_tokenizer, min_length=min_length)
print('files built')
if not args.pretrained_model:
print('you need to specify a trained model.')
exit(1)
else:
model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)
model.eval()
model.to(device)
num_parameters = 0
parameters = model.parameters()
for parameter in parameters:
num_parameters += parameter.numel()
print('number of parameters: {}'.format(num_parameters))
multi_gpu = False
full_len = 0
print('calculating total steps')
for i in tqdm(range(num_pieces)):
with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
full_len += len([int(item) for item in f.read().strip().split()])
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = DataParallel(model)
multi_gpu = True
print('starting training')
overall_step = 0
total_loss = 0
total_steps = 0
# eval
now = datetime.now()
print('time: {}'.format(now))
piece_num = 0
for i in range(num_pieces):
with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
line = f.read().strip()
tokens = line.split()
tokens = [int(token) for token in tokens]
start_point = 0
samples = []
while start_point < len(tokens) - n_ctx:
samples.append(tokens[start_point: start_point + n_ctx])
start_point += stride
start_point -= stride
last = tokens[start_point + n_ctx:]
last.extend([full_tokenizer.convert_tokens_to_ids(['[PAD]']) * (n_ctx - len(last))])
random.shuffle(samples)
for step in range(len(samples) // batch_size): # drop last
# prepare data
batch = samples[step * batch_size: (step + 1) * batch_size]
batch_labels = []
batch_inputs = []
for ids in batch:
int_ids_for_labels = [int(x) for x in ids]
int_ids_for_inputs = [int(x) for x in ids]
batch_labels.append(int_ids_for_labels)
batch_inputs.append(int_ids_for_inputs)
batch_labels = torch.tensor(batch_labels).long().to(device)
batch_inputs = torch.tensor(batch_inputs).long().to(device)
# forward pass
outputs = model.forward(input_ids=batch_inputs, labels=batch_labels)
loss, logits = outputs[:2]
# get loss
if multi_gpu:
loss = loss.mean()
total_loss += loss
total_steps += 1
if (overall_step + 1) % log_step == 0:
print('now time: {}:{}. Step {} of piece {}, ppl {}'.format(
datetime.now().hour,
datetime.now().minute,
(step + 1),
piece_num,
torch.exp(loss)))
piece_num += 1
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
else:
with open(args.output_dir + 'result.txt', 'w') as f:
f.write(np.exp(total_loss / total_steps))
if __name__ == '__main__':
main()