-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_student_model_init.py
163 lines (139 loc) · 6.25 KB
/
run_student_model_init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
#
# Usage:
# python ./bin/model/distil_whisper/create_student_model.py demo_configs/model/distil_whisper/create_student_model.json
"""
Initialise a student Whisper model from a pre-trained teacher model for
teacher-student distillation.
"""
import pdb
import os
import sys
import json
import argparse
import copy
import logging
from typing import Dict
import numpy as np
import torch
from transformers import GenerationConfig, WhisperForConditionalGeneration, WhisperProcessor
logger = logging.getLogger(__name__)
def init_student_model_from_teacher(
teacher_checkpoint,
processor_name_or_path,
encoder_layers=None,
decoder_layers=2,
save_dir=None,
):
teacher_model = WhisperForConditionalGeneration.from_pretrained(
teacher_checkpoint,
low_cpu_mem_usage=True,
)
processor = WhisperProcessor.from_pretrained(processor_name_or_path)
generation_config = GenerationConfig.from_pretrained(teacher_checkpoint)
teacher_config = teacher_model.config
teacher_encoder_layers = teacher_config.encoder_layers
teacher_decoder_layers = teacher_config.decoder_layers
student_config = copy.deepcopy(teacher_config)
student_config.update(
{
"encoder_layers": encoder_layers if encoder_layers > 0 else teacher_encoder_layers,
"decoder_layers": decoder_layers,
}
)
encoder_mapping = np.linspace(0, teacher_encoder_layers - 1, student_config.encoder_layers, dtype=int)
encoder_mapping[-1] = teacher_encoder_layers - 1
encoder_map = {}
for student_layer, teacher_layer in enumerate(encoder_mapping):
encoder_map[teacher_layer] = student_layer
decoder_mapping = np.linspace(0, teacher_decoder_layers - 1, student_config.decoder_layers, dtype=int)
decoder_mapping[-1] = teacher_decoder_layers - 1
decoder_map = {}
for student_layer, teacher_layer in enumerate(decoder_mapping):
decoder_map[teacher_layer] = student_layer
# init the student params from the teacher model
student_model = WhisperForConditionalGeneration(student_config)
missing_keys, unexpected_keys = student_model.load_state_dict(teacher_model.state_dict(), strict=False)
if len(missing_keys) > 0:
raise RuntimeError(
"Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
f"Missing key(s) in state_dict: {missing_keys}"
)
if decoder_layers == teacher_decoder_layers:
decoder_keys = [key for key in unexpected_keys if "model.decoder.layers" in key]
if len(decoder_keys) > 0:
raise RuntimeError(
"Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
f"Unexpected key(s) in state_dict: {decoder_keys}"
)
if encoder_layers == teacher_encoder_layers:
encoder_keys = [key for key in unexpected_keys if "model.encoder.layers" in key]
if len(encoder_keys) > 0:
raise RuntimeError(
"Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
f"Unexpected key(s) in state_dict: {encoder_keys}"
)
for layer in range(teacher_decoder_layers):
if layer in decoder_map:
# re-introduce pre-defined layers from the teacher
student_model.model.decoder.layers[decoder_map[layer]].load_state_dict(
teacher_model.model.decoder.layers[layer].state_dict()
)
if encoder_layers > 0:
for layer in range(teacher_encoder_layers):
if layer in encoder_map:
# re-introduce pre-defined layers from the teacher
student_model.model.encoder.layers[encoder_map[layer]].load_state_dict(
teacher_model.model.encoder.layers[layer].state_dict()
)
# remove the teacher params and model
del teacher_model
# save the converted weights and model
if save_dir is not None:
student_model.save_pretrained(save_dir)
# we also need to correctly save the processor and generation config
processor.save_pretrained(save_dir)
generation_config.save_pretrained(save_dir)
# check we can do a forward pass with the saved model - first load the weights and processor
logger.info("Checking we can load the saved model...")
student_model = WhisperForConditionalGeneration.from_pretrained(
save_dir,
low_cpu_mem_usage=True,
)
processor = WhisperProcessor.from_pretrained(save_dir)
# define some random inputs
input_features = processor(np.ones(16000), sampling_rate=16000, return_tensors="pt").input_features
decoder_start_token_id = student_model.config.decoder_start_token_id
decoder_input_ids = torch.ones((input_features.shape[0], 1), dtype=torch.long) * decoder_start_token_id
# do a forward pass - outputs will be gibberish for the initialised model so we can't check them
# but we make can sure the model runs as expected
logger.info("Checking we can run the converted model forward...")
_ = student_model(input_features, decoder_input_ids=decoder_input_ids).logits
logger.info("Conversion successful!")
if __name__ == "__main__":
configs: Dict = json.loads(open(sys.argv[1], "r").read())
print(configs)
init_student_model_from_teacher(
teacher_checkpoint=configs["teacher_model_name_or_path"],
processor_name_or_path=configs["processor_name_or_path"],
encoder_layers=configs["student_model_encoder_layers"],
decoder_layers=configs["student_model_decoder_layers"],
save_dir=configs["student_model_dir"]
)
os.system("cp %s %s" % (sys.argv[1], configs["student_model_dir"]))