-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
127 lines (110 loc) · 3.98 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
from dotenv import load_dotenv
import torch
import torch.nn as nn
import bitsandbytes as bnb
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from datasets import load_dataset
from peft import (
LoraConfig,
PeftConfig,
get_peft_model,
prepare_model_for_kbit_training,
)
import wandb
import time
load_dotenv()
model_name = "apple/OpenELM-270M"
tokenizer_name = "meta-llama/Llama-2-7b-hf" if model_name == "apple/OpenELM-270M" else model_name
max_context_length = 8096
dataset_name = 'HuggingFaceTB/cosmopedia'
dataset_subset = "openstax"
dataset_test_split_ratio = 0.05
output_dir_tmpl = "/root/openelm/cosmopedia-openstax-{timestamp}"
lora_r = 16
lora_alpha = 32
lora_targets = ["qkv_proj", "out_proj", "proj_1", "proj_2"]
run_name_tmpl = "openelm-cosmopedia-openstax-{timestamp}"
# Hyperparameters
num_epochs = 2
lr = 2e-4
logging_steps = 50
eval_steps = 1000
save_steps = 250
# Modify for prompt templates
def generate_prompt(system: str, user: str, assistant: str):
prompt = f"<s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]\n\n{assistant}</s>".strip()
return prompt
def setup_logging():
wandb.login(key=os.getenv("WANDB_API_KEY"))
def train():
# Metadata
timestamp = str(int(time.time()))
run_name = run_name_tmpl.format(timestamp=timestamp)
output_dir = output_dir_tmpl.format(timestamp=timestamp)
# Loading the Model and Convert to a PEFT Model
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
load_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', torch_dtype=torch.float16,
quantization_config=bnb_config, trust_remote_code=True)
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=lora_targets,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
# Function for Tokenizing the Dataset
def generate_and_tokenize_prompt(example):
full_prompt = generate_prompt(
"You are a helful digital assistant. Please provide safe, ethical and accurate information to the user.",
example["prompt"],
example["text"],
)
tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True, max_length=max_context_length)
return tokenized_full_prompt
# Loading the Dataset
dataset = load_dataset(dataset_name, dataset_subset, split="train")
split_dataset = dataset.train_test_split(test_size=dataset_test_split_ratio)
train_dataset, test_dataset = split_dataset["train"], split_dataset["test"]
train_dataset = train_dataset.shuffle().map(generate_and_tokenize_prompt, num_proc=16)
test_dataset = test_dataset.map(generate_and_tokenize_prompt, num_proc=16)
# Setup Logging
setup_logging()
# Setup Training
training_args = transformers.TrainingArguments(
auto_find_batch_size=True,
num_train_epochs=num_epochs,
learning_rate=lr,
bf16=True,
eval_strategy="steps",
logging_steps=logging_steps,
eval_steps=eval_steps,
output_dir=output_dir,
save_strategy='steps',
save_steps=save_steps,
report_to="wandb",
run_name=run_name
)
trainer = transformers.Trainer(
model=model,
train_dataset=train_dataset,
eval_dataset=test_dataset,
args=training_args,
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
# Training
trainer.train()
# Post-Processing
wandb.finish()