forked from NExT-GPT/NExT-GPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
base.yaml
45 lines (36 loc) · 1.36 KB
/
base.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# ========= system global ========== #
models:
nextgpt:
model_name: NextGPTModel
agent_name: DeepSpeedAgent
seed: 13
max_length: 512 # max length of the user input prompt
logging_step: 5
num_clip_tokens: 77
gen_emb_dim: 768
pretrained_ckpt_path: ../ckpt/pretrained_ckpt/
# ========= LLM ========== #
vicuna_version: 7b_v0 # [7b_v0, ]
# ========= multimodal encoder ========== #
imagebind_version: huge
# ========= text-to-image alignment tuning ========== #
n_img_tokens: 4
text_emb_to_img_layers: [-1]
num_gen_img_tokens: 4
text_fc_to_img_mode: transformer # [qformer, transformer]
# ========= text-to-video alignment tuning ========== #
n_video_tokens: 24
text_emb_to_video_layers: [-1]
num_gen_video_tokens: 24
text_fc_to_video_mode: transformer # [qformer, transformer]
# ========= text-to-audio alignment tuning ========== #
n_audio_tokens: 8
text_emb_to_audio_layers: [-1]
num_gen_audio_tokens: 8
text_fc_to_audio_mode: transformer # [qformer, transformer]
# ========= image diffusion model ========== #
image_diffusion: runwayml/stable-diffusion-v1-5 # [runwayml/stable-diffusion-v1-5, stabilityai/stable-diffusion-2]
# ========= video diffusion model ========== #
video_diffusion: cerspense/zeroscope_v2_576w
# ========= audio diffusion model ========== #
audio_diffusion: cvssp/audioldm-l-full # [cvssp/audioldm-l-full, cvssp/audioldm-s-full-v2]