Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename Olmo1124 to Olmo2 #10500

Merged
merged 1 commit into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3040,9 +3040,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return [(self.map_tensor_name(name), data_torch)]


@Model.register("Olmo1124ForCausalLM")
class Olmo1124Model(Model):
model_arch = gguf.MODEL_ARCH.OLMO_1124
@Model.register("Olmo2ForCausalLM")
class Olmo2Model(Model):
model_arch = gguf.MODEL_ARCH.OLMO2


@Model.register("OlmoeForCausalLM")
Expand Down
6 changes: 3 additions & 3 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ class MODEL_ARCH(IntEnum):
COMMAND_R = auto()
DBRX = auto()
OLMO = auto()
OLMO_1124 = auto()
OLMO2 = auto()
OLMOE = auto()
OPENELM = auto()
ARCTIC = auto()
Expand Down Expand Up @@ -405,7 +405,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO_1124: "olmo_1124",
MODEL_ARCH.OLMO2: "olmo2",
MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic",
Expand Down Expand Up @@ -1071,7 +1071,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.OLMO_1124: [
MODEL_ARCH.OLMO2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
Expand Down
28 changes: 14 additions & 14 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class TensorNameMap:
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
"transformer.word_embeddings", # falcon
"word_embeddings", # bloom
"model.embed_tokens", # llama-hf nemotron olmoe olmo_1124
"model.embed_tokens", # llama-hf nemotron olmoe olmo2
"tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon
Expand Down Expand Up @@ -54,7 +54,7 @@ class TensorNameMap:
# Output
MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
"output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2
Expand All @@ -66,7 +66,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
"model.norm", # llama-hf baichuan internlm2 olmoe olmo_1124
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2
"norm", # llama-pth
"transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2
Expand Down Expand Up @@ -145,7 +145,7 @@ class TensorNameMap:

# Attention query
MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo_1124
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j
Expand All @@ -157,7 +157,7 @@ class TensorNameMap:

# Attention key
MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo_1124
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j
Expand All @@ -170,7 +170,7 @@ class TensorNameMap:

# Attention value
MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo_1124
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j
Expand All @@ -188,7 +188,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo_1124
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert
"transformer.h.{bid}.attn.out_proj", # gpt-j
Expand All @@ -215,7 +215,7 @@ class TensorNameMap:
),

MODEL_TENSOR.ATTN_POST_NORM: (
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo_1124
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
),

# Rotary embeddings
Expand Down Expand Up @@ -250,7 +250,7 @@ class TensorNameMap:

# Post feed-forward norm
MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
),

MODEL_TENSOR.FFN_GATE_INP: (
Expand All @@ -273,7 +273,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", # bloom
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo_1124
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
"layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert
"transformer.h.{bid}.mlp.fc_in", # gpt-j
Expand Down Expand Up @@ -314,7 +314,7 @@ class TensorNameMap:

# Feed-forward gate
MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo_1124
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
"layers.{bid}.feed_forward.w1", # llama-pth
"transformer.h.{bid}.mlp.w2", # qwen
"transformer.h.{bid}.mlp.c_fc2", # jais
Expand Down Expand Up @@ -346,7 +346,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", # bloom
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo_1124
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
"layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert
"transformer.h.{bid}.mlp.fc_out", # gpt-j
Expand Down Expand Up @@ -383,7 +383,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo_1124
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm
Expand All @@ -392,7 +392,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo_1124
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm
Expand Down
18 changes: 9 additions & 9 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ enum llm_arch {
LLM_ARCH_COMMAND_R,
LLM_ARCH_DBRX,
LLM_ARCH_OLMO,
LLM_ARCH_OLMO_1124,
LLM_ARCH_OLMO2,
LLM_ARCH_OLMOE,
LLM_ARCH_OPENELM,
LLM_ARCH_ARCTIC,
Expand Down Expand Up @@ -233,7 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_COMMAND_R, "command-r" },
{ LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_OLMO_1124, "olmo_1124" },
{ LLM_ARCH_OLMO2, "olmo2" },
{ LLM_ARCH_OLMOE, "olmoe" },
{ LLM_ARCH_OPENELM, "openelm" },
{ LLM_ARCH_ARCTIC, "arctic" },
Expand Down Expand Up @@ -1210,7 +1210,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
},
},
{
LLM_ARCH_OLMO_1124,
LLM_ARCH_OLMO2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
Expand Down Expand Up @@ -5900,7 +5900,7 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_OLMO_1124:
case LLM_ARCH_OLMO2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

Expand Down Expand Up @@ -8593,7 +8593,7 @@ static bool llm_load_tensors(
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
case LLM_ARCH_OLMO_1124:
case LLM_ARCH_OLMO2:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

Expand Down Expand Up @@ -14483,7 +14483,7 @@ struct llm_build_context {
return gf;
}

struct ggml_cgraph * build_olmo_1124() {
struct ggml_cgraph * build_olmo2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);

// mutable variable, needed during the last layer of the computation to skip unused tokens
Expand Down Expand Up @@ -16799,9 +16799,9 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_olmo();
} break;
case LLM_ARCH_OLMO_1124:
case LLM_ARCH_OLMO2:
{
result = llm.build_olmo_1124();
result = llm.build_olmo2();
} break;
case LLM_ARCH_OLMOE:
{
Expand Down Expand Up @@ -20078,7 +20078,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE:
case LLM_ARCH_OLMO_1124:
case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE:
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
Expand Down
Loading