From 1f2e91ad0c8dcf2658482e29582dfa4b87a682fe Mon Sep 17 00:00:00 2001
From: xffxff <1247714429@qq.com>
Date: Tue, 26 Nov 2024 05:52:33 +0000
Subject: [PATCH 1/2] remove local vllm implementation

---
 aria/vllm/__init__.py |    0
 aria/vllm/aria.py     | 1107 -----------------------------------------
 docs/inference.md     |   15 +-
 3 files changed, 4 insertions(+), 1118 deletions(-)
 delete mode 100644 aria/vllm/__init__.py
 delete mode 100644 aria/vllm/aria.py

diff --git a/aria/vllm/__init__.py b/aria/vllm/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/aria/vllm/aria.py b/aria/vllm/aria.py
deleted file mode 100644
index 56407f4..0000000
--- a/aria/vllm/aria.py
+++ /dev/null
@@ -1,1107 +0,0 @@
-# Copyright 2024 Rhymes AI. All rights reserved.
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import math
-from typing import Iterable, List, Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-from PIL import Image
-from transformers import LlamaConfig
-from transformers.utils import logging
-from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
-)
-from vllm.inputs import INPUT_REGISTRY, token_inputs
-from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput, SamplingMetadata
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.models.interfaces import SupportsMultiModal
-from vllm.model_executor.models.llama import (
-    LlamaAttention,
-    LlamaDecoderLayer,
-    LlamaMLP,
-    LlamaModel,
-    RMSNorm,
-)
-from vllm.model_executor.models.utils import (
-    AutoWeightsLoader,
-    WeightsMapper,
-    make_layers,
-    maybe_prefix,
-    merge_multimodal_embeddings,
-)
-from vllm.model_executor.utils import set_weight_attrs
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.utils import (
-    cached_get_tokenizer,
-    repeat_and_pad_placeholder_tokens,
-)
-from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
-
-logger = logging.get_logger(__name__)
-
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from torch.nn.init import trunc_normal_
-from transformers.activations import ACT2FN
-from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
-from vllm.config import QuantizationConfig
-from vllm.model_executor.models.idefics2_vision_model import Idefics2VisionTransformer
-
-
-class AriaVisionConfig(Idefics2VisionConfig):
-    model_type = "aria_vision_model"
-
-
-class IdentityOp(torch.nn.Module):
-    """
-    An identity operation that returns the input unchanged.
-
-    This can be used as a placeholder or to maintain architectural consistency
-    when a specific operation is not needed.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-
-    def forward(self, x, *args, **kwargs):
-        return x
-
-
-class AriaVisionTransformer(Idefics2VisionTransformer):
-    def __init__(
-        self,
-        config: AriaVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__(config, quant_config, prefix)
-        self.post_layernorm = IdentityOp()
-
-
-class AriaVisionModel(nn.Module):
-    config_class = AriaVisionConfig
-
-    def __init__(
-        self,
-        config: AriaVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        *,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-
-        self.vision_model = AriaVisionTransformer(
-            config,
-            quant_config,
-            prefix=f"{prefix}.vision_model",
-        )
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        pixel_mask: Optional[torch.BoolTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]:
-        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
-
-        vit_oup = self.vision_model(
-            pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask,
-        )
-
-        image_atts = self._create_image_attention_mask(patch_attention_mask)
-
-        return vit_oup, image_atts
-
-    def _create_patch_attention_mask(self, pixel_mask):
-        if pixel_mask is None:
-            return None
-
-        patches_subgrid = pixel_mask.unfold(
-            dimension=1,
-            size=self.vision_model.config.patch_size,
-            step=self.vision_model.config.patch_size,
-        ).unfold(
-            dimension=2,
-            size=self.vision_model.config.patch_size,
-            step=self.vision_model.config.patch_size,
-        )
-        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
-
-    def _create_image_attention_mask(self, patch_attention_mask):
-        if patch_attention_mask is None:
-            return None
-
-        flattened_mask = patch_attention_mask.flatten(1)
-        return torch.logical_not(flattened_mask)
-
-
-class FFN(nn.Module):
-    """
-    Feed-Forward Network module.
-
-    Args:
-        embed_dim (int): Input embedding dimension.
-        ff_dim (int): Hidden dimension of the feed-forward network.
-        output_dim (int): Output dimension.
-    """
-
-    def __init__(self, embed_dim, ff_dim, output_dim):
-        super().__init__()
-        self.linear_in = nn.Linear(embed_dim, ff_dim, bias=False)
-        self.linear_out = nn.Linear(ff_dim, output_dim, bias=False)
-        self.act = ACT2FN["gelu_new"]
-
-    def forward(self, hidden_states):
-        hidden_states = self.act(self.linear_in(hidden_states))
-        hidden_states = self.linear_out(hidden_states)
-        return hidden_states
-
-
-class CrossAttention(nn.Module):
-    """
-    Cross-Attention module.
-
-    Args:
-        kv_dim (int): Dimension of key and value.
-        embed_dim (int): Embedding dimension.
-        num_heads (int): Number of attention heads.
-        drop_out_rate (float): Dropout rate. Default is 0.
-    """
-
-    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
-        super().__init__()
-        self.num_heads = num_heads
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
-        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-
-        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-        self.linear = nn.Linear(embed_dim, embed_dim)
-        self.dropout = nn.Dropout(drop_out_rate)
-
-        self.layer_norm = nn.LayerNorm(embed_dim)
-        self.ln_kv = nn.LayerNorm(kv_dim)
-
-    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
-        """
-        Forward pass of the CrossAttention module.
-
-        Args:
-            x (torch.Tensor): Input tensor for key and value.
-            hidden_states (torch.Tensor): Input tensor for query.
-            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
-            add_residual (bool): Whether to add residual connection. Default is False.
-
-        Returns:
-            torch.Tensor: Output tensor after cross-attention.
-        """
-        normed_hidden_states = self.layer_norm(hidden_states)
-        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
-
-        x = self.ln_kv(x)
-        key = self.k_proj(x).permute(1, 0, 2)
-        value = self.v_proj(x).permute(1, 0, 2)
-
-        attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask)
-
-        attn_output = attn_output.permute(1, 0, 2)
-
-        if add_residual:
-            attn_output = hidden_states + self.dropout(self.linear(attn_output))
-        else:
-            attn_output = self.dropout(self.linear(attn_output))
-
-        return attn_output
-
-
-class AriaProjector(nn.Module):
-    """
-    A projection module with one cross attention layer and one FFN layer, which projects ViT's outputs into MoE's inputs.
-
-    Args:
-        patch_to_query_dict (dict): Maps patch numbers to their corresponding query numbers,
-            e.g., {1225: 128, 4900: 256}. This allows for different query sizes based on image resolution.
-        embed_dim (int): Embedding dimension.
-        num_heads (int): Number of attention heads.
-        kv_dim (int): Dimension of key and value.
-        ff_dim (int): Hidden dimension of the feed-forward network.
-        output_dim (int): Output dimension.
-        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
-
-    Outputs:
-        A tensor with the shape of (batch_size, query_number, output_dim)
-    """
-
-    def __init__(
-        self,
-        patch_to_query_dict,
-        embed_dim,
-        num_heads,
-        kv_dim,
-        ff_dim,
-        output_dim,
-        norm_layer=nn.LayerNorm,
-    ):
-        super().__init__()
-        self.patch_to_query_dict = patch_to_query_dict
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-
-        self.query = nn.Parameter(
-            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)
-        )
-
-        trunc_normal_(self.query, std=0.02)
-
-        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
-
-        self.ln_ffn = norm_layer(embed_dim)
-        self.ffn = FFN(embed_dim, ff_dim, output_dim)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def forward(self, x, attn_mask=None):
-        """
-        Forward pass of the Projector module.
-
-        Args:
-            x (torch.Tensor): Input tensor of shape (batch_size, num_patches, kv_dim).
-            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
-
-        Returns:
-            torch.Tensor: Output tensor of shape (batch_size, query_number, output_dim).
-        """
-        bs = x.shape[0]
-        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
-
-        query_num = self.patch_to_query_dict.get(x.shape[1], None)
-        assert (
-            query_num is not None
-        ), f"Query number for {x.shape[1]} patches is not provided"
-
-        queries = queries[:, :query_num, :]
-
-        if attn_mask is not None:
-            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
-            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
-
-        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
-
-        out = self.ffn(self.ln_ffn(attention_out))
-
-        return out
-
-
-class FFN(nn.Module):
-    """
-    Feed-Forward Network module.
-
-    Args:
-        embed_dim (int): Input embedding dimension.
-        ff_dim (int): Hidden dimension of the feed-forward network.
-        output_dim (int): Output dimension.
-    """
-
-    def __init__(self, embed_dim, ff_dim, output_dim):
-        super().__init__()
-        self.linear_in = nn.Linear(embed_dim, ff_dim, bias=False)
-        self.linear_out = nn.Linear(ff_dim, output_dim, bias=False)
-        self.act = ACT2FN["gelu_new"]
-
-    def forward(self, hidden_states):
-        hidden_states = self.act(self.linear_in(hidden_states))
-        hidden_states = self.linear_out(hidden_states)
-        return hidden_states
-
-
-class CrossAttention(nn.Module):
-    """
-    Cross-Attention module.
-
-    Args:
-        kv_dim (int): Dimension of key and value.
-        embed_dim (int): Embedding dimension.
-        num_heads (int): Number of attention heads.
-        drop_out_rate (float): Dropout rate. Default is 0.
-    """
-
-    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
-        super().__init__()
-        self.num_heads = num_heads
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
-        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-
-        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-        self.linear = nn.Linear(embed_dim, embed_dim)
-        self.dropout = nn.Dropout(drop_out_rate)
-
-        self.layer_norm = nn.LayerNorm(embed_dim)
-        self.ln_kv = nn.LayerNorm(kv_dim)
-
-    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
-        """
-        Forward pass of the CrossAttention module.
-
-        Args:
-            x (torch.Tensor): Input tensor for key and value.
-            hidden_states (torch.Tensor): Input tensor for query.
-            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
-            add_residual (bool): Whether to add residual connection. Default is False.
-
-        Returns:
-            torch.Tensor: Output tensor after cross-attention.
-        """
-        normed_hidden_states = self.layer_norm(hidden_states)
-        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
-
-        x = self.ln_kv(x)
-        key = self.k_proj(x).permute(1, 0, 2)
-        value = self.v_proj(x).permute(1, 0, 2)
-
-        attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask)
-
-        attn_output = attn_output.permute(1, 0, 2)
-
-        if add_residual:
-            attn_output = hidden_states + self.dropout(self.linear(attn_output))
-        else:
-            attn_output = self.dropout(self.linear(attn_output))
-
-        return attn_output
-
-
-class AriaProjector(nn.Module):
-    """
-    A projection module with one cross attention layer and one FFN layer, which projects ViT's outputs into MoE's inputs.
-
-    Args:
-        patch_to_query_dict (dict): Maps patch numbers to their corresponding query numbers,
-            e.g., {1225: 128, 4900: 256}. This allows for different query sizes based on image resolution.
-        embed_dim (int): Embedding dimension.
-        num_heads (int): Number of attention heads.
-        kv_dim (int): Dimension of key and value.
-        ff_dim (int): Hidden dimension of the feed-forward network.
-        output_dim (int): Output dimension.
-        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
-
-    Outputs:
-        A tensor with the shape of (batch_size, query_number, output_dim)
-    """
-
-    def __init__(
-        self,
-        patch_to_query_dict,
-        embed_dim,
-        num_heads,
-        kv_dim,
-        ff_dim,
-        output_dim,
-        norm_layer=nn.LayerNorm,
-    ):
-        super().__init__()
-        self.patch_to_query_dict = patch_to_query_dict
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-
-        self.query = nn.Parameter(
-            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)
-        )
-
-        trunc_normal_(self.query, std=0.02)
-
-        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
-
-        self.ln_ffn = norm_layer(embed_dim)
-        self.ffn = FFN(embed_dim, ff_dim, output_dim)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def forward(self, x, attn_mask=None):
-        """
-        Forward pass of the Projector module.
-
-        Args:
-            x (torch.Tensor): Input tensor of shape (batch_size, num_patches, kv_dim).
-            attn_mask (torch.Tensor, optional): Attention mask. Default is None.
-
-        Returns:
-            torch.Tensor: Output tensor of shape (batch_size, query_number, output_dim).
-        """
-        bs = x.shape[0]
-        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
-
-        query_num = self.patch_to_query_dict.get(x.shape[1], None)
-        assert (
-            query_num is not None
-        ), f"Query number for {x.shape[1]} patches is not provided"
-
-        queries = queries[:, :query_num, :]
-
-        if attn_mask is not None:
-            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
-            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
-
-        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
-
-        out = self.ffn(self.ln_ffn(attention_out))
-
-        return out
-
-
-class AriaMoELMConfig(LlamaConfig):
-    """
-    Configuration class for AriaMoE language model.
-
-    This class extends the LlamaConfig to include additional parameters specific to the Mixture of Experts (MoE) architecture.
-    """
-
-    model_type = "aria_moe_lm"
-
-    def __init__(
-        self,
-        moe_intermediate_size: int = 4096,
-        moe_num_experts: int = 8,
-        moe_topk: int = 2,
-        moe_z_loss_coeff: float = 1e-5,
-        moe_aux_loss_coeff: float = 1e-3,
-        moe_num_shared_experts: int = 2,
-        **kwargs,
-    ):
-        """
-        Initialize the AriaMoELMConfig.
-
-        Args:
-            moe_intermediate_size (int): The intermediate size for MoE layers. Default is 4096.
-            moe_num_experts (int): The number of experts in the MoE layer. Default is 8.
-            moe_topk (int): The number of top experts to route to for each token. Default is 2.
-            moe_z_loss_coeff (float): The coefficient for the auxiliary z-loss. Default is 1e-5.
-            moe_aux_loss_coeff (float): The coefficient for the auxiliary load balancing loss. Default is 1e-3.
-            moe_num_shared_experts (int): The number of shared experts. Default is 2.
-            **kwargs: Additional keyword arguments to be passed to the parent LlamaConfig.
-        """
-        super().__init__(**kwargs)
-        self.moe_intermediate_size = moe_intermediate_size
-        self.moe_num_experts = moe_num_experts
-        self.moe_topk = moe_topk
-        self.moe_z_loss_coeff = moe_z_loss_coeff
-        self.moe_aux_loss_coeff = moe_aux_loss_coeff
-        self.moe_num_shared_experts = moe_num_shared_experts
-
-
-class Experts(nn.Module):
-    def __init__(self, config: AriaMoELMConfig):
-        super().__init__()
-        self.config = config
-
-        self.router_weight = nn.Parameter(
-            torch.empty((self.config.moe_num_experts, self.config.hidden_size))
-        )
-
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-        if self.tp_size > config.moe_num_experts:
-            raise ValueError(
-                f"Tensor model parallel size {self.tp_size} is greater than the number of experts {config.moe_num_experts}"
-            )
-
-        self.w1 = nn.Parameter(
-            torch.empty(
-                (
-                    config.moe_num_experts,
-                    config.moe_intermediate_size * 2 // self.tp_size,
-                    config.hidden_size,
-                )
-            )
-        )
-        self.w2 = nn.Parameter(
-            torch.empty(
-                (
-                    config.moe_num_experts,
-                    config.hidden_size,
-                    config.moe_intermediate_size // self.tp_size,
-                )
-            )
-        )
-        set_weight_attrs(
-            self.router_weight, {"weight_loader": self._weight_loader_for_router}
-        )
-        set_weight_attrs(self.w1, {"weight_loader": self._weight_loader_for_w1})
-        set_weight_attrs(self.w2, {"weight_loader": self._weight_loader_for_w2})
-
-    def _weight_loader_for_router(
-        self, param: nn.Parameter, loaded_weight: torch.Tensor
-    ):
-        param.data.copy_(loaded_weight)
-
-    def _weight_loader_for_w1(self, param: nn.Parameter, loaded_weight: torch.Tensor):
-        # the shape of loaded_weight is (num_experts, hidden_size, 2 * moe_intermediate_size)
-        if self.tp_size > 1:
-            up, gate = loaded_weight.chunk(2, dim=-1)
-            up_current_rank = up.chunk(self.tp_size, dim=-1)[self.tp_rank]
-            gate_current_rank = gate.chunk(self.tp_size, dim=-1)[self.tp_rank]
-            up_and_gate = torch.cat(
-                [up_current_rank, gate_current_rank], dim=-1
-            ).transpose(1, 2)
-            param.data.copy_(up_and_gate)
-        else:
-            param.data.copy_(loaded_weight.transpose(1, 2))
-
-    def _weight_loader_for_w2(self, param: nn.Parameter, loaded_weight: torch.Tensor):
-        # the shape of loaded_weight is (num_experts, moe_intermediate_size, hidden_size)
-        if self.tp_size > 1:
-            down_current_rank = loaded_weight.chunk(self.tp_size, dim=1)[self.tp_rank]
-            param.data.copy_(down_current_rank.transpose(1, 2))
-        else:
-            param.data.copy_(loaded_weight.transpose(1, 2))
-
-    def forward(self, hidden_states):
-        router_output = torch.nn.functional.linear(hidden_states, self.router_weight)
-
-        def custom_routing_function(hidden_states, router_output, topk, renormalize):
-            top_logits, top_indices = torch.topk(
-                router_output, k=self.config.moe_topk, dim=1
-            )
-            scores = torch.softmax(top_logits, dim=-1, dtype=torch.float32)
-            return scores, top_indices.to(torch.int32)
-
-        hidden_states_shape = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_states.size(-1))
-        final_hidden_states = fused_moe(
-            hidden_states,
-            self.w1,
-            self.w2,
-            router_output,
-            self.config.moe_topk,
-            False,
-            inplace=True,
-            custom_routing_function=custom_routing_function,
-        )
-        final_hidden_states = final_hidden_states.view(hidden_states_shape)
-        final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
-        return final_hidden_states
-
-
-class MoELayer(nn.Module):
-    """
-    Mixture of Experts (MoE) Layer for the AriaMoE model.
-
-    This layer implements the MoE mechanism, which routes input tokens to different experts
-    based on a routing algorithm, processes them through the experts, and then combines
-    the outputs.
-    """
-
-    def __init__(
-        self,
-        config: AriaMoELMConfig,
-        quant_config: Optional[QuantizationConfig],
-        lora_config: Optional[LoRAConfig],
-    ) -> None:
-        super().__init__()
-        self.config = config
-
-        self.experts = Experts(config)
-        self.shared_experts = LlamaMLP(
-            config.hidden_size,
-            config.moe_intermediate_size * config.moe_num_shared_experts,
-            "silu",
-            quant_config=quant_config,
-        )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the MoE Layer.
-
-        Args:
-            hidden_states (torch.Tensor): Input tensor of shape (batch_size, sequence_length, hidden_size).
-
-        Returns:
-            torch.Tensor: Output tensor after passing through the MoE layer.
-        """
-
-        shared_expert_output = self.shared_experts(hidden_states)
-        sparse_expert_output = self.experts(hidden_states)
-
-        return sparse_expert_output + shared_expert_output
-
-
-class MoEDecoderLayer(LlamaDecoderLayer):
-    """
-    Custom Decoder Layer for the AriaMoE model which modifies the standard `LlamaDecoderLayer` by
-    replacing the traditional MLP with a Mixture of Experts (MoE) Layer.
-    """
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        nn.Module.__init__(self)
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
-        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        # Support abacusai/Smaug-72B-v0.1 with attention_bias
-        # Support internlm/internlm-7b with bias
-        attention_bias = getattr(config, "attention_bias", False) or getattr(
-            config, "bias", False
-        )
-        self.self_attn = LlamaAttention(
-            config=config,
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(
-                config, "num_key_value_heads", config.num_attention_heads
-            ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            quant_config=quant_config,
-            bias=attention_bias,
-            cache_config=cache_config,
-            prefix=f"{prefix}.self_attn",
-        )
-        self.mlp = MoELayer(config, quant_config=quant_config, lora_config=lora_config)
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-
-
-class AriaMoELMModel(LlamaModel):
-    """
-    Custom LlamaModel for the AriaMoE model which modifies the standard LlamaModel by
-    replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
-    """
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        # FIXME: this is a hack to disable the compilation of the model
-        self.do_not_compile = True
-
-        self.layers = None
-
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: MoEDecoderLayer(
-                config=config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
-            ),
-            prefix=f"{prefix}.layers",
-        )
-
-
-def build_mm_projector(config):
-    """
-    Builds and returns an AriaProjector instance based on the provided configuration.
-
-    Args:
-        config (AriaConfig): The configuration object containing necessary parameters.
-
-    Returns:
-        AriaProjector: An instance of the AriaProjector class.
-    """
-    return AriaProjector(
-        patch_to_query_dict=config.projector_patch_to_query_dict,
-        embed_dim=config.vision_config.hidden_size,
-        num_heads=config.vision_config.num_attention_heads,
-        kv_dim=config.vision_config.hidden_size,
-        ff_dim=config.text_config.hidden_size,
-        output_dim=config.text_config.hidden_size,
-    )
-
-
-def _select_best_resolution(
-    img_width: int, img_height: int, target_ratios: List[List[int]], patch_size: int
-):
-    """
-    Selects the best resolution from a list of possible resolutions based on the original size.
-
-    Args:
-        img_width: the original widths of images.
-        img_height: the original heights of images.
-        target_ratios (2d numpy array): dimension size (M,2)
-        patch_size (int): image patch size
-
-    Returns:
-        tuple: The best fit resolution in the format (width, height).
-    """
-
-    aspect_ratio = img_width / img_height
-    best_ratio_diff = float("inf")
-    best_ratio_w, best_ratio_h = 1, 1
-    area = np.int32(img_width) * np.int32(img_height)
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio_w, best_ratio_h = ratio[0], ratio[1]
-        elif (
-            ratio_diff == best_ratio_diff
-            and area > 0.5 * patch_size * patch_size * ratio[0] * ratio[1]
-        ):
-            best_ratio_w, best_ratio_h = ratio[0], ratio[1]
-
-    return best_ratio_w, best_ratio_h
-
-
-def split_image(
-    image: Image.Image,
-    split_image: bool,
-    split_ratio: List[List[int]] = [
-        [1, 2],
-        [1, 3],
-        [1, 4],
-        [1, 5],
-        [1, 6],
-        [1, 7],
-        [1, 8],
-        [2, 4],
-        [2, 3],
-        [2, 2],
-        [2, 1],
-        [3, 1],
-        [3, 2],
-        [4, 1],
-        [4, 2],
-        [5, 1],
-        [6, 1],
-        [7, 1],
-        [8, 1],
-    ],
-    patch_size: int = 980,
-) -> List[Image.Image]:
-    """
-    Split image into multiple patches
-
-    Args:
-        image (PIL.Image): Input image.
-        split_image (bool): Whether to split the image into patches.
-        split_ratio (2d numpy array): dimension size (M,2)
-        patch_size (int): image patch size
-
-    Returns:
-        List[PIL.Image]: List of splitted images.
-    """
-    if split_image:
-        ratio_width, ratio_height = _select_best_resolution(
-            image.width, image.height, split_ratio, patch_size
-        )
-        resize_width = patch_size * ratio_width
-        resize_height = patch_size * ratio_height
-        blocks = ratio_width * ratio_height
-        resized_img = image.resize((resize_width, resize_height))
-        processed_images = []
-        for i in range(blocks):
-            box = (
-                (i % (resize_width // patch_size)) * patch_size,
-                (i // (resize_width // patch_size)) * patch_size,
-                ((i % (resize_width // patch_size)) + 1) * patch_size,
-                ((i // (resize_width // patch_size)) + 1) * patch_size,
-            )
-            # split the image
-            split_img = resized_img.crop(box)
-            processed_images.append(split_img)
-        assert len(processed_images) == blocks
-        if len(processed_images) != 1:
-            processed_images.insert(0, image)
-        return processed_images
-    else:
-        return [image]
-
-
-def get_max_multimodal_tokens(ctx):
-    return max(ctx.model_config.hf_config.image_size2tokens.values())
-
-
-def input_mapper_for_aria(ctx, data):
-    """
-    This is almost same with _default_input_mapper from vllm.multimodal.image.py.
-    Args:
-        ctx (ModelExecutorContext): The context object containing necessary parameters.
-        data (Union[Image.Image, torch.Tensor, List[Union[Image.Image, torch.Tensor]]]): The input data to be processed.
-    The only different is we would like to support runtime max_image_size adjustment.
-    """
-    model_config = ctx.model_config
-    max_image_size = getattr(model_config.multimodal_config, "max_image_size", 980)
-
-    # PIL image
-    if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
-        image_processor = cached_get_image_processor(
-            model_config.model, trust_remote_code=model_config.trust_remote_code
-        )
-        if image_processor is None:
-            raise RuntimeError(
-                "No HuggingFace processor is available " "to process the image object"
-            )
-        try:
-            batch_data = image_processor.preprocess(
-                data, max_image_size=max_image_size, return_tensors="pt"
-            ).data
-            batch_data.pop("num_crops")
-        except Exception:
-            logger.error("Failed to process image (%s)", data)
-            raise
-
-        return MultiModalInputs(batch_data)
-
-    # Image embedding
-    elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor):
-        return MultiModalInputs({"image_embeds": data})
-
-    raise TypeError(f"Invalid image type: {type(data)}")
-
-
-def input_processor(ctx, llm_inputs):
-    multi_modal_data = llm_inputs.get("multi_modal_data")
-    # if it is pure text input, use it as is
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return llm_inputs
-
-    model_config = ctx.model_config
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    hf_config = model_config.hf_config
-
-    # prepare image tokens, the max_image_size is used to determine the number of patch_size for every image
-    max_image_size = multi_modal_data.pop("max_image_size", 980)
-    _split_image = multi_modal_data.pop("split_image", False)
-
-    assert isinstance(max_image_size, int) or isinstance(
-        max_image_size, float
-    ), "max_image_size should be float or int"
-    images = (
-        multi_modal_data["image"]
-        if isinstance(multi_modal_data["image"], list)
-        else [multi_modal_data["image"]]
-    )
-    num_crops = []
-    splitted_images = []
-    for image in images:
-        splitted_image = split_image(image, _split_image, patch_size=max_image_size)
-        splitted_images.extend(splitted_image)
-        num_crops.append(len(splitted_image))
-    max_image_size = [max_image_size] * len(images)
-    # reassign the image because we might split them into mini-patches
-    multi_modal_data["image"] = splitted_images
-
-    # Mapping the image patch size to the corresponding number of tokens for each image
-    image_feature_sizes = []
-    for image_size, num_crop in zip(max_image_size, num_crops):
-        assert (
-            image_size in hf_config.image_size2tokens
-        ), f"Invalid image size: {image_size}, available options: {list(hf_config.image_size2tokens.keys())}"
-        image_feature_sizes.append(hf_config.image_size2tokens[image_size] * num_crop)
-
-    # Set up the max_image_size and split_image in the RuntimeContext for the image processor
-    # TODO: Supports dynamic image size support
-    setattr(model_config.multimodal_config, "max_image_size", max(max_image_size))
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        llm_inputs.get("prompt"),
-        llm_inputs["prompt_token_ids"],
-        placeholder_token_id=hf_config.image_token_index,
-        repeat_count=image_feature_sizes,
-    )
-
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data=multi_modal_data,
-        # multi_modal_placeholders={"image": ranges},
-    )
-
-
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens)
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria)
-@INPUT_REGISTRY.register_input_processor(input_processor)
-class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
-    """
-    Aria model for conditional generation tasks.
-
-    This model combines a vision tower, a multi-modal projector, and a language model
-    to perform tasks that involve both image and text inputs.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-
-        # prepare the image_size to tokens mapping for the image preprocess, see input_processor
-        setattr(
-            config,
-            "image_size2tokens",
-            {
-                int(math.sqrt(k) * config.vision_config.patch_size): v
-                for k, v in config.projector_patch_to_query_dict.items()
-            },
-        )
-        self.config = config
-        self.vision_tower = AriaVisionModel(config.vision_config)
-        self.multi_modal_projector = build_mm_projector(config)
-        self.vocab_size = config.text_config.vocab_size
-        self.language_model = AriaMoELMModel(
-            vllm_config=vllm_config.with_hf_config(config.text_config),
-            prefix=maybe_prefix(prefix, "language_model.model"),
-        )
-        self.pad_token_id = (
-            self.config.pad_token_id if self.config.pad_token_id is not None else -1
-        )
-        self.unpadded_vocab_size = config.text_config.vocab_size
-        self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
-            config.text_config.hidden_size,
-            org_num_embeddings=self.language_model.org_vocab_size,
-            quant_config=quant_config,
-        )
-        logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, self.vocab_size, logit_scale
-        )
-        self.sampler = Sampler()
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        **kwargs: object,
-    ):
-        # 1. Extra the input embeddings
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        pixel_values = kwargs.get("pixel_values", None)
-        pixel_mask = kwargs.get("pixel_mask", None)
-
-        # 2. Merge text and images
-        if pixel_values is not None:
-            pixel_values = pixel_values.view(-1, *pixel_values.shape[-3:]).to(
-                torch.bfloat16
-            )
-            pixel_mask = pixel_mask.view(-1, *pixel_mask.shape[-2:])
-            selected_image_feature, image_attn_mask = self.vision_tower(
-                pixel_values,
-                pixel_mask=pixel_mask,
-            )
-
-            image_features = self.multi_modal_projector(
-                selected_image_feature, attn_mask=image_attn_mask
-            )
-
-            inputs_embeds = inputs_embeds.to(image_features.dtype)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, image_features, self.config.image_token_index
-            )
-
-        hidden_states = self.language_model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-            None,
-            inputs_embeds=inputs_embeds,
-        )
-
-        return hidden_states
-
-    def compute_logits(
-        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
-    ) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "language_model.model": "language_model",
-                "language_model.lm_head": "lm_head",
-            },
-            orig_to_new_suffix={
-                "experts.fc1.weight": "experts.w1",
-                "experts.fc2.weight": "experts.w2",
-                "router.weight": "experts.router_weight",
-            },
-        )
-
-        loader = AutoWeightsLoader(self)
-        loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/docs/inference.md b/docs/inference.md
index 600c5be..2954e89 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -78,8 +78,10 @@ python aria/inference.py --help
 This method leverages vLLM for high-performance inference, particularly useful for scenarios requiring parallel processing or handling multiple requests.
 
 ### Install vLLM:
+
+Install the latest version of vLLM:
 ```bash
-pip install -e .[vllm]
+pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
 ### How to Use:
@@ -92,23 +94,14 @@ pip install -e .[vllm]
 ```python
 from PIL import Image
 from transformers import AutoTokenizer
-from vllm import LLM, ModelRegistry, SamplingParams
-
-from aria.vllm.aria import AriaForConditionalGeneration
-
-ModelRegistry.register_model(
-    "AriaForConditionalGeneration", AriaForConditionalGeneration
-)
+from vllm import LLM, SamplingParams
 
 
 def main():
     llm = LLM(
         model="rhymes-ai/Aria",
-        tokenizer="rhymes-ai/Aria",
         tokenizer_mode="slow",
         dtype="bfloat16",
-        limit_mm_per_prompt={"image": 256},
-        enforce_eager=True,
         trust_remote_code=True,
     )
 

From c43a65f294a3070bc69b7a746a20432c5d4e4bd0 Mon Sep 17 00:00:00 2001
From: xffxff <1247714429@qq.com>
Date: Tue, 26 Nov 2024 05:55:01 +0000
Subject: [PATCH 2/2] remove vllm as dependency

---
 pyproject.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f556c17..9180163 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,9 +31,6 @@ dev = [
     "isort==5.13.2",
     "pytest==8.3.3",
 ]
-vllm = [
-    "vllm==0.6.4.post1"
-]
 grouped_gemm = [
     "grouped_gemm==0.1.6"
 ]