diff --git a/.github/workflows/_comps-workflow.yml b/.github/workflows/_comps-workflow.yml index 3c226d7c3..bbb4ce12d 100644 --- a/.github/workflows/_comps-workflow.yml +++ b/.github/workflows/_comps-workflow.yml @@ -65,7 +65,7 @@ jobs: fi if [[ $(grep -c "vllm-hpu:" ${docker_compose_yml}) != 0 ]]; then git clone https://github.com/HabanaAI/vllm-fork.git vllm-fork - cd vllm-fork && git rev-parse HEAD && cd ../ + cd vllm-fork && git checkout 3c39626 && cd ../ fi - name: Get build list id: get-build-list diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md index d3f48e768..1405273b0 100644 --- a/comps/llms/text-generation/vllm/langchain/README.md +++ b/comps/llms/text-generation/vllm/langchain/README.md @@ -56,12 +56,6 @@ bash ./build_docker_vllm.sh hpu Set `hw_mode` to `hpu`. -Note: If you want to enable tensor parallel, please set `setuptools==69.5.1` in Dockerfile.hpu before build docker with following command. - -``` -sed -i "s/RUN pip install setuptools/RUN pip install setuptools==69.5.1/g" docker/Dockerfile.hpu -``` - #### Launch vLLM service on single node For small model, we can just use single node. diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh index 2e8d4e89d..aa189df0c 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh @@ -37,6 +37,7 @@ fi if [ "$hw_mode" = "hpu" ]; then git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork/ + git checkout 3c39626 docker build -f Dockerfile.hpu -t opea/vllm-hpu:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy cd .. rm -rf vllm-fork diff --git a/comps/llms/text-generation/vllm/llama_index/README.md b/comps/llms/text-generation/vllm/llama_index/README.md index 960b8ac14..1cd254c88 100644 --- a/comps/llms/text-generation/vllm/llama_index/README.md +++ b/comps/llms/text-generation/vllm/llama_index/README.md @@ -56,12 +56,6 @@ bash ./build_docker_vllm.sh hpu Set `hw_mode` to `hpu`. -Note: If you want to enable tensor parallel, please set `setuptools==69.5.1` in Dockerfile.hpu before build docker with following command. - -``` -sed -i "s/RUN pip install setuptools/RUN pip install setuptools==69.5.1/g" docker/Dockerfile.hpu -``` - #### Launch vLLM service on single node For small model, we can just use single node. diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh index 302cd8ee1..7bd162954 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh @@ -32,6 +32,7 @@ fi if [ "$hw_mode" = "hpu" ]; then git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork/ + git checkout 3c39626 docker build -f Dockerfile.hpu -t opea/vllm-hpu:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy cd .. rm -rf vllm-fork diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh index 5024b0c93..073ee5736 100644 --- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh @@ -12,6 +12,7 @@ function build_docker_images() { cd $WORKPATH git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork/ + git checkout 3c39626 docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . if [ $? -ne 0 ]; then echo "opea/vllm-hpu built fail" @@ -34,7 +35,7 @@ function build_docker_images() { } function start_service() { - export LLM_MODEL="facebook/opt-125m" + export LLM_MODEL="Intel/neural-chat-7b-v3-3" port_number=5025 docker run -d --rm \ --runtime=habana \ @@ -76,7 +77,7 @@ function validate_microservice() { result=$(http_proxy="" curl http://${ip_address}:5025/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "facebook/opt-125m", + "model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0 diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh index 724e523e7..62626508a 100644 --- a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh @@ -12,6 +12,7 @@ function build_docker_images() { cd $WORKPATH git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork/ + git checkout 3c39626 docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . if [ $? -ne 0 ]; then echo "opea/vllm-hpu built fail" @@ -34,7 +35,7 @@ function build_docker_images() { } function start_service() { - export LLM_MODEL="facebook/opt-125m" + export LLM_MODEL="Intel/neural-chat-7b-v3-3" port_number=5025 docker run -d --rm \ --runtime=habana \ @@ -76,7 +77,7 @@ function validate_microservice() { result=$(http_proxy="" curl http://${ip_address}:5025/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "facebook/opt-125m", + "model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0