diff --git a/Dockerfile b/Dockerfile index 9276377f3dd..2d1c31e3ca3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ COPY launcher launcher RUN cargo build --profile release-opt # Text Generation Inference base image -FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest as base +FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest AS base ENV ATTENTION=default ENV PREFIX_CACHING=0 @@ -75,7 +75,7 @@ RUN cd server && \ make gen-server && \ pip install --no-deps -r requirements.txt && \ bash ./dill-0.3.8-patch.sh && \ - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0 && \ + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 && \ BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \ pip install . --no-cache-dir diff --git a/server/pyproject.toml b/server/pyproject.toml index 46a513117f1..c61ac030b47 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -22,7 +22,7 @@ opentelemetry-instrumentation-grpc = "^0.36b0" hf-transfer = "^0.1.2" sentencepiece = "^0.1.97" peft = "^0.10" -optimum-habana = "1.14.1" +optimum-habana = "1.15.0" transformers = "4.45.2" numpy = "1.26.4" accelerate = "0.33.0" diff --git a/server/requirements.txt b/server/requirements.txt index a940574f989..0741449090d 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -12,7 +12,7 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p coloredlogs==15.0.1 ; python_version >= "3.9" and python_version < "3.13" datasets==3.0.1 ; python_version >= "3.9" and python_version < "3.13" deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" -diffusers==0.29.2 ; python_version >= "3.9" and python_version < "3.13" +diffusers==0.31.0 ; python_version >= "3.9" and python_version < "3.13" dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13" filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13" frozenlist==1.4.1 ; python_version >= "3.9" and python_version < "3.13" @@ -46,7 +46,7 @@ opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_versi opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13" -optimum-habana==1.14.1 ; python_version >= "3.9" and python_version < "3.13" +optimum-habana==1.15.0 ; python_version >= "3.9" and python_version < "3.13" optimum==1.23.2 ; python_version >= "3.9" and python_version < "3.13" packaging==24.1 ; python_version >= "3.9" and python_version < "3.13" pandas==2.2.3 ; python_version >= "3.9" and python_version < "3.13" @@ -67,7 +67,7 @@ requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13" scikit-learn==1.5.2 ; python_version >= "3.9" and python_version < "3.13" scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" -sentence-transformers[train]==3.0.1 ; python_version >= "3.9" and python_version < "3.13" +sentence-transformers[train]==3.2.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13" six==1.16.0 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 120b140bfed..dd52f2dba3d 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -717,11 +717,12 @@ def __init__( } - if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon"]: - if model.config.model_type not in ["falcon"]: + if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon", "gpt_bigcode"]: + if model.config.model_type not in ["falcon", "gpt_bigcode"]: self.kwargs["attn_softmax_bf16"] = True - self.kwargs["trim_logits"] = True + if model.config.model_type not in ["gpt_bigcode"]: + self.kwargs["trim_logits"] = True if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true": self.kwargs["use_flash_attention"] = True diff --git a/server/text_generation_server/pb/.gitignore b/server/text_generation_server/pb/.gitignore old mode 100644 new mode 100755