add gaudi2 support

tylertitsworth · tylertitsworth · commit f26ee13f0b2c · 2024-09-26T15:24:44.000-07:00
Signed-off-by: tylertitsworth &lt;tyler.titsworth@intel.com&gt;
diff --git a/workflows/charts/tgi/Chart.yaml b/workflows/charts/tgi/Chart.yaml
@@ -33,7 +33,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 0.2.0
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
diff --git a/workflows/charts/tgi/README.md b/workflows/charts/tgi/README.md
@@ -7,23 +7,24 @@ For more information about how to use Huggingface text-generation-inference with
 > [!TIP]
 > For Gaudi-related documentation, check out [tgi-gaudi](https://github.com/huggingface/tgi-gaudi).
 
-![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.16.0](https://img.shields.io/badge/AppVersion-1.16.0-informational?style=flat-square)
+![Version: 0.2.0](https://img.shields.io/badge/Version-0.2.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.16.0](https://img.shields.io/badge/AppVersion-1.16.0-informational?style=flat-square)
 
 ## Values
 
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
-| deploy.configMapName | string | `"intel-proxy-config"` | ConfigMap of Environment Variables |
+| deploy.configMap | object | `{"enabled":true,"name":"tgi-config"}` | ConfigMap of Environment Variables |
 | deploy.image | string | `"ghcr.io/huggingface/text-generation-inference:latest-intel"` | Intel TGI Image |
-| deploy.model | string | `"HuggingFaceTB/SmolLM-135M"` | Model to be loaded |
-| deploy.quantize | string | `""` | Enable Quantization (ex: bitsandbytes-nf4) |
 | deploy.replicaCount | int | `1` | Number of pods |
 | deploy.resources | object | `{"limits":{"cpu":"4000m","gpu.intel.com/i915":1},"requests":{"cpu":"1000m","memory":"1Gi"}}` | Resource configuration |
 | deploy.resources.limits."gpu.intel.com/i915" | int | `1` | Intel GPU Device Configuration |
 | fullnameOverride | string | `""` | Full qualified Domain Name |
 | ingress | object | `{"annotations":{},"className":"","enabled":false,"hosts":[{"host":"chart-example.local","paths":[{"path":"/","pathType":"ImplementationSpecific"}]}],"tls":[]}` | Ingress configuration |
 | nameOverride | string | `""` | Name of the serving service |
+| pvc.size | string | `"15Gi"` |  |
+| pvc.storageClassName | string | `"nil"` |  |
 | secret.encodedToken | string | `""` | Base64 Encoded Huggingface Hub API Token |
+| securityContext | object | `{}` | Security Context Configuration |
 | service | object | `{"port":80,"type":"NodePort"}` | Service configuration |
 
 ----------------------------------------------
diff --git a/workflows/charts/tgi/templates/NOTES.txt b/workflows/charts/tgi/templates/NOTES.txt
@@ -17,6 +17,5 @@
 {{- else if contains "ClusterIP" .Values.service.type }}
   export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "tgi.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
   export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
-  echo "Visit http://127.0.0.1:8080 to use your application"
   kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
 {{- end }}
diff --git a/workflows/charts/tgi/templates/deploy.yaml b/workflows/charts/tgi/templates/deploy.yaml
@@ -28,54 +28,62 @@ spec:
       labels:
         {{- include "tgi.selectorLabels" . | nindent 8 }}
     spec:
-      securityContext:
-        fsGroup: 1000
-        runAsUser: 1000
+      hostIPC: true
       containers:
         - name: {{ .Chart.Name }}
           args:
-            - '--model-id'
-            - {{ .Values.deploy.model | quote }}
-            {{- if index .Values.deploy.resources.limits "gpu.intel.com/i915" }}
-            - '--num-shard'
-            - {{ index .Values.deploy.resources.limits "gpu.intel.com/i915" | quote }}
-            {{- end }}
             - '-p'
             - {{ .Values.service.port | quote }}
-            {{- if .Values.quantize }}
-            - '--quantize'
-            - {{ .Values.deploy.quantize | quote }}
-            {{- end }}
             - '--cuda-graphs=0'
           envFrom:
+            {{- if eq .Values.deploy.configMap.enabled true }}
             - configMapRef:
-                name: {{ .Values.deploy.configMapName }}
+                name: {{ .Values.deploy.configMap.name }}
+            {{- end }}
             - secretRef:
                 name: {{ .Release.Name }}-hf-token
-          env:
-            - name: NUMBA_CACHE_DIR # https://github.com/huggingface/text-generation-inference/pull/2443
-              value: /data/numba_cache
+          # env:
+          #   - name: NUMBA_CACHE_DIR # https://github.com/huggingface/text-generation-inference/pull/2443
+          #     value: /data/numba_cache
           image: {{ .Values.deploy.image }}
           livenessProbe:
-            httpGet:
-              path: /health
-              port: {{ .Values.service.port }}
+            failureThreshold: 10
             initialDelaySeconds: 5
             periodSeconds: 5
+            tcpSocket:
+              port: http
+          readinessProbe:
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          startupProbe:
+            failureThreshold: 120
+            initialDelaySeconds: 20
+            periodSeconds: 5
+            tcpSocket:
+              port: http
           ports:
             - name: http
               containerPort: {{ .Values.service.port }}
               protocol: TCP
           resources:
             {{- toYaml .Values.deploy.resources | nindent 12 }}
+          securityContext:
+            {{ toYaml .Values.securityContext | nindent 12 }}
           volumeMounts:
             - mountPath: /dev/shm
               name: dshm
             - mountPath: /data
               name: hf-data
+            - mountPath: /tmp
+              name: tmp
       volumes:
         - name: dshm
           emptyDir:
             medium: Memory
         - name: hf-data
+          persistentVolumeClaim:
+            claimName: {{ include "tgi.fullname" . }}-cache
+        - name: tmp
           emptyDir: {}
diff --git a/workflows/charts/tgi/templates/pvc.yaml b/workflows/charts/tgi/templates/pvc.yaml
@@ -0,0 +1,29 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "tgi.fullname" . }}-cache
+  labels:
+    {{- include "tgi.labels" . | nindent 4 }}
+spec:
+  {{- if .Values.pvc.storageClassName }}
+  storageClassName: {{ .Values.pvc.storageClassName }}
+  {{- end }}
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: {{ .Values.pvc.size }}
diff --git a/workflows/charts/tgi/templates/secret.yaml b/workflows/charts/tgi/templates/secret.yaml
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 {{- $name := .Values.secret.encodedToken | required ".Values.secret.encodedToken is required in Base64 Format." -}}
+---
 apiVersion: v1
 kind: Secret
 metadata:
diff --git a/workflows/charts/tgi/values.yaml b/workflows/charts/tgi/values.yaml
@@ -18,13 +18,11 @@ nameOverride: ""
 fullnameOverride: ""
 deploy:
   # -- ConfigMap of Environment Variables
-  configMapName: intel-proxy-config
+  configMap:
+    enabled: true
+    name: tgi-config
   # -- Intel TGI Image
   image: ghcr.io/huggingface/text-generation-inference:latest-intel
-  # -- Model to be loaded
-  model: HuggingFaceTB/SmolLM-135M
-  # -- Enable Quantization (ex: bitsandbytes-nf4)
-  quantize: ""
   # -- Number of pods
   replicaCount: 1
   # -- Resource configuration
@@ -39,6 +37,8 @@ deploy:
     requests:
       cpu: 1000m
       memory: "1Gi"
+# -- Security Context Configuration
+securityContext: {}
 secret:
   # -- Base64 Encoded Huggingface Hub API Token
   encodedToken: ""
@@ -62,3 +62,6 @@ ingress:
   #  - secretName: chart-example-tls
   #    hosts:
   #      - chart-example.local
+pvc:
+  storageClassName: nil
+  size: 15Gi