KEDA Configuration Example
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: prometheus-scaledobject
namespace: vllm
spec:
maxReplicaCount: 2
minReplicaCount: 1
cooldownPeriod: 120 # Cooldown period in seconds before scale-in
pollingInterval: 30 # Polling interval for metrics (in seconds)
scaleTargetRef:
name: llama-32-3b-instruct
triggers:
- type: prometheus
metadata:
serverAddress: http://l2-jatiluhur.metric.cloudeka.ai
threshold: '0.5' # This is equivalent to 50%
query: vllm:gpu_cache_usage_perc{job="goto-vllm-exporter", model_name="meta-llama/Llama-3.2-3B-Instruct"}Last updated
