dgx-spark-inference-stack/docker-compose.yml at main · jdaln/dgx-spark-inference-stack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
include:
  - compose/models-gpt.yml
  - compose/models-qwen.yml
  - compose/models-mistral.yml
  - compose/models-glm.yml
  - compose/models-gemma.yml
  - compose/models-nemotron.yml
  - compose/models-llama.yml
  # - compose/models-phi.yml
  # - compose/models-deepseek.yml
  # - compose/models-experimental.yml

services:
  waker:
    build:
      context: .
      dockerfile: ./waker/Dockerfile
    container_name: vllm-waker
    cap_drop: [ALL]
    security_opt: ["no-new-privileges:true"]
    read_only: true
    tmpfs:
      - /tmp:noexec,nosuid,size=10m
#    ports: ["18080:18080"] #debug
    environment:
      PORT: "18080"
      MANAGE_PREFIX: "vllm-"
      IGNORE_NAMES: "vllm-gateway,vllm-waker,vllm-request-validator"
      IDLE_STOP_SECONDS: "1200" # 20mn for slow models
      NO_STOP_BEFORE_SECONDS: "30"
      HEALTH_TIMEOUT_MS: "900000"
      DOCKER_STOP_TIMEOUT_SECONDS: "5"
      MODEL_HEALTH_URL_TEMPLATE: "http://{name}:8000/health"
      TICK_MS: "1000"
      BUSY_STATUS_CODE: "429"
      VERBOSE: ${WAKER_VERBOSE:-0}
    logging:
      driver: ${DOCKER_LOG_DRIVER:-json-file}
      options:
        max-size: "10m"
        max-file: "3"
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
      - ./stats:/stats
      - ./models.json:/config/models.json:ro
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "node", "-e", "fetch('http://localhost:18080/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
      interval: 30s
      timeout: 5s
      retries: 5
    networks:
      - vllm_internal
#      - default #debug

  request-validator:
    build:
      context: .
      dockerfile: ./request-validator/Dockerfile
    container_name: vllm-request-validator
    cap_drop: [ALL]
    security_opt: ["no-new-privileges:true"]
    read_only: true
    tmpfs:
      - /tmp:noexec,nosuid,size=10m
    environment:
      PORT: "18081"
      VERBOSE: ${WAKER_VERBOSE:-0}
    logging:
      driver: ${DOCKER_LOG_DRIVER:-json-file}
      options:
        max-size: "10m"
        max-file: "3"
    restart: unless-stopped
    networks:
      - vllm_internal
    volumes:
      - ./models.json:/config/models.json:ro
    healthcheck:
      test: ["CMD", "node", "healthcheck.js"]
      interval: 30s
      timeout: 5s
      retries: 3

  api-gateway:
    image: nginx:1.27-alpine@sha256:63ffc0d1f14e4082b832c6a42e606e9a0384a526f16ddd720af7c1f018f2f7c4
    container_name: vllm-gateway
    entrypoint: ["nginx", "-g", "daemon off;"]
    cap_drop: [ALL]
    cap_add: [CHOWN, SETGID, SETUID]
    security_opt: ["no-new-privileges:true"]
    read_only: true
    tmpfs:
      - /tmp:noexec,nosuid,size=10m
      - /var/cache/nginx:noexec,nosuid,size=10m
      - /run:noexec,nosuid,size=1m
    depends_on:
      request-validator:
        condition: service_healthy
    ports: ["127.0.0.1:8009:8080"]
    logging:
      driver: ${DOCKER_LOG_DRIVER:-json-file}
      options:
        max-size: "10m"
        max-file: "3"
    volumes:
      - ./gateway.conf:/etc/nginx/conf.d/gateway.conf:ro
    restart: unless-stopped
    networks:
      - default
      - vllm_internal

networks:
  vllm_internal:
    internal: true