mistral-llm/old/Dockerfile

# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ----------
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder

# Toolchain
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \
    rm -rf /var/lib/apt/lists/*
RUN python3 -m pip install --no-cache-dir --upgrade pip wheel

# Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda
ENV CUDA_HOME=/usr/local/cuda
ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs
ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH

# CMAKE-args:
# - CUDA aan
# - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C
# - Architectuur op Pascal (61) om compile-tijd te beperken
# - Linker flags wijzen naar stubs en linken -lcuda expliciet
ENV CMAKE_ARGS="-DLLAMA_CUDA=on \
 -DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \
 -DCMAKE_CUDA_ARCHITECTURES=61 \
 -DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \
 -DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'"

# Wheel bouwen (from source)
RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python

# ---------- Stage 2: runtime (met cuBLAS aanwezig) ----------
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04

# Python + OpenMP runtime
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 python3-pip ca-certificates libgomp1 uvicorn && \
    rm -rf /var/lib/apt/lists/*

# Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan
ENV PYTHONUNBUFFERED=1 \
    PYTHONIOENCODING=UTF-8 \
    LLAMA_LOG_LEVEL=info \
    GGML_CUDA_FORCE_CUBLAS=1

# Installeer de zojuist gebouwde wheel
COPY --from=builder /wheels /wheels
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
    python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \
    python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl

RUN ulimit -s unlimited

EXPOSE 8000
VOLUME ["/models"]

COPY entrypoint.sh /entrypoint.sh

ENTRYPOINT ["/entrypoint.sh"]
# --n_ctx 12412 daarboven problemen?
CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf",     "--chat_format","chatml-function-calling",     "--host","0.0.0.0","--port","8000",     "--n_ctx","13027","--n_batch","16",     "--n_gpu_layers","-1",   "--use_mmap","false"]
first commit 2026-02-23 15:01:45 +00:00			`# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ----------`
			`FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder`

			`# Toolchain`
			`RUN apt-get update && apt-get install -y --no-install-recommends \`
			`python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \`
			`rm -rf /var/lib/apt/lists/*`
			`RUN python3 -m pip install --no-cache-dir --upgrade pip wheel`

			`# Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda`
			`ENV CUDA_HOME=/usr/local/cuda`
			`ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs`
			`ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH`
			`ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH`

			`# CMAKE-args:`
			`# - CUDA aan`
			`# - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C`
			`# - Architectuur op Pascal (61) om compile-tijd te beperken`
			`# - Linker flags wijzen naar stubs en linken -lcuda expliciet`
			`ENV CMAKE_ARGS="-DLLAMA_CUDA=on \`
			`-DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \`
			`-DCMAKE_CUDA_ARCHITECTURES=61 \`
			`-DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \`
			`-DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'"`

			`# Wheel bouwen (from source)`
			`RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python`

			`# ---------- Stage 2: runtime (met cuBLAS aanwezig) ----------`
			`FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04`

			`# Python + OpenMP runtime`
			`RUN apt-get update && apt-get install -y --no-install-recommends \`
			`python3 python3-pip ca-certificates libgomp1 uvicorn && \`
			`rm -rf /var/lib/apt/lists/*`

			`# Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan`
			`ENV PYTHONUNBUFFERED=1 \`
			`PYTHONIOENCODING=UTF-8 \`
			`LLAMA_LOG_LEVEL=info \`
			`GGML_CUDA_FORCE_CUBLAS=1`

			`# Installeer de zojuist gebouwde wheel`
			`COPY --from=builder /wheels /wheels`
			`RUN python3 -m pip install --no-cache-dir --upgrade pip && \`
			`python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \`
			`python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl`

			`RUN ulimit -s unlimited`

			`EXPOSE 8000`
			`VOLUME ["/models"]`

			`COPY entrypoint.sh /entrypoint.sh`

			`ENTRYPOINT ["/entrypoint.sh"]`
			`# --n_ctx 12412 daarboven problemen?`
			`CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf", "--chat_format","chatml-function-calling", "--host","0.0.0.0","--port","8000", "--n_ctx","13027","--n_batch","16", "--n_gpu_layers","-1", "--use_mmap","false"]`