mistral-llm/old/Dockerfile
2026-02-23 16:01:45 +01:00

60 lines
2.4 KiB
Docker

# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ----------
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder
# Toolchain
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \
rm -rf /var/lib/apt/lists/*
RUN python3 -m pip install --no-cache-dir --upgrade pip wheel
# Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda
ENV CUDA_HOME=/usr/local/cuda
ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs
ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH
# CMAKE-args:
# - CUDA aan
# - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C
# - Architectuur op Pascal (61) om compile-tijd te beperken
# - Linker flags wijzen naar stubs en linken -lcuda expliciet
ENV CMAKE_ARGS="-DLLAMA_CUDA=on \
-DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \
-DCMAKE_CUDA_ARCHITECTURES=61 \
-DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \
-DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'"
# Wheel bouwen (from source)
RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python
# ---------- Stage 2: runtime (met cuBLAS aanwezig) ----------
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
# Python + OpenMP runtime
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip ca-certificates libgomp1 uvicorn && \
rm -rf /var/lib/apt/lists/*
# Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan
ENV PYTHONUNBUFFERED=1 \
PYTHONIOENCODING=UTF-8 \
LLAMA_LOG_LEVEL=info \
GGML_CUDA_FORCE_CUBLAS=1
# Installeer de zojuist gebouwde wheel
COPY --from=builder /wheels /wheels
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \
python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl
RUN ulimit -s unlimited
EXPOSE 8000
VOLUME ["/models"]
COPY entrypoint.sh /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
# --n_ctx 12412 daarboven problemen?
CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf", "--chat_format","chatml-function-calling", "--host","0.0.0.0","--port","8000", "--n_ctx","13027","--n_batch","16", "--n_gpu_layers","-1", "--use_mmap","false"]