# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ---------- FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder # Toolchain RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \ rm -rf /var/lib/apt/lists/* RUN python3 -m pip install --no-cache-dir --upgrade pip wheel # Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda ENV CUDA_HOME=/usr/local/cuda ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH # CMAKE-args: # - CUDA aan # - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C # - Architectuur op Pascal (61) om compile-tijd te beperken # - Linker flags wijzen naar stubs en linken -lcuda expliciet ENV CMAKE_ARGS="-DLLAMA_CUDA=on \ -DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \ -DCMAKE_CUDA_ARCHITECTURES=61 \ -DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \ -DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'" # Wheel bouwen (from source) RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python # ---------- Stage 2: runtime (met cuBLAS aanwezig) ---------- FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 # Python + OpenMP runtime RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip ca-certificates libgomp1 uvicorn && \ rm -rf /var/lib/apt/lists/* # Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan ENV PYTHONUNBUFFERED=1 \ PYTHONIOENCODING=UTF-8 \ LLAMA_LOG_LEVEL=info \ GGML_CUDA_FORCE_CUBLAS=1 # Installeer de zojuist gebouwde wheel COPY --from=builder /wheels /wheels RUN python3 -m pip install --no-cache-dir --upgrade pip && \ python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \ python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl RUN ulimit -s unlimited EXPOSE 8000 VOLUME ["/models"] COPY entrypoint.sh /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] # --n_ctx 12412 daarboven problemen? CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf", "--chat_format","chatml-function-calling", "--host","0.0.0.0","--port","8000", "--n_ctx","13027","--n_batch","16", "--n_gpu_layers","-1", "--use_mmap","false"]