# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ----------
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder

# Toolchain
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \
    rm -rf /var/lib/apt/lists/*
RUN python3 -m pip install --no-cache-dir --upgrade pip wheel

# Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda
ENV CUDA_HOME=/usr/local/cuda
ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs
ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH

# CMAKE-args:
# - CUDA aan
# - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C
# - Architectuur op Pascal (61) om compile-tijd te beperken
# - Linker flags wijzen naar stubs en linken -lcuda expliciet
ENV CMAKE_ARGS="-DLLAMA_CUDA=on \
 -DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \
 -DCMAKE_CUDA_ARCHITECTURES=61 \
 -DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \
 -DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'"

# Wheel bouwen (from source)
RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python

# ---------- Stage 2: runtime (met cuBLAS aanwezig) ----------
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04

# Python + OpenMP runtime
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 python3-pip ca-certificates libgomp1 uvicorn && \
    rm -rf /var/lib/apt/lists/*

# Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan
ENV PYTHONUNBUFFERED=1 \
    PYTHONIOENCODING=UTF-8 \
    LLAMA_LOG_LEVEL=info \
    GGML_CUDA_FORCE_CUBLAS=1

# Installeer de zojuist gebouwde wheel
COPY --from=builder /wheels /wheels
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
    python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \
    python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl

RUN ulimit -s unlimited

EXPOSE 8000
VOLUME ["/models"]

COPY entrypoint.sh /entrypoint.sh

ENTRYPOINT ["/entrypoint.sh"]
# --n_ctx 12412 daarboven problemen?
CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf",     "--chat_format","chatml-function-calling",     "--host","0.0.0.0","--port","8000",     "--n_ctx","13027","--n_batch","16",     "--n_gpu_layers","-1",   "--use_mmap","false"]