60 lines
2.4 KiB
Docker
60 lines
2.4 KiB
Docker
|
|
# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ----------
|
||
|
|
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder
|
||
|
|
|
||
|
|
# Toolchain
|
||
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||
|
|
python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \
|
||
|
|
rm -rf /var/lib/apt/lists/*
|
||
|
|
RUN python3 -m pip install --no-cache-dir --upgrade pip wheel
|
||
|
|
|
||
|
|
# Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda
|
||
|
|
ENV CUDA_HOME=/usr/local/cuda
|
||
|
|
ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs
|
||
|
|
ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH
|
||
|
|
ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH
|
||
|
|
|
||
|
|
# CMAKE-args:
|
||
|
|
# - CUDA aan
|
||
|
|
# - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C
|
||
|
|
# - Architectuur op Pascal (61) om compile-tijd te beperken
|
||
|
|
# - Linker flags wijzen naar stubs en linken -lcuda expliciet
|
||
|
|
ENV CMAKE_ARGS="-DLLAMA_CUDA=on \
|
||
|
|
-DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \
|
||
|
|
-DCMAKE_CUDA_ARCHITECTURES=61 \
|
||
|
|
-DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \
|
||
|
|
-DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'"
|
||
|
|
|
||
|
|
# Wheel bouwen (from source)
|
||
|
|
RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python
|
||
|
|
|
||
|
|
# ---------- Stage 2: runtime (met cuBLAS aanwezig) ----------
|
||
|
|
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||
|
|
|
||
|
|
# Python + OpenMP runtime
|
||
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||
|
|
python3 python3-pip ca-certificates libgomp1 uvicorn && \
|
||
|
|
rm -rf /var/lib/apt/lists/*
|
||
|
|
|
||
|
|
# Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan
|
||
|
|
ENV PYTHONUNBUFFERED=1 \
|
||
|
|
PYTHONIOENCODING=UTF-8 \
|
||
|
|
LLAMA_LOG_LEVEL=info \
|
||
|
|
GGML_CUDA_FORCE_CUBLAS=1
|
||
|
|
|
||
|
|
# Installeer de zojuist gebouwde wheel
|
||
|
|
COPY --from=builder /wheels /wheels
|
||
|
|
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||
|
|
python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \
|
||
|
|
python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl
|
||
|
|
|
||
|
|
RUN ulimit -s unlimited
|
||
|
|
|
||
|
|
EXPOSE 8000
|
||
|
|
VOLUME ["/models"]
|
||
|
|
|
||
|
|
COPY entrypoint.sh /entrypoint.sh
|
||
|
|
|
||
|
|
ENTRYPOINT ["/entrypoint.sh"]
|
||
|
|
# --n_ctx 12412 daarboven problemen?
|
||
|
|
CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf", "--chat_format","chatml-function-calling", "--host","0.0.0.0","--port","8000", "--n_ctx","13027","--n_batch","16", "--n_gpu_layers","-1", "--use_mmap","false"]
|