commit fd7028ad62e6fe98245f2448a6d18caaf43c5d34 Author: admin Date: Mon Feb 23 16:01:45 2026 +0100 first commit diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3404fd9 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,64 @@ +# ---------- Stage 1: build llama-server (CUDA, op jouw CPU) ---------- +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder + +ENV DEBIAN_FRONTEND=noninteractive + +# Basis build tools + curl dev (voor HTTP server) +RUN apt-get update && apt-get install -y --no-install-recommends \ + git build-essential cmake ninja-build ca-certificates \ + libcurl4-openssl-dev && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /src + +# Lama.cpp ophalen (HEAD, incl. nieuwe tool-calling / server) +RUN git clone --depth 1 https://github.com/ggml-org/llama.cpp.git . + +# Belangrijk: zorg dat de linker een libcuda.so.1 ziet (stub) +# De stub zit in /usr/local/cuda/lib64/stubs/libcuda.so +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so \ + /usr/lib/x86_64-linux-gnu/libcuda.so.1 + +# CMake configure: +# - Server aan +# - CUDA backend aan +# - Native CPU-optimalisatie (op jouw T5600, dus geen AVX2) +# - Architectuur 61 (Pascal, P5000) +RUN cmake -S . -B build \ + -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_SERVER=ON \ + -DGGML_CUDA=ON \ + -DGGML_NATIVE=ON \ + -DCMAKE_CUDA_ARCHITECTURES=61 + +# Build alleen de server +RUN cmake --build build --config Release --target llama-server + +# ---------- Stage 2: runtime image ---------- +FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgomp1 libcurl4-openssl-dev \ + ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +# Binaries + libs uit de builder kopiƫren +COPY --from=builder /src/build/bin/llama-server /usr/local/bin/llama-server +COPY --from=builder /src/build/bin/libggml* /usr/local/lib/ +COPY --from=builder /src/build/bin/libllama* /usr/local/lib/ +COPY --from=builder /src/build/bin/libmtmd* /usr/local/lib/ +COPY --from=builder /src/build/bin/libg* /usr/local/lib/ + +ENV LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} + +EXPOSE 8080 +VOLUME ["/models"] + +ENTRYPOINT ["llama-server"] +# Je overschrijft CMD zelf bij docker run; dit is alleen een default +CMD ["--host", "0.0.0.0", "--port", "8080"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..528eea7 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +LLM dockers running llama.cpp for Mistral llm's on a Nvidia P5000 diff --git a/mistral-llm0.sh b/mistral-llm0.sh new file mode 100755 index 0000000..f1d24a1 --- /dev/null +++ b/mistral-llm0.sh @@ -0,0 +1,7 @@ +docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --flash-attn on --split-mode layer -c 42000 --jinja --chat-template-file /models/chat_templateX.jinja +#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Magistral-Small-2509-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --flash-attn on --split-mode layer -c 10000 --jinja +#cur docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/Ministral/mmproj-F16-Ministral-Reasoning-2512.gguf --flash-attn on --split-mode layer --temp 0.05 -c 42000 --frequency_penalty 0.8 --repeat_penalty 1.2 --jinja +#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/Ministral/mmproj-F16-Ministral-Instruct-2512.gguf --flash-attn on --split-mode layer -c 42000 --jinja +#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Magistral-Small-2509-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/mmproj-F16.gguf --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on --split-mode layer -c 13288 --jinja +#/opt/models/mistral/Magistral-Small-2509-Q4_K_M.gguf +#/opt/models/mistral/devstral-small-2-chat-template-opencode.jinja diff --git a/mistral-llm1.sh b/mistral-llm1.sh new file mode 100755 index 0000000..d65854e --- /dev/null +++ b/mistral-llm1.sh @@ -0,0 +1,7 @@ +#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --flash-attn on -c 42000 --jinja --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja +#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Magistral-Small-2509-Q4_K_M.gguf --flash-attn on -c 10000 --jinja --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja +docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf --mmproj /models/Ministral/mmproj-F16-Ministral-Reasoning-2512.gguf --flash-attn on --temp 0.05 -c 42000 --repeat_penalty 1.5 --frequency_penalty 0.8 --jinja --chat-template /models/chat_templateX.jinja +# --chat-template-file /models/chat_templateX.jinja +# --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja +#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --mmproj /models/Ministral/mmproj-F16-Ministral-Instruct-2512.gguf --flash-attn on -c 42000 --jinja +#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Magistral-Small-2509-Q4_K_M.gguf --mmproj /models/mmproj-F16.gguf --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on -c 13288 --jinja diff --git a/old/Dockerfile b/old/Dockerfile new file mode 100644 index 0000000..eed144d --- /dev/null +++ b/old/Dockerfile @@ -0,0 +1,59 @@ +# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ---------- +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder + +# Toolchain +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \ + rm -rf /var/lib/apt/lists/* +RUN python3 -m pip install --no-cache-dir --upgrade pip wheel + +# Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda +ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs +ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH +ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH + +# CMAKE-args: +# - CUDA aan +# - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C +# - Architectuur op Pascal (61) om compile-tijd te beperken +# - Linker flags wijzen naar stubs en linken -lcuda expliciet +ENV CMAKE_ARGS="-DLLAMA_CUDA=on \ + -DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \ + -DCMAKE_CUDA_ARCHITECTURES=61 \ + -DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \ + -DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'" + +# Wheel bouwen (from source) +RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python + +# ---------- Stage 2: runtime (met cuBLAS aanwezig) ---------- +FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 + +# Python + OpenMP runtime +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip ca-certificates libgomp1 uvicorn && \ + rm -rf /var/lib/apt/lists/* + +# Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan +ENV PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LLAMA_LOG_LEVEL=info \ + GGML_CUDA_FORCE_CUBLAS=1 + +# Installeer de zojuist gebouwde wheel +COPY --from=builder /wheels /wheels +RUN python3 -m pip install --no-cache-dir --upgrade pip && \ + python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \ + python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl + +RUN ulimit -s unlimited + +EXPOSE 8000 +VOLUME ["/models"] + +COPY entrypoint.sh /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] +# --n_ctx 12412 daarboven problemen? +CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf", "--chat_format","chatml-function-calling", "--host","0.0.0.0","--port","8000", "--n_ctx","13027","--n_batch","16", "--n_gpu_layers","-1", "--use_mmap","false"] diff --git a/old/entrypoint.sh b/old/entrypoint.sh new file mode 100755 index 0000000..ffe42b7 --- /dev/null +++ b/old/entrypoint.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail + +unset GGML_CUDA_FORCE_MMQ + +echo ">> GGML_CUDA_FORCE_MMQ: (unset)" +echo ">> GGML_CUDA_FORCE_CUBLAS: ${GGML_CUDA_FORCE_CUBLAS:-}" +echo ">> LLAMA_LOG_LEVEL: ${LLAMA_LOG_LEVEL:-}" + +exec python3 -m llama_cpp.server "$@" diff --git a/old/mistral-llm0.sh b/old/mistral-llm0.sh new file mode 100755 index 0000000..b0d37a5 --- /dev/null +++ b/old/mistral-llm0.sh @@ -0,0 +1,2 @@ +docker run --rm --gpus "device=0" --name "mistral-llm0" -e CUDA_VISIBLE_DEVICES=0 -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx +#docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx diff --git a/old/mistral-llm1.sh b/old/mistral-llm1.sh new file mode 100755 index 0000000..f07f22a --- /dev/null +++ b/old/mistral-llm1.sh @@ -0,0 +1,2 @@ +docker run --rm --gpus '"device=1"' --name "mistral-llm1" -e CUDA_VISIBLE_DEVICES=0 -e LLAMA_CUDA_VISIBLE_DEVICES=0 -p 8001:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx +#docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx