commit fd7028ad62e6fe98245f2448a6d18caaf43c5d34
Author: admin <erik.vanderboom@mst.nl>
Date:   Mon Feb 23 16:01:45 2026 +0100

    first commit

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3404fd9
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,64 @@
+# ---------- Stage 1: build llama-server (CUDA, op jouw CPU) ----------
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Basis build tools + curl dev (voor HTTP server)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git build-essential cmake ninja-build ca-certificates \
+    libcurl4-openssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+
+# Lama.cpp ophalen (HEAD, incl. nieuwe tool-calling / server)
+RUN git clone --depth 1 https://github.com/ggml-org/llama.cpp.git .
+
+# Belangrijk: zorg dat de linker een libcuda.so.1 ziet (stub)
+# De stub zit in /usr/local/cuda/lib64/stubs/libcuda.so
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so \
+         /usr/lib/x86_64-linux-gnu/libcuda.so.1
+
+# CMake configure:
+# - Server aan
+# - CUDA backend aan
+# - Native CPU-optimalisatie (op jouw T5600, dus geen AVX2)
+# - Architectuur 61 (Pascal, P5000)
+RUN cmake -S . -B build \
+    -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=ON \
+    -DGGML_CUDA=ON \
+    -DGGML_NATIVE=ON \
+    -DCMAKE_CUDA_ARCHITECTURES=61
+
+# Build alleen de server
+RUN cmake --build build --config Release --target llama-server
+
+# ---------- Stage 2: runtime image ----------
+FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 libcurl4-openssl-dev \
+    ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+# Binaries + libs uit de builder kopiëren
+COPY --from=builder /src/build/bin/llama-server /usr/local/bin/llama-server
+COPY --from=builder /src/build/bin/libggml* /usr/local/lib/
+COPY --from=builder /src/build/bin/libllama* /usr/local/lib/
+COPY --from=builder /src/build/bin/libmtmd* /usr/local/lib/
+COPY --from=builder /src/build/bin/libg*    /usr/local/lib/
+
+ENV LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
+
+EXPOSE 8080
+VOLUME ["/models"]
+
+ENTRYPOINT ["llama-server"]
+# Je overschrijft CMD zelf bij docker run; dit is alleen een default
+CMD ["--host", "0.0.0.0", "--port", "8080"]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..528eea7
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+LLM dockers running llama.cpp for Mistral llm's on a Nvidia P5000
diff --git a/mistral-llm0.sh b/mistral-llm0.sh
new file mode 100755
index 0000000..f1d24a1
--- /dev/null
+++ b/mistral-llm0.sh
@@ -0,0 +1,7 @@
+docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1  --flash-attn on --split-mode layer -c 42000 --jinja --chat-template-file /models/chat_templateX.jinja
+#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Magistral-Small-2509-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1  --flash-attn on --split-mode layer -c 10000 --jinja
+#cur docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/Ministral/mmproj-F16-Ministral-Reasoning-2512.gguf  --flash-attn on --split-mode layer --temp 0.05 -c 42000 --frequency_penalty 0.8 --repeat_penalty 1.2 --jinja
+#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/Ministral/mmproj-F16-Ministral-Instruct-2512.gguf  --flash-attn on --split-mode layer -c 42000 --jinja 
+#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Magistral-Small-2509-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/mmproj-F16.gguf --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on --split-mode layer -c 13288 --jinja 
+#/opt/models/mistral/Magistral-Small-2509-Q4_K_M.gguf
+#/opt/models/mistral/devstral-small-2-chat-template-opencode.jinja
diff --git a/mistral-llm1.sh b/mistral-llm1.sh
new file mode 100755
index 0000000..d65854e
--- /dev/null
+++ b/mistral-llm1.sh
@@ -0,0 +1,7 @@
+#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --flash-attn on -c 42000 --jinja --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
+#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Magistral-Small-2509-Q4_K_M.gguf  --flash-attn on -c 10000 --jinja --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
+docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf --mmproj /models/Ministral/mmproj-F16-Ministral-Reasoning-2512.gguf --flash-attn on --temp 0.05 -c 42000 --repeat_penalty 1.5 --frequency_penalty 0.8 --jinja --chat-template /models/chat_templateX.jinja
+# --chat-template-file /models/chat_templateX.jinja
+# --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
+#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --mmproj /models/Ministral/mmproj-F16-Ministral-Instruct-2512.gguf --flash-attn on -c 42000 --jinja 
+#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Magistral-Small-2509-Q4_K_M.gguf --mmproj /models/mmproj-F16.gguf --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on -c 13288 --jinja 
diff --git a/old/Dockerfile b/old/Dockerfile
new file mode 100644
index 0000000..eed144d
--- /dev/null
+++ b/old/Dockerfile
@@ -0,0 +1,59 @@
+# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ----------
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder
+
+# Toolchain
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \
+    rm -rf /var/lib/apt/lists/*
+RUN python3 -m pip install --no-cache-dir --upgrade pip wheel
+
+# Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda
+ENV CUDA_HOME=/usr/local/cuda
+ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs
+ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH
+
+# CMAKE-args:
+# - CUDA aan
+# - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C
+# - Architectuur op Pascal (61) om compile-tijd te beperken
+# - Linker flags wijzen naar stubs en linken -lcuda expliciet
+ENV CMAKE_ARGS="-DLLAMA_CUDA=on \
+ -DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \
+ -DCMAKE_CUDA_ARCHITECTURES=61 \
+ -DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \
+ -DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'"
+
+# Wheel bouwen (from source)
+RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python
+
+# ---------- Stage 2: runtime (met cuBLAS aanwezig) ----------
+FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
+
+# Python + OpenMP runtime
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip ca-certificates libgomp1 uvicorn && \
+    rm -rf /var/lib/apt/lists/*
+
+# Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONIOENCODING=UTF-8 \
+    LLAMA_LOG_LEVEL=info \
+    GGML_CUDA_FORCE_CUBLAS=1
+
+# Installeer de zojuist gebouwde wheel
+COPY --from=builder /wheels /wheels
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \
+    python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl
+
+RUN ulimit -s unlimited
+
+EXPOSE 8000
+VOLUME ["/models"]
+
+COPY entrypoint.sh /entrypoint.sh
+
+ENTRYPOINT ["/entrypoint.sh"]
+# --n_ctx 12412 daarboven problemen?
+CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf",     "--chat_format","chatml-function-calling",     "--host","0.0.0.0","--port","8000",     "--n_ctx","13027","--n_batch","16",     "--n_gpu_layers","-1",   "--use_mmap","false"]
diff --git a/old/entrypoint.sh b/old/entrypoint.sh
new file mode 100755
index 0000000..ffe42b7
--- /dev/null
+++ b/old/entrypoint.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+unset GGML_CUDA_FORCE_MMQ
+
+echo ">> GGML_CUDA_FORCE_MMQ: (unset)"
+echo ">> GGML_CUDA_FORCE_CUBLAS: ${GGML_CUDA_FORCE_CUBLAS:-<not set>}"
+echo ">> LLAMA_LOG_LEVEL: ${LLAMA_LOG_LEVEL:-<not set>}"
+
+exec python3 -m llama_cpp.server "$@"
diff --git a/old/mistral-llm0.sh b/old/mistral-llm0.sh
new file mode 100755
index 0000000..b0d37a5
--- /dev/null
+++ b/old/mistral-llm0.sh
@@ -0,0 +1,2 @@
+docker run --rm --gpus "device=0" --name "mistral-llm0" -e CUDA_VISIBLE_DEVICES=0 -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
+#docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
diff --git a/old/mistral-llm1.sh b/old/mistral-llm1.sh
new file mode 100755
index 0000000..f07f22a
--- /dev/null
+++ b/old/mistral-llm1.sh
@@ -0,0 +1,2 @@
+docker run --rm --gpus '"device=1"' --name "mistral-llm1" -e CUDA_VISIBLE_DEVICES=0 -e LLAMA_CUDA_VISIBLE_DEVICES=0 -p 8001:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
+#docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx