first commit

2026-02-23 16:01:45 +01:00 · 2026-02-23 16:01:45 +01:00 · fd7028ad62
commit fd7028ad62
8 changed files with 152 additions and 0 deletions
--- a/64
+++ b/64
@ -0,0 +1,64 @@
 # ---------- Stage 1: build llama-server (CUDA, op jouw CPU) ----------
 FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder
 ENV DEBIAN_FRONTEND=noninteractive
 # Basis build tools + curl dev (voor HTTP server)
 RUN apt-get update && apt-get install -y --no-install-recommends \
    git build-essential cmake ninja-build ca-certificates \
    libcurl4-openssl-dev && \
    rm -rf /var/lib/apt/lists/*
 WORKDIR /src
 # Lama.cpp ophalen (HEAD, incl. nieuwe tool-calling / server)
 RUN git clone --depth 1 https://github.com/ggml-org/llama.cpp.git .
 # Belangrijk: zorg dat de linker een libcuda.so.1 ziet (stub)
 # De stub zit in /usr/local/cuda/lib64/stubs/libcuda.so
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so \
         /usr/lib/x86_64-linux-gnu/libcuda.so.1
 # CMake configure:
 # - Server aan
 # - CUDA backend aan
 # - Native CPU-optimalisatie (op jouw T5600, dus geen AVX2)
 # - Architectuur 61 (Pascal, P5000)
 RUN cmake -S . -B build \
    -G Ninja \
    -DCMAKE_BUILD_TYPE=Release \
    -DLLAMA_BUILD_TESTS=OFF \
    -DLLAMA_BUILD_EXAMPLES=OFF \
    -DLLAMA_BUILD_SERVER=ON \
    -DGGML_CUDA=ON \
    -DGGML_NATIVE=ON \
    -DCMAKE_CUDA_ARCHITECTURES=61
 # Build alleen de server
 RUN cmake --build build --config Release --target llama-server
 # ---------- Stage 2: runtime image ----------
 FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgomp1 libcurl4-openssl-dev \
    ca-certificates && \
    rm -rf /var/lib/apt/lists/*
 # Binaries + libs uit de builder kopiëren
 COPY --from=builder /src/build/bin/llama-server /usr/local/bin/llama-server
 COPY --from=builder /src/build/bin/libggml* /usr/local/lib/
 COPY --from=builder /src/build/bin/libllama* /usr/local/lib/
 COPY --from=builder /src/build/bin/libmtmd* /usr/local/lib/
 COPY --from=builder /src/build/bin/libg*    /usr/local/lib/
 ENV LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
 EXPOSE 8080
 VOLUME ["/models"]
 ENTRYPOINT ["llama-server"]
 # Je overschrijft CMD zelf bij docker run; dit is alleen een default
 CMD ["--host", "0.0.0.0", "--port", "8080"]
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
 LLM dockers running llama.cpp for Mistral llm's on a Nvidia P5000
--- a/mistral-llm0.sh
+++ b/mistral-llm0.sh
@ -0,0 +1,7 @@
 docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1  --flash-attn on --split-mode layer -c 42000 --jinja --chat-template-file /models/chat_templateX.jinja
 #docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Magistral-Small-2509-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1  --flash-attn on --split-mode layer -c 10000 --jinja
 #cur docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/Ministral/mmproj-F16-Ministral-Reasoning-2512.gguf  --flash-attn on --split-mode layer --temp 0.05 -c 42000 --frequency_penalty 0.8 --repeat_penalty 1.2 --jinja
 #docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/Ministral/mmproj-F16-Ministral-Instruct-2512.gguf  --flash-attn on --split-mode layer -c 42000 --jinja 
 #docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Magistral-Small-2509-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/mmproj-F16.gguf --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on --split-mode layer -c 13288 --jinja 
 #/opt/models/mistral/Magistral-Small-2509-Q4_K_M.gguf
 #/opt/models/mistral/devstral-small-2-chat-template-opencode.jinja
--- a/mistral-llm1.sh
+++ b/mistral-llm1.sh
@ -0,0 +1,7 @@
 #docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --flash-attn on -c 42000 --jinja --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
 #docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Magistral-Small-2509-Q4_K_M.gguf  --flash-attn on -c 10000 --jinja --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
 docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf --mmproj /models/Ministral/mmproj-F16-Ministral-Reasoning-2512.gguf --flash-attn on --temp 0.05 -c 42000 --repeat_penalty 1.5 --frequency_penalty 0.8 --jinja --chat-template /models/chat_templateX.jinja
 # --chat-template-file /models/chat_templateX.jinja
 # --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
 #docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --mmproj /models/Ministral/mmproj-F16-Ministral-Instruct-2512.gguf --flash-attn on -c 42000 --jinja 
 #docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest  --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Magistral-Small-2509-Q4_K_M.gguf --mmproj /models/mmproj-F16.gguf --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on -c 13288 --jinja 
--- a/old/Dockerfile
+++ b/old/Dockerfile
@ -0,0 +1,59 @@
 # ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ----------
 FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder
 # Toolchain
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \
    rm -rf /var/lib/apt/lists/*
 RUN python3 -m pip install --no-cache-dir --upgrade pip wheel
 # Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda
 ENV CUDA_HOME=/usr/local/cuda
 ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs
 ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH
 ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH
 # CMAKE-args:
 # - CUDA aan
 # - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C
 # - Architectuur op Pascal (61) om compile-tijd te beperken
 # - Linker flags wijzen naar stubs en linken -lcuda expliciet
 ENV CMAKE_ARGS="-DLLAMA_CUDA=on \
 -DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \
 -DCMAKE_CUDA_ARCHITECTURES=61 \
 -DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \
 -DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'"
 # Wheel bouwen (from source)
 RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python
 # ---------- Stage 2: runtime (met cuBLAS aanwezig) ----------
 FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
 # Python + OpenMP runtime
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 python3-pip ca-certificates libgomp1 uvicorn && \
    rm -rf /var/lib/apt/lists/*
 # Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan
 ENV PYTHONUNBUFFERED=1 \
    PYTHONIOENCODING=UTF-8 \
    LLAMA_LOG_LEVEL=info \
    GGML_CUDA_FORCE_CUBLAS=1
 # Installeer de zojuist gebouwde wheel
 COPY --from=builder /wheels /wheels
 RUN python3 -m pip install --no-cache-dir --upgrade pip && \
    python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \
    python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl
 RUN ulimit -s unlimited
 EXPOSE 8000
 VOLUME ["/models"]
 COPY entrypoint.sh /entrypoint.sh
 ENTRYPOINT ["/entrypoint.sh"]
 # --n_ctx 12412 daarboven problemen?
 CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf",     "--chat_format","chatml-function-calling",     "--host","0.0.0.0","--port","8000",     "--n_ctx","13027","--n_batch","16",     "--n_gpu_layers","-1",   "--use_mmap","false"]
--- a/old/entrypoint.sh
+++ b/old/entrypoint.sh
@ -0,0 +1,10 @@
 #!/usr/bin/env bash
 set -euo pipefail
 unset GGML_CUDA_FORCE_MMQ
 echo ">> GGML_CUDA_FORCE_MMQ: (unset)"
 echo ">> GGML_CUDA_FORCE_CUBLAS: ${GGML_CUDA_FORCE_CUBLAS:-<not set>}"
 echo ">> LLAMA_LOG_LEVEL: ${LLAMA_LOG_LEVEL:-<not set>}"
 exec python3 -m llama_cpp.server "$@"
--- a/old/mistral-llm0.sh
+++ b/old/mistral-llm0.sh
@ -0,0 +1,2 @@
 docker run --rm --gpus "device=0" --name "mistral-llm0" -e CUDA_VISIBLE_DEVICES=0 -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
 #docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
--- a/old/mistral-llm1.sh
+++ b/old/mistral-llm1.sh
@ -0,0 +1,2 @@
 docker run --rm --gpus '"device=1"' --name "mistral-llm1" -e CUDA_VISIBLE_DEVICES=0 -e LLAMA_CUDA_VISIBLE_DEVICES=0 -p 8001:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
 #docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
		`@ -0,0 +1 @@`
							`LLM dockers running llama.cpp for Mistral llm's on a Nvidia P5000`
		`@ -0,0 +1,2 @@`
							`docker run --rm --gpus "device=0" --name "mistral-llm0" -e CUDA_VISIBLE_DEVICES=0 -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx`
							`#docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx`
		`@ -0,0 +1,2 @@`
							`docker run --rm --gpus '"device=1"' --name "mistral-llm1" -e CUDA_VISIBLE_DEVICES=0 -e LLAMA_CUDA_VISIBLE_DEVICES=0 -p 8001:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx`
							`#docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx`