first commit
This commit is contained in:
commit
fd7028ad62
64
Dockerfile
Normal file
64
Dockerfile
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
# ---------- Stage 1: build llama-server (CUDA, op jouw CPU) ----------
|
||||||
|
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Basis build tools + curl dev (voor HTTP server)
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
git build-essential cmake ninja-build ca-certificates \
|
||||||
|
libcurl4-openssl-dev && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /src
|
||||||
|
|
||||||
|
# Lama.cpp ophalen (HEAD, incl. nieuwe tool-calling / server)
|
||||||
|
RUN git clone --depth 1 https://github.com/ggml-org/llama.cpp.git .
|
||||||
|
|
||||||
|
# Belangrijk: zorg dat de linker een libcuda.so.1 ziet (stub)
|
||||||
|
# De stub zit in /usr/local/cuda/lib64/stubs/libcuda.so
|
||||||
|
RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so \
|
||||||
|
/usr/lib/x86_64-linux-gnu/libcuda.so.1
|
||||||
|
|
||||||
|
# CMake configure:
|
||||||
|
# - Server aan
|
||||||
|
# - CUDA backend aan
|
||||||
|
# - Native CPU-optimalisatie (op jouw T5600, dus geen AVX2)
|
||||||
|
# - Architectuur 61 (Pascal, P5000)
|
||||||
|
RUN cmake -S . -B build \
|
||||||
|
-G Ninja \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DGGML_CUDA=ON \
|
||||||
|
-DGGML_NATIVE=ON \
|
||||||
|
-DCMAKE_CUDA_ARCHITECTURES=61
|
||||||
|
|
||||||
|
# Build alleen de server
|
||||||
|
RUN cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
|
# ---------- Stage 2: runtime image ----------
|
||||||
|
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libgomp1 libcurl4-openssl-dev \
|
||||||
|
ca-certificates && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Binaries + libs uit de builder kopiëren
|
||||||
|
COPY --from=builder /src/build/bin/llama-server /usr/local/bin/llama-server
|
||||||
|
COPY --from=builder /src/build/bin/libggml* /usr/local/lib/
|
||||||
|
COPY --from=builder /src/build/bin/libllama* /usr/local/lib/
|
||||||
|
COPY --from=builder /src/build/bin/libmtmd* /usr/local/lib/
|
||||||
|
COPY --from=builder /src/build/bin/libg* /usr/local/lib/
|
||||||
|
|
||||||
|
ENV LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
|
||||||
|
|
||||||
|
EXPOSE 8080
|
||||||
|
VOLUME ["/models"]
|
||||||
|
|
||||||
|
ENTRYPOINT ["llama-server"]
|
||||||
|
# Je overschrijft CMD zelf bij docker run; dit is alleen een default
|
||||||
|
CMD ["--host", "0.0.0.0", "--port", "8080"]
|
||||||
1
README.md
Normal file
1
README.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
LLM dockers running llama.cpp for Mistral llm's on a Nvidia P5000
|
||||||
7
mistral-llm0.sh
Executable file
7
mistral-llm0.sh
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --flash-attn on --split-mode layer -c 42000 --jinja --chat-template-file /models/chat_templateX.jinja
|
||||||
|
#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Magistral-Small-2509-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --flash-attn on --split-mode layer -c 10000 --jinja
|
||||||
|
#cur docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/Ministral/mmproj-F16-Ministral-Reasoning-2512.gguf --flash-attn on --split-mode layer --temp 0.05 -c 42000 --frequency_penalty 0.8 --repeat_penalty 1.2 --jinja
|
||||||
|
#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/Ministral/mmproj-F16-Ministral-Instruct-2512.gguf --flash-attn on --split-mode layer -c 42000 --jinja
|
||||||
|
#docker run --name "mistral-llm0" --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8000:8080 llama-server-noavx:latest -m /models/Magistral-Small-2509-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --mmproj /models/mmproj-F16.gguf --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on --split-mode layer -c 13288 --jinja
|
||||||
|
#/opt/models/mistral/Magistral-Small-2509-Q4_K_M.gguf
|
||||||
|
#/opt/models/mistral/devstral-small-2-chat-template-opencode.jinja
|
||||||
7
mistral-llm1.sh
Executable file
7
mistral-llm1.sh
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --flash-attn on -c 42000 --jinja --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
|
||||||
|
#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Magistral-Small-2509-Q4_K_M.gguf --flash-attn on -c 10000 --jinja --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
|
||||||
|
docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 -d --restart unless-stopped -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf --mmproj /models/Ministral/mmproj-F16-Ministral-Reasoning-2512.gguf --flash-attn on --temp 0.05 -c 42000 --repeat_penalty 1.5 --frequency_penalty 0.8 --jinja --chat-template /models/chat_templateX.jinja
|
||||||
|
# --chat-template-file /models/chat_templateX.jinja
|
||||||
|
# --chat-template-file /models/devstral-small-2-chat-template-opencode.jinja
|
||||||
|
#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf --mmproj /models/Ministral/mmproj-F16-Ministral-Instruct-2512.gguf --flash-attn on -c 42000 --jinja
|
||||||
|
#docker run --name "mistral-llm1" --gpus device=1 -e CUDA_VISIBLE_DEVICES=0 --rm -v /opt/models/mistral/:/models -p 8001:8080 llama-server-noavx:latest --host 0.0.0.0 --port 8080 --n-gpu-layers -1 --split-mode layer -m /models/Magistral-Small-2509-Q4_K_M.gguf --mmproj /models/mmproj-F16.gguf --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on -c 13288 --jinja
|
||||||
59
old/Dockerfile
Normal file
59
old/Dockerfile
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
# ---------- Stage 1: build llama-cpp-python with CUDA (no AVX2) ----------
|
||||||
|
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder
|
||||||
|
|
||||||
|
# Toolchain
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-dev build-essential cmake ninja-build ca-certificates uvicorn && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
RUN python3 -m pip install --no-cache-dir --upgrade pip wheel
|
||||||
|
|
||||||
|
# Zorg dat de linker de CUDA driver stubs ziet en link expliciet tegen -lcuda
|
||||||
|
ENV CUDA_HOME=/usr/local/cuda
|
||||||
|
ENV CUDA_STUBS=/usr/local/cuda/lib64/stubs
|
||||||
|
ENV LD_LIBRARY_PATH=${CUDA_STUBS}:$LD_LIBRARY_PATH
|
||||||
|
ENV LIBRARY_PATH=${CUDA_STUBS}:$LIBRARY_PATH
|
||||||
|
|
||||||
|
# CMAKE-args:
|
||||||
|
# - CUDA aan
|
||||||
|
# - Geen AVX2 (oude Xeons), wel AVX/FMA/F16C
|
||||||
|
# - Architectuur op Pascal (61) om compile-tijd te beperken
|
||||||
|
# - Linker flags wijzen naar stubs en linken -lcuda expliciet
|
||||||
|
ENV CMAKE_ARGS="-DLLAMA_CUDA=on \
|
||||||
|
-DLLAMA_NATIVE=off -DLLAMA_AVX2=off -DLLAMA_AVX=on -DLLAMA_FMA=on -DLLAMA_F16C=on \
|
||||||
|
-DCMAKE_CUDA_ARCHITECTURES=61 \
|
||||||
|
-DCMAKE_EXE_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda' \
|
||||||
|
-DCMAKE_SHARED_LINKER_FLAGS='-L/usr/local/cuda/lib64/stubs -lcuda'"
|
||||||
|
|
||||||
|
# Wheel bouwen (from source)
|
||||||
|
RUN pip wheel --no-binary=:all: --no-deps -w /wheels llama-cpp-python
|
||||||
|
|
||||||
|
# ---------- Stage 2: runtime (met cuBLAS aanwezig) ----------
|
||||||
|
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
# Python + OpenMP runtime
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip ca-certificates libgomp1 uvicorn && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Stabiele defaults op Pascal (P5000): MMQ uit, cuBLAS aan
|
||||||
|
ENV PYTHONUNBUFFERED=1 \
|
||||||
|
PYTHONIOENCODING=UTF-8 \
|
||||||
|
LLAMA_LOG_LEVEL=info \
|
||||||
|
GGML_CUDA_FORCE_CUBLAS=1
|
||||||
|
|
||||||
|
# Installeer de zojuist gebouwde wheel
|
||||||
|
COPY --from=builder /wheels /wheels
|
||||||
|
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||||
|
python3 -m pip install --no-cache-dir anyio starlette sse_starlette starlette_context pydantic_settings fastapi && \
|
||||||
|
python3 -m pip install --no-cache-dir /wheels/llama_cpp_python-*.whl
|
||||||
|
|
||||||
|
RUN ulimit -s unlimited
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
VOLUME ["/models"]
|
||||||
|
|
||||||
|
COPY entrypoint.sh /entrypoint.sh
|
||||||
|
|
||||||
|
ENTRYPOINT ["/entrypoint.sh"]
|
||||||
|
# --n_ctx 12412 daarboven problemen?
|
||||||
|
CMD ["--model","/models/mistral-small-3.2.Q4_K_M.gguf", "--chat_format","chatml-function-calling", "--host","0.0.0.0","--port","8000", "--n_ctx","13027","--n_batch","16", "--n_gpu_layers","-1", "--use_mmap","false"]
|
||||||
10
old/entrypoint.sh
Executable file
10
old/entrypoint.sh
Executable file
@ -0,0 +1,10 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
unset GGML_CUDA_FORCE_MMQ
|
||||||
|
|
||||||
|
echo ">> GGML_CUDA_FORCE_MMQ: (unset)"
|
||||||
|
echo ">> GGML_CUDA_FORCE_CUBLAS: ${GGML_CUDA_FORCE_CUBLAS:-<not set>}"
|
||||||
|
echo ">> LLAMA_LOG_LEVEL: ${LLAMA_LOG_LEVEL:-<not set>}"
|
||||||
|
|
||||||
|
exec python3 -m llama_cpp.server "$@"
|
||||||
2
old/mistral-llm0.sh
Executable file
2
old/mistral-llm0.sh
Executable file
@ -0,0 +1,2 @@
|
|||||||
|
docker run --rm --gpus "device=0" --name "mistral-llm0" -e CUDA_VISIBLE_DEVICES=0 -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
|
||||||
|
#docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
|
||||||
2
old/mistral-llm1.sh
Executable file
2
old/mistral-llm1.sh
Executable file
@ -0,0 +1,2 @@
|
|||||||
|
docker run --rm --gpus '"device=1"' --name "mistral-llm1" -e CUDA_VISIBLE_DEVICES=0 -e LLAMA_CUDA_VISIBLE_DEVICES=0 -p 8001:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
|
||||||
|
#docker run --rm --gpus all -p 8000:8000 -v /opt/models/mistral:/models -t mistral-llm:cu122-avx
|
||||||
Loading…
Reference in New Issue
Block a user