# ---------- Stage 1: build llama-server (CUDA, op jouw CPU) ---------- FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS builder ENV DEBIAN_FRONTEND=noninteractive # Basis build tools + curl dev (voor HTTP server) RUN apt-get update && apt-get install -y --no-install-recommends \ git build-essential cmake ninja-build ca-certificates \ libcurl4-openssl-dev && \ rm -rf /var/lib/apt/lists/* WORKDIR /src # Lama.cpp ophalen (HEAD, incl. nieuwe tool-calling / server) RUN git clone --depth 1 https://github.com/ggml-org/llama.cpp.git . # Belangrijk: zorg dat de linker een libcuda.so.1 ziet (stub) # De stub zit in /usr/local/cuda/lib64/stubs/libcuda.so RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so \ /usr/lib/x86_64-linux-gnu/libcuda.so.1 # CMake configure: # - Server aan # - CUDA backend aan # - Native CPU-optimalisatie (op jouw T5600, dus geen AVX2) # - Architectuur 61 (Pascal, P5000) RUN cmake -S . -B build \ -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DGGML_CUDA=ON \ -DGGML_NATIVE=ON \ -DCMAKE_CUDA_ARCHITECTURES=61 # Build alleen de server RUN cmake --build build --config Release --target llama-server # ---------- Stage 2: runtime image ---------- FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y --no-install-recommends \ libgomp1 libcurl4-openssl-dev \ ca-certificates && \ rm -rf /var/lib/apt/lists/* # Binaries + libs uit de builder kopiƫren COPY --from=builder /src/build/bin/llama-server /usr/local/bin/llama-server COPY --from=builder /src/build/bin/libggml* /usr/local/lib/ COPY --from=builder /src/build/bin/libllama* /usr/local/lib/ COPY --from=builder /src/build/bin/libmtmd* /usr/local/lib/ COPY --from=builder /src/build/bin/libg* /usr/local/lib/ ENV LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} EXPOSE 8080 VOLUME ["/models"] ENTRYPOINT ["llama-server"] # Je overschrijft CMD zelf bij docker run; dit is alleen een default CMD ["--host", "0.0.0.0", "--port", "8080"]