openapi.json changes and native or bridged toolcalling changes

2026-01-08 16:10:04 +01:00 · 2026-01-08 16:10:04 +01:00 · 4e2ffac24c
commit 4e2ffac24c
parent ddc1c56cd7
6 changed files with 425 additions and 187 deletions
--- a/89
+++ b/89
@ -1,9 +1,15 @@
-FROM python:3.11-slim
+# ===== Base met CUDA11.8 + cuDNN + conda =====
+FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime

 WORKDIR /app

-# ===== Model caches op vaste paden (blijven in image) =====
-# Hugging Face caches (embeddings) + XDG cache (o.a. whisper)
+# Zorg dat conda libs altijd eerst gevonden worden
+ENV LD_LIBRARY_PATH=/opt/conda/lib:${LD_LIBRARY_PATH}
+
+# P5000 = Pascal SM 6.1; handig voor (eventueel) on-the-fly builds
+ENV TORCH_CUDA_ARCH_LIST="6.1"
+
+# ===== Model caches op vaste paden =====
 ENV HF_HOME=/opt/hf \
    HUGGINGFACE_HUB_CACHE=/opt/hf \
    TRANSFORMERS_CACHE=/opt/hf \
@ -11,83 +17,92 @@ ENV HF_HOME=/opt/hf \
    XDG_CACHE_HOME=/opt/cache \
    STT_MODEL=small

-# Optioneel build-args om modelkeuzes te pinnen
-ARG RAG_EMBEDDINGS=gte-multilingual       # of: bge-small / e5-small / gte-base-en
-ARG STT_MODEL_ARG=small                   # tiny | base | small | medium | large-v3, etc.
+ARG RAG_EMBEDDINGS=gte-multilingual
+ARG STT_MODEL_ARG=small
 ENV RAG_EMBEDDINGS=${RAG_EMBEDDINGS}
 ENV STT_MODEL=${STT_MODEL_ARG}

-# maak directories nu al aan (rechten)
+# directories
 RUN mkdir -p /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper && \
    chmod -R a+rX /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper

+# ===== Alleen minimale apt utils (géén multimedia libs!) =====
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive \
+    apt-get install -y --no-install-recommends \
+      git curl build-essential ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# ===== Multimedia via conda-forge (alles uit één ecosysteem) =====
+# - av 10 + ffmpeg<7 (past goed bij pyAV) 
+# - cairo/pango/gdk-pixbuf/pixman voor cairosvg stack
+# VERVANG de vorige conda multimedia regel door deze:
+# Tooling voor PyAV build
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    pkg-config git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential && rm -rf /var/lib/apt/lists/*
+
+# FFmpeg via conda-forge (zodat je recente headers/libs hebt)
+RUN conda config --system --set channel_priority flexible \
+ && conda install -y -c conda-forge "ffmpeg>=6,<8" \
+ && conda clean -afy
+
+# Later in je pip stap:
+# ... faster-whisper==1.0.0 zal av==11.* trekken en nu WEL kunnen bouwen tegen conda’s FFmpeg 6
+
+# ===== Python deps =====
 COPY requirements.txt .
-RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential
 RUN pip install --upgrade pip
+# jouw requirements
 RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25
-#RUN pip cache purge
+# losse extras (let op: av via conda, niet via pip!)
+RUN pip install --no-cache-dir \
+      PyPDF2 python-multipart gitpython chromadb httpx meilisearch \
+      pandas openpyxl python-pptx faster-whisper==1.0.0 \
+      cairosvg sentence-transformers rank-bm25

-RUN apt-get update && apt-get install -y --no-install-recommends \
-      wget ca-certificates libstdc++6 libatomic1 \
-  && rm -rf /var/lib/apt/lists/* \
-  && mkdir -p /opt/piper \
-  && set -eux; \
-     URL="https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz"; \
-     wget -O /tmp/piper.tgz "$URL"; \
-     tar -xzf /tmp/piper.tgz -C /opt/piper --strip-components=1; \
-     ln -sf /opt/piper/piper /usr/local/bin/piper; \
-     rm -f /tmp/piper.tgz
+# ===== Prefetch modellen =====

-# ===== Prefetch modellen tijdens de build =====
-# 1) SentenceTransformers (embeddings) — volgens je mapping in app.py
+# 1) SentenceTransformers
 RUN python - <<'PY'
 import os
 from sentence_transformers import SentenceTransformer
 mapping = {
-    "gte-multilingual": ("Alibaba-NLP/gte-multilingual-base"),
-    "bge-small": ("BAAI/bge-small-en-v1.5"),
-    "e5-small": ("intfloat/e5-small-v2"),
-    "gte-base-en": ("thenlper/gte-base"),
+    "gte-multilingual": "Alibaba-NLP/gte-multilingual-base",
+    "bge-small": "BAAI/bge-small-en-v1.5",
+    "e5-small": "intfloat/e5-small-v2",
+    "gte-base-en": "thenlper/gte-base",
 }
 choice = os.environ.get("RAG_EMBEDDINGS","gte-multilingual").lower()
 hf_id = mapping.get(choice, "BAAI/bge-small-en-v1.5")
-# cache_folder respecteert SENTENCE_TRANSFORMERS_HOME/HF_HOME, maar we forceren expliciet:
 cache_root = os.environ.get("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers")
 local_dir = os.path.join(cache_root, "embedder")
 os.makedirs(cache_root, exist_ok=True)
-
 print("Downloading SentenceTransformer:", hf_id)
-# Laat SentenceTransformer zelf hun cache doen (HF_HOME etc)
-# wij saven het eindresultaat daarna naar local_dir
-model = SentenceTransformer(hf_id, cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME","/opt/sentence-transformers"))
+model = SentenceTransformer(hf_id, cache_folder=cache_root, device="cpu")  # download only
 model.save(local_dir)
 print("Prefetched SentenceTransformer:", hf_id)
 PY

-# 2) faster-whisper (STT) — cache in /opt/cache/whisper
+# 2) faster-whisper (prefetch CPU-kant; runtime kan je device kiezen)
 RUN python - <<'PY'
 import os
 from faster_whisper import WhisperModel
 name = os.environ.get("STT_MODEL","small")
 cache_root = os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper")
 os.makedirs(cache_root, exist_ok=True)
-# Build-time altijd CPU/INT8 (geen GPU nodig tijdens build)
 _ = WhisperModel(name, device="cpu", compute_type="int8", download_root=cache_root)
 print("Prefetched faster-whisper:", name, "->", cache_root)
 PY

-# (optioneel) piper voice kun je hier ook voorcachen; laat ik nu achterwege omdat voice per omgeving wisselt.
-
-
+# (optioneel) piper skip ik hier; kan later

+# ===== App code =====
 COPY app.py .
 COPY queue_helper.py .
 COPY agent_repo.py .
 COPY windowing_utils.py .
 COPY smart_rag.py .
 COPY llm_client.py .
+COPY web_search.py .

 EXPOSE 8080
-
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/agent_repo.py
+++ b/agent_repo.py
@ -231,7 +231,7 @@ _embed_documents = None
 # Non-invasief: behoudt hetzelfde response-shape als _llm_call.

 # Harde cap van jouw Mistral-LLM docker (zoals je aangaf)
-_MODEL_BUDGET = int(os.getenv("LLM_TOTAL_TOKEN_BUDGET", "13027"))
+_MODEL_BUDGET = int(os.getenv("LLM_TOTAL_TOKEN_BUDGET", "42000"))
 # Veiligheidsmarge voor headers/EOS/afwijkingen in token-raming
 _BUDGET_SAFETY = int(os.getenv("LLM_BUDGET_SAFETY_TOKENS", "512"))
 # Max aantal vervolgstappen als het net afgekapt lijkt
@ -3255,7 +3255,7 @@ def _prepare_contexts_under_budget(
    question: str,
    stack_summary_text: str,
    *,
-    budget_tokens: int = int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")),
+    budget_tokens: int = int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "7000")),
    tok_len=approx_token_count
 ) -> List[dict]:
    """
@ -3392,7 +3392,7 @@ async def _llm_qa_answer(question: str, stack_summary_text: str, contexts: List[
    # --- NIEUW: trim contexts onder tokenbudget ---
    contexts = _prepare_contexts_under_budget(
        contexts, question, stack_summary_text,
-        budget_tokens=int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")),
+        budget_tokens=int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "7000")),
        tok_len=approx_token_count
    )

@ -3617,7 +3617,7 @@ async def propose_patches_without_apply(repo_path: str, candidates: List[str], u
            # sla deze stap over; ga door naar volgende kandidaat
            continue
        last_err = None
-        for mx in [1024]:
+        for mx in [2048]:
            try:
                messages = [
                    {"role":"system","content":"Voer exact de gevraagde wijziging uit. GEEN extra refactors/best practices. Lever de volledige, werkende bestandinformatie als 1 codeblok."},
@ -4215,7 +4215,7 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
        )
        llm_resp = await _llm_call(
            [{"role":"system","content":sys},{"role":"user","content":user}],
-            stream=False, temperature=0.2, top_p=0.9, max_tokens=1536
+            stream=False, temperature=0.2, top_p=0.9, max_tokens=2048
        )
        out = (llm_resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
        if out.strip():
--- a/app.py
+++ b/app.py
@ -14,7 +14,7 @@ import contextlib
 from contextlib import contextmanager
 import os, re, json, time, uuid, hashlib, logging, asyncio, fnmatch, threading
 from dataclasses import dataclass
-from typing import List, Dict, Optional, Union, Any
+from typing import List, Dict, Optional, Union, Any, Callable
 from pathlib import Path
 from io import BytesIO

@ -37,6 +37,12 @@ AUTO_CONTINUE_MAX_ROUNDS = int(os.getenv("AUTO_CONTINUE_MAX_ROUNDS", "6"))
 AUTO_CONTINUE_TAIL_CHARS = int(os.getenv("AUTO_CONTINUE_TAIL_CHARS", "600"))

 from llm_client import init_llm_client, _sync_model_infer
+from web_search import Tools, HelpFunctions, EventEmitter
+
+import subprocess
+import tempfile
+
+

 # Optionele libs voor tekst-extractie
 try:
@ -299,7 +305,8 @@ def _unique_id(route: APIRoute):
    method = list(route.methods)[0].lower() if route.methods else "get"
    return f"{route.name}_{route.path.replace('/', '_')}_{method}"

-app = FastAPI(title="Mistral Bridge API",generate_unique_id_function=_unique_id)
+
+app = FastAPI(title="Mistral Bridge API",openapi_url=None,generate_unique_id_function=_unique_id)#openapi_url=None,
 app.add_middleware(
    CORSMiddleware,
    allow_credentials=True,
@ -531,9 +538,26 @@ class _Embedder:
        return self._encode(docs)

    def embed_query(self, q: str) -> list[float]:
+        # e5 prefix blijft identiek
        if self.family == "e5":
            q = f"query: {q}"
-        return self._encode([q])[0]
+        # --- LRU cache (per proces) om dubbele embeds bij routed buckets te vermijden ---
+        key = (q or "").strip()
+        cache = getattr(self, "_q_cache", None)
+        if cache is None:
+            from collections import OrderedDict
+            self._q_cache = OrderedDict()
+            self._q_cache_cap = int(os.getenv("RAG_Q_EMBED_CACHE", "2048"))
+            cache = self._q_cache
+        if key in cache:
+            v = cache.pop(key)
+            cache[key] = v
+            return v
+        v = self._encode([q])[0]
+        cache[key] = v
+        if len(cache) > getattr(self, "_q_cache_cap", 2048):
+            cache.popitem(last=False)  # evict oldest
+        return v

 def _build_embedder() -> _Embedder:
    import inspect
@ -553,7 +577,16 @@ def _build_embedder() -> _Embedder:
            model_name, family, slug = mapping[choice]
        cache_dir = os.environ.get("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers")
        local_dir = os.path.join(cache_dir, "embedder")
-        st_kwargs = {"device": "cpu"}
+        # --- device auto-select: cuda als beschikbaar, anders cpu; override met RAG_EMBED_DEVICE ---
+        dev_req = os.getenv("RAG_EMBED_DEVICE", "auto").strip().lower()
+        dev = "cpu"
+        try:
+            import torch
+            if dev_req in ("cuda", "gpu", "auto") and torch.cuda.is_available():
+                dev = "cuda"
+        except Exception:
+            dev = "cpu"
+        st_kwargs = {"device": dev}
        if os.path.isdir(local_dir):
            # Prefetched model in image → gebruik dat
            model_source = local_dir
@ -568,7 +601,9 @@ def _build_embedder() -> _Embedder:
            if "trust_remote_code" in inspect.signature(SentenceTransformer).parameters:
                st_kwargs["trust_remote_code"] = True
            model = SentenceTransformer(model_source, **st_kwargs)
-            # optioneel: CPU thread-telling forceren
+            # logging: inzichtelijk maken waar embeds draaien
+            print(f"[embeddings] model={model_name} device={dev}")
+            # optioneel: CPU thread-telling forceren (fallback)
            try:
                thr = int(os.getenv("RAG_TORCH_THREADS", "0"))
                if thr > 0:
@ -576,7 +611,7 @@ def _build_embedder() -> _Embedder:
                    torch.set_num_threads(thr)
            except Exception:
                pass
-            return _Embedder(slug=slug, family=family, model=model, device="cpu")
+            return _Embedder(slug=slug, family=family, model=model, device=dev)
        except Exception as e:
            print("ERROR building embedder:",str(e))

@ -835,7 +870,7 @@ async def llm_call_openai_compat(
    stream: bool = False,
    temperature: float = 0.2,
    top_p: float = 0.9,
-    max_tokens: int = 13027,
+    max_tokens: int = 42000,
    extra: Optional[dict] = None,
    stop: Optional[Union[str, list[str]]] = None,
    **kwargs
@ -846,7 +881,8 @@ async def llm_call_openai_compat(
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": max_tokens,
-        "stream": bool(stream)
+        "stream": bool(stream),
+        "repeat_penalty": 1.5,
    }
    # OpenAI-compat: optionele stop-sequenties doorgeven indien aanwezig
    if stop is not None:
@ -1447,6 +1483,77 @@ def _stt_transcribe_path(path: str, lang: str | None):
    text = "".join(seg.text for seg in segments).strip()
    return text, getattr(info, "language", None)

+# Initialize the Tools instance
+tools_instance = Tools()
+
+@app.post("/web/search/xng")
+async def web_search_xng(
+    query: str = Form(...),
+    SEARXNG_ENGINE_API_BASE_URL: str = tools_instance.valves.SEARXNG_ENGINE_API_BASE_URL,
+    IGNORED_WEBSITES: str = tools_instance.valves.IGNORED_WEBSITES,
+    RETURNED_SCRAPPED_PAGES_NO: int = tools_instance.valves.RETURNED_SCRAPPED_PAGES_NO,
+    SCRAPPED_PAGES_NO: int = tools_instance.valves.SCRAPPED_PAGES_NO,
+    PAGE_CONTENT_WORDS_LIMIT: int = tools_instance.valves.PAGE_CONTENT_WORDS_LIMIT,
+    CITATION_LINKS: bool = tools_instance.valves.CITATION_LINKS,
+) -> str:
+    omsch="""
+    Search the web using SearXNG and get the content of the relevant pages.
+    OpenAI-compat: { "query": "string", "SEARXNG_ENGINE_API_BASE_URL": "string", "IGNORED_WEBSITES": "string", "RETURNED_SCRAPPED_PAGES_NO": "integer", "SCRAPPED_PAGES_NO": "integer", "PAGE_CONTENT_WORDS_LIMIT": "integer", "CITATION_LINKS": "boolean" }
+    Return: JSON string with search results.
+    """
+    #query = (body.get("query") or "").strip()
+    if not query:
+        raise HTTPException(status_code=400, detail="Lege query")
+
+    # Extract parameters from the body
+    params = {
+        "SEARXNG_ENGINE_API_BASE_URL": SEARXNG_ENGINE_API_BASE_URL,
+        "IGNORED_WEBSITES": IGNORED_WEBSITES,
+        "RETURNED_SCRAPPED_PAGES_NO": RETURNED_SCRAPPED_PAGES_NO,
+        "SCRAPPED_PAGES_NO": SCRAPPED_PAGES_NO,
+        "PAGE_CONTENT_WORDS_LIMIT": PAGE_CONTENT_WORDS_LIMIT,
+        "CITATION_LINKS": CITATION_LINKS,
+    }
+
+    # Update the valves with the provided parameters
+    tools_instance.valves = tools_instance.Valves(**params)
+
+    # Call the existing search_web function from tools_instance
+    result = await tools_instance.search_web(query)
+    return JSONResponse(result)
+
+@app.post("/web/get_website/xng")
+async def get_website_xng(
+    url: str = Form(...),
+    SEARXNG_ENGINE_API_BASE_URL: str = tools_instance.valves.SEARXNG_ENGINE_API_BASE_URL,
+    IGNORED_WEBSITES: str = tools_instance.valves.IGNORED_WEBSITES,
+    PAGE_CONTENT_WORDS_LIMIT: int = tools_instance.valves.PAGE_CONTENT_WORDS_LIMIT,
+    CITATION_LINKS: bool = tools_instance.valves.CITATION_LINKS,
+) -> str:
+    omsch="""
+    Web scrape the website provided and get the content of it.
+    OpenAI-compat: { "url": "string", "SEARXNG_ENGINE_API_BASE_URL": "string", "IGNORED_WEBSITES": "string", "PAGE_CONTENT_WORDS_LIMIT": "integer", "CITATION_LINKS": "boolean" }
+    Return: JSON string with website content.
+    """
+    #url = (body.get("url") or "").strip()
+    if not url:
+        raise HTTPException(status_code=400, detail="Lege URL")
+
+    # Extract parameters from the body
+    params = {
+        "SEARXNG_ENGINE_API_BASE_URL": SEARXNG_ENGINE_API_BASE_URL,
+        "IGNORED_WEBSITES": IGNORED_WEBSITES,
+        "PAGE_CONTENT_WORDS_LIMIT": PAGE_CONTENT_WORDS_LIMIT,
+        "CITATION_LINKS": CITATION_LINKS,
+    }
+
+    # Update the valves with the provided parameters
+    tools_instance.valves = tools_instance.Valves(**params)
+
+    # Call the existing get_website function from tools_instance
+    result = await tools_instance.get_website(url)
+    return JSONResponse(result)
+
@app.post("/v1/audio/transcriptions")
 async def audio_transcriptions(
    file: UploadFile = File(...),
@ -1521,6 +1628,22 @@ async def images_generations(payload: dict = Body(...)):
        out_items.append({"b64_json": b64})
    return {"created": int(time.time()), "data": out_items}

+@app.get("/improve_web_query")
+async def improve_web_query(request: Request, format: str = "proxy"):
+    text = (request.query_params.get("text") or "")[:int(request.query_params.get("max_chars", 20000))]
+    objective = request.query_params.get("objective") or "Verbeter deze search query door de kern van de tekst te identificeren en deze om te zetten in een betere query. Gebruik AND, OR, of andere logische operatoren om de query te optimaliseren."
+    style = request.query_params.get("style") or "Hanteer best practices, behoud inhoudelijke betekenis."
+
+    resp = await llm_call_openai_compat(
+        [{"role": "user", "content": f""
+                                   f"Je taak is om deze search query te verbeteren: {text}\n"
+                                   f"Objectief: {objective}\n"
+                                   f"Stijl: {style}\n"
+                                   f"Verbeterde query:"}],
+        stream=False, max_tokens=200
+    )
+    return resp
+
@app.get("/v1/images/health")
 def images_health():
    return {"svg_to_png": bool(cairosvg is not None)}
@ -1751,10 +1874,26 @@ def _normalize_files_arg(args: dict):

@app.get("/openapi.json", include_in_schema=False)
 async def openapi_endpoint():
+    tool_routes=[]
+    for r in app.routes:
+        if getattr(r, "path", "").split("/")[1] in ["v1","web","openapi","vision","file","rag","repo"]:
+            tool_routes.append(r)
+            logger.info("toolwithpath: %s",getattr(r, "path", ""))
+    tool_routes = [
+        r for r in app.routes
+        if isinstance(r, APIRoute) and r.path.startswith("/openapi/")
+    ]
+    logger.info("OpenAPI tool_routes=%d", len(tool_routes))
+    for r in tool_routes:
+        logger.info("  tool route: %s", r.path)
+    #tool_routes = [
+    #    r for r in app.routes
+    #    if getattr(r, "path", "").startswith("/openapi/")
+    #]
    return get_openapi(
        title="Tool Server",
        version="0.1.0",
-        routes=app.routes,
+        routes=tool_routes,
    )

 def _openai_tools_from_registry(reg: dict):
@ -1794,6 +1933,7 @@ def _parse_validation_results(text: str) -> list[str]:
    return issues

 async def _execute_tool(name: str, args: dict) -> dict:
+    logger.info("toolcall: "+str(name)+" ("+str(args)+")")
    if name == "repo_grep":
        repo_url = args.get("repo_url","")
        branch   = args.get("branch","main")
@ -1856,6 +1996,18 @@ async def _execute_tool(name: str, args: dict) -> dict:
        )
        return out

+    # Web tools
+    if name == "web_search_xng":
+        # Search the web using SearXNG and get the content of the relevant pages.
+        out=await web_search_xng(query=args.get("query",""))
+        return out
+
+    if name == "get_website_xng":
+        # Web scrape the website provided and get the content of it.
+        out=await get_website_xng(
+          url=args.get("url", ""))
+        return out
+    
    # Tekst tools
    if name == "summarize_text":
        text = (args.get("text") or "")[:int(args.get("max_chars",16000))]
@ -2010,6 +2162,22 @@ TOOLS_REGISTRY = {
            "required":["query"]
        }
    },
+    "web_search_xng": {
+        "description": "Search the web using SearXNG and get the content of the relevant pages.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "query": {"type": "string"},
+        },"required":["query"]}
+    },
+    "get_website_xng": {
+      "description": "Web scrape the website provided and get the content of it.",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "url": {"type": "string"},
+        },"required":["url"]}
+    },
    "summarize_text": {
        "description": "Vat tekst samen in bullets met inleiding en actiepunten.",
        "parameters": {"type":"object","properties":{
@ -2252,15 +2420,54 @@ async def llm_call_autocont(
    # Bouw een standaard OpenAI-chat response met het samengevoegde antwoord
    return _openai_chat_response(mdl, full_text, messages)

+def _normalize_tools_and_choice(body: dict) -> None:
+    tools = body.get("tools") or []
+    tc = body.get("tool_choice")
+
+    # 1) OWUI: tool_choice="required" -> maak het OpenAI-compat
+    if tc == "required":
+        names = []
+        for t in tools:
+            fn = (t.get("function") or {}) if isinstance(t, dict) else {}
+            n = fn.get("name")
+            if n:
+                names.append(n)
+
+        uniq = list(dict.fromkeys(names))
+        if len(uniq) == 1:
+            # force die ene tool
+            body["tool_choice"] = {"type": "function", "function": {"name": uniq[0]}}
+        else:
+            # meerdere tools: upstream snapt "required" niet -> kies auto
+            body["tool_choice"] = "auto"
+
+    # 2) Sommige clients sturen tools als [{"name":..., "parameters":...}] i.p.v. OpenAI {"type":"function","function":{...}}
+    if tools and isinstance(tools, list) and isinstance(tools[0], dict):
+        if "type" not in tools[0] and "name" in tools[0]:
+            body["tools"] = [{"type": "function", "function": t} for t in tools]
+
+    # 3) (optioneel) backward compat: functions -> tools
+    if (not body.get("tools")) and body.get("functions"):
+        body["tools"] = [{"type": "function", "function": f} for f in body["functions"]]
+        body.pop("functions", None)
+        if body.get("function_call") and not body.get("tool_choice"):
+            # grof: map function_call->tool_choice
+            fc = body["function_call"]
+            if isinstance(fc, dict) and fc.get("name"):
+                body["tool_choice"] = {"type": "function", "function": {"name": fc["name"]}}
+            body.pop("function_call", None)
+

@app.post("/v1/chat/completions")
 async def openai_chat_completions(body: dict = Body(...), request: Request = None):
    model = (body.get("model") or os.getenv("LLM_MODEL", "mistral-medium")).strip()
-    #logging.info(str(body))
-    #logging.info(str(request))
+    _normalize_tools_and_choice(body)
+    logging.info(str(body))
+    logging.info(str(request))
    stream = bool(body.get("stream", False))
    raw_messages = body.get("messages") or []
    # normaliseer tool-berichten naar plain tekst voor het LLM
+    if False:
        norm_messages = []
        for m in raw_messages:
            if m.get("role") == "tool":
@ -2271,16 +2478,31 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                })
            else:
                norm_messages.append(m)
-
+    else:
+        norm_messages=raw_messages

    # --- minimal tool-calling glue (laat rest van je functie intact) ---
    tools = body.get("tools") or []
+    
+    RUN_BRIDGE = os.getenv("LLM_TOOL_RUNNER", "bridge").lower() == "bridge"
+
+    # (optioneel maar vaak nodig) forceer jouw eigen tools i.p.v. wat OWUI meestuurt
+    if RUN_BRIDGE and os.getenv("FORCE_ALL_TOOLS", "1").lower() not in ("0","false","no"):
+        body["tools"] = _openai_tools_from_registry(_visible_registry(TOOLS_REGISTRY))
+        tools = body["tools"]
+
+    # bridge werkt het makkelijkst non-stream
+    if RUN_BRIDGE and stream and (body.get("tools") or []):
+        body["stream"] = False
+        stream = False
+        
    # 'tool_choice' hier alleen lezen; later in de native branch wordt opnieuw naar body gekeken
    tool_choice_req = body.get("tool_choice")  # 'auto' | 'none' | 'required' | {...}
    try:
        logger.info("🧰 tools_count=%s, tool_choice=%s", len(tools), tool_choice_req)
    except Exception:
        pass
+    if not stream:

        # OWUI stuurt vaak "required" als: "er MOET een tool worden gebruikt".
        # Als er precies 1 tool is meegegeven, normaliseren we dat naar "force deze tool".
@ -2388,7 +2610,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non

        # Snelle escape: bij streaming en geen expliciete 'required' tool -> forceer directe streaming
        if stream and tools and tool_choice_req in (None, "auto", "none") and \
-       os.getenv("STREAM_PREFER_DIRECT", "1").lower() not in ("0","false","no"):
+           os.getenv("STREAM_PREFER_DIRECT", "0").lower() not in ("0","false","no"):
            tools = []  # bypass tool glue zodat we rechtstreeks naar de echte streaming gaan

        # (2) Auto: vraag de LLM om 1+ function calls te produceren
@ -2549,7 +2771,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non

        if LLM_FUNCTION_CALLING_MODE in ("native","auto") and not stream:
            # Relay-modus: laat LLM tools kiezen, bridge voert uit, daarna 2e run.
-            relay = os.getenv("LLM_TOOL_RUNNER", "passthrough").lower() == "bridge"
+            relay = os.getenv("LLM_TOOL_RUNNER", "bridge").lower() == "bridge" #passthrough
            client = app.state.HTTPX
            if not relay:
                passthrough = dict(body); passthrough["messages"]=messages
@ -2781,6 +3003,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
            top_p       = float(body.get("top_p", 0.9))
            _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021"))
            max_tokens  = int(body.get("max_tokens", _default_max))
+            logger.info("UP tool_choice=%r tools0=%s", body.get("tool_choice"), (body.get("tools") or [{}])[0])
            return await llm_call_openai_compat(
                messages,
                model=model,
@ -2795,13 +3018,13 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non

    # --- non-stream: windowing + auto-continue (zoals eerder gepatcht) ---
    LLM_WINDOWING_ENABLE = os.getenv("LLM_WINDOWING_ENABLE", "1").lower() not in ("0","false","no")
-    MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "13021"))
+    MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "42000"))
    RESP_RESERVE   = int(os.getenv("LLM_RESPONSE_RESERVE", "1024"))
    MAX_AUTOCONT   = int(os.getenv("LLM_AUTO_CONTINUES", "2"))
    temperature = float(body.get("temperature", 0.2))
    top_p       = float(body.get("top_p", 0.9))
    # Laat env de default bepalen, zodat OWUI niet hard op 1024 blijft hangen
-    _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13027"))
+    _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "42000"))
    max_tokens  = int(body.get("max_tokens", _default_max))

    trimmed = messages
@ -3689,7 +3912,7 @@ async def rag_query_api(
        resp = await llm_call_openai_compat(
            [{"role":"system","content":"You are precise and return only valid JSON."},
             {"role":"user","content": prompt+"\n\nOnly JSON array."}],
-            stream=False, temperature=0.0, top_p=1.0, max_tokens=512
+            stream=False, temperature=0.0, top_p=1.0, max_tokens=1024
        )
        try:
            order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]"))
--- a/mistral-api.sh
+++ b/mistral-api.sh
@ -1 +1 @@
-docker run -d --name mistral-api --network host -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=13021 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ mistral-api
+docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=16288 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 mistral-api
--- a/smart_rag.py
+++ b/smart_rag.py
@ -213,7 +213,7 @@ async def enrich_intent(llm_call_fn, messages: List[Dict]) -> Dict:
    try:
        resp = await llm_call_fn(
            [{"role":"system","content":sys},{"role":"user","content":usr}],
-            stream=False, temperature=0.1, top_p=1.0, max_tokens=300
+            stream=False, temperature=0.1, top_p=1.0, max_tokens=512
        )
        raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}")
        spec = _safe_json_loads(raw) or {"task": user_text, "constraints": [], "file_hints": [], "keywords": [], "acceptance": [], "ask": None} #json.loads(raw.strip())
@ -245,7 +245,7 @@ async def expand_queries(llm_call_fn, q: str, k: int = 3) -> List[str]:
    try:
        resp = await llm_call_fn(
            [{"role":"system","content":sys},{"role":"user","content":usr}],
-            stream=False, temperature=0.2, top_p=0.9, max_tokens=120
+            stream=False, temperature=0.2, top_p=0.9, max_tokens=240
        )
        raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]")
        arr = _safe_json_loads(raw) or []
--- a/windowing_utils.py
+++ b/windowing_utils.py
@ -156,9 +156,9 @@ def fit_context_under_budget(

 def build_repo_context(
    files_ranked: List[Tuple[str, str, float]],
-    per_chunk_tokens: int = 1200,
+    per_chunk_tokens: int = 2100,
    overlap_tokens: int = 60,
-    ctx_budget_tokens: int = 4000,
+    ctx_budget_tokens: int = 5000,
    tok_len: Callable[[str], int] = approx_token_count
 ) -> str:
    expanded: List[Tuple[str,str]] = []