update

2026-02-02 10:28:41 +01:00 · 2026-02-02 10:28:41 +01:00 · 1aaf0d013a
commit 1aaf0d013a
parent 932144e798
3 changed files with 321 additions and 113 deletions
--- a/agent_repo.py
+++ b/agent_repo.py
@ -173,7 +173,7 @@ except Exception:
 logger = logging.getLogger("agent_repo")
 # ---------- Omgeving / Config ----------
-GITEA_URL = os.environ.get("GITEA_URL", "http://localhost:3080").rstrip("/")
+GITEA_URL = os.environ.get("GITEA_URL", "http://10.25.138.40:30085").rstrip("/")
 GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "8bdbe18dd2ec93ecbf9cd0a8f01a6eadf9cfa87d")
 GITEA_API = os.environ.get("GITEA_API", f"{GITEA_URL}/api/v1").rstrip("/")
 AGENT_DEFAULT_BRANCH = os.environ.get("AGENT_DEFAULT_BRANCH", "main")
@ -188,7 +188,7 @@ AGENT_CLARIFY_THRESHOLD = float(os.environ.get("AGENT_CLARIFY_THRESHOLD", "0.6")
 # Meilisearch (optioneel)
-MEILI_URL = os.environ.get("MEILI_URL", "http://localhost:7700").strip()
+MEILI_URL = os.environ.get("MEILI_URL", "http://192.168.100.1:7700").strip()
 MEILI_KEY = os.environ.get("MEILI_KEY", "0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ").strip()
 MEILI_INDEX_PREFIX = os.environ.get("MEILI_INDEX_PREFIX", "code").strip()
@ -377,7 +377,7 @@ def _qdrant_query(collection_name: str, query: str, n_results: int, where: Dict[
    Filter, FieldCondition, MatchValue = _qdrant_models
    # Let op: je hebt hier *ook* een embedder nodig (client-side). In dit skeleton verwachten we dat
    # je server-side search by text hebt geconfigureerd. Anders: voeg hier je embedder toe.
-    client = _qdrant(host=os.getenv("QDRANT_HOST","localhost"), port=int(os.getenv("QDRANT_PORT","6333")))
+    client = _qdrant(host=os.getenv("QDRANT_HOST","192.168.100.1"), port=int(os.getenv("QDRANT_PORT","6333")))
    # Eenvoudig: text search (als ingeschakeld). Anders: raise en laat de mock fallback pakken.
    try:
        must: List[Any] = []
@ -4245,7 +4245,7 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                pass
        st.stage = "ASK"
        base = ("Ik verken de code en doe een voorstel. Geef de repo (bv. `admin/image-viewing-website` of "
-                "`http://localhost:3080/admin/image-viewing-website.git`). "
+                "`http://10.25.138.40:30085/admin/image-viewing-website.git`). "
                "Of zeg: **'zoek repo'** als ik zelf moet zoeken.")
        return _with_preview(base, st)
--- a/app.py
+++ b/app.py
@ -294,6 +294,87 @@ def detect_toolcalls_any(text: str) -> list[dict]:
            }]
    return []
 def _coerce_text_toolcalls_to_openai(data: dict) -> dict:
    """Als een upstream LLM tool-calls als tekst (bv. '[TOOL_CALLS] ...') teruggeeft,
    zet dit om naar OpenAI-native choices[0].message.tool_calls zodat OpenWebUI tools kan runnen.
    Laat bestaande tool_calls ongemoeid.
    """
    try:
        if not isinstance(data, dict):
            return data
        choices = data.get("choices") or []
        if not choices or not isinstance(choices, list):
            return data
        ch0 = choices[0] or {}
        if not isinstance(ch0, dict):
            return data
        msg = ch0.get("message") or {}
        if not isinstance(msg, dict):
            return data
        # native tool_calls bestaan al → niets doen
        if msg.get("tool_calls"):
            return data
        content = msg.get("content")
        if not isinstance(content, str):
            return data
        s = content.strip()
        if not s:
            return data
        # Alleen proberen als er duidelijke signalen zijn
        if ("[TOOL_CALLS]" not in s) and (not s.lstrip().startswith("[")) and ("call_tool" not in s) and ("tool_calls" not in s):
            return data
        calls = detect_toolcalls_any(s) or []
        if not calls:
            # vLLM/[TOOL_CALLS] stijl: vaak een JSON array na de tag
            s2 = re.sub(r"^\s*\[TOOL_CALLS\]\s*", "", s, flags=re.I)
            try:
                s2 = html.unescape(s2)
            except Exception:
                pass
            m = re.search(r"\[[\s\S]*\]", s2)
            arr = None
            if m:
                try:
                    arr = json.loads(m.group(0))
                except Exception:
                    arr = None
            if isinstance(arr, list):
                calls = []
                for it in arr:
                    if not isinstance(it, dict):
                        continue
                    name = it.get("name")
                    args = it.get("arguments", {})
                    if not name and isinstance(it.get("function"), dict):
                        name = it["function"].get("name")
                        args = it["function"].get("arguments", args)
                    if isinstance(args, str):
                        try:
                            args = json.loads(args)
                        except Exception:
                            args = {"input": args}
                    if name:
                        calls.append({
                            "id": f"call_{uuid.uuid4().hex[:8]}",
                            "type": "function",
                            "function": {"name": name, "arguments": json.dumps(args, ensure_ascii=False)}
                        })
        if calls:
            msg["role"] = msg.get("role") or "assistant"
            msg["content"] = None
            msg["tool_calls"] = calls
            ch0["message"] = msg
            ch0["finish_reason"] = "tool_calls"
            data["choices"][0] = ch0
        return data
    except Exception:
        return data
 # -----------------------------------------------------------------------------
 # App & logging
 # -----------------------------------------------------------------------------
@ -384,7 +465,7 @@ async def log_requests(request: Request, call_next):
 # Config
 # -----------------------------------------------------------------------------
 MISTRAL_MODE = os.getenv("MISTRAL_MODE", "v1").lower()
-LLM_URL = os.getenv("LLM_URL", "http://localhost:8000/v1/chat/completions").strip()
+LLM_URL = os.getenv("LLM_URL", "http://192.168.100.1:8000/v1/chat/completions").strip()
 RAW_URL = os.getenv("MISTRAL_URL_RAW", "http://host.docker.internal:8000/completion").strip()
 LLM_CONNECT_TIMEOUT = float(os.getenv("LLM_CONNECT_TIMEOUT", "10"))
 LLM_READ_TIMEOUT = float(os.getenv("LLM_READ_TIMEOUT", "1200"))
@ -392,7 +473,7 @@ LLM_READ_TIMEOUT = float(os.getenv("LLM_READ_TIMEOUT", "1200"))
 _UPSTREAM_URLS = [u.strip() for u in os.getenv("LLM_UPSTREAMS","").split(",") if u.strip()]
 # ==== Meilisearch (optioneel) ====
-MEILI_URL      = os.getenv("MEILI_URL", "http://localhost:7700").rstrip("/")
+MEILI_URL      = os.getenv("MEILI_URL", "http://192.168.100.1:7700").rstrip("/")
 MEILI_API_KEY  = os.getenv("MEILI_API_KEY", "0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ")
 MEILI_INDEX    = os.getenv("MEILI_INDEX", "code_chunks")
 MEILI_ENABLED  = bool(MEILI_URL)
@ -485,7 +566,7 @@ if CELERY_ENABLED:
        celery_app = None
 # Git / repos
-GITEA_URL = os.environ.get("GITEA_URL", "http://localhost:3080").rstrip("/")
+GITEA_URL = os.environ.get("GITEA_URL", "http://10.25.138.40:30085").rstrip("/")
 REPO_PATH = os.environ.get("REPO_PATH", "/tmp/repos")
 # -----------------------------------------------------------------------------
@ -868,7 +949,7 @@ async def llm_call_openai_compat(
    *,
    model: Optional[str] = None,
    stream: bool = False,
-    temperature: float = 0.2,
+    temperature: float = 0.02,
    top_p: float = 0.9,
    max_tokens: int = 42000,
    extra: Optional[dict] = None,
@ -1100,7 +1181,7 @@ async def _svg_from_prompt(prompt: str, w: int, h: int, background: str="white")
            f"- Thema: {prompt}\n- Gebruik eenvoudige vormen/paths/tekst.")
    resp = await llm_call_openai_compat(
        [{"role":"system","content":sys},{"role":"user","content":user}],
-        stream=False, temperature=0.35, top_p=0.9, max_tokens=2048
+        stream=False, temperature=0.035, top_p=0.9, max_tokens=2048
    )
    svg = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
    return _svg_wrap_if_needed(_sanitize_svg(svg), w, h, background)
@ -1668,7 +1749,7 @@ async def present_make(
            f"Max. {max_slides} dia's, 3–6 bullets per dia.")
    plan = await llm_call_openai_compat(
        [{"role":"system","content":sys},{"role":"user","content":user}],
-        stream=False, temperature=0.3, top_p=0.9, max_tokens=13021
+        stream=False, temperature=0.03, top_p=0.9, max_tokens=13021
    )
    raw = (plan.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}")
    try:
@ -1712,7 +1793,7 @@ async def vision_ask(
    file: UploadFile = File(...),
    prompt: str = Form("Beschrijf kort wat je ziet."),
    stream: bool = Form(False),
-    temperature: float = Form(0.2),
+    temperature: float = Form(0.02),
    top_p: float = Form(0.9),
    max_tokens: int = Form(1024),
 ):
@ -1743,7 +1824,7 @@ async def vision_and_text(
    stream: bool = Form(False),
    max_images: int = Form(6),
    max_chars: int = Form(25000),
-    temperature: float = Form(0.2),
+    temperature: float = Form(0.02),
    top_p: float = Form(0.9),
    max_tokens: int = Form(2048),
 ):
@ -1801,7 +1882,7 @@ async def vision_health():
 # -------- Tool registry (OpenAI-style) --------
 LLM_FUNCTION_CALLING_MODE = os.getenv("LLM_FUNCTION_CALLING_MODE", "auto").lower()  # "native" | "shim" | "auto"
-OWUI_BASE_URL='http://localhost:3000'
+OWUI_BASE_URL='http://192.168.100.1:8089'
 OWUI_API_TOKEN='sk-f1b7991b054442b5ae388de905019726'
 # Aliassen zodat oudere codepaths blijven werken
 OWUI_BASE = OWUI_BASE_URL
@ -1976,6 +2057,17 @@ async def t_run_shell(args: dict) -> dict:
 async def _execute_tool(name: str, args: dict) -> dict:
    logger.info("toolcall: "+str(name)+" ("+str(args)+")")
    required=[]
    if name in TOOLS_REGISTRY:
        required=TOOLS_REGISTRY[name]["parameters"]["required"]
    else:
        return {"error": f"Unknown tool '{name}'."}
    if not all(k in args and args[k] for k in required):
        return {"error": f"Missing required arguments for tool '{name}'. Required: {required}"}
    for k in args:
        if k in required:
            if args[k] in ['',None]:
                return {"error": f"Missing required arguments for tool '{name}'. Required: {required}"}
    if name == "repo_grep":
        repo_url = args.get("repo_url","")
        branch   = args.get("branch","main")
@ -2050,6 +2142,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
        })
        return out
    if name == "rag_query":
        try:
            out= await run_in_threadpool(_rag_index_repo_sync, **{
                "repo_url": args.get("repo",""),
                "branch": "main",
@ -2061,6 +2154,9 @@ async def _execute_tool(name: str, args: dict) -> dict:
                "collection_name": "code_docs",
                "force": False,
            })
        except Exception as e:
            return {"error": f"Error for functioncall '{name}', while doing repo_index. errortext: {str(e)}"}
        try:
            out = await rag_query_api(
                query=args.get("query",""),
                n_results=int(args.get("n_results",5)),
@ -2070,6 +2166,9 @@ async def _execute_tool(name: str, args: dict) -> dict:
                profile=args.get("profile")
            )
            return out
        except Exception as e:
            return {"error": f"Error for functioncall '{name}', while doing repo_query. errortext: {str(e)}"}
    # Console tools
    if name == "run_shell":
@ -2080,7 +2179,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
    # Repo
    if name == "repo_qa":
        # High-level QA over een specifieke repo.
-        out=json.dumps(await repo_qa_answer(repo_hint=args.get("repo"),question=args.get("question"),branch=args.get("branch","main"),n_ctx=10), ensure_ascii=False)
+        out=json.dumps(await repo_qa_answer(repo_hint=args.get("repo").replace('"','').replace("'",""),question=args.get("question"),branch=args.get("branch","main"),n_ctx=10), ensure_ascii=False)
        return out
    # Web tools
@ -2203,7 +2302,7 @@ TOOLS_REGISTRY = {
                "repo_url":{"type":"string"},
                "branch":{"type":"string","default":"main"},
                "query":{"type":"string"},
-                "max_hits":{"type":"integer","default":200}
+                "max_hits":{"type":"integer","default":10}
            },
            "required":["repo_url","query"]
        }
@ -2250,7 +2349,7 @@ TOOLS_REGISTRY = {
                "path_contains":{"type":["string","null"]},
                "profile":{"type":["string","null"]}
            },
-            "required":["query"]
+            "required":["query","repo"]
        }
    },
    "web_search_xng": {
@ -2286,7 +2385,7 @@ TOOLS_REGISTRY = {
                "repo":{"type":"string"},
                "question":{"type":"string"},
                "branch":{"type":"string"},
-        },"required":["repo_hint","question"]}
+        },"required":["repo","question"]}
    },
    "summarize_text": {
        "description": "Vat tekst samen in bullets met inleiding en actiepunten.",
@ -2490,7 +2589,7 @@ async def llm_call_autocont(
    *,
    model: Optional[str] = None,
    stream: bool = False,
-    temperature: float = 0.2,
+    temperature: float = 0.02,
    top_p: float = 0.9,
    max_tokens: int = 1024,
    extra: Optional[dict] = None,
@ -2581,7 +2680,8 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
    stream = bool(body.get("stream", False))
    raw_messages = body.get("messages") or []
    # normaliseer tool-berichten naar plain tekst voor het LLM
-    if False:
+    NORMALIZE_TOOL_MESSAGES = os.getenv("NORMALIZE_TOOL_MESSAGES", "0").lower() not in ("0","false","no")
    if NORMALIZE_TOOL_MESSAGES:
        norm_messages = []
        for m in raw_messages:
            if m.get("role") == "tool":
@ -2616,7 +2716,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
        logger.info("🧰 tools_count=%s, tool_choice=%s", len(tools), tool_choice_req)
    except Exception:
        pass
-    if not stream:
+    if RUN_BRIDGE and not stream:
        # OWUI stuurt vaak "required" als: "er MOET een tool worden gebruikt".
        # Als er precies 1 tool is meegegeven, normaliseren we dat naar "force deze tool".
@ -2648,7 +2748,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                client = app.state.HTTPX
                r = await client.post(LLM_URL, json=passthrough)
                try:
-                    return JSONResponse(r.json(), status_code=r.status_code)
+                    data = r.json()
                    data = _coerce_text_toolcalls_to_openai(data)
                    return JSONResponse(data, status_code=r.status_code)
                except Exception:
                    return PlainTextResponse(r.text, status_code=r.status_code)
@ -2700,7 +2802,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                    client = app.state.HTTPX
                    r = await client.post(LLM_URL, json=passthrough)
                    try:
-                        return JSONResponse(r.json(), status_code=r.status_code)
+                        data = r.json()
                        data = _coerce_text_toolcalls_to_openai(data)
                        return JSONResponse(data, status_code=r.status_code)
                    except Exception:
                        return PlainTextResponse(r.text, status_code=r.status_code)
@ -2849,6 +2953,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
        if LLM_FUNCTION_CALLING_MODE in ("native","auto") and stream:
            passthrough = dict(body); passthrough["messages"]=messages
            if images_b64: passthrough["images"]=images_b64
            STREAM_TOOLCALL_COERCE = os.getenv("STREAM_TOOLCALL_COERCE","1").lower() not in ("0","false","no")
            async def _aiter():
                import asyncio, contextlib
                client = app.state.HTTPX
@ -2867,6 +2972,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                            await q.put(b"__EOF__")
                    reader_task = asyncio.create_task(_reader())
                    try:
                        buf = ""
                        acc = ""
                        suppress = False
                        while True:
                            try:
                                chunk = await asyncio.wait_for(q.get(), timeout=HEARTBEAT)
@ -2875,7 +2983,91 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                                continue
                            if chunk == b"__EOF__":
                                break
                            if not STREAM_TOOLCALL_COERCE:
                                yield chunk
                                continue
                            try:
                                buf += chunk.decode("utf-8", errors="ignore")
                            except Exception:
                                yield chunk
                                continue
                            # SSE events zijn gescheiden door een lege regel
                            while "\n\n" in buf:
                                event, buf = buf.split("\n\n", 1)
                                if not event:
                                    continue
                                if event.startswith(":"):
                                    yield (event + "\n\n").encode("utf-8")
                                    continue
                                lines = event.splitlines()
                                data_lines = [ln[5:].lstrip() for ln in lines if ln.startswith("data:")]
                                if not data_lines:
                                    if not suppress:
                                        yield (event + "\n\n").encode("utf-8")
                                    continue
                                data_s = "\n".join(data_lines).strip()
                                if data_s == "[DONE]":
                                    yield b"data: [DONE]\n\n"
                                    return
                                try:
                                    obj = json.loads(data_s)
                                except Exception:
                                    if not suppress:
                                        yield (event + "\n\n").encode("utf-8")
                                    continue
                                try:
                                    ch0 = (obj.get("choices") or [{}])[0] or {}
                                    delta = ch0.get("delta") or {}
                                except Exception:
                                    delta = {}
                                # Als upstream al echte tool_calls streamt: pass-through
                                if isinstance(delta, dict) and delta.get("tool_calls"):
                                    if not suppress:
                                        yield ("data: " + json.dumps(obj, ensure_ascii=False) + "\n\n").encode("utf-8")
                                    continue
                                content = (delta.get("content") if isinstance(delta, dict) else None)
                                if isinstance(content, str) and content:
                                    acc += content
                                    if "[TOOL_CALLS" in acc:
                                        suppress = True  # onderdruk de text-tag stream
                                    calls = detect_toolcalls_any(acc) or []
                                    if calls:
                                        created = int(time.time())
                                        chunk_id = obj.get("id") or f"chatcmpl-{uuid.uuid4().hex[:24]}"
                                        model_name = obj.get("model") or body.get("model") or "unknown"
                                        tc_delta = []
                                        for i, tc in enumerate(calls):
                                            tcc = dict(tc)
                                            tcc["index"] = i
                                            tc_delta.append(tcc)
                                        first = {
                                            "id": chunk_id,
                                            "object": "chat.completion.chunk",
                                            "created": created,
                                            "model": model_name,
                                            "choices": [{
                                                "index": 0,
                                                "delta": {"role":"assistant", "tool_calls": tc_delta},
                                                "finish_reason": None
                                            }]
                                        }
                                        second = {
                                            "id": chunk_id,
                                            "object": "chat.completion.chunk",
                                            "created": created,
                                            "model": model_name,
                                            "choices": [{
                                                "index": 0,
                                                "delta": {},
                                                "finish_reason": "tool_calls"
                                            }]
                                        }
                                        yield ("data: " + json.dumps(first, ensure_ascii=False) + "\n\n").encode("utf-8")
                                        yield ("data: " + json.dumps(second, ensure_ascii=False) + "\n\n").encode("utf-8")
                                        yield b"data: [DONE]\n\n"
                                        return
                                if not suppress:
                                    yield ("data: " + json.dumps(obj, ensure_ascii=False) + "\n\n").encode("utf-8")
                    finally:
                        reader_task.cancel()
                        with contextlib.suppress(Exception):
@ -2895,31 +3087,46 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                if images_b64: passthrough["images"]=images_b64
                r = await client.post(LLM_URL, json=passthrough)
                try:
-                    return JSONResponse(r.json(), status_code=r.status_code)
+                    data = r.json()
                    data = _coerce_text_toolcalls_to_openai(data)
                    return JSONResponse(data, status_code=r.status_code)
                except Exception:
                    return PlainTextResponse(r.text, status_code=r.status_code)
-            # (A) 1e call: vraag de LLM om tool_calls (geen stream)
+            # Relay-modus: iteratief tools uitvoeren totdat de LLM stopt met tool_calls
-            first_req = dict(body)
+            max_rounds = int(os.getenv("LLM_TOOL_MAX_ROUNDS", "5"))
-            first_req["messages"] = messages
+            follow_messages = messages
-            first_req["stream"] = False
+            last_status = None
-            if images_b64: first_req["images"] = images_b64
+            for _round in range(max_rounds):
-            r1 = await client.post(LLM_URL, json=first_req)
+                req_i = dict(body)
                req_i["messages"] = follow_messages
                req_i["stream"] = False
                if images_b64: req_i["images"] = images_b64
                r_i = await client.post(LLM_URL, json=req_i)
                last_status = r_i.status_code
                try:
-                data1 = r1.json()
+                    data_i = r_i.json()
                except Exception:
-                return PlainTextResponse(r1.text, status_code=r1.status_code)
+                    return PlainTextResponse(r_i.text, status_code=r_i.status_code)
-            msg1 = ((data1.get("choices") or [{}])[0].get("message") or {})
+                msg_i = ((data_i.get("choices") or [{}])[0].get("message") or {})
-            tool_calls = msg1.get("tool_calls") or []
+                tool_calls = msg_i.get("tool_calls") or []
                # Fallback: sommige backends gooien toolcalls als tekst (bv. [TOOL_CALLS])
                if not tool_calls:
                    txt = (msg_i.get("content") or "")
                    tool_calls = detect_toolcalls_any(txt) or []
                # Geen tool-calls? Geef direct door.
                if not tool_calls:
-                return JSONResponse(data1, status_code=r1.status_code)
+                    data_i = _coerce_text_toolcalls_to_openai(data_i)
                    return JSONResponse(data_i, status_code=r_i.status_code)
-            # (B) voer tool_calls lokaal uit
+                # Tools uitvoeren
                tool_msgs = []
                for tc in tool_calls:
                    # Normaliseer tc structuur
                    tc_id = (tc or {}).get("id") or f"call_{uuid.uuid4().hex[:8]}"
                    fn = ((tc or {}).get("function") or {})
                    tname = fn.get("name")
                    logger.info(f"Running tool: '{tname}'")
                    raw_args = fn.get("arguments") or "{}"
                    try:
                        args = json.loads(raw_args) if isinstance(raw_args, str) else (raw_args or {})
@ -2934,26 +3141,22 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                            out = {"error": str(e)}
                    tool_msgs.append({
                        "role": "tool",
-                    "tool_call_id": tc.get("id"),
+                        "tool_call_id": tc_id,
                        "name": tname or "unknown",
                        "content": json.dumps(out, ensure_ascii=False)
                    })
                    # Zorg dat assistant tool_calls een id heeft
                    if isinstance(tc, dict) and not tc.get("id"):
                        tc["id"] = tc_id
-            # (C) 2e call: geef tool outputs terug aan LLM voor eindantwoord
+                follow_messages = follow_messages + [
            follow_messages = messages + [
                    {"role": "assistant", "tool_calls": tool_calls},
                    *tool_msgs
                ]
-            second_req = dict(body)
+
-            second_req["messages"] = follow_messages
+            # Te veel tool-rondes → stop om loops te voorkomen
-            second_req["stream"] = False
+            safe_msg = f"Te veel tool-rondes ({max_rounds}). Stop om loop te voorkomen."
-            # images opnieuw meesturen is niet nodig, maar kan geen kwaad:
+            return JSONResponse(_openai_chat_response(model, safe_msg, follow_messages), status_code=(last_status or 200))
            if images_b64: second_req["images"] = images_b64
            r2 = await client.post(LLM_URL, json=second_req)
            try:
                return JSONResponse(r2.json(), status_code=r2.status_code)
            except Exception:
                return PlainTextResponse(r2.text, status_code=r2.status_code)
        # shim (non-stream)
        if LLM_FUNCTION_CALLING_MODE == "shim" and not stream:
@ -2967,7 +3170,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                   "Otherwise reply with ONLY: {\"final_answer\":\"...\"}\n\nTools:\n" + "\n".join(tool_lines))
            decide = await llm_call_openai_compat(
                [{"role":"system","content":sys}] + messages,
-                stream=False, temperature=float(body.get("temperature",0.2)),
+                stream=False, temperature=float(body.get("temperature",0.02)),
                top_p=float(body.get("top_p",0.9)), max_tokens=min(512, int(body.get("max_tokens",1024))),
                extra=extra_payload if extra_payload else None
            )
@ -3006,7 +3209,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
            ]
            final = await llm_call_openai_compat(
                follow, stream=False,
-                temperature=float(body.get("temperature",0.2)),
+                temperature=float(body.get("temperature",0.02)),
                top_p=float(body.get("top_p",0.9)),
                max_tokens=int(body.get("max_tokens",1024)),
                extra=extra_payload if extra_payload else None
@ -3023,7 +3226,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
            LLM_WINDOWING_ENABLE = os.getenv("LLM_WINDOWING_ENABLE", "1").lower() not in ("0","false","no")
            MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "13021"))
            RESP_RESERVE   = int(os.getenv("LLM_RESPONSE_RESERVE", "1024"))
-            temperature = float(body.get("temperature", 0.2))
+            temperature = float(body.get("temperature", 0.02))
            top_p       = float(body.get("top_p", 0.9))
            # respecteer env-override voor default
            _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "1024"))
@ -3051,7 +3254,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                            {"role":"system","content":"Je bent een bondige notulist. Vat samen in max 10 bullets (feiten/besluiten/acties)."},
                            {"role":"user","content": f"Vorige samenvatting:\n{old}\n\nNieuwe geschiedenis:\n{chunk_text}\n\nGeef geüpdatete samenvatting."}
                        ]
-                        resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.1, top_p=1.0, max_tokens=300)
+                        resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.01, top_p=1.0, max_tokens=300)
                        return (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content", old or "")
                    trimmed_stream_msgs = await win.build_within_budget(system_prompt=None, summarizer=_summarizer)
                    new_summary = getattr(win, "running_summary", running_summary)
@ -3116,7 +3319,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
    else:
        # --- ÉCHTE streaming (geen tools): direct passthrough met heartbeats ---
        if stream:
-            temperature = float(body.get("temperature", 0.2))
+            temperature = float(body.get("temperature", 0.02))
            top_p       = float(body.get("top_p", 0.9))
            _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021"))
            max_tokens  = int(body.get("max_tokens", _default_max))
@ -3138,7 +3341,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
    MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "42000"))
    RESP_RESERVE   = int(os.getenv("LLM_RESPONSE_RESERVE", "1024"))
    MAX_AUTOCONT   = int(os.getenv("LLM_AUTO_CONTINUES", "2"))
-    temperature = float(body.get("temperature", 0.2))
+    temperature = float(body.get("temperature", 0.02))
    top_p       = float(body.get("top_p", 0.9))
    # Laat env de default bepalen, zodat OWUI niet hard op 1024 blijft hangen
    _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "42000"))
@ -3165,7 +3368,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
                    {"role":"system","content":"Je bent een bondige notulist. Vat samen in max 10 bullets (feiten/besluiten/acties)."},
                    {"role":"user","content": f"Vorige samenvatting:\n{old}\n\nNieuwe geschiedenis:\n{chunk_text}\n\nGeef geüpdatete samenvatting."}
                ]
-                resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.1, top_p=1.0, max_tokens=300)
+                resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.01, top_p=1.0, max_tokens=300)
                return (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content", old or "")
            trimmed = await win.build_within_budget(system_prompt=None, summarizer=_summarizer)
            new_summary = getattr(win, "running_summary", running_summary)
@ -3309,7 +3512,7 @@ async def _summarize_files_llm(items: list[tuple[str, str]]) -> dict[str, str]:
            {"role":"user","content": f"Pad: {path}\n\nInhoud (ingekort):\n{snippet}\n\nAntwoord: "}
        ]
        try:
-            resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.1, top_p=1.0, max_tokens=64)
+            resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.01, top_p=1.0, max_tokens=64)
            summ = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content","").strip()
        except Exception:
            summ = ""
@ -3807,14 +4010,19 @@ async def rag_query_api(
    collection_name_eff = _collection_effective(collection_name)
    col = _get_collection(collection_name_eff)
    q_emb = _EMBEDDER.embed_query(query)
-    where = {}
+    # Chroma: $and/$or moet >=2 where-expressies bevatten.
    conds = []
    if repo:
-        # Accepteer zowel 'repo' (basename) als 'repo_full' (owner/repo)
+        conds.append({"repo_full": {"$eq": repo}})
-        base = repo.rsplit("/", 1)[-1]
+    if branch:
-        where = {"$and": [
+        conds.append({"branch": {"$eq": branch}})
-            {"repo_full": {"$eq": repo}}
+    if profile:
-        ]}
+        conds.append({"profile": {"$eq": profile}})
-    if profile: where["profile"] = {"$eq": profile}
+    where = None
    if len(conds) == 1:
        where = conds[0]
    elif len(conds) >= 2:
        where = {"$and": conds}
    # ---- symbol hit set (repo-scoped) ----
    sym_hit_keys: set[str] = set()
@ -4113,7 +4321,7 @@ async def rag_query_api(
        resp = await llm_call_openai_compat(
            [{"role":"system","content":"You are precise and return only valid JSON."},
             {"role":"user","content": prompt+"\n\nOnly JSON array."}],
-            stream=False, temperature=0.0, top_p=1.0, max_tokens=1024
+            stream=False, temperature=0.01, top_p=1.0, max_tokens=1024
        )
        try:
            order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]"))
--- a/mistral-api.sh
+++ b/mistral-api.sh
@ -1 +1 @@
-docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=16288 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 mistral-api
+docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=0 -e LLM_CONTEXT_TOKENS=42000 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=0 -e LLM_FUNCTION_CALLING_MODE=auto -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -e FORCE_ALL_TOOLS=0 -e AUTO_CONTINUE=0 -e STREAM_PREFER_DIRECT=1 mistral-api
`@ -1 +1 @@`
	docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=16288 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 mistral-api	docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=0 -e LLM_CONTEXT_TOKENS=42000 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=0 -e LLM_FUNCTION_CALLING_MODE=auto -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -e FORCE_ALL_TOOLS=0 -e AUTO_CONTINUE=0 -e STREAM_PREFER_DIRECT=1 mistral-api