RAG updates

2025-11-20 16:16:00 +01:00 · 2025-11-20 16:16:00 +01:00 · a79293abbd
commit a79293abbd
parent 8a97368577
8 changed files with 794 additions and 77 deletions
--- a/56
+++ b/56
@ -2,11 +2,31 @@ FROM python:3.11-slim
 WORKDIR /app
 # ===== Model caches op vaste paden (blijven in image) =====
 # Hugging Face caches (embeddings) + XDG cache (o.a. whisper)
 ENV HF_HOME=/opt/hf \
    HUGGINGFACE_HUB_CACHE=/opt/hf \
    TRANSFORMERS_CACHE=/opt/hf \
    SENTENCE_TRANSFORMERS_HOME=/opt/sentence-transformers \
    XDG_CACHE_HOME=/opt/cache \
    STT_MODEL=small
 # Optioneel build-args om modelkeuzes te pinnen
 ARG RAG_EMBEDDINGS=gte-multilingual       # of: bge-small / e5-small / gte-base-en
 ARG STT_MODEL_ARG=small                   # tiny | base | small | medium | large-v3, etc.
 ENV RAG_EMBEDDINGS=${RAG_EMBEDDINGS}
 ENV STT_MODEL=${STT_MODEL_ARG}
 # maak directories nu al aan (rechten)
 RUN mkdir -p /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper && \
    chmod -R a+rX /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper
 COPY requirements.txt .
-RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils
+RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential
 RUN pip install --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25
 #RUN pip cache purge
 RUN apt-get update && apt-get install -y --no-install-recommends \
      wget ca-certificates libstdc++6 libatomic1 \
@ -19,12 +39,46 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ln -sf /opt/piper/piper /usr/local/bin/piper; \
     rm -f /tmp/piper.tgz
 # ===== Prefetch modellen tijdens de build =====
 # 1) SentenceTransformers (embeddings) — volgens je mapping in app.py
 RUN python - <<'PY'
 import os
 from sentence_transformers import SentenceTransformer
 mapping = {
    "gte-multilingual": ("Alibaba-NLP/gte-multilingual-base"),
    "bge-small": ("BAAI/bge-small-en-v1.5"),
    "e5-small": ("intfloat/e5-small-v2"),
    "gte-base-en": ("thenlper/gte-base"),
 }
 choice = os.environ.get("RAG_EMBEDDINGS","gte-multilingual").lower()
 hf_id = mapping.get(choice, "BAAI/bge-small-en-v1.5")
 # cache_folder respecteert SENTENCE_TRANSFORMERS_HOME/HF_HOME, maar we forceren expliciet:
 SentenceTransformer(hf_id, cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME","/opt/sentence-transformers"))
 print("Prefetched SentenceTransformer:", hf_id)
 PY
 # 2) faster-whisper (STT) — cache in /opt/cache/whisper
 RUN python - <<'PY'
 import os
 from faster_whisper import WhisperModel
 name = os.environ.get("STT_MODEL","small")
 cache_root = os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper")
 os.makedirs(cache_root, exist_ok=True)
 # Build-time altijd CPU/INT8 (geen GPU nodig tijdens build)
 _ = WhisperModel(name, device="cpu", compute_type="int8", download_root=cache_root)
 print("Prefetched faster-whisper:", name, "->", cache_root)
 PY
 # (optioneel) piper voice kun je hier ook voorcachen; laat ik nu achterwege omdat voice per omgeving wisselt.
 COPY app.py .
 COPY queue_helper.py .
 COPY agent_repo.py .
 COPY windowing_utils.py .
 COPY smart_rag.py .
 COPY llm_client .
 EXPOSE 8080
--- a/agent_repo.py
+++ b/agent_repo.py
@ -18,7 +18,7 @@ from windowing_utils import approx_token_count
 from starlette.concurrency import run_in_threadpool
 import asyncio
 from collections import defaultdict
-
+from llm_client import _llm_call
 # --- Async I/O executors (voorkom event-loop blocking) ---
 from concurrent.futures import ThreadPoolExecutor
@ -59,7 +59,59 @@ _meili_search_fn = None
 _GRAPH_CACHE: dict[str, dict[str, set[str]]] = {}
 _TREE_SUM_CACHE: dict[str, dict[str, str]] = {}
 # ---------------------------------------------------------
 # Fast-path helpers: expliciete paden + vervangpaar (old->new)
 # ---------------------------------------------------------
 _Q = r"[\"'“”‘’`]"
 _PATH_PATS = [
    r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"”']",
    r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
    r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"”']",
    r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
 ]
 _TRANS_WRAPPERS = [
    r"__\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
    r"@lang\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
    r"trans\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
 ]
 def _extract_repo_branch_from_text(txt: str) -> Tuple[Optional[str], str]:
    repo_url, branch = None, "main"
    m = re.search(r"\bRepo\s*:\s*(\S+)", txt, flags=re.I)
    if m: repo_url = m.group(1).strip()
    mb = re.search(r"\bbranch\s*:\s*([A-Za-z0-9._/-]+)", txt, flags=re.I)
    if mb: branch = mb.group(1).strip()
    return repo_url, branch
 def _extract_explicit_paths(txt: str) -> List[str]:
    out = []
    for pat in _PATH_PATS:
        for m in re.finditer(pat, txt):
            p = m.group(1)
            if p and p not in out:
                out.append(p)
    return out
 def _extract_replace_pair(txt: str) -> Tuple[Optional[str], Optional[str]]:
    # NL/EN varianten + “slimme” quotes
    pats = [
        rf"Vervang\s+de\s+tekst\s*{_Q}(.+?){_Q}[^.\n]*?(?:in|naar|verander(?:en)?\s+in)\s*{_Q}(.+?){_Q}",
        rf"Replace(?:\s+the)?\s+text\s*{_Q}(.+?){_Q}\s*(?:to|with)\s*{_Q}(.+?){_Q}",
    ]
    for p in pats:
        m = re.search(p, txt, flags=re.I|re.S)
        if m:
            return m.group(1), m.group(2)
    mm = re.search(r"(Vervang|Replace)[\s\S]*?"+_Q+"(.+?)"+_Q+"[\s\S]*?"+_Q+"(.+?)"+_Q, txt, flags=re.I)
    if mm:
        return mm.group(2), mm.group(3)
    return None, None
 def _looks_like_unified_diff_request(txt: str) -> bool:
    if re.search(r"\bunified\s+diff\b", txt, flags=re.I): return True
    if re.search(r"\b(diff|patch)\b", txt, flags=re.I) and _extract_explicit_paths(txt):
        return True
    return False
 # zet dit dicht bij de andere module-consts
 async def _call_get_git_repo(repo_url: str, branch: str):
@ -2873,7 +2925,7 @@ def laravel_scan_routes(repo_root: Path) -> list[dict]:
        if not p.exists():
            continue
        txt = _read_file_safe(p)
-        for m in re.finditer(r"Route::(get|post|put|patch|delete|match|resource)\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*([^)]+)\)", txt, flags=re.I):
+        for m in re.finditer(r"Route::(get|post|put|patch|delete|match)\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*([^)]+)\)", txt, flags=re.I):
            verb, uri, target = m.group(1).lower(), m.group(2), m.group(3)
            ctrl = None; method = None; name = None
            # controller@method
@ -3926,13 +3978,122 @@ async def _llm_task_route(user_goal: str, framework: str = "laravel") -> dict:
 # ---------- Hoofd-handler ----------
 async def handle_repo_agent(messages: List[dict], request) -> str:
    """
    Uitbreiding: fast-path voor unified diffs op expliciete bestanden met tekstvervanging.
    Als niet van toepassing, valt automatisch terug op de bestaande flow.
    """
    # 1) Combineer user/system content om opdracht te parsen
    try:
        full_txt = "\n".join([m.get("content","") for m in messages if m.get("role") in ("system","user")])
    except Exception:
        full_txt = ""
    # 2) Herken fast-path
    try_fast = _looks_like_unified_diff_request(full_txt)
    paths_fp = _extract_explicit_paths(full_txt) if try_fast else []
    old_txt, new_txt = _extract_replace_pair(full_txt) if try_fast else (None, None)
    # NB: we gebruiken de injecties die via initialize_agent zijn gezet:
    # - get_git_repo_fn (async)
    # - read_text_file_fn  (sync)
    # Deze symbolen worden onderin initialize_agent aan globals() gehangen.
    get_git_repo_fn = globals().get("get_git_repo_fn")
    read_text_file_fn = globals().get("read_text_file_fn")
    if try_fast and paths_fp and old_txt and new_txt and callable(get_git_repo_fn) and callable(read_text_file_fn):
        # 3) repo + branch bepalen
        repo_url, branch = _extract_repo_branch_from_text(full_txt)
        if not repo_url:
            # fallback: probeer repo uit eerdere agent-state (optioneel), anders stop fast-path
            repo_url = globals().get("_last_repo_url")
            branch   = globals().get("_last_branch", "main")
        if repo_url:
            try:
                repo_root = await get_git_repo_fn(repo_url, branch or "main")
                root = Path(repo_root)
                lang_path = root / "resources" / "lang" / "nl.json"
                lang_before = lang_path.read_text(encoding="utf-8", errors="ignore") if lang_path.exists() else "{}"
                lang_data = {}
                try:
                    lang_data = json.loads(lang_before or "{}")
                except Exception:
                    lang_data = {}
                diffs_out = []
                lang_changed = False
                def _make_udiff(a: str, b: str, rel: str) -> str:
                    return "".join(difflib.unified_diff(
                        a.splitlines(keepends=True),
                        b.splitlines(keepends=True),
                        fromfile=f"a/{rel}", tofile=f"b/{rel}", n=3
                    ))
                # 4) per bestand: ofwel inline replace, ofwel vertaling bijwerken
                for rel in paths_fp:
                    p = root / rel
                    if not p.exists():
                        continue
                    before = read_text_file_fn(p)
                    if not before:
                        continue
                    # Als de 'oude' tekst voorkomt BINNEN een vertaalwrapper, dan géén blade-edit
                    found_in_wrapper = False
                    for pat in _TRANS_WRAPPERS:
                        for m in re.finditer(pat, before):
                            inner = m.group(1)
                            if inner == old_txt:
                                found_in_wrapper = True
                                break
                        if found_in_wrapper:
                            break
                    if found_in_wrapper:
                        # update nl.json: {"oude": "nieuwe"}
                        if lang_data.get(old_txt) != new_txt:
                            lang_data[old_txt] = new_txt
                            lang_changed = True
                        continue
                    # anders: directe, exacte vervanging (conservatief)
                    after = before.replace(old_txt, new_txt)
                    if after != before:
                        diff = _make_udiff(before, after, rel)
                        if diff.strip():
                            diffs_out.append(("blade", rel, diff))
                # 5) indien vertaling gewijzigd: diff voor nl.json toevoegen
                if lang_changed:
                    new_lang = json.dumps(lang_data, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
                    diff_lang = _make_udiff(lang_before if isinstance(lang_before, str) else "", new_lang, "resources/lang/nl.json")
                    if diff_lang.strip():
                        diffs_out.append(("lang", "resources/lang/nl.json", diff_lang))
                if diffs_out:
                    parts = ["### Unified diffs"]
                    for kind, rel, d in diffs_out:
                        parts.append(f"**{rel}**")
                        parts.append("```diff\n" + d + "```")
                    return "\n\n".join(parts)
                else:
                    return "Dry-run: geen wijzigbare treffers gevonden in opgegeven bestanden (of reeds actueel)."
            except Exception as e:
                # mislukt → val terug op bestaande discover/agent flow
                pass
    # === GEEN fast-path → ga door met de bestaande flow hieronder ===
    sid = _get_session_id(messages, request)
    st = _app.state.AGENT_SESSIONS.get(sid) or AgentState()
    _app.state.AGENT_SESSIONS[sid] = st
    user_last = next((m["content"] for m in reversed(messages) if m.get("role")=="user"), "").strip()
    user_last_lower = user_last.lower()
    logger.info("INFO:agent_repo:[%s] stage=%s", sid, st.stage)
-    from smart_rag import enrich_intent, expand_queries, hybrid_retrieve
+    from smart_rag import (
        enrich_intent,
        expand_queries,
        hybrid_retrieve,
        _laravel_pairs_from_route_text,
        _laravel_guess_view_paths_from_text,
    )
    # Als user een .git URL meegeeft: zet state en ga via de state-machine verder
    user_txt = next((m.get("content","") for m in reversed(messages) if m.get("role")=="user"), "")
    repo_url = await _detect_repo_url(user_txt)
@ -4287,16 +4448,13 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                    picked.append(f)
        # --- VIEW/LANG bias voor UI-label wijzigingen ---
-        if task_type == "ui_label_change":
+        # Pak de eerste quote uit de prompt als "oude" literal
-            # Probeer de 'oude' literal uit de prompt te halen (voor gerichter filteren)
+        qs = extract_quotes(st.user_goal) or []
-            try:
+        old_lit = qs[0] if qs else None
                old_lit, _new_lit, _why = deduce_old_new_literals(st.user_goal, "")
            except Exception:
                old_lit = None
        def _contains_old(rel: str) -> bool:
            if not old_lit:
-                    return True
+                return True  # fallback: geen filtering
            try:
                txt = _read_text_file(Path(st.repo_path) / rel) or ""
                return old_lit in txt
@ -4374,22 +4532,12 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                # routes -> controllers
                if rel in ("routes/web.php","routes/api.php"):
                    txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
                    try:
                        from app import _laravel_pairs_from_route_text  # of waar je helper staat
                    except Exception:
                        _laravel_pairs_from_route_text = None
                    if _laravel_pairs_from_route_text:
                    for ctrl_path, _m in _laravel_pairs_from_route_text(txt):
                        if ctrl_path and ctrl_path not in picked and ctrl_path not in add:
                            add.append(ctrl_path)
                # controllers -> views
                if rel.startswith("app/Http/Controllers/") and rel.endswith(".php"):
                    txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
                    try:
                        from app import _laravel_guess_view_paths_from_text
                    except Exception:
                        _laravel_guess_view_paths_from_text = None
                    if _laravel_guess_view_paths_from_text:
                    for v in _laravel_guess_view_paths_from_text(txt):
                        if v and v not in picked and v not in add:
                            add.append(v)
@ -4410,12 +4558,10 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                        pass
            picked = (picked + add + more)[:MAX_FILES_DRYRUN]
        # 5) Literal-grep fallback: als de user een oud->nieuw wijziging impliceert, zoek de 'old' literal repo-breed
-        try:
+        qs = extract_quotes(st.user_goal) or []
-            old, new, _why_pair = deduce_old_new_literals(st.user_goal, "")
+        old = qs[0].strip() if qs and qs[0].strip() else None
-        except Exception:
+        if old:
-            old, new = None, None
+            grep_hits = _grep_repo_for_literal(Path(st.repo_path), old, limit=16)
        if old and isinstance(old, str) and old.strip():
            grep_hits = _grep_repo_for_literal(Path(st.repo_path), old.strip(), limit=16)
            for rel in grep_hits:
                if rel in all_files and rel not in picked:
                    picked.append(rel)
@ -4589,15 +4735,14 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                if best and best not in st.candidate_paths: added.append(best)
        st.candidate_paths = (added + st.candidate_paths)[:MAX_FILES_DRYRUN]
        # extra: grep op 'old' literal uit user_goal om kandidaten te verrijken
-        try:
+        qs = extract_quotes(st.user_goal) or []
-            old, new, _why_pair = deduce_old_new_literals(st.user_goal, "")
+        old = qs[0].strip() if qs and qs[0].strip() else None
        except Exception:
            old = None
        if old:
            for rel in _grep_repo_for_literal(root, old, limit=16):
                if rel in all_files and rel not in st.candidate_paths:
                    st.candidate_paths.append(rel)
        try:
            proposed, diffs, reasons = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
            if not proposed:
--- a/app.py
+++ b/app.py
@ -32,7 +32,11 @@ from fastapi.routing import APIRoute
 from starlette.concurrency import run_in_threadpool
 from pydantic import BaseModel
 AUTO_CONTINUE = os.getenv("AUTO_CONTINUE", "1").lower() not in ("0","false","no")
 AUTO_CONTINUE_MAX_ROUNDS = int(os.getenv("AUTO_CONTINUE_MAX_ROUNDS", "6"))
 AUTO_CONTINUE_TAIL_CHARS = int(os.getenv("AUTO_CONTINUE_TAIL_CHARS", "600"))
 from llm_client import init_llm_client, _sync_model_infer
 # Optionele libs voor tekst-extractie
 try:
@ -71,6 +75,17 @@ except Exception:
    cairosvg = None
 import tempfile, subprocess  # voor audio
 import html  # ← nieuw: unescape van HTML-entities in tool-call JSON
 # Forceer consistente caches zowel binnen als buiten Docker
 os.environ.setdefault("HF_HOME", "/opt/hf")
 os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/opt/hf")
 os.environ.setdefault("TRANSFORMERS_CACHE", "/opt/hf")
 os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers")
 os.environ.setdefault("XDG_CACHE_HOME", "/opt/cache")
 # whisper cache pad (gebruikt door faster-whisper)
 os.environ.setdefault("WHISPER_CACHE_DIR", os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper"))
 # STT (Whisper-compatible via faster-whisper) — optioneel
 _STT_MODEL = None
@ -148,15 +163,53 @@ def _build_tools_system_prompt(tools: list) -> str:
    return "\n".join(lines)
 def _extract_tool_calls_from_text(txt: str):
-    # tolerant: pak eerste JSON object (ook als het in ```json staat)
+    s = (txt or "").strip()
-    m = re.search(r"\{[\s\S]*\}", txt or "")
+    # Strip code fences & bekende chat-tokens
    if s.startswith("```"):
        s = extract_code_block(s)
    s = re.sub(r"^<\|im_start\|>\s*assistant\s*", "", s, flags=re.I)
    # HTML-escaped JSON (&#34; etc.)
    try:
        s = html.unescape(s)
    except Exception:
        pass
    # tolerant: pak eerste JSON object
    m = re.search(r"\{[\s\S]*\}", s)
    if not m:
        return []
    try:
        obj = json.loads(m.group(0))
    except Exception:
        return []
-    tc = obj.get("tool_calls") or []
+    # === Normaliseer verschillende dialecten ===
    # 1) OpenAI: {"tool_calls":[{"type":"function","function":{"name":..,"arguments":..}}]}
    if "tool_calls" in obj and isinstance(obj["tool_calls"], list):
        tc = obj["tool_calls"]
    # 2) Replit/Magistral/Mistral-achtig: {"call_tool":{"name":"...","arguments":{...}}} óf lijst
    elif "call_tool" in obj:
        raw = obj["call_tool"]
        if isinstance(raw, dict):
            raw = [raw]
        tc = []
        for it in (raw or []):
            name = (it or {}).get("name")
            args = (it or {}).get("arguments") or {}
            if isinstance(args, str):
                try: args = json.loads(args)
                except Exception: pass
            if name:
                tc.append({"type":"function","function":{"name": name, "arguments": json.dumps(args, ensure_ascii=False)}})
    # 3) Legacy OpenAI: {"function_call":{"name":"...","arguments":"{...}"}} 
    elif "function_call" in obj and isinstance(obj["function_call"], dict):
        fc = obj["function_call"]
        name = fc.get("name")
        args = fc.get("arguments") or {}
        if isinstance(args, str):
            try: args = json.loads(args)
            except Exception: pass
        tc = [{"type":"function","function":{"name": name, "arguments": json.dumps(args, ensure_ascii=False)}}] if name else []
    else:
        tc = []
    out = []
    for c in tc:
        name = (c or {}).get("name")
@ -166,6 +219,13 @@ def _extract_tool_calls_from_text(txt: str):
                args = json.loads(args)
            except Exception:
               args = {}
        # Support zowel {"name":..,"arguments":..} als {"type":"function","function":{...}}
        if not name and isinstance(c.get("function"), dict):
            name = c["function"].get("name")
            args = c["function"].get("arguments") or args
            if isinstance(args, str):
                try: args = json.loads(args)
                except Exception: args = {}
        if name:
            out.append({
                "id": f"call_{uuid.uuid4().hex[:8]}",
@ -177,6 +237,46 @@ def _extract_tool_calls_from_text(txt: str):
            })
    return out
 # --- Universal toolcall detector (JSON + ReAct + HTML-escaped) ---
 _TOOL_REACT = re.compile(
    r"Action\s*:\s*([A-Za-z0-9_\-\.]+)\s*[\r\n]+Action\s*Input\s*:\s*(?:```json\s*([\s\S]*?)\s*```|([\s\S]*))",
    re.I
 )
 def detect_toolcalls_any(text: str) -> list[dict]:
    """
    Retourneert altijd OpenAI-achtige tool_calls items:
    [{"id":..., "type":"function", "function":{"name":..., "arguments": "{...}"}}]
    """
    calls = _extract_tool_calls_from_text(text)
    if calls:
        return calls
    s = (text or "").strip()
    try:
        s = html.unescape(s)
    except Exception:
        pass
    # ReAct fallback
    m = _TOOL_REACT.search(s)
    if m:
        name = (m.group(1) or "").strip()
        raw = (m.group(2) or m.group(3) or "").strip()
        # strip eventuele fences
        if raw.startswith("```"):
            raw = extract_code_block(raw)
        try:
            args = json.loads(raw) if raw else {}
        except Exception:
            # laatste redmiddel: wikkel als {"input": "<raw>"}
            args = {"input": raw}
        if name:
            return [{
                "id": f"call_{uuid.uuid4().hex[:8]}",
                "type": "function",
                "function": {"name": name, "arguments": json.dumps(args, ensure_ascii=False)}
            }]
    return []
 # -----------------------------------------------------------------------------
 # App & logging
 # -----------------------------------------------------------------------------
@ -197,6 +297,21 @@ app.add_middleware(
    allow_headers=["*"],
 )
 # Eén centrale QueueManager voor ALLE LLM-calls
 #app.state.LLM_QUEUE = QueueManager(model_infer_fn=_sync_model_infer)
 # Koppel deze queue aan llm_client zodat _llm_call dezelfde queue gebruikt
 #init_llm_client(app.state.LLM_QUEUE)
 # Let op:
 # - llm_call_openai_compat verwacht dat app.state.LLM_QUEUE een deque-achtige queue is
 #   (append/index/popleft/remove).
 # - De QueueManager uit queue_helper.py is thread-based en wordt hier niet gebruikt.
 #   Als je QueueManager wilt inzetten, doe dat in llm_client.py of elders,
 #   maar niet als app.state.LLM_QUEUE, om type-conflicten te voorkomen.
@app.on_event("startup")
 async def _startup():
    # Zorg dat lokale hosts nooit via een proxy gaan
@ -697,7 +812,7 @@ async def llm_call_openai_compat(
    stream: bool = False,
    temperature: float = 0.2,
    top_p: float = 0.9,
-    max_tokens: int = 1024,
+    max_tokens: int = 13027,
    extra: Optional[dict] = None,
    stop: Optional[Union[str, list[str]]] = None,
    **kwargs
@ -926,7 +1041,7 @@ async def _svg_from_prompt(prompt: str, w: int, h: int, background: str="white")
            f"- Thema: {prompt}\n- Gebruik eenvoudige vormen/paths/tekst.")
    resp = await llm_call_openai_compat(
        [{"role":"system","content":sys},{"role":"user","content":user}],
-        stream=False, temperature=0.35, top_p=0.9, max_tokens=1200
+        stream=False, temperature=0.35, top_p=0.9, max_tokens=2048
    )
    svg = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
    return _svg_wrap_if_needed(_sanitize_svg(svg), w, h, background)
@ -1079,6 +1194,165 @@ def _parse_repo_qa_from_messages(messages: list[dict]) -> tuple[Optional[str], s
    question = full
    return repo_hint, question, branch, n_ctx
 # ------------------------------
 # Unified-diff "fast path" (expliciete paden + vervangtekst)
 # ------------------------------
 _Q = r"[\"'“”‘’`]"
 _PATH_PATS = [
    r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"”']",
    r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
    r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"”']",
    r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
 ]
 def _extract_repo_branch(text: str) -> tuple[Optional[str], str]:
    repo_url = None
    branch = "main"
    m = re.search(r"\bRepo\s*:\s*(\S+)", text, flags=re.I)
    if m:
        repo_url = m.group(1).strip()
    mb = re.search(r"\bbranch\s*:\s*([A-Za-z0-9._/-]+)", text, flags=re.I)
    if mb:
        branch = mb.group(1).strip()
    return repo_url, branch
 def _extract_paths(text: str) -> list[str]:
    paths: set[str] = set()
    for pat in _PATH_PATS:
        for m in re.finditer(pat, text):
            paths.add(m.group(1))
    return list(paths)
 def _extract_replace_pair(text: str) -> tuple[Optional[str], Optional[str]]:
    """
    Haalt (old,new) uit NL/EN varianten. Ondersteunt rechte en ‘slimme’ quotes.
    Voorbeelden die matchen:
    - Vervang de tekst “A” in “B”
    - Vervang de tekst "A" veranderen in "B"
    - Replace the text 'A' to 'B'
    - Replace "A" with "B"
    """
    pats = [
        rf"Vervang\s+de\s+tekst\s*{_Q}(.+?){_Q}[^.\n]*?(?:in|naar|verander(?:en)?\s+in)\s*{_Q}(.+?){_Q}",
        rf"Replace(?:\s+the)?\s+text\s*{_Q}(.+?){_Q}\s*(?:to|with)\s*{_Q}(.+?){_Q}",
    ]
    for p in pats:
        m = re.search(p, text, flags=re.I|re.S)
        if m:
            return m.group(1), m.group(2)
    # fallback: eerste twee quoted strings in de buurt van 'Vervang' / 'Replace'
    mm = re.search(r"(Vervang|Replace)[\s\S]*?"+_Q+"(.+?)"+_Q+"[\s\S]*?"+_Q+"(.+?)"+_Q, text, flags=re.I)
    if mm:
        return mm.group(2), mm.group(3)
    return None, None
 def _looks_like_unified_diff_request(text: str) -> bool:
    if re.search(r"\bunified\s+diff\b", text, flags=re.I):
        return True
    # ook accepteren bij "maak een diff" met expliciete bestanden
    if re.search(r"\b(diff|patch)\b", text, flags=re.I) and any(re.search(p, text) for p in _PATH_PATS):
       return True
    return False
 def _make_unified_diff(a_text: str, b_text: str, path: str) -> str:
    import difflib
    a_lines = a_text.splitlines(keepends=True)
    b_lines = b_text.splitlines(keepends=True)
    return "".join(difflib.unified_diff(a_lines, b_lines, fromfile=f"a/{path}", tofile=f"b/{path}", n=3))
 _TRANS_WRAPPERS = [
    r"__\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
    r"@lang\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
    r"trans\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
 ]
 def _find_in_translation_wrapper(content: str, needle: str) -> bool:
    for pat in _TRANS_WRAPPERS:
        for m in re.finditer(pat, content):
            inner = m.group(1)
            if inner == needle:
                return True
    return False
 def _update_nl_json(root: Path, key: str, value: str) -> tuple[bool, str]:
    """
    Zet of update resources/lang/nl.json: { key: value }
    Retourneert (gewijzigd, nieuwe_tekst).
    """
    lang_path = root / "resources" / "lang" / "nl.json"
    data = {}
    if lang_path.exists():
        try:
            data = json.loads(lang_path.read_text(encoding="utf-8", errors="ignore") or "{}")
        except Exception:
            data = {}
    # Alleen aanpassen als nodig
    prev = data.get(key)
    if prev == value:
        return False, lang_path.as_posix()
    data[key] = value
    # Mooie, stabiele JSON dump
    lang_path.parent.mkdir(parents=True, exist_ok=True)
    tmp = lang_path.with_suffix(".tmp.json")
    tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
    os.replace(tmp, lang_path)
    return True, lang_path.as_posix()
 async def _fast_unified_diff_task(messages: list[dict]) -> Optional[str]:
    """
    Als de prompt vraagt om 'unified diff' met expliciete bestanden en vervangtekst,
    voer dat direct uit (bypass handle_repo_agent) en geef diffs terug.
    """
    full = "\n".join([m.get("content","") for m in messages if m.get("role") in ("system","user")])
    if not _looks_like_unified_diff_request(full):
        return None
    repo_url, branch = _extract_repo_branch(full)
    paths = _extract_paths(full)
    old, new = _extract_replace_pair(full)
    if not (repo_url and paths and old and new):
        return None
    # Repo ophalen
    repo_path = await _get_git_repo_async(repo_url, branch)
    root = Path(repo_path)
    changed_files: list[tuple[str, str]] = []  # (path, diff_text)
    lang_changed = False
    lang_path_text = ""
    for rel in paths:
        p = root / rel
        if not p.exists():
            continue
        before = _read_text_file_wrapper(p)
        if not before:
            continue
        # Als de tekst binnen een vertaal-wrapper staat, wijzig NIET de blade,
        # maar zet/patch resources/lang/nl.json: { "oude": "nieuwe" }.
        if _find_in_translation_wrapper(before, old):
            ch, lang_path_text = _update_nl_json(root, old, new)
            lang_changed = lang_changed or ch
            # geen bladewijziging in dit geval
            continue
        after = before.replace(old, new)
        if after == before:
            continue
        diff = _make_unified_diff(before, after, rel)
        if diff.strip():
            changed_files.append((rel, diff))
    if not changed_files and not lang_changed:
        return "Dry-run: geen wijzigbare treffers gevonden in opgegeven bestanden (of reeds actueel)."
    out = []
    if changed_files:
        out.append("### Unified diffs")
        for rel, d in changed_files:
            out.append(f"```diff\n{d}```")
    if lang_changed:
        out.append(f"✅ Bijgewerkte vertaling in: `{lang_path_text}` → \"{old}\" → \"{new}\"")
    return "\n\n".join(out).strip()
 # ------------------------------
 # OpenAI-compatible endpoints
@ -1131,15 +1405,17 @@ def _get_stt_model():
    except Exception:
        raise HTTPException(status_code=500, detail="STT niet beschikbaar: faster-whisper ontbreekt.")
    # device-selectie
    download_root = os.getenv("STT_MODEL_DIR", os.environ.get("WHISPER_CACHE_DIR"))
    os.makedirs(download_root, exist_ok=True)
    if STT_DEVICE == "auto":
        try:
-            _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cuda", compute_type="float16")
+            _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cuda", compute_type="float16", download_root=download_root)
            return _STT_MODEL
        except Exception:
-            _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cpu", compute_type="int8")
+            _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cpu", compute_type="int8", download_root=download_root)
            return _STT_MODEL
    dev, comp = ("cuda","float16") if STT_DEVICE=="cuda" else ("cpu","int8")
-    _STT_MODEL = WhisperModel(STT_MODEL_NAME, device=dev, compute_type=comp)
+    _STT_MODEL = WhisperModel(STT_MODEL_NAME, device=dev, compute_type=comp, download_root=download_root)
    return _STT_MODEL
 def _stt_transcribe_path(path: str, lang: str | None):
@ -1246,7 +1522,7 @@ async def present_make(
            f"Max. {max_slides} dia's, 3–6 bullets per dia.")
    plan = await llm_call_openai_compat(
        [{"role":"system","content":sys},{"role":"user","content":user}],
-        stream=False, temperature=0.3, top_p=0.9, max_tokens=1200
+        stream=False, temperature=0.3, top_p=0.9, max_tokens=13021
    )
    raw = (plan.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}")
    try:
@ -1292,7 +1568,7 @@ async def vision_ask(
    stream: bool = Form(False),
    temperature: float = Form(0.2),
    top_p: float = Form(0.9),
-    max_tokens: int = Form(512),
+    max_tokens: int = Form(1024),
 ):
    raw = await run_in_threadpool(file.file.read)
    img_b64 = base64.b64encode(raw).decode("utf-8")
@ -1323,7 +1599,7 @@ async def vision_and_text(
    max_chars: int = Form(25000),
    temperature: float = Form(0.2),
    top_p: float = Form(0.9),
-    max_tokens: int = Form(1024),
+    max_tokens: int = Form(2048),
 ):
    images_b64: list[str] = []
    text_chunks: list[str] = []
@ -1370,7 +1646,7 @@ async def vision_health():
    tiny_png = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAOaO5nYAAAAASUVORK5CYII="
    try:
        messages = [{"role":"user","content":"<image> Beschrijf dit in één woord."}]
-        resp = await llm_call_openai_compat(messages, extra={"images":[tiny_png]}, max_tokens=16)
+        resp = await llm_call_openai_compat(messages, extra={"images":[tiny_png]}, max_tokens=128)
        txt = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","").strip()
        return {"vision": bool(txt), "sample": txt[:60]}
    except Exception as e:
@ -1564,7 +1840,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
        resp = await llm_call_openai_compat(
            [{"role":"system","content":"Je bent behulpzaam en exact."},
             {"role":"user","content": f"{instruction}\n\n--- BEGIN ---\n{text}\n--- EINDE ---"}],
-            stream=False, max_tokens=768
+            stream=False, max_tokens=7680
        )
        return resp
@ -1574,7 +1850,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
        resp = await llm_call_openai_compat(
            [{"role":"system","content":"Wees feitelijk en concreet."},
             {"role":"user","content": f"{goal}\n\n--- BEGIN ---\n{text}\n--- EINDE ---"}],
-            stream=False, max_tokens=768
+            stream=False, max_tokens=7680
        )
        return resp
@ -1587,7 +1863,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
             {"role":"user","content": f"{objective}\nStijl/criteria: {style}\n"
                                       "Antwoord met eerst een korte toelichting, daarna alleen de verbeterde inhoud tussen een codeblok.\n\n"
                                       f"--- BEGIN ---\n{text}\n--- EINDE ---"}],
-            stream=False, max_tokens=1024
+            stream=False, max_tokens=10240
        )
        return resp
@ -1597,7 +1873,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
        resp = await llm_call_openai_compat(
            [{"role":"system","content":"Wees strikt en concreet."},
             {"role":"user","content": f"{VALIDATE_PROMPT}\n\nCode om te valideren:\n```\n{code}\n```"}],
-            stream=False, max_tokens=512
+            stream=False, max_tokens=10000
        )
        txt = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
        return {"status":"issues_found" if _parse_validation_results(txt) else "valid",
@ -1611,7 +1887,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
            [{"role":"system","content": SYSTEM_PROMPT},
             {"role":"user","content": f"Verbeter deze {language}-code met focus op {focus}:\n\n"
                                       f"{code}\n\nGeef eerst een korte toelichting, dan alleen het verbeterde codeblok. Behoud functionaliteit."}],
-            stream=False, max_tokens=1536
+            stream=False, max_tokens=10768
        )
        return resp
@ -1636,7 +1912,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
    if name == "vision_analyze":
        image_url = args.get("image_url","")
        prompt = args.get("prompt","Beschrijf beknopt wat je ziet en noem de belangrijkste details.")
-        max_tokens = int(args.get("max_tokens",512))
+        max_tokens = int(args.get("max_tokens",1024))
        b64 = None
        # Alleen data: of raw base64 accepteren; http(s) niet, want die worden niet
        # ingeladen in de vision-call en zouden stil falen.
@ -1908,6 +2184,52 @@ async def _complete_with_autocontinue(
        continues += 1
    return content, resp
 async def llm_call_autocont(
    messages: list[dict],
    *,
    model: Optional[str] = None,
    stream: bool = False,
    temperature: float = 0.2,
    top_p: float = 0.9,
    max_tokens: int = 1024,
    extra: Optional[dict] = None,
    stop: Optional[Union[str, list[str]]] = None,
    **kwargs
 ) -> dict | StreamingResponse:
    """
    OpenAI-compatibele wrapper die non-stream antwoorden automatisch
    doorcontinueert met jouw _complete_with_autocontinue(...).
    - stream=True  -> passthrough naar llm_call_openai_compat (ongewijzigd)
    - stream=False -> retourneert één samengevoegd antwoord als OpenAI JSON
    """
    mdl = (model or os.getenv("LLM_MODEL", "mistral-medium")).strip()
    if stream:
        # streaming blijft 1-op-1 zoals voorheen
        return await llm_call_openai_compat(
            messages,
            model=mdl,
            stream=True,
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            extra=extra if extra else None,
            stop=stop
        )
    max_autocont = int(os.getenv("LLM_AUTO_CONTINUES", "2"))
    full_text, _last = await _complete_with_autocontinue(
        messages,
        model=mdl,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        extra_payload=extra if extra else None,
        max_autocont=max_autocont
    )
    # Bouw een standaard OpenAI-chat response met het samengevoegde antwoord
    return _openai_chat_response(mdl, full_text, messages)
@app.post("/v1/chat/completions")
 async def openai_chat_completions(body: dict = Body(...), request: Request = None):
    model = (body.get("model") or os.getenv("LLM_MODEL", "mistral-medium")).strip()
@ -2053,7 +2375,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
        # jouw bestaande helper; hou 'm zoals je al gebruikt
        resp = await llm_call_openai_compat(ask, stream=False, max_tokens=512)
        txt = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content", "") or ""
-        calls = _extract_tool_calls_from_text(txt)
+        calls = detect_toolcalls_any(txt) #_extract_tool_calls_from_text(txt)
        if calls:
            return {
                "id": f"chatcmpl-{uuid.uuid4().hex}",
@ -2075,6 +2397,17 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
    # Speciale modellen
    if model == "repo-agent":
        # === snelle bypass voor "unified diff" opdrachten met expliciete paden ===
        try:
            fast = await _fast_unified_diff_task(messages)
        except Exception as e:
            fast = f"(Fast-diff pad mislukte: {e})"
        if fast:
            if stream:
                async def _emit(): yield ("data: " + json.dumps({"id": f"chatcmpl-{uuid.uuid4().hex[:12]}", "object":"chat.completion.chunk","created": int(time.time()),"model":"repo-agent","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":None}]}) + "\n\n").encode("utf-8"); yield ("data: " + json.dumps({"id": f"chatcmpl-{uuid.uuid4().hex[:12]}","object":"chat.completion.chunk","created": int(time.time()),"model":"repo-agent","choices":[{"index":0,"delta":{"content": fast},"finish_reason":None}]}) + "\n\n").encode("utf-8"); yield b"data: [DONE]\n\n"
                return StreamingResponse(_emit(), media_type="text/event-stream", headers={"Cache-Control":"no-cache, no-transform","X-Accel-Buffering":"no","Connection":"keep-alive"})
            return JSONResponse(_openai_chat_response("repo-agent", fast, messages))
        if stream:
            async def event_gen():
                import asyncio, time, json, contextlib
@ -2421,7 +2754,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
        if stream:
            temperature = float(body.get("temperature", 0.2))
            top_p       = float(body.get("top_p", 0.9))
-            _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "1024"))
+            _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021"))
            max_tokens  = int(body.get("max_tokens", _default_max))
            return await llm_call_openai_compat(
                messages,
@ -3331,7 +3664,7 @@ async def rag_query_api(
        resp = await llm_call_openai_compat(
            [{"role":"system","content":"You are precise and return only valid JSON."},
             {"role":"user","content": prompt+"\n\nOnly JSON array."}],
-            stream=False, temperature=0.0, top_p=1.0, max_tokens=256
+            stream=False, temperature=0.0, top_p=1.0, max_tokens=512
        )
        try:
            order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]"))
@ -3428,7 +3761,8 @@ initialize_agent(
    get_git_repo_fn=_get_git_repo_async,
    rag_index_repo_internal_fn=_rag_index_repo_internal,
    rag_query_internal_fn=_rag_query_internal,
-    llm_call_fn=llm_call_openai_compat,
+    # Gebruik auto-continue wrapper zodat agent-antwoorden niet worden afgekapt
    llm_call_fn=llm_call_autocont,
    extract_code_block_fn=extract_code_block,
    read_text_file_fn=_read_text_file_wrapper,
    client_ip_fn=_client_ip,
--- a/llm_client.py
+++ b/llm_client.py
@ -0,0 +1,131 @@
 from __future__ import annotations
 import os
 import asyncio
 import logging
 from typing import List, Dict, Any, Optional
 import httpx
 from queue_helper import QueueManager
 logger = logging.getLogger(__name__)
 # -------------------------------------------------------------
 # Config voor onderliggende LLM-backend
 # -------------------------------------------------------------
 # Dit is NIET jouw eigen /v1/chat/completions endpoint,
 # maar de *echte* model-backend (bijv. Ollama, vLLM, Mistral server, etc.).
 LLM_API_BASE = os.getenv("LLM_API_BASE", "http://127.0.0.1:11434")
 LLM_DEFAULT_MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini")
 LLM_REQUEST_TIMEOUT = float(os.getenv("LLM_REQUEST_TIMEOUT", "120"))
 # Deze wordt in app.py gezet via init_llm_client(...)
 LLM_QUEUE: QueueManager | None = None
 def init_llm_client(queue: QueueManager) -> None:
    """
    Koppel de globale LLM_QUEUE aan de QueueManager uit app.py.
    Deze MOET je in app.py één keer aanroepen.
    """
    global LLM_QUEUE
    LLM_QUEUE = queue
    logger.info("llm_client: LLM_QUEUE gekoppeld via init_llm_client.")
 def _sync_model_infer(payload: Dict[str, Any]) -> Dict[str, Any]:
    """
    Synchronous call naar de echte LLM-backend.
    Dit is de functie die je in app.py gebruikt bij het maken van de QueueManager.
    """
    url = f"{LLM_API_BASE.rstrip('/')}/v1/chat/completions"
    try:
        with httpx.Client(timeout=LLM_REQUEST_TIMEOUT) as client:
            resp = client.post(url, json=payload)
            resp.raise_for_status()
            return resp.json()
    except Exception as exc:
        logger.exception("LLM backend call failed: %s", exc)
        return {
            "object": "chat.completion",
            "choices": [{
                "index": 0,
                "finish_reason": "error",
                "message": {
                    "role": "assistant",
                    "content": f"[LLM-fout] {exc}",
                },
            }],
            "usage": {
                "prompt_tokens": 0,
                "completion_tokens": 0,
                "total_tokens": 0,
            },
        }
 async def _llm_call(
    messages: List[Dict[str, str]],
    *,
    stream: bool = False,
    temperature: float = 0.2,
    top_p: float = 0.9,
    max_tokens: Optional[int] = None,
    model: Optional[str] = None,
    **extra: Any,
 ) -> Dict[str, Any]:
    """
    Centrale helper voor tools/agents/smart_rag/repo-agent.
    Belangrijk:
    - Gebruikt de *bestaande* QueueManager uit app.py (via init_llm_client).
    - Stuurt jobs in de agent-queue (lagere prioriteit dan users).
    - GEEN wachtrij-meldingen ("u bent #...") voor deze interne calls.
    """
    if stream:
        # In deze agent gebruiken we geen streaming.
        raise NotImplementedError("_llm_call(stream=True) wordt momenteel niet ondersteund.")
    if LLM_QUEUE is None:
        # Hard fail: dan weet je meteen dat init_llm_client nog niet is aangeroepen.
        raise RuntimeError("LLM_QUEUE is niet geïnitialiseerd. Roep init_llm_client(...) aan in app.py")
    payload: Dict[str, Any] = {
        "model": model or LLM_DEFAULT_MODEL,
        "messages": messages,
        "stream": False,
        "temperature": float(temperature),
        "top_p": float(top_p),
    }
    if max_tokens is not None:
        payload["max_tokens"] = int(max_tokens)
    payload.update(extra)
    loop = asyncio.get_running_loop()
    try:
        # request_agent_sync blokkeert → naar threadpool
        response: Dict[str, Any] = await loop.run_in_executor(
            None, lambda: LLM_QUEUE.request_agent_sync(payload)
        )
        return response
    except Exception as exc:
        logger.exception("_llm_call via agent-queue failed: %s", exc)
        return {
            "object": "chat.completion",
            "choices": [{
                "index": 0,
                "finish_reason": "error",
                "message": {
                    "role": "assistant",
                    "content": f"[LLM-queue-fout] {exc}",
                },
            }],
            "usage": {
                "prompt_tokens": 0,
                "completion_tokens": 0,
                "total_tokens": 0,
            },
        }
--- a/mistral-api.sh
+++ b/mistral-api.sh
@ -0,0 +1 @@
 docker run -d --name mistral-api --network host -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=13021 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ mistral-api
--- a/queue_helper.py
+++ b/queue_helper.py
@ -45,11 +45,20 @@ class QueueManager:
        self._worker.start()
    # ---------- public API ----------
-    def enqueue_user(self, payload: Dict, progress_cb: Callable[[Dict], None]) -> tuple[str, int]:
+    def enqueue_user(
        self,
        payload: Dict,
        progress_cb: Callable[[Dict], None],
        *,
        notify_position: bool = False,
    ) -> tuple[str, int]:
        job = _Job(payload, progress_cb)
        try: self._user_q.put_nowait(job)
        except queue.Full: raise RuntimeError(f"User‑queue vol (≥{USER_MAX_QUEUE})")
        position = self._user_q.qsize()
        if notify_position:
            # start een aparte notifier-thread die periodiek de wachtrijpositie meldt
            start_position_notifier(job, self._user_q)
        return job.job_id, position
    def enqueue_agent(self, payload: Dict, progress_cb: Callable[[Dict], None]) -> str:
@ -58,6 +67,32 @@ class QueueManager:
        except queue.Full: raise RuntimeError(f"Agent‑queue vol (≥{AGENT_MAX_QUEUE})")
        return job.job_id
    # ---------- sync helper voor agents/tools ----------
    def request_agent_sync(self, payload: Dict, timeout: float = WORKER_TIMEOUT) -> Dict:
        """
        Gebruik dit voor interne calls (agents/tools).
        - Job wordt in de agent-queue gezet (lagere prioriteit dan users).
        - We wachten blokkerend tot de worker klaar is of tot timeout.
        - Er worden GEEN wachtrij-meldingen ("U bent #...") verstuurd.
        """
        result_box: Dict[str, Any] = {}
        def _cb(msg: Dict):
            # alleen het eindresultaat is interessant voor tools/agents
            result_box["answer"] = msg
        job = _Job(payload, _cb)
        try:
            self._agent_q.put_nowait(job)
        except queue.Full:
            raise RuntimeError(f"Agent-queue vol (≥{AGENT_MAX_QUEUE})")
        ok = job.event.wait(timeout)
        if not ok:
            raise TimeoutError(f"LLM-inference duurde langer dan {timeout} seconden.")
        if job.error:
            raise RuntimeError(job.error)
        return result_box.get("answer") or {}
    # ---------- worker ----------
    def _run_worker(self):
        while not self._shutdown.is_set():
@ -79,18 +114,24 @@ class QueueManager:
        self._shutdown.set()
        self._worker.join(timeout=5)
-def start_position_notifier(job: _Job,
+def start_position_notifier(
    job: _Job,
    queue_ref: queue.Queue,
-                           interval: float = UPDATE_INTERVAL):
+    interval: float = UPDATE_INTERVAL,
    ):
    """Stuurt elke `interval` seconden een bericht met de huidige positie."""
    def _notifier():
-        while not job.event.is_set():
+        # Stop zodra het job-event wordt gezet (success/fout/timeout upstream)
        while not job.event.wait(interval):
            # Neem een snapshot van de queue-inhoud op een thread-safe manier
            with queue_ref.mutex:
                snapshot = list(queue_ref.queue)
            try:
-                pos = list(queue_ref.queue).index(job) + 1   # 1‑based
+                pos = snapshot.index(job) + 1   # 1-based
            except ValueError:
                # Job staat niet meer in de wachtrij → geen updates meer nodig
                break
            job.callback({"info": f"U bent #{pos} in de wachtrij. Even geduld…" })
            time.sleep(interval)
    t = threading.Thread(target=_notifier, daemon=True)
    t.start()
    return t
--- a/smart_rag.py
+++ b/smart_rag.py
@ -296,7 +296,7 @@ async def hybrid_retrieve(
    # Optionele query-routing + RRF
    use_route = str(os.getenv("RAG_ROUTE", "1")).lower() not in ("0", "false")
    use_rrf   = str(os.getenv("RAG_RRF",   "1")).lower() not in ("0", "false")
-    # Optionele mini multi-query expansion (default uit)
+    # Optionele mini multi-query expansion (default aan)
    use_expand = str(os.getenv("RAG_MULTI_EXPAND", "1")).lower() in ("1","true","yes")
    k_variants = max(1, int(os.getenv("RAG_MULTI_K", "3")))
    per_query_k = max(1, int(per_query_k))
@ -317,10 +317,14 @@ async def hybrid_retrieve(
        if use_route:
            buckets = _route_query_buckets(qv)
            for b in buckets:
                # combineer globale path_contains-hint met bucket-specifieke filter
                pc = b.get("path_contains")
                if path_contains and not pc:
                    pc = path_contains
                res = await rag_query_internal_fn(
                    query=b["q"], n_results=per_query_k,
                    collection_name=collection_name,
-                    repo=repo, path_contains=b["path_contains"], profile=profile
+                    repo=repo, path_contains=pc, profile=profile
                )
                lst = []
                for item in (res or {}).get("results", []):
@ -594,5 +598,7 @@ __all__ = [
    "expand_queries",
    "hybrid_retrieve",
    "assemble_context",
    "_laravel_pairs_from_route_text",
    "_laravel_guess_view_paths_from_text",
 ]
--- a/windowing_utils.py
+++ b/windowing_utils.py
@ -1,7 +1,7 @@
 # windowing_utils.py
 from __future__ import annotations
 from dataclasses import dataclass, field
-from typing import List, Dict, Callable, Optional, Tuple
+from typing import List, Dict, Callable, Optional, Tuple, Awaitable
 import hashlib
 import os
 import time
@ -64,7 +64,7 @@ class ConversationWindow:
    async def build_within_budget(
        self,
        system_prompt: Optional[str],
-        summarizer: Optional[Callable[[str, List[Dict]], "awaitable[str]"]] = None
+        summarizer: Optional[Callable[[str, List[Dict]], Awaitable[str]]] = None
    ) -> List[Dict]:
        budget = self.max_ctx_tokens - max(1, self.response_reserve)
        working = self.history[:]
@ -106,6 +106,11 @@ class ConversationWindow:
                self.running_summary = await summarizer(self.running_summary, chunk_buf)
                chunk_buf = []
        # verwerk eventuele overgebleven buffer zodat er geen turns verdwijnen
        if chunk_buf:
            self.running_summary = await summarizer(self.running_summary, chunk_buf)
            chunk_buf = []
        self.history = working
        return await build_candidate(self.running_summary, working)
		`@ -0,0 +1 @@`
							docker run -d --name mistral-api --network host -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=13021 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ mistral-api