diff --git a/Dockerfile b/Dockerfile
index b3194c0..bb7b046 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,11 +2,31 @@ FROM python:3.11-slim
 
 WORKDIR /app
 
+# ===== Model caches op vaste paden (blijven in image) =====
+# Hugging Face caches (embeddings) + XDG cache (o.a. whisper)
+ENV HF_HOME=/opt/hf \
+    HUGGINGFACE_HUB_CACHE=/opt/hf \
+    TRANSFORMERS_CACHE=/opt/hf \
+    SENTENCE_TRANSFORMERS_HOME=/opt/sentence-transformers \
+    XDG_CACHE_HOME=/opt/cache \
+    STT_MODEL=small
+
+# Optioneel build-args om modelkeuzes te pinnen
+ARG RAG_EMBEDDINGS=gte-multilingual       # of: bge-small / e5-small / gte-base-en
+ARG STT_MODEL_ARG=small                   # tiny | base | small | medium | large-v3, etc.
+ENV RAG_EMBEDDINGS=${RAG_EMBEDDINGS}
+ENV STT_MODEL=${STT_MODEL_ARG}
+
+# maak directories nu al aan (rechten)
+RUN mkdir -p /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper && \
+    chmod -R a+rX /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper
+
 COPY requirements.txt .
-RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils
+RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential
 RUN pip install --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25
+#RUN pip cache purge
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
       wget ca-certificates libstdc++6 libatomic1 \
@@ -19,12 +39,46 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
      ln -sf /opt/piper/piper /usr/local/bin/piper; \
      rm -f /tmp/piper.tgz
 
+# ===== Prefetch modellen tijdens de build =====
+# 1) SentenceTransformers (embeddings) — volgens je mapping in app.py
+RUN python - <<'PY'
+import os
+from sentence_transformers import SentenceTransformer
+mapping = {
+    "gte-multilingual": ("Alibaba-NLP/gte-multilingual-base"),
+    "bge-small": ("BAAI/bge-small-en-v1.5"),
+    "e5-small": ("intfloat/e5-small-v2"),
+    "gte-base-en": ("thenlper/gte-base"),
+}
+choice = os.environ.get("RAG_EMBEDDINGS","gte-multilingual").lower()
+hf_id = mapping.get(choice, "BAAI/bge-small-en-v1.5")
+# cache_folder respecteert SENTENCE_TRANSFORMERS_HOME/HF_HOME, maar we forceren expliciet:
+SentenceTransformer(hf_id, cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME","/opt/sentence-transformers"))
+print("Prefetched SentenceTransformer:", hf_id)
+PY
+
+# 2) faster-whisper (STT) — cache in /opt/cache/whisper
+RUN python - <<'PY'
+import os
+from faster_whisper import WhisperModel
+name = os.environ.get("STT_MODEL","small")
+cache_root = os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper")
+os.makedirs(cache_root, exist_ok=True)
+# Build-time altijd CPU/INT8 (geen GPU nodig tijdens build)
+_ = WhisperModel(name, device="cpu", compute_type="int8", download_root=cache_root)
+print("Prefetched faster-whisper:", name, "->", cache_root)
+PY
+
+# (optioneel) piper voice kun je hier ook voorcachen; laat ik nu achterwege omdat voice per omgeving wisselt.
+
+
 
 COPY app.py .
 COPY queue_helper.py .
 COPY agent_repo.py .
 COPY windowing_utils.py .
 COPY smart_rag.py .
+COPY llm_client .
 
 EXPOSE 8080
 
diff --git a/agent_repo.py b/agent_repo.py
index 5eeb047..14ea4ce 100644
--- a/agent_repo.py
+++ b/agent_repo.py
@@ -18,7 +18,7 @@ from windowing_utils import approx_token_count
 from starlette.concurrency import run_in_threadpool
 import asyncio
 from collections import defaultdict
-
+from llm_client import _llm_call
 
 # --- Async I/O executors (voorkom event-loop blocking) ---
 from concurrent.futures import ThreadPoolExecutor
@@ -59,7 +59,59 @@ _meili_search_fn = None
 _GRAPH_CACHE: dict[str, dict[str, set[str]]] = {}
 _TREE_SUM_CACHE: dict[str, dict[str, str]] = {}
 
+# ---------------------------------------------------------
+# Fast-path helpers: expliciete paden + vervangpaar (old->new)
+# ---------------------------------------------------------
+_Q = r"[\"'“”‘’`]"
+_PATH_PATS = [
+    r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"”']",
+    r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
+    r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"”']",
+    r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
+]
+_TRANS_WRAPPERS = [
+    r"__\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
+    r"@lang\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
+    r"trans\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
+]
 
+def _extract_repo_branch_from_text(txt: str) -> Tuple[Optional[str], str]:
+    repo_url, branch = None, "main"
+    m = re.search(r"\bRepo\s*:\s*(\S+)", txt, flags=re.I)
+    if m: repo_url = m.group(1).strip()
+    mb = re.search(r"\bbranch\s*:\s*([A-Za-z0-9._/-]+)", txt, flags=re.I)
+    if mb: branch = mb.group(1).strip()
+    return repo_url, branch
+
+def _extract_explicit_paths(txt: str) -> List[str]:
+    out = []
+    for pat in _PATH_PATS:
+        for m in re.finditer(pat, txt):
+            p = m.group(1)
+            if p and p not in out:
+                out.append(p)
+    return out
+
+def _extract_replace_pair(txt: str) -> Tuple[Optional[str], Optional[str]]:
+    # NL/EN varianten + “slimme” quotes
+    pats = [
+        rf"Vervang\s+de\s+tekst\s*{_Q}(.+?){_Q}[^.\n]*?(?:in|naar|verander(?:en)?\s+in)\s*{_Q}(.+?){_Q}",
+        rf"Replace(?:\s+the)?\s+text\s*{_Q}(.+?){_Q}\s*(?:to|with)\s*{_Q}(.+?){_Q}",
+    ]
+    for p in pats:
+        m = re.search(p, txt, flags=re.I|re.S)
+        if m:
+            return m.group(1), m.group(2)
+    mm = re.search(r"(Vervang|Replace)[\s\S]*?"+_Q+"(.+?)"+_Q+"[\s\S]*?"+_Q+"(.+?)"+_Q, txt, flags=re.I)
+    if mm:
+        return mm.group(2), mm.group(3)
+    return None, None
+
+def _looks_like_unified_diff_request(txt: str) -> bool:
+    if re.search(r"\bunified\s+diff\b", txt, flags=re.I): return True
+    if re.search(r"\b(diff|patch)\b", txt, flags=re.I) and _extract_explicit_paths(txt):
+        return True
+    return False
 
 # zet dit dicht bij de andere module-consts
 async def _call_get_git_repo(repo_url: str, branch: str):
@@ -2873,7 +2925,7 @@ def laravel_scan_routes(repo_root: Path) -> list[dict]:
         if not p.exists():
             continue
         txt = _read_file_safe(p)
-        for m in re.finditer(r"Route::(get|post|put|patch|delete|match|resource)\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*([^)]+)\)", txt, flags=re.I):
+        for m in re.finditer(r"Route::(get|post|put|patch|delete|match)\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*([^)]+)\)", txt, flags=re.I):
             verb, uri, target = m.group(1).lower(), m.group(2), m.group(3)
             ctrl = None; method = None; name = None
             # controller@method
@@ -3926,13 +3978,122 @@ async def _llm_task_route(user_goal: str, framework: str = "laravel") -> dict:
 
 # ---------- Hoofd-handler ----------
 async def handle_repo_agent(messages: List[dict], request) -> str:
+    """
+    Uitbreiding: fast-path voor unified diffs op expliciete bestanden met tekstvervanging.
+    Als niet van toepassing, valt automatisch terug op de bestaande flow.
+    """
+    # 1) Combineer user/system content om opdracht te parsen
+    try:
+        full_txt = "\n".join([m.get("content","") for m in messages if m.get("role") in ("system","user")])
+    except Exception:
+        full_txt = ""
+
+    # 2) Herken fast-path
+    try_fast = _looks_like_unified_diff_request(full_txt)
+    paths_fp = _extract_explicit_paths(full_txt) if try_fast else []
+    old_txt, new_txt = _extract_replace_pair(full_txt) if try_fast else (None, None)
+
+    # NB: we gebruiken de injecties die via initialize_agent zijn gezet:
+    # - get_git_repo_fn (async)
+    # - read_text_file_fn  (sync)
+    # Deze symbolen worden onderin initialize_agent aan globals() gehangen.
+    get_git_repo_fn = globals().get("get_git_repo_fn")
+    read_text_file_fn = globals().get("read_text_file_fn")
+
+    if try_fast and paths_fp and old_txt and new_txt and callable(get_git_repo_fn) and callable(read_text_file_fn):
+        # 3) repo + branch bepalen
+        repo_url, branch = _extract_repo_branch_from_text(full_txt)
+        if not repo_url:
+            # fallback: probeer repo uit eerdere agent-state (optioneel), anders stop fast-path
+            repo_url = globals().get("_last_repo_url")
+            branch   = globals().get("_last_branch", "main")
+        if repo_url:
+            try:
+                repo_root = await get_git_repo_fn(repo_url, branch or "main")
+                root = Path(repo_root)
+                lang_path = root / "resources" / "lang" / "nl.json"
+                lang_before = lang_path.read_text(encoding="utf-8", errors="ignore") if lang_path.exists() else "{}"
+                lang_data = {}
+                try:
+                    lang_data = json.loads(lang_before or "{}")
+                except Exception:
+                    lang_data = {}
+
+                diffs_out = []
+                lang_changed = False
+
+                def _make_udiff(a: str, b: str, rel: str) -> str:
+                    return "".join(difflib.unified_diff(
+                        a.splitlines(keepends=True),
+                        b.splitlines(keepends=True),
+                        fromfile=f"a/{rel}", tofile=f"b/{rel}", n=3
+                    ))
+
+                # 4) per bestand: ofwel inline replace, ofwel vertaling bijwerken
+                for rel in paths_fp:
+                    p = root / rel
+                    if not p.exists():
+                        continue
+                    before = read_text_file_fn(p)
+                    if not before:
+                        continue
+                    # Als de 'oude' tekst voorkomt BINNEN een vertaalwrapper, dan géén blade-edit
+                    found_in_wrapper = False
+                    for pat in _TRANS_WRAPPERS:
+                        for m in re.finditer(pat, before):
+                            inner = m.group(1)
+                            if inner == old_txt:
+                                found_in_wrapper = True
+                                break
+                        if found_in_wrapper:
+                            break
+                    if found_in_wrapper:
+                        # update nl.json: {"oude": "nieuwe"}
+                        if lang_data.get(old_txt) != new_txt:
+                            lang_data[old_txt] = new_txt
+                            lang_changed = True
+                        continue
+
+                    # anders: directe, exacte vervanging (conservatief)
+                    after = before.replace(old_txt, new_txt)
+                    if after != before:
+                        diff = _make_udiff(before, after, rel)
+                        if diff.strip():
+                            diffs_out.append(("blade", rel, diff))
+
+                # 5) indien vertaling gewijzigd: diff voor nl.json toevoegen
+                if lang_changed:
+                    new_lang = json.dumps(lang_data, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
+                    diff_lang = _make_udiff(lang_before if isinstance(lang_before, str) else "", new_lang, "resources/lang/nl.json")
+                    if diff_lang.strip():
+                        diffs_out.append(("lang", "resources/lang/nl.json", diff_lang))
+                if diffs_out:
+                    parts = ["### Unified diffs"]
+                    for kind, rel, d in diffs_out:
+                        parts.append(f"**{rel}**")
+                        parts.append("```diff\n" + d + "```")
+                    return "\n\n".join(parts)
+                else:
+                    return "Dry-run: geen wijzigbare treffers gevonden in opgegeven bestanden (of reeds actueel)."
+            except Exception as e:
+                # mislukt → val terug op bestaande discover/agent flow
+                pass
+
+    # === GEEN fast-path → ga door met de bestaande flow hieronder ===
+
     sid = _get_session_id(messages, request)
     st = _app.state.AGENT_SESSIONS.get(sid) or AgentState()
     _app.state.AGENT_SESSIONS[sid] = st
     user_last = next((m["content"] for m in reversed(messages) if m.get("role")=="user"), "").strip()
     user_last_lower = user_last.lower()
     logger.info("INFO:agent_repo:[%s] stage=%s", sid, st.stage)
-    from smart_rag import enrich_intent, expand_queries, hybrid_retrieve
+    from smart_rag import (
+        enrich_intent,
+        expand_queries,
+        hybrid_retrieve,
+        _laravel_pairs_from_route_text,
+        _laravel_guess_view_paths_from_text,
+    )
     # Als user een .git URL meegeeft: zet state en ga via de state-machine verder
     user_txt = next((m.get("content","") for m in reversed(messages) if m.get("role")=="user"), "")
     repo_url = await _detect_repo_url(user_txt)
@@ -4287,21 +4448,18 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                     picked.append(f)
 
         # --- VIEW/LANG bias voor UI-label wijzigingen ---
-        if task_type == "ui_label_change":
-            # Probeer de 'oude' literal uit de prompt te halen (voor gerichter filteren)
-            try:
-                old_lit, _new_lit, _why = deduce_old_new_literals(st.user_goal, "")
-            except Exception:
-                old_lit = None
+        # Pak de eerste quote uit de prompt als "oude" literal
+        qs = extract_quotes(st.user_goal) or []
+        old_lit = qs[0] if qs else None
 
-            def _contains_old(rel: str) -> bool:
-                if not old_lit:
-                    return True
-                try:
-                    txt = _read_text_file(Path(st.repo_path)/rel) or ""
-                    return old_lit in txt
-                except Exception:
-                    return False
+        def _contains_old(rel: str) -> bool:
+            if not old_lit:
+                return True  # fallback: geen filtering
+            try:
+                txt = _read_text_file(Path(st.repo_path) / rel) or ""
+                return old_lit in txt
+            except Exception:
+                return False
 
             view_files = [f for f in all_files
                           if f.startswith("resources/views/") and f.endswith(".blade.php")]
@@ -4374,25 +4532,15 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                 # routes -> controllers
                 if rel in ("routes/web.php","routes/api.php"):
                     txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
-                    try:
-                        from app import _laravel_pairs_from_route_text  # of waar je helper staat
-                    except Exception:
-                        _laravel_pairs_from_route_text = None
-                    if _laravel_pairs_from_route_text:
-                        for ctrl_path,_m in _laravel_pairs_from_route_text(txt):
-                            if ctrl_path and ctrl_path not in picked and ctrl_path not in add:
-                                add.append(ctrl_path)
+                    for ctrl_path, _m in _laravel_pairs_from_route_text(txt):
+                        if ctrl_path and ctrl_path not in picked and ctrl_path not in add:
+                            add.append(ctrl_path)
                 # controllers -> views
                 if rel.startswith("app/Http/Controllers/") and rel.endswith(".php"):
                     txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
-                    try:
-                        from app import _laravel_guess_view_paths_from_text
-                    except Exception:
-                        _laravel_guess_view_paths_from_text = None
-                    if _laravel_guess_view_paths_from_text:
-                        for v in _laravel_guess_view_paths_from_text(txt):
-                            if v and v not in picked and v not in add:
-                                add.append(v)
+                    for v in _laravel_guess_view_paths_from_text(txt):
+                        if v and v not in picked and v not in add:
+                            add.append(v)
             # Extra: neem kleine nabije partials/layouts mee (zelfde dir, ≤40KB)
             more = []
             for rel in (picked + add)[:8]:
@@ -4410,12 +4558,10 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                         pass
             picked = (picked + add + more)[:MAX_FILES_DRYRUN]
         # 5) Literal-grep fallback: als de user een oud->nieuw wijziging impliceert, zoek de 'old' literal repo-breed
-        try:
-            old, new, _why_pair = deduce_old_new_literals(st.user_goal, "")
-        except Exception:
-            old, new = None, None
-        if old and isinstance(old, str) and old.strip():
-            grep_hits = _grep_repo_for_literal(Path(st.repo_path), old.strip(), limit=16)
+        qs = extract_quotes(st.user_goal) or []
+        old = qs[0].strip() if qs and qs[0].strip() else None
+        if old:
+            grep_hits = _grep_repo_for_literal(Path(st.repo_path), old, limit=16)
             for rel in grep_hits:
                 if rel in all_files and rel not in picked:
                     picked.append(rel)
@@ -4589,15 +4735,14 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
                 if best and best not in st.candidate_paths: added.append(best)
         st.candidate_paths = (added + st.candidate_paths)[:MAX_FILES_DRYRUN]
         # extra: grep op 'old' literal uit user_goal om kandidaten te verrijken
-        try:
-            old, new, _why_pair = deduce_old_new_literals(st.user_goal, "")
-        except Exception:
-            old = None
+        qs = extract_quotes(st.user_goal) or []
+        old = qs[0].strip() if qs and qs[0].strip() else None
         if old:
             for rel in _grep_repo_for_literal(root, old, limit=16):
                 if rel in all_files and rel not in st.candidate_paths:
                     st.candidate_paths.append(rel)
 
+
         try:
             proposed, diffs, reasons = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
             if not proposed:
diff --git a/app.py b/app.py
index d44c1ef..de778ed 100644
--- a/app.py
+++ b/app.py
@@ -32,7 +32,11 @@ from fastapi.routing import APIRoute
 from starlette.concurrency import run_in_threadpool
 from pydantic import BaseModel
 
+AUTO_CONTINUE = os.getenv("AUTO_CONTINUE", "1").lower() not in ("0","false","no")
+AUTO_CONTINUE_MAX_ROUNDS = int(os.getenv("AUTO_CONTINUE_MAX_ROUNDS", "6"))
+AUTO_CONTINUE_TAIL_CHARS = int(os.getenv("AUTO_CONTINUE_TAIL_CHARS", "600"))
 
+from llm_client import init_llm_client, _sync_model_infer
 
 # Optionele libs voor tekst-extractie
 try:
@@ -71,6 +75,17 @@ except Exception:
     cairosvg = None
 
 import tempfile, subprocess  # voor audio
+import html  # ← nieuw: unescape van HTML-entities in tool-call JSON
+
+# Forceer consistente caches zowel binnen als buiten Docker
+os.environ.setdefault("HF_HOME", "/opt/hf")
+os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/opt/hf")
+os.environ.setdefault("TRANSFORMERS_CACHE", "/opt/hf")
+os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers")
+os.environ.setdefault("XDG_CACHE_HOME", "/opt/cache")
+# whisper cache pad (gebruikt door faster-whisper)
+os.environ.setdefault("WHISPER_CACHE_DIR", os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper"))
+
 
 # STT (Whisper-compatible via faster-whisper) — optioneel
 _STT_MODEL = None
@@ -148,15 +163,53 @@ def _build_tools_system_prompt(tools: list) -> str:
     return "\n".join(lines)
 
 def _extract_tool_calls_from_text(txt: str):
-    # tolerant: pak eerste JSON object (ook als het in ```json staat)
-    m = re.search(r"\{[\s\S]*\}", txt or "")
+    s = (txt or "").strip()
+    # Strip code fences & bekende chat-tokens
+    if s.startswith("```"):
+        s = extract_code_block(s)
+    s = re.sub(r"^<\|im_start\|>\s*assistant\s*", "", s, flags=re.I)
+    # HTML-escaped JSON (&#34; etc.)
+    try:
+        s = html.unescape(s)
+    except Exception:
+        pass
+    # tolerant: pak eerste JSON object
+    m = re.search(r"\{[\s\S]*\}", s)
     if not m:
         return []
     try:
         obj = json.loads(m.group(0))
     except Exception:
         return []
-    tc = obj.get("tool_calls") or []
+    # === Normaliseer verschillende dialecten ===
+    # 1) OpenAI: {"tool_calls":[{"type":"function","function":{"name":..,"arguments":..}}]}
+    if "tool_calls" in obj and isinstance(obj["tool_calls"], list):
+        tc = obj["tool_calls"]
+    # 2) Replit/Magistral/Mistral-achtig: {"call_tool":{"name":"...","arguments":{...}}} óf lijst
+    elif "call_tool" in obj:
+        raw = obj["call_tool"]
+        if isinstance(raw, dict):
+            raw = [raw]
+        tc = []
+        for it in (raw or []):
+            name = (it or {}).get("name")
+            args = (it or {}).get("arguments") or {}
+            if isinstance(args, str):
+                try: args = json.loads(args)
+                except Exception: pass
+            if name:
+                tc.append({"type":"function","function":{"name": name, "arguments": json.dumps(args, ensure_ascii=False)}})
+    # 3) Legacy OpenAI: {"function_call":{"name":"...","arguments":"{...}"}} 
+    elif "function_call" in obj and isinstance(obj["function_call"], dict):
+        fc = obj["function_call"]
+        name = fc.get("name")
+        args = fc.get("arguments") or {}
+        if isinstance(args, str):
+            try: args = json.loads(args)
+            except Exception: pass
+        tc = [{"type":"function","function":{"name": name, "arguments": json.dumps(args, ensure_ascii=False)}}] if name else []
+    else:
+        tc = []
     out = []
     for c in tc:
         name = (c or {}).get("name")
@@ -166,6 +219,13 @@ def _extract_tool_calls_from_text(txt: str):
                 args = json.loads(args)
             except Exception:
                args = {}
+        # Support zowel {"name":..,"arguments":..} als {"type":"function","function":{...}}
+        if not name and isinstance(c.get("function"), dict):
+            name = c["function"].get("name")
+            args = c["function"].get("arguments") or args
+            if isinstance(args, str):
+                try: args = json.loads(args)
+                except Exception: args = {}
         if name:
             out.append({
                 "id": f"call_{uuid.uuid4().hex[:8]}",
@@ -177,6 +237,46 @@ def _extract_tool_calls_from_text(txt: str):
             })
     return out
 
+# --- Universal toolcall detector (JSON + ReAct + HTML-escaped) ---
+_TOOL_REACT = re.compile(
+    r"Action\s*:\s*([A-Za-z0-9_\-\.]+)\s*[\r\n]+Action\s*Input\s*:\s*(?:```json\s*([\s\S]*?)\s*```|([\s\S]*))",
+    re.I
+)
+
+def detect_toolcalls_any(text: str) -> list[dict]:
+    """
+    Retourneert altijd OpenAI-achtige tool_calls items:
+    [{"id":..., "type":"function", "function":{"name":..., "arguments": "{...}"}}]
+    """
+    calls = _extract_tool_calls_from_text(text)
+    if calls:
+        return calls
+    s = (text or "").strip()
+    try:
+        s = html.unescape(s)
+    except Exception:
+        pass
+    # ReAct fallback
+    m = _TOOL_REACT.search(s)
+    if m:
+        name = (m.group(1) or "").strip()
+        raw = (m.group(2) or m.group(3) or "").strip()
+        # strip eventuele fences
+        if raw.startswith("```"):
+            raw = extract_code_block(raw)
+        try:
+            args = json.loads(raw) if raw else {}
+        except Exception:
+            # laatste redmiddel: wikkel als {"input": "<raw>"}
+            args = {"input": raw}
+        if name:
+            return [{
+                "id": f"call_{uuid.uuid4().hex[:8]}",
+                "type": "function",
+                "function": {"name": name, "arguments": json.dumps(args, ensure_ascii=False)}
+            }]
+    return []
+
 # -----------------------------------------------------------------------------
 # App & logging
 # -----------------------------------------------------------------------------
@@ -197,6 +297,21 @@ app.add_middleware(
     allow_headers=["*"],
 )
 
+
+# Eén centrale QueueManager voor ALLE LLM-calls
+#app.state.LLM_QUEUE = QueueManager(model_infer_fn=_sync_model_infer)
+
+# Koppel deze queue aan llm_client zodat _llm_call dezelfde queue gebruikt
+#init_llm_client(app.state.LLM_QUEUE)
+
+# Let op:
+# - llm_call_openai_compat verwacht dat app.state.LLM_QUEUE een deque-achtige queue is
+#   (append/index/popleft/remove).
+# - De QueueManager uit queue_helper.py is thread-based en wordt hier niet gebruikt.
+#   Als je QueueManager wilt inzetten, doe dat in llm_client.py of elders,
+#   maar niet als app.state.LLM_QUEUE, om type-conflicten te voorkomen.
+
+
 @app.on_event("startup")
 async def _startup():
     # Zorg dat lokale hosts nooit via een proxy gaan
@@ -697,7 +812,7 @@ async def llm_call_openai_compat(
     stream: bool = False,
     temperature: float = 0.2,
     top_p: float = 0.9,
-    max_tokens: int = 1024,
+    max_tokens: int = 13027,
     extra: Optional[dict] = None,
     stop: Optional[Union[str, list[str]]] = None,
     **kwargs
@@ -926,7 +1041,7 @@ async def _svg_from_prompt(prompt: str, w: int, h: int, background: str="white")
             f"- Thema: {prompt}\n- Gebruik eenvoudige vormen/paths/tekst.")
     resp = await llm_call_openai_compat(
         [{"role":"system","content":sys},{"role":"user","content":user}],
-        stream=False, temperature=0.35, top_p=0.9, max_tokens=1200
+        stream=False, temperature=0.35, top_p=0.9, max_tokens=2048
     )
     svg = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
     return _svg_wrap_if_needed(_sanitize_svg(svg), w, h, background)
@@ -1079,6 +1194,165 @@ def _parse_repo_qa_from_messages(messages: list[dict]) -> tuple[Optional[str], s
     question = full
     return repo_hint, question, branch, n_ctx
 
+# ------------------------------
+# Unified-diff "fast path" (expliciete paden + vervangtekst)
+# ------------------------------
+_Q = r"[\"'“”‘’`]"
+_PATH_PATS = [
+    r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"”']",
+    r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
+    r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"”']",
+    r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
+]
+
+def _extract_repo_branch(text: str) -> tuple[Optional[str], str]:
+    repo_url = None
+    branch = "main"
+    m = re.search(r"\bRepo\s*:\s*(\S+)", text, flags=re.I)
+    if m:
+        repo_url = m.group(1).strip()
+    mb = re.search(r"\bbranch\s*:\s*([A-Za-z0-9._/-]+)", text, flags=re.I)
+    if mb:
+        branch = mb.group(1).strip()
+    return repo_url, branch
+
+def _extract_paths(text: str) -> list[str]:
+    paths: set[str] = set()
+    for pat in _PATH_PATS:
+        for m in re.finditer(pat, text):
+            paths.add(m.group(1))
+    return list(paths)
+
+def _extract_replace_pair(text: str) -> tuple[Optional[str], Optional[str]]:
+    """
+    Haalt (old,new) uit NL/EN varianten. Ondersteunt rechte en ‘slimme’ quotes.
+    Voorbeelden die matchen:
+    - Vervang de tekst “A” in “B”
+    - Vervang de tekst "A" veranderen in "B"
+    - Replace the text 'A' to 'B'
+    - Replace "A" with "B"
+    """
+    pats = [
+        rf"Vervang\s+de\s+tekst\s*{_Q}(.+?){_Q}[^.\n]*?(?:in|naar|verander(?:en)?\s+in)\s*{_Q}(.+?){_Q}",
+        rf"Replace(?:\s+the)?\s+text\s*{_Q}(.+?){_Q}\s*(?:to|with)\s*{_Q}(.+?){_Q}",
+    ]
+    for p in pats:
+        m = re.search(p, text, flags=re.I|re.S)
+        if m:
+            return m.group(1), m.group(2)
+    # fallback: eerste twee quoted strings in de buurt van 'Vervang' / 'Replace'
+    mm = re.search(r"(Vervang|Replace)[\s\S]*?"+_Q+"(.+?)"+_Q+"[\s\S]*?"+_Q+"(.+?)"+_Q, text, flags=re.I)
+    if mm:
+        return mm.group(2), mm.group(3)
+    return None, None
+
+def _looks_like_unified_diff_request(text: str) -> bool:
+    if re.search(r"\bunified\s+diff\b", text, flags=re.I):
+        return True
+    # ook accepteren bij "maak een diff" met expliciete bestanden
+    if re.search(r"\b(diff|patch)\b", text, flags=re.I) and any(re.search(p, text) for p in _PATH_PATS):
+       return True
+    return False
+
+def _make_unified_diff(a_text: str, b_text: str, path: str) -> str:
+    import difflib
+    a_lines = a_text.splitlines(keepends=True)
+    b_lines = b_text.splitlines(keepends=True)
+    return "".join(difflib.unified_diff(a_lines, b_lines, fromfile=f"a/{path}", tofile=f"b/{path}", n=3))
+
+_TRANS_WRAPPERS = [
+    r"__\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
+    r"@lang\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
+    r"trans\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
+]
+
+def _find_in_translation_wrapper(content: str, needle: str) -> bool:
+    for pat in _TRANS_WRAPPERS:
+        for m in re.finditer(pat, content):
+            inner = m.group(1)
+            if inner == needle:
+                return True
+    return False
+
+def _update_nl_json(root: Path, key: str, value: str) -> tuple[bool, str]:
+    """
+    Zet of update resources/lang/nl.json: { key: value }
+    Retourneert (gewijzigd, nieuwe_tekst).
+    """
+    lang_path = root / "resources" / "lang" / "nl.json"
+    data = {}
+    if lang_path.exists():
+        try:
+            data = json.loads(lang_path.read_text(encoding="utf-8", errors="ignore") or "{}")
+        except Exception:
+            data = {}
+    # Alleen aanpassen als nodig
+    prev = data.get(key)
+    if prev == value:
+        return False, lang_path.as_posix()
+    data[key] = value
+    # Mooie, stabiele JSON dump
+    lang_path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = lang_path.with_suffix(".tmp.json")
+    tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
+    os.replace(tmp, lang_path)
+    return True, lang_path.as_posix()
+
+async def _fast_unified_diff_task(messages: list[dict]) -> Optional[str]:
+    """
+    Als de prompt vraagt om 'unified diff' met expliciete bestanden en vervangtekst,
+    voer dat direct uit (bypass handle_repo_agent) en geef diffs terug.
+    """
+    full = "\n".join([m.get("content","") for m in messages if m.get("role") in ("system","user")])
+    if not _looks_like_unified_diff_request(full):
+        return None
+    repo_url, branch = _extract_repo_branch(full)
+    paths = _extract_paths(full)
+    old, new = _extract_replace_pair(full)
+    if not (repo_url and paths and old and new):
+        return None
+
+    # Repo ophalen
+    repo_path = await _get_git_repo_async(repo_url, branch)
+    root = Path(repo_path)
+    changed_files: list[tuple[str, str]] = []  # (path, diff_text)
+    lang_changed = False
+    lang_path_text = ""
+
+    for rel in paths:
+        p = root / rel
+        if not p.exists():
+            continue
+        before = _read_text_file_wrapper(p)
+        if not before:
+            continue
+
+        # Als de tekst binnen een vertaal-wrapper staat, wijzig NIET de blade,
+        # maar zet/patch resources/lang/nl.json: { "oude": "nieuwe" }.
+        if _find_in_translation_wrapper(before, old):
+            ch, lang_path_text = _update_nl_json(root, old, new)
+            lang_changed = lang_changed or ch
+            # geen bladewijziging in dit geval
+            continue
+
+        after = before.replace(old, new)
+        if after == before:
+            continue
+        diff = _make_unified_diff(before, after, rel)
+        if diff.strip():
+            changed_files.append((rel, diff))
+
+    if not changed_files and not lang_changed:
+        return "Dry-run: geen wijzigbare treffers gevonden in opgegeven bestanden (of reeds actueel)."
+
+    out = []
+    if changed_files:
+        out.append("### Unified diffs")
+        for rel, d in changed_files:
+            out.append(f"```diff\n{d}```")
+    if lang_changed:
+        out.append(f"✅ Bijgewerkte vertaling in: `{lang_path_text}` → \"{old}\" → \"{new}\"")
+    return "\n\n".join(out).strip()
 
 # ------------------------------
 # OpenAI-compatible endpoints
@@ -1131,15 +1405,17 @@ def _get_stt_model():
     except Exception:
         raise HTTPException(status_code=500, detail="STT niet beschikbaar: faster-whisper ontbreekt.")
     # device-selectie
+    download_root = os.getenv("STT_MODEL_DIR", os.environ.get("WHISPER_CACHE_DIR"))
+    os.makedirs(download_root, exist_ok=True)
     if STT_DEVICE == "auto":
         try:
-            _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cuda", compute_type="float16")
+            _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cuda", compute_type="float16", download_root=download_root)
             return _STT_MODEL
         except Exception:
-            _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cpu", compute_type="int8")
+            _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cpu", compute_type="int8", download_root=download_root)
             return _STT_MODEL
     dev, comp = ("cuda","float16") if STT_DEVICE=="cuda" else ("cpu","int8")
-    _STT_MODEL = WhisperModel(STT_MODEL_NAME, device=dev, compute_type=comp)
+    _STT_MODEL = WhisperModel(STT_MODEL_NAME, device=dev, compute_type=comp, download_root=download_root)
     return _STT_MODEL
 
 def _stt_transcribe_path(path: str, lang: str | None):
@@ -1246,7 +1522,7 @@ async def present_make(
             f"Max. {max_slides} dia's, 3–6 bullets per dia.")
     plan = await llm_call_openai_compat(
         [{"role":"system","content":sys},{"role":"user","content":user}],
-        stream=False, temperature=0.3, top_p=0.9, max_tokens=1200
+        stream=False, temperature=0.3, top_p=0.9, max_tokens=13021
     )
     raw = (plan.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}")
     try:
@@ -1292,7 +1568,7 @@ async def vision_ask(
     stream: bool = Form(False),
     temperature: float = Form(0.2),
     top_p: float = Form(0.9),
-    max_tokens: int = Form(512),
+    max_tokens: int = Form(1024),
 ):
     raw = await run_in_threadpool(file.file.read)
     img_b64 = base64.b64encode(raw).decode("utf-8")
@@ -1323,7 +1599,7 @@ async def vision_and_text(
     max_chars: int = Form(25000),
     temperature: float = Form(0.2),
     top_p: float = Form(0.9),
-    max_tokens: int = Form(1024),
+    max_tokens: int = Form(2048),
 ):
     images_b64: list[str] = []
     text_chunks: list[str] = []
@@ -1370,7 +1646,7 @@ async def vision_health():
     tiny_png = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAOaO5nYAAAAASUVORK5CYII="
     try:
         messages = [{"role":"user","content":"<image> Beschrijf dit in één woord."}]
-        resp = await llm_call_openai_compat(messages, extra={"images":[tiny_png]}, max_tokens=16)
+        resp = await llm_call_openai_compat(messages, extra={"images":[tiny_png]}, max_tokens=128)
         txt = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","").strip()
         return {"vision": bool(txt), "sample": txt[:60]}
     except Exception as e:
@@ -1564,7 +1840,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
         resp = await llm_call_openai_compat(
             [{"role":"system","content":"Je bent behulpzaam en exact."},
              {"role":"user","content": f"{instruction}\n\n--- BEGIN ---\n{text}\n--- EINDE ---"}],
-            stream=False, max_tokens=768
+            stream=False, max_tokens=7680
         )
         return resp
 
@@ -1574,7 +1850,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
         resp = await llm_call_openai_compat(
             [{"role":"system","content":"Wees feitelijk en concreet."},
              {"role":"user","content": f"{goal}\n\n--- BEGIN ---\n{text}\n--- EINDE ---"}],
-            stream=False, max_tokens=768
+            stream=False, max_tokens=7680
         )
         return resp
 
@@ -1587,7 +1863,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
              {"role":"user","content": f"{objective}\nStijl/criteria: {style}\n"
                                        "Antwoord met eerst een korte toelichting, daarna alleen de verbeterde inhoud tussen een codeblok.\n\n"
                                        f"--- BEGIN ---\n{text}\n--- EINDE ---"}],
-            stream=False, max_tokens=1024
+            stream=False, max_tokens=10240
         )
         return resp
 
@@ -1597,7 +1873,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
         resp = await llm_call_openai_compat(
             [{"role":"system","content":"Wees strikt en concreet."},
              {"role":"user","content": f"{VALIDATE_PROMPT}\n\nCode om te valideren:\n```\n{code}\n```"}],
-            stream=False, max_tokens=512
+            stream=False, max_tokens=10000
         )
         txt = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
         return {"status":"issues_found" if _parse_validation_results(txt) else "valid",
@@ -1611,7 +1887,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
             [{"role":"system","content": SYSTEM_PROMPT},
              {"role":"user","content": f"Verbeter deze {language}-code met focus op {focus}:\n\n"
                                        f"{code}\n\nGeef eerst een korte toelichting, dan alleen het verbeterde codeblok. Behoud functionaliteit."}],
-            stream=False, max_tokens=1536
+            stream=False, max_tokens=10768
         )
         return resp
 
@@ -1636,7 +1912,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
     if name == "vision_analyze":
         image_url = args.get("image_url","")
         prompt = args.get("prompt","Beschrijf beknopt wat je ziet en noem de belangrijkste details.")
-        max_tokens = int(args.get("max_tokens",512))
+        max_tokens = int(args.get("max_tokens",1024))
         b64 = None
         # Alleen data: of raw base64 accepteren; http(s) niet, want die worden niet
         # ingeladen in de vision-call en zouden stil falen.
@@ -1908,6 +2184,52 @@ async def _complete_with_autocontinue(
         continues += 1
     return content, resp
 
+async def llm_call_autocont(
+    messages: list[dict],
+    *,
+    model: Optional[str] = None,
+    stream: bool = False,
+    temperature: float = 0.2,
+    top_p: float = 0.9,
+    max_tokens: int = 1024,
+    extra: Optional[dict] = None,
+    stop: Optional[Union[str, list[str]]] = None,
+    **kwargs
+) -> dict | StreamingResponse:
+    """
+    OpenAI-compatibele wrapper die non-stream antwoorden automatisch
+    doorcontinueert met jouw _complete_with_autocontinue(...).
+    - stream=True  -> passthrough naar llm_call_openai_compat (ongewijzigd)
+    - stream=False -> retourneert één samengevoegd antwoord als OpenAI JSON
+    """
+    mdl = (model or os.getenv("LLM_MODEL", "mistral-medium")).strip()
+    if stream:
+        # streaming blijft 1-op-1 zoals voorheen
+        return await llm_call_openai_compat(
+            messages,
+            model=mdl,
+            stream=True,
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            extra=extra if extra else None,
+            stop=stop
+        )
+
+    max_autocont = int(os.getenv("LLM_AUTO_CONTINUES", "2"))
+    full_text, _last = await _complete_with_autocontinue(
+        messages,
+        model=mdl,
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens,
+        extra_payload=extra if extra else None,
+        max_autocont=max_autocont
+    )
+    # Bouw een standaard OpenAI-chat response met het samengevoegde antwoord
+    return _openai_chat_response(mdl, full_text, messages)
+
+
 @app.post("/v1/chat/completions")
 async def openai_chat_completions(body: dict = Body(...), request: Request = None):
     model = (body.get("model") or os.getenv("LLM_MODEL", "mistral-medium")).strip()
@@ -2053,7 +2375,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
         # jouw bestaande helper; hou 'm zoals je al gebruikt
         resp = await llm_call_openai_compat(ask, stream=False, max_tokens=512)
         txt = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content", "") or ""
-        calls = _extract_tool_calls_from_text(txt)
+        calls = detect_toolcalls_any(txt) #_extract_tool_calls_from_text(txt)
         if calls:
             return {
                 "id": f"chatcmpl-{uuid.uuid4().hex}",
@@ -2075,6 +2397,17 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
 
     # Speciale modellen
     if model == "repo-agent":
+        # === snelle bypass voor "unified diff" opdrachten met expliciete paden ===
+        try:
+            fast = await _fast_unified_diff_task(messages)
+        except Exception as e:
+            fast = f"(Fast-diff pad mislukte: {e})"
+        if fast:
+            if stream:
+                async def _emit(): yield ("data: " + json.dumps({"id": f"chatcmpl-{uuid.uuid4().hex[:12]}", "object":"chat.completion.chunk","created": int(time.time()),"model":"repo-agent","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":None}]}) + "\n\n").encode("utf-8"); yield ("data: " + json.dumps({"id": f"chatcmpl-{uuid.uuid4().hex[:12]}","object":"chat.completion.chunk","created": int(time.time()),"model":"repo-agent","choices":[{"index":0,"delta":{"content": fast},"finish_reason":None}]}) + "\n\n").encode("utf-8"); yield b"data: [DONE]\n\n"
+                return StreamingResponse(_emit(), media_type="text/event-stream", headers={"Cache-Control":"no-cache, no-transform","X-Accel-Buffering":"no","Connection":"keep-alive"})
+            return JSONResponse(_openai_chat_response("repo-agent", fast, messages))
+
         if stream:
             async def event_gen():
                 import asyncio, time, json, contextlib
@@ -2421,7 +2754,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
         if stream:
             temperature = float(body.get("temperature", 0.2))
             top_p       = float(body.get("top_p", 0.9))
-            _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "1024"))
+            _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021"))
             max_tokens  = int(body.get("max_tokens", _default_max))
             return await llm_call_openai_compat(
                 messages,
@@ -3331,7 +3664,7 @@ async def rag_query_api(
         resp = await llm_call_openai_compat(
             [{"role":"system","content":"You are precise and return only valid JSON."},
              {"role":"user","content": prompt+"\n\nOnly JSON array."}],
-            stream=False, temperature=0.0, top_p=1.0, max_tokens=256
+            stream=False, temperature=0.0, top_p=1.0, max_tokens=512
         )
         try:
             order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]"))
@@ -3428,7 +3761,8 @@ initialize_agent(
     get_git_repo_fn=_get_git_repo_async,
     rag_index_repo_internal_fn=_rag_index_repo_internal,
     rag_query_internal_fn=_rag_query_internal,
-    llm_call_fn=llm_call_openai_compat,
+    # Gebruik auto-continue wrapper zodat agent-antwoorden niet worden afgekapt
+    llm_call_fn=llm_call_autocont,
     extract_code_block_fn=extract_code_block,
     read_text_file_fn=_read_text_file_wrapper,
     client_ip_fn=_client_ip,
diff --git a/llm_client.py b/llm_client.py
new file mode 100755
index 0000000..971b8b6
--- /dev/null
+++ b/llm_client.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+import os
+import asyncio
+import logging
+from typing import List, Dict, Any, Optional
+
+import httpx
+
+from queue_helper import QueueManager
+
+logger = logging.getLogger(__name__)
+
+# -------------------------------------------------------------
+# Config voor onderliggende LLM-backend
+# -------------------------------------------------------------
+# Dit is NIET jouw eigen /v1/chat/completions endpoint,
+# maar de *echte* model-backend (bijv. Ollama, vLLM, Mistral server, etc.).
+LLM_API_BASE = os.getenv("LLM_API_BASE", "http://127.0.0.1:11434")
+LLM_DEFAULT_MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini")
+LLM_REQUEST_TIMEOUT = float(os.getenv("LLM_REQUEST_TIMEOUT", "120"))
+
+# Deze wordt in app.py gezet via init_llm_client(...)
+LLM_QUEUE: QueueManager | None = None
+
+
+def init_llm_client(queue: QueueManager) -> None:
+    """
+    Koppel de globale LLM_QUEUE aan de QueueManager uit app.py.
+    Deze MOET je in app.py één keer aanroepen.
+    """
+    global LLM_QUEUE
+    LLM_QUEUE = queue
+    logger.info("llm_client: LLM_QUEUE gekoppeld via init_llm_client.")
+
+
+def _sync_model_infer(payload: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Synchronous call naar de echte LLM-backend.
+    Dit is de functie die je in app.py gebruikt bij het maken van de QueueManager.
+    """
+    url = f"{LLM_API_BASE.rstrip('/')}/v1/chat/completions"
+    try:
+        with httpx.Client(timeout=LLM_REQUEST_TIMEOUT) as client:
+            resp = client.post(url, json=payload)
+            resp.raise_for_status()
+            return resp.json()
+    except Exception as exc:
+        logger.exception("LLM backend call failed: %s", exc)
+        return {
+            "object": "chat.completion",
+            "choices": [{
+                "index": 0,
+                "finish_reason": "error",
+                "message": {
+                    "role": "assistant",
+                    "content": f"[LLM-fout] {exc}",
+                },
+            }],
+            "usage": {
+                "prompt_tokens": 0,
+                "completion_tokens": 0,
+                "total_tokens": 0,
+            },
+        }
+
+
+async def _llm_call(
+    messages: List[Dict[str, str]],
+    *,
+    stream: bool = False,
+    temperature: float = 0.2,
+    top_p: float = 0.9,
+    max_tokens: Optional[int] = None,
+    model: Optional[str] = None,
+    **extra: Any,
+) -> Dict[str, Any]:
+    """
+    Centrale helper voor tools/agents/smart_rag/repo-agent.
+
+    Belangrijk:
+    - Gebruikt de *bestaande* QueueManager uit app.py (via init_llm_client).
+    - Stuurt jobs in de agent-queue (lagere prioriteit dan users).
+    - GEEN wachtrij-meldingen ("u bent #...") voor deze interne calls.
+    """
+    if stream:
+        # In deze agent gebruiken we geen streaming.
+        raise NotImplementedError("_llm_call(stream=True) wordt momenteel niet ondersteund.")
+
+    if LLM_QUEUE is None:
+        # Hard fail: dan weet je meteen dat init_llm_client nog niet is aangeroepen.
+        raise RuntimeError("LLM_QUEUE is niet geïnitialiseerd. Roep init_llm_client(...) aan in app.py")
+
+    payload: Dict[str, Any] = {
+        "model": model or LLM_DEFAULT_MODEL,
+        "messages": messages,
+        "stream": False,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+    }
+    if max_tokens is not None:
+        payload["max_tokens"] = int(max_tokens)
+
+    payload.update(extra)
+
+    loop = asyncio.get_running_loop()
+
+    try:
+        # request_agent_sync blokkeert → naar threadpool
+        response: Dict[str, Any] = await loop.run_in_executor(
+            None, lambda: LLM_QUEUE.request_agent_sync(payload)
+        )
+        return response
+    except Exception as exc:
+        logger.exception("_llm_call via agent-queue failed: %s", exc)
+        return {
+            "object": "chat.completion",
+            "choices": [{
+                "index": 0,
+                "finish_reason": "error",
+                "message": {
+                    "role": "assistant",
+                    "content": f"[LLM-queue-fout] {exc}",
+                },
+            }],
+            "usage": {
+                "prompt_tokens": 0,
+                "completion_tokens": 0,
+                "total_tokens": 0,
+            },
+        }
+
diff --git a/mistral-api.sh b/mistral-api.sh
new file mode 100755
index 0000000..1291b57
--- /dev/null
+++ b/mistral-api.sh
@@ -0,0 +1 @@
+docker run -d --name mistral-api --network host -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=13021 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ mistral-api
diff --git a/queue_helper.py b/queue_helper.py
index 20caf87..f871f50 100644
--- a/queue_helper.py
+++ b/queue_helper.py
@@ -45,11 +45,20 @@ class QueueManager:
         self._worker.start()
 
     # ---------- public API ----------
-    def enqueue_user(self, payload: Dict, progress_cb: Callable[[Dict], None]) -> tuple[str, int]:
+    def enqueue_user(
+        self,
+        payload: Dict,
+        progress_cb: Callable[[Dict], None],
+        *,
+        notify_position: bool = False,
+    ) -> tuple[str, int]:
         job = _Job(payload, progress_cb)
         try: self._user_q.put_nowait(job)
         except queue.Full: raise RuntimeError(f"User‑queue vol (≥{USER_MAX_QUEUE})")
         position = self._user_q.qsize()
+        if notify_position:
+            # start een aparte notifier-thread die periodiek de wachtrijpositie meldt
+            start_position_notifier(job, self._user_q)
         return job.job_id, position
 
     def enqueue_agent(self, payload: Dict, progress_cb: Callable[[Dict], None]) -> str:
@@ -58,6 +67,32 @@ class QueueManager:
         except queue.Full: raise RuntimeError(f"Agent‑queue vol (≥{AGENT_MAX_QUEUE})")
         return job.job_id
 
+    # ---------- sync helper voor agents/tools ----------
+    def request_agent_sync(self, payload: Dict, timeout: float = WORKER_TIMEOUT) -> Dict:
+        """
+        Gebruik dit voor interne calls (agents/tools).
+        - Job wordt in de agent-queue gezet (lagere prioriteit dan users).
+        - We wachten blokkerend tot de worker klaar is of tot timeout.
+        - Er worden GEEN wachtrij-meldingen ("U bent #...") verstuurd.
+        """
+        result_box: Dict[str, Any] = {}
+
+        def _cb(msg: Dict):
+            # alleen het eindresultaat is interessant voor tools/agents
+            result_box["answer"] = msg
+
+        job = _Job(payload, _cb)
+        try:
+            self._agent_q.put_nowait(job)
+        except queue.Full:
+            raise RuntimeError(f"Agent-queue vol (≥{AGENT_MAX_QUEUE})")
+
+        ok = job.event.wait(timeout)
+        if not ok:
+            raise TimeoutError(f"LLM-inference duurde langer dan {timeout} seconden.")
+        if job.error:
+            raise RuntimeError(job.error)
+        return result_box.get("answer") or {}
     # ---------- worker ----------
     def _run_worker(self):
         while not self._shutdown.is_set():
@@ -79,18 +114,24 @@ class QueueManager:
         self._shutdown.set()
         self._worker.join(timeout=5)
 
-def start_position_notifier(job: _Job,
-                           queue_ref: queue.Queue,
-                           interval: float = UPDATE_INTERVAL):
+def start_position_notifier(
+    job: _Job,
+    queue_ref: queue.Queue,
+    interval: float = UPDATE_INTERVAL,
+    ):
     """Stuurt elke `interval` seconden een bericht met de huidige positie."""
     def _notifier():
-        while not job.event.is_set():
+        # Stop zodra het job-event wordt gezet (success/fout/timeout upstream)
+        while not job.event.wait(interval):
+            # Neem een snapshot van de queue-inhoud op een thread-safe manier
+            with queue_ref.mutex:
+                snapshot = list(queue_ref.queue)
             try:
-                pos = list(queue_ref.queue).index(job) + 1   # 1‑based
+                pos = snapshot.index(job) + 1   # 1-based
             except ValueError:
+                # Job staat niet meer in de wachtrij → geen updates meer nodig
                 break
             job.callback({"info": f"U bent #{pos} in de wachtrij. Even geduld…" })
-            time.sleep(interval)
     t = threading.Thread(target=_notifier, daemon=True)
     t.start()
     return t
diff --git a/smart_rag.py b/smart_rag.py
index 05c3759..55c1c2d 100644
--- a/smart_rag.py
+++ b/smart_rag.py
@@ -296,7 +296,7 @@ async def hybrid_retrieve(
     # Optionele query-routing + RRF
     use_route = str(os.getenv("RAG_ROUTE", "1")).lower() not in ("0", "false")
     use_rrf   = str(os.getenv("RAG_RRF",   "1")).lower() not in ("0", "false")
-    # Optionele mini multi-query expansion (default uit)
+    # Optionele mini multi-query expansion (default aan)
     use_expand = str(os.getenv("RAG_MULTI_EXPAND", "1")).lower() in ("1","true","yes")
     k_variants = max(1, int(os.getenv("RAG_MULTI_K", "3")))
     per_query_k = max(1, int(per_query_k))
@@ -317,10 +317,14 @@ async def hybrid_retrieve(
         if use_route:
             buckets = _route_query_buckets(qv)
             for b in buckets:
+                # combineer globale path_contains-hint met bucket-specifieke filter
+                pc = b.get("path_contains")
+                if path_contains and not pc:
+                    pc = path_contains
                 res = await rag_query_internal_fn(
                     query=b["q"], n_results=per_query_k,
                     collection_name=collection_name,
-                    repo=repo, path_contains=b["path_contains"], profile=profile
+                    repo=repo, path_contains=pc, profile=profile
                 )
                 lst = []
                 for item in (res or {}).get("results", []):
@@ -594,5 +598,7 @@ __all__ = [
     "expand_queries",
     "hybrid_retrieve",
     "assemble_context",
+    "_laravel_pairs_from_route_text",
+    "_laravel_guess_view_paths_from_text",
 ]
 
diff --git a/windowing_utils.py b/windowing_utils.py
index a4354e0..81a144f 100644
--- a/windowing_utils.py
+++ b/windowing_utils.py
@@ -1,7 +1,7 @@
 # windowing_utils.py
 from __future__ import annotations
 from dataclasses import dataclass, field
-from typing import List, Dict, Callable, Optional, Tuple
+from typing import List, Dict, Callable, Optional, Tuple, Awaitable
 import hashlib
 import os
 import time
@@ -64,7 +64,7 @@ class ConversationWindow:
     async def build_within_budget(
         self,
         system_prompt: Optional[str],
-        summarizer: Optional[Callable[[str, List[Dict]], "awaitable[str]"]] = None
+        summarizer: Optional[Callable[[str, List[Dict]], Awaitable[str]]] = None
     ) -> List[Dict]:
         budget = self.max_ctx_tokens - max(1, self.response_reserve)
         working = self.history[:]
@@ -106,6 +106,11 @@ class ConversationWindow:
                 self.running_summary = await summarizer(self.running_summary, chunk_buf)
                 chunk_buf = []
 
+        # verwerk eventuele overgebleven buffer zodat er geen turns verdwijnen
+        if chunk_buf:
+            self.running_summary = await summarizer(self.running_summary, chunk_buf)
+            chunk_buf = []
+
         self.history = working
         return await build_candidate(self.running_summary, working)