RAG updates

This commit is contained in:
admin 2025-11-20 16:16:00 +01:00
parent 8a97368577
commit a79293abbd
8 changed files with 794 additions and 77 deletions

View File

@ -2,11 +2,31 @@ FROM python:3.11-slim
WORKDIR /app WORKDIR /app
# ===== Model caches op vaste paden (blijven in image) =====
# Hugging Face caches (embeddings) + XDG cache (o.a. whisper)
ENV HF_HOME=/opt/hf \
HUGGINGFACE_HUB_CACHE=/opt/hf \
TRANSFORMERS_CACHE=/opt/hf \
SENTENCE_TRANSFORMERS_HOME=/opt/sentence-transformers \
XDG_CACHE_HOME=/opt/cache \
STT_MODEL=small
# Optioneel build-args om modelkeuzes te pinnen
ARG RAG_EMBEDDINGS=gte-multilingual # of: bge-small / e5-small / gte-base-en
ARG STT_MODEL_ARG=small # tiny | base | small | medium | large-v3, etc.
ENV RAG_EMBEDDINGS=${RAG_EMBEDDINGS}
ENV STT_MODEL=${STT_MODEL_ARG}
# maak directories nu al aan (rechten)
RUN mkdir -p /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper && \
chmod -R a+rX /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper
COPY requirements.txt . COPY requirements.txt .
RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential
RUN pip install --upgrade pip RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25 RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25
#RUN pip cache purge
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
wget ca-certificates libstdc++6 libatomic1 \ wget ca-certificates libstdc++6 libatomic1 \
@ -19,12 +39,46 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ln -sf /opt/piper/piper /usr/local/bin/piper; \ ln -sf /opt/piper/piper /usr/local/bin/piper; \
rm -f /tmp/piper.tgz rm -f /tmp/piper.tgz
# ===== Prefetch modellen tijdens de build =====
# 1) SentenceTransformers (embeddings) — volgens je mapping in app.py
RUN python - <<'PY'
import os
from sentence_transformers import SentenceTransformer
mapping = {
"gte-multilingual": ("Alibaba-NLP/gte-multilingual-base"),
"bge-small": ("BAAI/bge-small-en-v1.5"),
"e5-small": ("intfloat/e5-small-v2"),
"gte-base-en": ("thenlper/gte-base"),
}
choice = os.environ.get("RAG_EMBEDDINGS","gte-multilingual").lower()
hf_id = mapping.get(choice, "BAAI/bge-small-en-v1.5")
# cache_folder respecteert SENTENCE_TRANSFORMERS_HOME/HF_HOME, maar we forceren expliciet:
SentenceTransformer(hf_id, cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME","/opt/sentence-transformers"))
print("Prefetched SentenceTransformer:", hf_id)
PY
# 2) faster-whisper (STT) — cache in /opt/cache/whisper
RUN python - <<'PY'
import os
from faster_whisper import WhisperModel
name = os.environ.get("STT_MODEL","small")
cache_root = os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper")
os.makedirs(cache_root, exist_ok=True)
# Build-time altijd CPU/INT8 (geen GPU nodig tijdens build)
_ = WhisperModel(name, device="cpu", compute_type="int8", download_root=cache_root)
print("Prefetched faster-whisper:", name, "->", cache_root)
PY
# (optioneel) piper voice kun je hier ook voorcachen; laat ik nu achterwege omdat voice per omgeving wisselt.
COPY app.py . COPY app.py .
COPY queue_helper.py . COPY queue_helper.py .
COPY agent_repo.py . COPY agent_repo.py .
COPY windowing_utils.py . COPY windowing_utils.py .
COPY smart_rag.py . COPY smart_rag.py .
COPY llm_client .
EXPOSE 8080 EXPOSE 8080

View File

@ -18,7 +18,7 @@ from windowing_utils import approx_token_count
from starlette.concurrency import run_in_threadpool from starlette.concurrency import run_in_threadpool
import asyncio import asyncio
from collections import defaultdict from collections import defaultdict
from llm_client import _llm_call
# --- Async I/O executors (voorkom event-loop blocking) --- # --- Async I/O executors (voorkom event-loop blocking) ---
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@ -59,7 +59,59 @@ _meili_search_fn = None
_GRAPH_CACHE: dict[str, dict[str, set[str]]] = {} _GRAPH_CACHE: dict[str, dict[str, set[str]]] = {}
_TREE_SUM_CACHE: dict[str, dict[str, str]] = {} _TREE_SUM_CACHE: dict[str, dict[str, str]] = {}
# ---------------------------------------------------------
# Fast-path helpers: expliciete paden + vervangpaar (old->new)
# ---------------------------------------------------------
_Q = r"[\"'“”‘’`]"
_PATH_PATS = [
r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"']",
r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"']",
r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
]
_TRANS_WRAPPERS = [
r"__\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
r"@lang\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
r"trans\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
]
def _extract_repo_branch_from_text(txt: str) -> Tuple[Optional[str], str]:
repo_url, branch = None, "main"
m = re.search(r"\bRepo\s*:\s*(\S+)", txt, flags=re.I)
if m: repo_url = m.group(1).strip()
mb = re.search(r"\bbranch\s*:\s*([A-Za-z0-9._/-]+)", txt, flags=re.I)
if mb: branch = mb.group(1).strip()
return repo_url, branch
def _extract_explicit_paths(txt: str) -> List[str]:
out = []
for pat in _PATH_PATS:
for m in re.finditer(pat, txt):
p = m.group(1)
if p and p not in out:
out.append(p)
return out
def _extract_replace_pair(txt: str) -> Tuple[Optional[str], Optional[str]]:
# NL/EN varianten + “slimme” quotes
pats = [
rf"Vervang\s+de\s+tekst\s*{_Q}(.+?){_Q}[^.\n]*?(?:in|naar|verander(?:en)?\s+in)\s*{_Q}(.+?){_Q}",
rf"Replace(?:\s+the)?\s+text\s*{_Q}(.+?){_Q}\s*(?:to|with)\s*{_Q}(.+?){_Q}",
]
for p in pats:
m = re.search(p, txt, flags=re.I|re.S)
if m:
return m.group(1), m.group(2)
mm = re.search(r"(Vervang|Replace)[\s\S]*?"+_Q+"(.+?)"+_Q+"[\s\S]*?"+_Q+"(.+?)"+_Q, txt, flags=re.I)
if mm:
return mm.group(2), mm.group(3)
return None, None
def _looks_like_unified_diff_request(txt: str) -> bool:
if re.search(r"\bunified\s+diff\b", txt, flags=re.I): return True
if re.search(r"\b(diff|patch)\b", txt, flags=re.I) and _extract_explicit_paths(txt):
return True
return False
# zet dit dicht bij de andere module-consts # zet dit dicht bij de andere module-consts
async def _call_get_git_repo(repo_url: str, branch: str): async def _call_get_git_repo(repo_url: str, branch: str):
@ -2873,7 +2925,7 @@ def laravel_scan_routes(repo_root: Path) -> list[dict]:
if not p.exists(): if not p.exists():
continue continue
txt = _read_file_safe(p) txt = _read_file_safe(p)
for m in re.finditer(r"Route::(get|post|put|patch|delete|match|resource)\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*([^)]+)\)", txt, flags=re.I): for m in re.finditer(r"Route::(get|post|put|patch|delete|match)\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*([^)]+)\)", txt, flags=re.I):
verb, uri, target = m.group(1).lower(), m.group(2), m.group(3) verb, uri, target = m.group(1).lower(), m.group(2), m.group(3)
ctrl = None; method = None; name = None ctrl = None; method = None; name = None
# controller@method # controller@method
@ -3926,13 +3978,122 @@ async def _llm_task_route(user_goal: str, framework: str = "laravel") -> dict:
# ---------- Hoofd-handler ---------- # ---------- Hoofd-handler ----------
async def handle_repo_agent(messages: List[dict], request) -> str: async def handle_repo_agent(messages: List[dict], request) -> str:
"""
Uitbreiding: fast-path voor unified diffs op expliciete bestanden met tekstvervanging.
Als niet van toepassing, valt automatisch terug op de bestaande flow.
"""
# 1) Combineer user/system content om opdracht te parsen
try:
full_txt = "\n".join([m.get("content","") for m in messages if m.get("role") in ("system","user")])
except Exception:
full_txt = ""
# 2) Herken fast-path
try_fast = _looks_like_unified_diff_request(full_txt)
paths_fp = _extract_explicit_paths(full_txt) if try_fast else []
old_txt, new_txt = _extract_replace_pair(full_txt) if try_fast else (None, None)
# NB: we gebruiken de injecties die via initialize_agent zijn gezet:
# - get_git_repo_fn (async)
# - read_text_file_fn (sync)
# Deze symbolen worden onderin initialize_agent aan globals() gehangen.
get_git_repo_fn = globals().get("get_git_repo_fn")
read_text_file_fn = globals().get("read_text_file_fn")
if try_fast and paths_fp and old_txt and new_txt and callable(get_git_repo_fn) and callable(read_text_file_fn):
# 3) repo + branch bepalen
repo_url, branch = _extract_repo_branch_from_text(full_txt)
if not repo_url:
# fallback: probeer repo uit eerdere agent-state (optioneel), anders stop fast-path
repo_url = globals().get("_last_repo_url")
branch = globals().get("_last_branch", "main")
if repo_url:
try:
repo_root = await get_git_repo_fn(repo_url, branch or "main")
root = Path(repo_root)
lang_path = root / "resources" / "lang" / "nl.json"
lang_before = lang_path.read_text(encoding="utf-8", errors="ignore") if lang_path.exists() else "{}"
lang_data = {}
try:
lang_data = json.loads(lang_before or "{}")
except Exception:
lang_data = {}
diffs_out = []
lang_changed = False
def _make_udiff(a: str, b: str, rel: str) -> str:
return "".join(difflib.unified_diff(
a.splitlines(keepends=True),
b.splitlines(keepends=True),
fromfile=f"a/{rel}", tofile=f"b/{rel}", n=3
))
# 4) per bestand: ofwel inline replace, ofwel vertaling bijwerken
for rel in paths_fp:
p = root / rel
if not p.exists():
continue
before = read_text_file_fn(p)
if not before:
continue
# Als de 'oude' tekst voorkomt BINNEN een vertaalwrapper, dan géén blade-edit
found_in_wrapper = False
for pat in _TRANS_WRAPPERS:
for m in re.finditer(pat, before):
inner = m.group(1)
if inner == old_txt:
found_in_wrapper = True
break
if found_in_wrapper:
break
if found_in_wrapper:
# update nl.json: {"oude": "nieuwe"}
if lang_data.get(old_txt) != new_txt:
lang_data[old_txt] = new_txt
lang_changed = True
continue
# anders: directe, exacte vervanging (conservatief)
after = before.replace(old_txt, new_txt)
if after != before:
diff = _make_udiff(before, after, rel)
if diff.strip():
diffs_out.append(("blade", rel, diff))
# 5) indien vertaling gewijzigd: diff voor nl.json toevoegen
if lang_changed:
new_lang = json.dumps(lang_data, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
diff_lang = _make_udiff(lang_before if isinstance(lang_before, str) else "", new_lang, "resources/lang/nl.json")
if diff_lang.strip():
diffs_out.append(("lang", "resources/lang/nl.json", diff_lang))
if diffs_out:
parts = ["### Unified diffs"]
for kind, rel, d in diffs_out:
parts.append(f"**{rel}**")
parts.append("```diff\n" + d + "```")
return "\n\n".join(parts)
else:
return "Dry-run: geen wijzigbare treffers gevonden in opgegeven bestanden (of reeds actueel)."
except Exception as e:
# mislukt → val terug op bestaande discover/agent flow
pass
# === GEEN fast-path → ga door met de bestaande flow hieronder ===
sid = _get_session_id(messages, request) sid = _get_session_id(messages, request)
st = _app.state.AGENT_SESSIONS.get(sid) or AgentState() st = _app.state.AGENT_SESSIONS.get(sid) or AgentState()
_app.state.AGENT_SESSIONS[sid] = st _app.state.AGENT_SESSIONS[sid] = st
user_last = next((m["content"] for m in reversed(messages) if m.get("role")=="user"), "").strip() user_last = next((m["content"] for m in reversed(messages) if m.get("role")=="user"), "").strip()
user_last_lower = user_last.lower() user_last_lower = user_last.lower()
logger.info("INFO:agent_repo:[%s] stage=%s", sid, st.stage) logger.info("INFO:agent_repo:[%s] stage=%s", sid, st.stage)
from smart_rag import enrich_intent, expand_queries, hybrid_retrieve from smart_rag import (
enrich_intent,
expand_queries,
hybrid_retrieve,
_laravel_pairs_from_route_text,
_laravel_guess_view_paths_from_text,
)
# Als user een .git URL meegeeft: zet state en ga via de state-machine verder # Als user een .git URL meegeeft: zet state en ga via de state-machine verder
user_txt = next((m.get("content","") for m in reversed(messages) if m.get("role")=="user"), "") user_txt = next((m.get("content","") for m in reversed(messages) if m.get("role")=="user"), "")
repo_url = await _detect_repo_url(user_txt) repo_url = await _detect_repo_url(user_txt)
@ -4287,16 +4448,13 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
picked.append(f) picked.append(f)
# --- VIEW/LANG bias voor UI-label wijzigingen --- # --- VIEW/LANG bias voor UI-label wijzigingen ---
if task_type == "ui_label_change": # Pak de eerste quote uit de prompt als "oude" literal
# Probeer de 'oude' literal uit de prompt te halen (voor gerichter filteren) qs = extract_quotes(st.user_goal) or []
try: old_lit = qs[0] if qs else None
old_lit, _new_lit, _why = deduce_old_new_literals(st.user_goal, "")
except Exception:
old_lit = None
def _contains_old(rel: str) -> bool: def _contains_old(rel: str) -> bool:
if not old_lit: if not old_lit:
return True return True # fallback: geen filtering
try: try:
txt = _read_text_file(Path(st.repo_path) / rel) or "" txt = _read_text_file(Path(st.repo_path) / rel) or ""
return old_lit in txt return old_lit in txt
@ -4374,22 +4532,12 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
# routes -> controllers # routes -> controllers
if rel in ("routes/web.php","routes/api.php"): if rel in ("routes/web.php","routes/api.php"):
txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore") txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
try:
from app import _laravel_pairs_from_route_text # of waar je helper staat
except Exception:
_laravel_pairs_from_route_text = None
if _laravel_pairs_from_route_text:
for ctrl_path, _m in _laravel_pairs_from_route_text(txt): for ctrl_path, _m in _laravel_pairs_from_route_text(txt):
if ctrl_path and ctrl_path not in picked and ctrl_path not in add: if ctrl_path and ctrl_path not in picked and ctrl_path not in add:
add.append(ctrl_path) add.append(ctrl_path)
# controllers -> views # controllers -> views
if rel.startswith("app/Http/Controllers/") and rel.endswith(".php"): if rel.startswith("app/Http/Controllers/") and rel.endswith(".php"):
txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore") txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
try:
from app import _laravel_guess_view_paths_from_text
except Exception:
_laravel_guess_view_paths_from_text = None
if _laravel_guess_view_paths_from_text:
for v in _laravel_guess_view_paths_from_text(txt): for v in _laravel_guess_view_paths_from_text(txt):
if v and v not in picked and v not in add: if v and v not in picked and v not in add:
add.append(v) add.append(v)
@ -4410,12 +4558,10 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
pass pass
picked = (picked + add + more)[:MAX_FILES_DRYRUN] picked = (picked + add + more)[:MAX_FILES_DRYRUN]
# 5) Literal-grep fallback: als de user een oud->nieuw wijziging impliceert, zoek de 'old' literal repo-breed # 5) Literal-grep fallback: als de user een oud->nieuw wijziging impliceert, zoek de 'old' literal repo-breed
try: qs = extract_quotes(st.user_goal) or []
old, new, _why_pair = deduce_old_new_literals(st.user_goal, "") old = qs[0].strip() if qs and qs[0].strip() else None
except Exception: if old:
old, new = None, None grep_hits = _grep_repo_for_literal(Path(st.repo_path), old, limit=16)
if old and isinstance(old, str) and old.strip():
grep_hits = _grep_repo_for_literal(Path(st.repo_path), old.strip(), limit=16)
for rel in grep_hits: for rel in grep_hits:
if rel in all_files and rel not in picked: if rel in all_files and rel not in picked:
picked.append(rel) picked.append(rel)
@ -4589,15 +4735,14 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
if best and best not in st.candidate_paths: added.append(best) if best and best not in st.candidate_paths: added.append(best)
st.candidate_paths = (added + st.candidate_paths)[:MAX_FILES_DRYRUN] st.candidate_paths = (added + st.candidate_paths)[:MAX_FILES_DRYRUN]
# extra: grep op 'old' literal uit user_goal om kandidaten te verrijken # extra: grep op 'old' literal uit user_goal om kandidaten te verrijken
try: qs = extract_quotes(st.user_goal) or []
old, new, _why_pair = deduce_old_new_literals(st.user_goal, "") old = qs[0].strip() if qs and qs[0].strip() else None
except Exception:
old = None
if old: if old:
for rel in _grep_repo_for_literal(root, old, limit=16): for rel in _grep_repo_for_literal(root, old, limit=16):
if rel in all_files and rel not in st.candidate_paths: if rel in all_files and rel not in st.candidate_paths:
st.candidate_paths.append(rel) st.candidate_paths.append(rel)
try: try:
proposed, diffs, reasons = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal) proposed, diffs, reasons = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
if not proposed: if not proposed:

378
app.py
View File

@ -32,7 +32,11 @@ from fastapi.routing import APIRoute
from starlette.concurrency import run_in_threadpool from starlette.concurrency import run_in_threadpool
from pydantic import BaseModel from pydantic import BaseModel
AUTO_CONTINUE = os.getenv("AUTO_CONTINUE", "1").lower() not in ("0","false","no")
AUTO_CONTINUE_MAX_ROUNDS = int(os.getenv("AUTO_CONTINUE_MAX_ROUNDS", "6"))
AUTO_CONTINUE_TAIL_CHARS = int(os.getenv("AUTO_CONTINUE_TAIL_CHARS", "600"))
from llm_client import init_llm_client, _sync_model_infer
# Optionele libs voor tekst-extractie # Optionele libs voor tekst-extractie
try: try:
@ -71,6 +75,17 @@ except Exception:
cairosvg = None cairosvg = None
import tempfile, subprocess # voor audio import tempfile, subprocess # voor audio
import html # ← nieuw: unescape van HTML-entities in tool-call JSON
# Forceer consistente caches zowel binnen als buiten Docker
os.environ.setdefault("HF_HOME", "/opt/hf")
os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/opt/hf")
os.environ.setdefault("TRANSFORMERS_CACHE", "/opt/hf")
os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers")
os.environ.setdefault("XDG_CACHE_HOME", "/opt/cache")
# whisper cache pad (gebruikt door faster-whisper)
os.environ.setdefault("WHISPER_CACHE_DIR", os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper"))
# STT (Whisper-compatible via faster-whisper) — optioneel # STT (Whisper-compatible via faster-whisper) — optioneel
_STT_MODEL = None _STT_MODEL = None
@ -148,15 +163,53 @@ def _build_tools_system_prompt(tools: list) -> str:
return "\n".join(lines) return "\n".join(lines)
def _extract_tool_calls_from_text(txt: str): def _extract_tool_calls_from_text(txt: str):
# tolerant: pak eerste JSON object (ook als het in ```json staat) s = (txt or "").strip()
m = re.search(r"\{[\s\S]*\}", txt or "") # Strip code fences & bekende chat-tokens
if s.startswith("```"):
s = extract_code_block(s)
s = re.sub(r"^<\|im_start\|>\s*assistant\s*", "", s, flags=re.I)
# HTML-escaped JSON (&#34; etc.)
try:
s = html.unescape(s)
except Exception:
pass
# tolerant: pak eerste JSON object
m = re.search(r"\{[\s\S]*\}", s)
if not m: if not m:
return [] return []
try: try:
obj = json.loads(m.group(0)) obj = json.loads(m.group(0))
except Exception: except Exception:
return [] return []
tc = obj.get("tool_calls") or [] # === Normaliseer verschillende dialecten ===
# 1) OpenAI: {"tool_calls":[{"type":"function","function":{"name":..,"arguments":..}}]}
if "tool_calls" in obj and isinstance(obj["tool_calls"], list):
tc = obj["tool_calls"]
# 2) Replit/Magistral/Mistral-achtig: {"call_tool":{"name":"...","arguments":{...}}} óf lijst
elif "call_tool" in obj:
raw = obj["call_tool"]
if isinstance(raw, dict):
raw = [raw]
tc = []
for it in (raw or []):
name = (it or {}).get("name")
args = (it or {}).get("arguments") or {}
if isinstance(args, str):
try: args = json.loads(args)
except Exception: pass
if name:
tc.append({"type":"function","function":{"name": name, "arguments": json.dumps(args, ensure_ascii=False)}})
# 3) Legacy OpenAI: {"function_call":{"name":"...","arguments":"{...}"}}
elif "function_call" in obj and isinstance(obj["function_call"], dict):
fc = obj["function_call"]
name = fc.get("name")
args = fc.get("arguments") or {}
if isinstance(args, str):
try: args = json.loads(args)
except Exception: pass
tc = [{"type":"function","function":{"name": name, "arguments": json.dumps(args, ensure_ascii=False)}}] if name else []
else:
tc = []
out = [] out = []
for c in tc: for c in tc:
name = (c or {}).get("name") name = (c or {}).get("name")
@ -166,6 +219,13 @@ def _extract_tool_calls_from_text(txt: str):
args = json.loads(args) args = json.loads(args)
except Exception: except Exception:
args = {} args = {}
# Support zowel {"name":..,"arguments":..} als {"type":"function","function":{...}}
if not name and isinstance(c.get("function"), dict):
name = c["function"].get("name")
args = c["function"].get("arguments") or args
if isinstance(args, str):
try: args = json.loads(args)
except Exception: args = {}
if name: if name:
out.append({ out.append({
"id": f"call_{uuid.uuid4().hex[:8]}", "id": f"call_{uuid.uuid4().hex[:8]}",
@ -177,6 +237,46 @@ def _extract_tool_calls_from_text(txt: str):
}) })
return out return out
# --- Universal toolcall detector (JSON + ReAct + HTML-escaped) ---
_TOOL_REACT = re.compile(
r"Action\s*:\s*([A-Za-z0-9_\-\.]+)\s*[\r\n]+Action\s*Input\s*:\s*(?:```json\s*([\s\S]*?)\s*```|([\s\S]*))",
re.I
)
def detect_toolcalls_any(text: str) -> list[dict]:
"""
Retourneert altijd OpenAI-achtige tool_calls items:
[{"id":..., "type":"function", "function":{"name":..., "arguments": "{...}"}}]
"""
calls = _extract_tool_calls_from_text(text)
if calls:
return calls
s = (text or "").strip()
try:
s = html.unescape(s)
except Exception:
pass
# ReAct fallback
m = _TOOL_REACT.search(s)
if m:
name = (m.group(1) or "").strip()
raw = (m.group(2) or m.group(3) or "").strip()
# strip eventuele fences
if raw.startswith("```"):
raw = extract_code_block(raw)
try:
args = json.loads(raw) if raw else {}
except Exception:
# laatste redmiddel: wikkel als {"input": "<raw>"}
args = {"input": raw}
if name:
return [{
"id": f"call_{uuid.uuid4().hex[:8]}",
"type": "function",
"function": {"name": name, "arguments": json.dumps(args, ensure_ascii=False)}
}]
return []
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# App & logging # App & logging
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -197,6 +297,21 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
# Eén centrale QueueManager voor ALLE LLM-calls
#app.state.LLM_QUEUE = QueueManager(model_infer_fn=_sync_model_infer)
# Koppel deze queue aan llm_client zodat _llm_call dezelfde queue gebruikt
#init_llm_client(app.state.LLM_QUEUE)
# Let op:
# - llm_call_openai_compat verwacht dat app.state.LLM_QUEUE een deque-achtige queue is
# (append/index/popleft/remove).
# - De QueueManager uit queue_helper.py is thread-based en wordt hier niet gebruikt.
# Als je QueueManager wilt inzetten, doe dat in llm_client.py of elders,
# maar niet als app.state.LLM_QUEUE, om type-conflicten te voorkomen.
@app.on_event("startup") @app.on_event("startup")
async def _startup(): async def _startup():
# Zorg dat lokale hosts nooit via een proxy gaan # Zorg dat lokale hosts nooit via een proxy gaan
@ -697,7 +812,7 @@ async def llm_call_openai_compat(
stream: bool = False, stream: bool = False,
temperature: float = 0.2, temperature: float = 0.2,
top_p: float = 0.9, top_p: float = 0.9,
max_tokens: int = 1024, max_tokens: int = 13027,
extra: Optional[dict] = None, extra: Optional[dict] = None,
stop: Optional[Union[str, list[str]]] = None, stop: Optional[Union[str, list[str]]] = None,
**kwargs **kwargs
@ -926,7 +1041,7 @@ async def _svg_from_prompt(prompt: str, w: int, h: int, background: str="white")
f"- Thema: {prompt}\n- Gebruik eenvoudige vormen/paths/tekst.") f"- Thema: {prompt}\n- Gebruik eenvoudige vormen/paths/tekst.")
resp = await llm_call_openai_compat( resp = await llm_call_openai_compat(
[{"role":"system","content":sys},{"role":"user","content":user}], [{"role":"system","content":sys},{"role":"user","content":user}],
stream=False, temperature=0.35, top_p=0.9, max_tokens=1200 stream=False, temperature=0.35, top_p=0.9, max_tokens=2048
) )
svg = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","") svg = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
return _svg_wrap_if_needed(_sanitize_svg(svg), w, h, background) return _svg_wrap_if_needed(_sanitize_svg(svg), w, h, background)
@ -1079,6 +1194,165 @@ def _parse_repo_qa_from_messages(messages: list[dict]) -> tuple[Optional[str], s
question = full question = full
return repo_hint, question, branch, n_ctx return repo_hint, question, branch, n_ctx
# ------------------------------
# Unified-diff "fast path" (expliciete paden + vervangtekst)
# ------------------------------
_Q = r"[\"'“”‘’`]"
_PATH_PATS = [
r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"']",
r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"']",
r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
]
def _extract_repo_branch(text: str) -> tuple[Optional[str], str]:
repo_url = None
branch = "main"
m = re.search(r"\bRepo\s*:\s*(\S+)", text, flags=re.I)
if m:
repo_url = m.group(1).strip()
mb = re.search(r"\bbranch\s*:\s*([A-Za-z0-9._/-]+)", text, flags=re.I)
if mb:
branch = mb.group(1).strip()
return repo_url, branch
def _extract_paths(text: str) -> list[str]:
paths: set[str] = set()
for pat in _PATH_PATS:
for m in re.finditer(pat, text):
paths.add(m.group(1))
return list(paths)
def _extract_replace_pair(text: str) -> tuple[Optional[str], Optional[str]]:
"""
Haalt (old,new) uit NL/EN varianten. Ondersteunt rechte en slimme quotes.
Voorbeelden die matchen:
- Vervang de tekst A in B
- Vervang de tekst "A" veranderen in "B"
- Replace the text 'A' to 'B'
- Replace "A" with "B"
"""
pats = [
rf"Vervang\s+de\s+tekst\s*{_Q}(.+?){_Q}[^.\n]*?(?:in|naar|verander(?:en)?\s+in)\s*{_Q}(.+?){_Q}",
rf"Replace(?:\s+the)?\s+text\s*{_Q}(.+?){_Q}\s*(?:to|with)\s*{_Q}(.+?){_Q}",
]
for p in pats:
m = re.search(p, text, flags=re.I|re.S)
if m:
return m.group(1), m.group(2)
# fallback: eerste twee quoted strings in de buurt van 'Vervang' / 'Replace'
mm = re.search(r"(Vervang|Replace)[\s\S]*?"+_Q+"(.+?)"+_Q+"[\s\S]*?"+_Q+"(.+?)"+_Q, text, flags=re.I)
if mm:
return mm.group(2), mm.group(3)
return None, None
def _looks_like_unified_diff_request(text: str) -> bool:
if re.search(r"\bunified\s+diff\b", text, flags=re.I):
return True
# ook accepteren bij "maak een diff" met expliciete bestanden
if re.search(r"\b(diff|patch)\b", text, flags=re.I) and any(re.search(p, text) for p in _PATH_PATS):
return True
return False
def _make_unified_diff(a_text: str, b_text: str, path: str) -> str:
import difflib
a_lines = a_text.splitlines(keepends=True)
b_lines = b_text.splitlines(keepends=True)
return "".join(difflib.unified_diff(a_lines, b_lines, fromfile=f"a/{path}", tofile=f"b/{path}", n=3))
_TRANS_WRAPPERS = [
r"__\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
r"@lang\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
r"trans\(\s*{q}({txt}){q}\s*\)".format(q=_Q, txt=r".+?"),
]
def _find_in_translation_wrapper(content: str, needle: str) -> bool:
for pat in _TRANS_WRAPPERS:
for m in re.finditer(pat, content):
inner = m.group(1)
if inner == needle:
return True
return False
def _update_nl_json(root: Path, key: str, value: str) -> tuple[bool, str]:
"""
Zet of update resources/lang/nl.json: { key: value }
Retourneert (gewijzigd, nieuwe_tekst).
"""
lang_path = root / "resources" / "lang" / "nl.json"
data = {}
if lang_path.exists():
try:
data = json.loads(lang_path.read_text(encoding="utf-8", errors="ignore") or "{}")
except Exception:
data = {}
# Alleen aanpassen als nodig
prev = data.get(key)
if prev == value:
return False, lang_path.as_posix()
data[key] = value
# Mooie, stabiele JSON dump
lang_path.parent.mkdir(parents=True, exist_ok=True)
tmp = lang_path.with_suffix(".tmp.json")
tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
os.replace(tmp, lang_path)
return True, lang_path.as_posix()
async def _fast_unified_diff_task(messages: list[dict]) -> Optional[str]:
"""
Als de prompt vraagt om 'unified diff' met expliciete bestanden en vervangtekst,
voer dat direct uit (bypass handle_repo_agent) en geef diffs terug.
"""
full = "\n".join([m.get("content","") for m in messages if m.get("role") in ("system","user")])
if not _looks_like_unified_diff_request(full):
return None
repo_url, branch = _extract_repo_branch(full)
paths = _extract_paths(full)
old, new = _extract_replace_pair(full)
if not (repo_url and paths and old and new):
return None
# Repo ophalen
repo_path = await _get_git_repo_async(repo_url, branch)
root = Path(repo_path)
changed_files: list[tuple[str, str]] = [] # (path, diff_text)
lang_changed = False
lang_path_text = ""
for rel in paths:
p = root / rel
if not p.exists():
continue
before = _read_text_file_wrapper(p)
if not before:
continue
# Als de tekst binnen een vertaal-wrapper staat, wijzig NIET de blade,
# maar zet/patch resources/lang/nl.json: { "oude": "nieuwe" }.
if _find_in_translation_wrapper(before, old):
ch, lang_path_text = _update_nl_json(root, old, new)
lang_changed = lang_changed or ch
# geen bladewijziging in dit geval
continue
after = before.replace(old, new)
if after == before:
continue
diff = _make_unified_diff(before, after, rel)
if diff.strip():
changed_files.append((rel, diff))
if not changed_files and not lang_changed:
return "Dry-run: geen wijzigbare treffers gevonden in opgegeven bestanden (of reeds actueel)."
out = []
if changed_files:
out.append("### Unified diffs")
for rel, d in changed_files:
out.append(f"```diff\n{d}```")
if lang_changed:
out.append(f"✅ Bijgewerkte vertaling in: `{lang_path_text}` → \"{old}\"\"{new}\"")
return "\n\n".join(out).strip()
# ------------------------------ # ------------------------------
# OpenAI-compatible endpoints # OpenAI-compatible endpoints
@ -1131,15 +1405,17 @@ def _get_stt_model():
except Exception: except Exception:
raise HTTPException(status_code=500, detail="STT niet beschikbaar: faster-whisper ontbreekt.") raise HTTPException(status_code=500, detail="STT niet beschikbaar: faster-whisper ontbreekt.")
# device-selectie # device-selectie
download_root = os.getenv("STT_MODEL_DIR", os.environ.get("WHISPER_CACHE_DIR"))
os.makedirs(download_root, exist_ok=True)
if STT_DEVICE == "auto": if STT_DEVICE == "auto":
try: try:
_STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cuda", compute_type="float16") _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cuda", compute_type="float16", download_root=download_root)
return _STT_MODEL return _STT_MODEL
except Exception: except Exception:
_STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cpu", compute_type="int8") _STT_MODEL = WhisperModel(STT_MODEL_NAME, device="cpu", compute_type="int8", download_root=download_root)
return _STT_MODEL return _STT_MODEL
dev, comp = ("cuda","float16") if STT_DEVICE=="cuda" else ("cpu","int8") dev, comp = ("cuda","float16") if STT_DEVICE=="cuda" else ("cpu","int8")
_STT_MODEL = WhisperModel(STT_MODEL_NAME, device=dev, compute_type=comp) _STT_MODEL = WhisperModel(STT_MODEL_NAME, device=dev, compute_type=comp, download_root=download_root)
return _STT_MODEL return _STT_MODEL
def _stt_transcribe_path(path: str, lang: str | None): def _stt_transcribe_path(path: str, lang: str | None):
@ -1246,7 +1522,7 @@ async def present_make(
f"Max. {max_slides} dia's, 36 bullets per dia.") f"Max. {max_slides} dia's, 36 bullets per dia.")
plan = await llm_call_openai_compat( plan = await llm_call_openai_compat(
[{"role":"system","content":sys},{"role":"user","content":user}], [{"role":"system","content":sys},{"role":"user","content":user}],
stream=False, temperature=0.3, top_p=0.9, max_tokens=1200 stream=False, temperature=0.3, top_p=0.9, max_tokens=13021
) )
raw = (plan.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}") raw = (plan.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}")
try: try:
@ -1292,7 +1568,7 @@ async def vision_ask(
stream: bool = Form(False), stream: bool = Form(False),
temperature: float = Form(0.2), temperature: float = Form(0.2),
top_p: float = Form(0.9), top_p: float = Form(0.9),
max_tokens: int = Form(512), max_tokens: int = Form(1024),
): ):
raw = await run_in_threadpool(file.file.read) raw = await run_in_threadpool(file.file.read)
img_b64 = base64.b64encode(raw).decode("utf-8") img_b64 = base64.b64encode(raw).decode("utf-8")
@ -1323,7 +1599,7 @@ async def vision_and_text(
max_chars: int = Form(25000), max_chars: int = Form(25000),
temperature: float = Form(0.2), temperature: float = Form(0.2),
top_p: float = Form(0.9), top_p: float = Form(0.9),
max_tokens: int = Form(1024), max_tokens: int = Form(2048),
): ):
images_b64: list[str] = [] images_b64: list[str] = []
text_chunks: list[str] = [] text_chunks: list[str] = []
@ -1370,7 +1646,7 @@ async def vision_health():
tiny_png = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAOaO5nYAAAAASUVORK5CYII=" tiny_png = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAOaO5nYAAAAASUVORK5CYII="
try: try:
messages = [{"role":"user","content":"<image> Beschrijf dit in één woord."}] messages = [{"role":"user","content":"<image> Beschrijf dit in één woord."}]
resp = await llm_call_openai_compat(messages, extra={"images":[tiny_png]}, max_tokens=16) resp = await llm_call_openai_compat(messages, extra={"images":[tiny_png]}, max_tokens=128)
txt = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","").strip() txt = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","").strip()
return {"vision": bool(txt), "sample": txt[:60]} return {"vision": bool(txt), "sample": txt[:60]}
except Exception as e: except Exception as e:
@ -1564,7 +1840,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
resp = await llm_call_openai_compat( resp = await llm_call_openai_compat(
[{"role":"system","content":"Je bent behulpzaam en exact."}, [{"role":"system","content":"Je bent behulpzaam en exact."},
{"role":"user","content": f"{instruction}\n\n--- BEGIN ---\n{text}\n--- EINDE ---"}], {"role":"user","content": f"{instruction}\n\n--- BEGIN ---\n{text}\n--- EINDE ---"}],
stream=False, max_tokens=768 stream=False, max_tokens=7680
) )
return resp return resp
@ -1574,7 +1850,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
resp = await llm_call_openai_compat( resp = await llm_call_openai_compat(
[{"role":"system","content":"Wees feitelijk en concreet."}, [{"role":"system","content":"Wees feitelijk en concreet."},
{"role":"user","content": f"{goal}\n\n--- BEGIN ---\n{text}\n--- EINDE ---"}], {"role":"user","content": f"{goal}\n\n--- BEGIN ---\n{text}\n--- EINDE ---"}],
stream=False, max_tokens=768 stream=False, max_tokens=7680
) )
return resp return resp
@ -1587,7 +1863,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
{"role":"user","content": f"{objective}\nStijl/criteria: {style}\n" {"role":"user","content": f"{objective}\nStijl/criteria: {style}\n"
"Antwoord met eerst een korte toelichting, daarna alleen de verbeterde inhoud tussen een codeblok.\n\n" "Antwoord met eerst een korte toelichting, daarna alleen de verbeterde inhoud tussen een codeblok.\n\n"
f"--- BEGIN ---\n{text}\n--- EINDE ---"}], f"--- BEGIN ---\n{text}\n--- EINDE ---"}],
stream=False, max_tokens=1024 stream=False, max_tokens=10240
) )
return resp return resp
@ -1597,7 +1873,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
resp = await llm_call_openai_compat( resp = await llm_call_openai_compat(
[{"role":"system","content":"Wees strikt en concreet."}, [{"role":"system","content":"Wees strikt en concreet."},
{"role":"user","content": f"{VALIDATE_PROMPT}\n\nCode om te valideren:\n```\n{code}\n```"}], {"role":"user","content": f"{VALIDATE_PROMPT}\n\nCode om te valideren:\n```\n{code}\n```"}],
stream=False, max_tokens=512 stream=False, max_tokens=10000
) )
txt = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","") txt = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
return {"status":"issues_found" if _parse_validation_results(txt) else "valid", return {"status":"issues_found" if _parse_validation_results(txt) else "valid",
@ -1611,7 +1887,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
[{"role":"system","content": SYSTEM_PROMPT}, [{"role":"system","content": SYSTEM_PROMPT},
{"role":"user","content": f"Verbeter deze {language}-code met focus op {focus}:\n\n" {"role":"user","content": f"Verbeter deze {language}-code met focus op {focus}:\n\n"
f"{code}\n\nGeef eerst een korte toelichting, dan alleen het verbeterde codeblok. Behoud functionaliteit."}], f"{code}\n\nGeef eerst een korte toelichting, dan alleen het verbeterde codeblok. Behoud functionaliteit."}],
stream=False, max_tokens=1536 stream=False, max_tokens=10768
) )
return resp return resp
@ -1636,7 +1912,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
if name == "vision_analyze": if name == "vision_analyze":
image_url = args.get("image_url","") image_url = args.get("image_url","")
prompt = args.get("prompt","Beschrijf beknopt wat je ziet en noem de belangrijkste details.") prompt = args.get("prompt","Beschrijf beknopt wat je ziet en noem de belangrijkste details.")
max_tokens = int(args.get("max_tokens",512)) max_tokens = int(args.get("max_tokens",1024))
b64 = None b64 = None
# Alleen data: of raw base64 accepteren; http(s) niet, want die worden niet # Alleen data: of raw base64 accepteren; http(s) niet, want die worden niet
# ingeladen in de vision-call en zouden stil falen. # ingeladen in de vision-call en zouden stil falen.
@ -1908,6 +2184,52 @@ async def _complete_with_autocontinue(
continues += 1 continues += 1
return content, resp return content, resp
async def llm_call_autocont(
messages: list[dict],
*,
model: Optional[str] = None,
stream: bool = False,
temperature: float = 0.2,
top_p: float = 0.9,
max_tokens: int = 1024,
extra: Optional[dict] = None,
stop: Optional[Union[str, list[str]]] = None,
**kwargs
) -> dict | StreamingResponse:
"""
OpenAI-compatibele wrapper die non-stream antwoorden automatisch
doorcontinueert met jouw _complete_with_autocontinue(...).
- stream=True -> passthrough naar llm_call_openai_compat (ongewijzigd)
- stream=False -> retourneert één samengevoegd antwoord als OpenAI JSON
"""
mdl = (model or os.getenv("LLM_MODEL", "mistral-medium")).strip()
if stream:
# streaming blijft 1-op-1 zoals voorheen
return await llm_call_openai_compat(
messages,
model=mdl,
stream=True,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
extra=extra if extra else None,
stop=stop
)
max_autocont = int(os.getenv("LLM_AUTO_CONTINUES", "2"))
full_text, _last = await _complete_with_autocontinue(
messages,
model=mdl,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
extra_payload=extra if extra else None,
max_autocont=max_autocont
)
# Bouw een standaard OpenAI-chat response met het samengevoegde antwoord
return _openai_chat_response(mdl, full_text, messages)
@app.post("/v1/chat/completions") @app.post("/v1/chat/completions")
async def openai_chat_completions(body: dict = Body(...), request: Request = None): async def openai_chat_completions(body: dict = Body(...), request: Request = None):
model = (body.get("model") or os.getenv("LLM_MODEL", "mistral-medium")).strip() model = (body.get("model") or os.getenv("LLM_MODEL", "mistral-medium")).strip()
@ -2053,7 +2375,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
# jouw bestaande helper; hou 'm zoals je al gebruikt # jouw bestaande helper; hou 'm zoals je al gebruikt
resp = await llm_call_openai_compat(ask, stream=False, max_tokens=512) resp = await llm_call_openai_compat(ask, stream=False, max_tokens=512)
txt = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content", "") or "" txt = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content", "") or ""
calls = _extract_tool_calls_from_text(txt) calls = detect_toolcalls_any(txt) #_extract_tool_calls_from_text(txt)
if calls: if calls:
return { return {
"id": f"chatcmpl-{uuid.uuid4().hex}", "id": f"chatcmpl-{uuid.uuid4().hex}",
@ -2075,6 +2397,17 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
# Speciale modellen # Speciale modellen
if model == "repo-agent": if model == "repo-agent":
# === snelle bypass voor "unified diff" opdrachten met expliciete paden ===
try:
fast = await _fast_unified_diff_task(messages)
except Exception as e:
fast = f"(Fast-diff pad mislukte: {e})"
if fast:
if stream:
async def _emit(): yield ("data: " + json.dumps({"id": f"chatcmpl-{uuid.uuid4().hex[:12]}", "object":"chat.completion.chunk","created": int(time.time()),"model":"repo-agent","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":None}]}) + "\n\n").encode("utf-8"); yield ("data: " + json.dumps({"id": f"chatcmpl-{uuid.uuid4().hex[:12]}","object":"chat.completion.chunk","created": int(time.time()),"model":"repo-agent","choices":[{"index":0,"delta":{"content": fast},"finish_reason":None}]}) + "\n\n").encode("utf-8"); yield b"data: [DONE]\n\n"
return StreamingResponse(_emit(), media_type="text/event-stream", headers={"Cache-Control":"no-cache, no-transform","X-Accel-Buffering":"no","Connection":"keep-alive"})
return JSONResponse(_openai_chat_response("repo-agent", fast, messages))
if stream: if stream:
async def event_gen(): async def event_gen():
import asyncio, time, json, contextlib import asyncio, time, json, contextlib
@ -2421,7 +2754,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
if stream: if stream:
temperature = float(body.get("temperature", 0.2)) temperature = float(body.get("temperature", 0.2))
top_p = float(body.get("top_p", 0.9)) top_p = float(body.get("top_p", 0.9))
_default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "1024")) _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021"))
max_tokens = int(body.get("max_tokens", _default_max)) max_tokens = int(body.get("max_tokens", _default_max))
return await llm_call_openai_compat( return await llm_call_openai_compat(
messages, messages,
@ -3331,7 +3664,7 @@ async def rag_query_api(
resp = await llm_call_openai_compat( resp = await llm_call_openai_compat(
[{"role":"system","content":"You are precise and return only valid JSON."}, [{"role":"system","content":"You are precise and return only valid JSON."},
{"role":"user","content": prompt+"\n\nOnly JSON array."}], {"role":"user","content": prompt+"\n\nOnly JSON array."}],
stream=False, temperature=0.0, top_p=1.0, max_tokens=256 stream=False, temperature=0.0, top_p=1.0, max_tokens=512
) )
try: try:
order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]")) order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]"))
@ -3428,7 +3761,8 @@ initialize_agent(
get_git_repo_fn=_get_git_repo_async, get_git_repo_fn=_get_git_repo_async,
rag_index_repo_internal_fn=_rag_index_repo_internal, rag_index_repo_internal_fn=_rag_index_repo_internal,
rag_query_internal_fn=_rag_query_internal, rag_query_internal_fn=_rag_query_internal,
llm_call_fn=llm_call_openai_compat, # Gebruik auto-continue wrapper zodat agent-antwoorden niet worden afgekapt
llm_call_fn=llm_call_autocont,
extract_code_block_fn=extract_code_block, extract_code_block_fn=extract_code_block,
read_text_file_fn=_read_text_file_wrapper, read_text_file_fn=_read_text_file_wrapper,
client_ip_fn=_client_ip, client_ip_fn=_client_ip,

131
llm_client.py Executable file
View File

@ -0,0 +1,131 @@
from __future__ import annotations
import os
import asyncio
import logging
from typing import List, Dict, Any, Optional
import httpx
from queue_helper import QueueManager
logger = logging.getLogger(__name__)
# -------------------------------------------------------------
# Config voor onderliggende LLM-backend
# -------------------------------------------------------------
# Dit is NIET jouw eigen /v1/chat/completions endpoint,
# maar de *echte* model-backend (bijv. Ollama, vLLM, Mistral server, etc.).
LLM_API_BASE = os.getenv("LLM_API_BASE", "http://127.0.0.1:11434")
LLM_DEFAULT_MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini")
LLM_REQUEST_TIMEOUT = float(os.getenv("LLM_REQUEST_TIMEOUT", "120"))
# Deze wordt in app.py gezet via init_llm_client(...)
LLM_QUEUE: QueueManager | None = None
def init_llm_client(queue: QueueManager) -> None:
"""
Koppel de globale LLM_QUEUE aan de QueueManager uit app.py.
Deze MOET je in app.py één keer aanroepen.
"""
global LLM_QUEUE
LLM_QUEUE = queue
logger.info("llm_client: LLM_QUEUE gekoppeld via init_llm_client.")
def _sync_model_infer(payload: Dict[str, Any]) -> Dict[str, Any]:
"""
Synchronous call naar de echte LLM-backend.
Dit is de functie die je in app.py gebruikt bij het maken van de QueueManager.
"""
url = f"{LLM_API_BASE.rstrip('/')}/v1/chat/completions"
try:
with httpx.Client(timeout=LLM_REQUEST_TIMEOUT) as client:
resp = client.post(url, json=payload)
resp.raise_for_status()
return resp.json()
except Exception as exc:
logger.exception("LLM backend call failed: %s", exc)
return {
"object": "chat.completion",
"choices": [{
"index": 0,
"finish_reason": "error",
"message": {
"role": "assistant",
"content": f"[LLM-fout] {exc}",
},
}],
"usage": {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
},
}
async def _llm_call(
messages: List[Dict[str, str]],
*,
stream: bool = False,
temperature: float = 0.2,
top_p: float = 0.9,
max_tokens: Optional[int] = None,
model: Optional[str] = None,
**extra: Any,
) -> Dict[str, Any]:
"""
Centrale helper voor tools/agents/smart_rag/repo-agent.
Belangrijk:
- Gebruikt de *bestaande* QueueManager uit app.py (via init_llm_client).
- Stuurt jobs in de agent-queue (lagere prioriteit dan users).
- GEEN wachtrij-meldingen ("u bent #...") voor deze interne calls.
"""
if stream:
# In deze agent gebruiken we geen streaming.
raise NotImplementedError("_llm_call(stream=True) wordt momenteel niet ondersteund.")
if LLM_QUEUE is None:
# Hard fail: dan weet je meteen dat init_llm_client nog niet is aangeroepen.
raise RuntimeError("LLM_QUEUE is niet geïnitialiseerd. Roep init_llm_client(...) aan in app.py")
payload: Dict[str, Any] = {
"model": model or LLM_DEFAULT_MODEL,
"messages": messages,
"stream": False,
"temperature": float(temperature),
"top_p": float(top_p),
}
if max_tokens is not None:
payload["max_tokens"] = int(max_tokens)
payload.update(extra)
loop = asyncio.get_running_loop()
try:
# request_agent_sync blokkeert → naar threadpool
response: Dict[str, Any] = await loop.run_in_executor(
None, lambda: LLM_QUEUE.request_agent_sync(payload)
)
return response
except Exception as exc:
logger.exception("_llm_call via agent-queue failed: %s", exc)
return {
"object": "chat.completion",
"choices": [{
"index": 0,
"finish_reason": "error",
"message": {
"role": "assistant",
"content": f"[LLM-queue-fout] {exc}",
},
}],
"usage": {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
},
}

1
mistral-api.sh Executable file
View File

@ -0,0 +1 @@
docker run -d --name mistral-api --network host -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=13021 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ mistral-api

View File

@ -45,11 +45,20 @@ class QueueManager:
self._worker.start() self._worker.start()
# ---------- public API ---------- # ---------- public API ----------
def enqueue_user(self, payload: Dict, progress_cb: Callable[[Dict], None]) -> tuple[str, int]: def enqueue_user(
self,
payload: Dict,
progress_cb: Callable[[Dict], None],
*,
notify_position: bool = False,
) -> tuple[str, int]:
job = _Job(payload, progress_cb) job = _Job(payload, progress_cb)
try: self._user_q.put_nowait(job) try: self._user_q.put_nowait(job)
except queue.Full: raise RuntimeError(f"Userqueue vol (≥{USER_MAX_QUEUE})") except queue.Full: raise RuntimeError(f"Userqueue vol (≥{USER_MAX_QUEUE})")
position = self._user_q.qsize() position = self._user_q.qsize()
if notify_position:
# start een aparte notifier-thread die periodiek de wachtrijpositie meldt
start_position_notifier(job, self._user_q)
return job.job_id, position return job.job_id, position
def enqueue_agent(self, payload: Dict, progress_cb: Callable[[Dict], None]) -> str: def enqueue_agent(self, payload: Dict, progress_cb: Callable[[Dict], None]) -> str:
@ -58,6 +67,32 @@ class QueueManager:
except queue.Full: raise RuntimeError(f"Agentqueue vol (≥{AGENT_MAX_QUEUE})") except queue.Full: raise RuntimeError(f"Agentqueue vol (≥{AGENT_MAX_QUEUE})")
return job.job_id return job.job_id
# ---------- sync helper voor agents/tools ----------
def request_agent_sync(self, payload: Dict, timeout: float = WORKER_TIMEOUT) -> Dict:
"""
Gebruik dit voor interne calls (agents/tools).
- Job wordt in de agent-queue gezet (lagere prioriteit dan users).
- We wachten blokkerend tot de worker klaar is of tot timeout.
- Er worden GEEN wachtrij-meldingen ("U bent #...") verstuurd.
"""
result_box: Dict[str, Any] = {}
def _cb(msg: Dict):
# alleen het eindresultaat is interessant voor tools/agents
result_box["answer"] = msg
job = _Job(payload, _cb)
try:
self._agent_q.put_nowait(job)
except queue.Full:
raise RuntimeError(f"Agent-queue vol (≥{AGENT_MAX_QUEUE})")
ok = job.event.wait(timeout)
if not ok:
raise TimeoutError(f"LLM-inference duurde langer dan {timeout} seconden.")
if job.error:
raise RuntimeError(job.error)
return result_box.get("answer") or {}
# ---------- worker ---------- # ---------- worker ----------
def _run_worker(self): def _run_worker(self):
while not self._shutdown.is_set(): while not self._shutdown.is_set():
@ -79,18 +114,24 @@ class QueueManager:
self._shutdown.set() self._shutdown.set()
self._worker.join(timeout=5) self._worker.join(timeout=5)
def start_position_notifier(job: _Job, def start_position_notifier(
job: _Job,
queue_ref: queue.Queue, queue_ref: queue.Queue,
interval: float = UPDATE_INTERVAL): interval: float = UPDATE_INTERVAL,
):
"""Stuurt elke `interval` seconden een bericht met de huidige positie.""" """Stuurt elke `interval` seconden een bericht met de huidige positie."""
def _notifier(): def _notifier():
while not job.event.is_set(): # Stop zodra het job-event wordt gezet (success/fout/timeout upstream)
while not job.event.wait(interval):
# Neem een snapshot van de queue-inhoud op een thread-safe manier
with queue_ref.mutex:
snapshot = list(queue_ref.queue)
try: try:
pos = list(queue_ref.queue).index(job) + 1 # 1based pos = snapshot.index(job) + 1 # 1-based
except ValueError: except ValueError:
# Job staat niet meer in de wachtrij → geen updates meer nodig
break break
job.callback({"info": f"U bent #{pos} in de wachtrij. Even geduld…" }) job.callback({"info": f"U bent #{pos} in de wachtrij. Even geduld…" })
time.sleep(interval)
t = threading.Thread(target=_notifier, daemon=True) t = threading.Thread(target=_notifier, daemon=True)
t.start() t.start()
return t return t

View File

@ -296,7 +296,7 @@ async def hybrid_retrieve(
# Optionele query-routing + RRF # Optionele query-routing + RRF
use_route = str(os.getenv("RAG_ROUTE", "1")).lower() not in ("0", "false") use_route = str(os.getenv("RAG_ROUTE", "1")).lower() not in ("0", "false")
use_rrf = str(os.getenv("RAG_RRF", "1")).lower() not in ("0", "false") use_rrf = str(os.getenv("RAG_RRF", "1")).lower() not in ("0", "false")
# Optionele mini multi-query expansion (default uit) # Optionele mini multi-query expansion (default aan)
use_expand = str(os.getenv("RAG_MULTI_EXPAND", "1")).lower() in ("1","true","yes") use_expand = str(os.getenv("RAG_MULTI_EXPAND", "1")).lower() in ("1","true","yes")
k_variants = max(1, int(os.getenv("RAG_MULTI_K", "3"))) k_variants = max(1, int(os.getenv("RAG_MULTI_K", "3")))
per_query_k = max(1, int(per_query_k)) per_query_k = max(1, int(per_query_k))
@ -317,10 +317,14 @@ async def hybrid_retrieve(
if use_route: if use_route:
buckets = _route_query_buckets(qv) buckets = _route_query_buckets(qv)
for b in buckets: for b in buckets:
# combineer globale path_contains-hint met bucket-specifieke filter
pc = b.get("path_contains")
if path_contains and not pc:
pc = path_contains
res = await rag_query_internal_fn( res = await rag_query_internal_fn(
query=b["q"], n_results=per_query_k, query=b["q"], n_results=per_query_k,
collection_name=collection_name, collection_name=collection_name,
repo=repo, path_contains=b["path_contains"], profile=profile repo=repo, path_contains=pc, profile=profile
) )
lst = [] lst = []
for item in (res or {}).get("results", []): for item in (res or {}).get("results", []):
@ -594,5 +598,7 @@ __all__ = [
"expand_queries", "expand_queries",
"hybrid_retrieve", "hybrid_retrieve",
"assemble_context", "assemble_context",
"_laravel_pairs_from_route_text",
"_laravel_guess_view_paths_from_text",
] ]

View File

@ -1,7 +1,7 @@
# windowing_utils.py # windowing_utils.py
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import List, Dict, Callable, Optional, Tuple from typing import List, Dict, Callable, Optional, Tuple, Awaitable
import hashlib import hashlib
import os import os
import time import time
@ -64,7 +64,7 @@ class ConversationWindow:
async def build_within_budget( async def build_within_budget(
self, self,
system_prompt: Optional[str], system_prompt: Optional[str],
summarizer: Optional[Callable[[str, List[Dict]], "awaitable[str]"]] = None summarizer: Optional[Callable[[str, List[Dict]], Awaitable[str]]] = None
) -> List[Dict]: ) -> List[Dict]:
budget = self.max_ctx_tokens - max(1, self.response_reserve) budget = self.max_ctx_tokens - max(1, self.response_reserve)
working = self.history[:] working = self.history[:]
@ -106,6 +106,11 @@ class ConversationWindow:
self.running_summary = await summarizer(self.running_summary, chunk_buf) self.running_summary = await summarizer(self.running_summary, chunk_buf)
chunk_buf = [] chunk_buf = []
# verwerk eventuele overgebleven buffer zodat er geen turns verdwijnen
if chunk_buf:
self.running_summary = await summarizer(self.running_summary, chunk_buf)
chunk_buf = []
self.history = working self.history = working
return await build_candidate(self.running_summary, working) return await build_candidate(self.running_summary, working)