This commit is contained in:
admin 2026-02-02 10:28:41 +01:00
parent 932144e798
commit 1aaf0d013a
3 changed files with 321 additions and 113 deletions

View File

@ -173,7 +173,7 @@ except Exception:
logger = logging.getLogger("agent_repo") logger = logging.getLogger("agent_repo")
# ---------- Omgeving / Config ---------- # ---------- Omgeving / Config ----------
GITEA_URL = os.environ.get("GITEA_URL", "http://localhost:3080").rstrip("/") GITEA_URL = os.environ.get("GITEA_URL", "http://10.25.138.40:30085").rstrip("/")
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "8bdbe18dd2ec93ecbf9cd0a8f01a6eadf9cfa87d") GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "8bdbe18dd2ec93ecbf9cd0a8f01a6eadf9cfa87d")
GITEA_API = os.environ.get("GITEA_API", f"{GITEA_URL}/api/v1").rstrip("/") GITEA_API = os.environ.get("GITEA_API", f"{GITEA_URL}/api/v1").rstrip("/")
AGENT_DEFAULT_BRANCH = os.environ.get("AGENT_DEFAULT_BRANCH", "main") AGENT_DEFAULT_BRANCH = os.environ.get("AGENT_DEFAULT_BRANCH", "main")
@ -188,7 +188,7 @@ AGENT_CLARIFY_THRESHOLD = float(os.environ.get("AGENT_CLARIFY_THRESHOLD", "0.6")
# Meilisearch (optioneel) # Meilisearch (optioneel)
MEILI_URL = os.environ.get("MEILI_URL", "http://localhost:7700").strip() MEILI_URL = os.environ.get("MEILI_URL", "http://192.168.100.1:7700").strip()
MEILI_KEY = os.environ.get("MEILI_KEY", "0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ").strip() MEILI_KEY = os.environ.get("MEILI_KEY", "0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ").strip()
MEILI_INDEX_PREFIX = os.environ.get("MEILI_INDEX_PREFIX", "code").strip() MEILI_INDEX_PREFIX = os.environ.get("MEILI_INDEX_PREFIX", "code").strip()
@ -377,7 +377,7 @@ def _qdrant_query(collection_name: str, query: str, n_results: int, where: Dict[
Filter, FieldCondition, MatchValue = _qdrant_models Filter, FieldCondition, MatchValue = _qdrant_models
# Let op: je hebt hier *ook* een embedder nodig (client-side). In dit skeleton verwachten we dat # Let op: je hebt hier *ook* een embedder nodig (client-side). In dit skeleton verwachten we dat
# je server-side search by text hebt geconfigureerd. Anders: voeg hier je embedder toe. # je server-side search by text hebt geconfigureerd. Anders: voeg hier je embedder toe.
client = _qdrant(host=os.getenv("QDRANT_HOST","localhost"), port=int(os.getenv("QDRANT_PORT","6333"))) client = _qdrant(host=os.getenv("QDRANT_HOST","192.168.100.1"), port=int(os.getenv("QDRANT_PORT","6333")))
# Eenvoudig: text search (als ingeschakeld). Anders: raise en laat de mock fallback pakken. # Eenvoudig: text search (als ingeschakeld). Anders: raise en laat de mock fallback pakken.
try: try:
must: List[Any] = [] must: List[Any] = []
@ -4245,7 +4245,7 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
pass pass
st.stage = "ASK" st.stage = "ASK"
base = ("Ik verken de code en doe een voorstel. Geef de repo (bv. `admin/image-viewing-website` of " base = ("Ik verken de code en doe een voorstel. Geef de repo (bv. `admin/image-viewing-website` of "
"`http://localhost:3080/admin/image-viewing-website.git`). " "`http://10.25.138.40:30085/admin/image-viewing-website.git`). "
"Of zeg: **'zoek repo'** als ik zelf moet zoeken.") "Of zeg: **'zoek repo'** als ik zelf moet zoeken.")
return _with_preview(base, st) return _with_preview(base, st)

424
app.py
View File

@ -294,6 +294,87 @@ def detect_toolcalls_any(text: str) -> list[dict]:
}] }]
return [] return []
def _coerce_text_toolcalls_to_openai(data: dict) -> dict:
"""Als een upstream LLM tool-calls als tekst (bv. '[TOOL_CALLS] ...') teruggeeft,
zet dit om naar OpenAI-native choices[0].message.tool_calls zodat OpenWebUI tools kan runnen.
Laat bestaande tool_calls ongemoeid.
"""
try:
if not isinstance(data, dict):
return data
choices = data.get("choices") or []
if not choices or not isinstance(choices, list):
return data
ch0 = choices[0] or {}
if not isinstance(ch0, dict):
return data
msg = ch0.get("message") or {}
if not isinstance(msg, dict):
return data
# native tool_calls bestaan al → niets doen
if msg.get("tool_calls"):
return data
content = msg.get("content")
if not isinstance(content, str):
return data
s = content.strip()
if not s:
return data
# Alleen proberen als er duidelijke signalen zijn
if ("[TOOL_CALLS]" not in s) and (not s.lstrip().startswith("[")) and ("call_tool" not in s) and ("tool_calls" not in s):
return data
calls = detect_toolcalls_any(s) or []
if not calls:
# vLLM/[TOOL_CALLS] stijl: vaak een JSON array na de tag
s2 = re.sub(r"^\s*\[TOOL_CALLS\]\s*", "", s, flags=re.I)
try:
s2 = html.unescape(s2)
except Exception:
pass
m = re.search(r"\[[\s\S]*\]", s2)
arr = None
if m:
try:
arr = json.loads(m.group(0))
except Exception:
arr = None
if isinstance(arr, list):
calls = []
for it in arr:
if not isinstance(it, dict):
continue
name = it.get("name")
args = it.get("arguments", {})
if not name and isinstance(it.get("function"), dict):
name = it["function"].get("name")
args = it["function"].get("arguments", args)
if isinstance(args, str):
try:
args = json.loads(args)
except Exception:
args = {"input": args}
if name:
calls.append({
"id": f"call_{uuid.uuid4().hex[:8]}",
"type": "function",
"function": {"name": name, "arguments": json.dumps(args, ensure_ascii=False)}
})
if calls:
msg["role"] = msg.get("role") or "assistant"
msg["content"] = None
msg["tool_calls"] = calls
ch0["message"] = msg
ch0["finish_reason"] = "tool_calls"
data["choices"][0] = ch0
return data
except Exception:
return data
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# App & logging # App & logging
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -384,7 +465,7 @@ async def log_requests(request: Request, call_next):
# Config # Config
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
MISTRAL_MODE = os.getenv("MISTRAL_MODE", "v1").lower() MISTRAL_MODE = os.getenv("MISTRAL_MODE", "v1").lower()
LLM_URL = os.getenv("LLM_URL", "http://localhost:8000/v1/chat/completions").strip() LLM_URL = os.getenv("LLM_URL", "http://192.168.100.1:8000/v1/chat/completions").strip()
RAW_URL = os.getenv("MISTRAL_URL_RAW", "http://host.docker.internal:8000/completion").strip() RAW_URL = os.getenv("MISTRAL_URL_RAW", "http://host.docker.internal:8000/completion").strip()
LLM_CONNECT_TIMEOUT = float(os.getenv("LLM_CONNECT_TIMEOUT", "10")) LLM_CONNECT_TIMEOUT = float(os.getenv("LLM_CONNECT_TIMEOUT", "10"))
LLM_READ_TIMEOUT = float(os.getenv("LLM_READ_TIMEOUT", "1200")) LLM_READ_TIMEOUT = float(os.getenv("LLM_READ_TIMEOUT", "1200"))
@ -392,7 +473,7 @@ LLM_READ_TIMEOUT = float(os.getenv("LLM_READ_TIMEOUT", "1200"))
_UPSTREAM_URLS = [u.strip() for u in os.getenv("LLM_UPSTREAMS","").split(",") if u.strip()] _UPSTREAM_URLS = [u.strip() for u in os.getenv("LLM_UPSTREAMS","").split(",") if u.strip()]
# ==== Meilisearch (optioneel) ==== # ==== Meilisearch (optioneel) ====
MEILI_URL = os.getenv("MEILI_URL", "http://localhost:7700").rstrip("/") MEILI_URL = os.getenv("MEILI_URL", "http://192.168.100.1:7700").rstrip("/")
MEILI_API_KEY = os.getenv("MEILI_API_KEY", "0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ") MEILI_API_KEY = os.getenv("MEILI_API_KEY", "0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ")
MEILI_INDEX = os.getenv("MEILI_INDEX", "code_chunks") MEILI_INDEX = os.getenv("MEILI_INDEX", "code_chunks")
MEILI_ENABLED = bool(MEILI_URL) MEILI_ENABLED = bool(MEILI_URL)
@ -485,7 +566,7 @@ if CELERY_ENABLED:
celery_app = None celery_app = None
# Git / repos # Git / repos
GITEA_URL = os.environ.get("GITEA_URL", "http://localhost:3080").rstrip("/") GITEA_URL = os.environ.get("GITEA_URL", "http://10.25.138.40:30085").rstrip("/")
REPO_PATH = os.environ.get("REPO_PATH", "/tmp/repos") REPO_PATH = os.environ.get("REPO_PATH", "/tmp/repos")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -868,7 +949,7 @@ async def llm_call_openai_compat(
*, *,
model: Optional[str] = None, model: Optional[str] = None,
stream: bool = False, stream: bool = False,
temperature: float = 0.2, temperature: float = 0.02,
top_p: float = 0.9, top_p: float = 0.9,
max_tokens: int = 42000, max_tokens: int = 42000,
extra: Optional[dict] = None, extra: Optional[dict] = None,
@ -1100,7 +1181,7 @@ async def _svg_from_prompt(prompt: str, w: int, h: int, background: str="white")
f"- Thema: {prompt}\n- Gebruik eenvoudige vormen/paths/tekst.") f"- Thema: {prompt}\n- Gebruik eenvoudige vormen/paths/tekst.")
resp = await llm_call_openai_compat( resp = await llm_call_openai_compat(
[{"role":"system","content":sys},{"role":"user","content":user}], [{"role":"system","content":sys},{"role":"user","content":user}],
stream=False, temperature=0.35, top_p=0.9, max_tokens=2048 stream=False, temperature=0.035, top_p=0.9, max_tokens=2048
) )
svg = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","") svg = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
return _svg_wrap_if_needed(_sanitize_svg(svg), w, h, background) return _svg_wrap_if_needed(_sanitize_svg(svg), w, h, background)
@ -1668,7 +1749,7 @@ async def present_make(
f"Max. {max_slides} dia's, 36 bullets per dia.") f"Max. {max_slides} dia's, 36 bullets per dia.")
plan = await llm_call_openai_compat( plan = await llm_call_openai_compat(
[{"role":"system","content":sys},{"role":"user","content":user}], [{"role":"system","content":sys},{"role":"user","content":user}],
stream=False, temperature=0.3, top_p=0.9, max_tokens=13021 stream=False, temperature=0.03, top_p=0.9, max_tokens=13021
) )
raw = (plan.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}") raw = (plan.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}")
try: try:
@ -1712,7 +1793,7 @@ async def vision_ask(
file: UploadFile = File(...), file: UploadFile = File(...),
prompt: str = Form("Beschrijf kort wat je ziet."), prompt: str = Form("Beschrijf kort wat je ziet."),
stream: bool = Form(False), stream: bool = Form(False),
temperature: float = Form(0.2), temperature: float = Form(0.02),
top_p: float = Form(0.9), top_p: float = Form(0.9),
max_tokens: int = Form(1024), max_tokens: int = Form(1024),
): ):
@ -1743,7 +1824,7 @@ async def vision_and_text(
stream: bool = Form(False), stream: bool = Form(False),
max_images: int = Form(6), max_images: int = Form(6),
max_chars: int = Form(25000), max_chars: int = Form(25000),
temperature: float = Form(0.2), temperature: float = Form(0.02),
top_p: float = Form(0.9), top_p: float = Form(0.9),
max_tokens: int = Form(2048), max_tokens: int = Form(2048),
): ):
@ -1801,7 +1882,7 @@ async def vision_health():
# -------- Tool registry (OpenAI-style) -------- # -------- Tool registry (OpenAI-style) --------
LLM_FUNCTION_CALLING_MODE = os.getenv("LLM_FUNCTION_CALLING_MODE", "auto").lower() # "native" | "shim" | "auto" LLM_FUNCTION_CALLING_MODE = os.getenv("LLM_FUNCTION_CALLING_MODE", "auto").lower() # "native" | "shim" | "auto"
OWUI_BASE_URL='http://localhost:3000' OWUI_BASE_URL='http://192.168.100.1:8089'
OWUI_API_TOKEN='sk-f1b7991b054442b5ae388de905019726' OWUI_API_TOKEN='sk-f1b7991b054442b5ae388de905019726'
# Aliassen zodat oudere codepaths blijven werken # Aliassen zodat oudere codepaths blijven werken
OWUI_BASE = OWUI_BASE_URL OWUI_BASE = OWUI_BASE_URL
@ -1976,6 +2057,17 @@ async def t_run_shell(args: dict) -> dict:
async def _execute_tool(name: str, args: dict) -> dict: async def _execute_tool(name: str, args: dict) -> dict:
logger.info("toolcall: "+str(name)+" ("+str(args)+")") logger.info("toolcall: "+str(name)+" ("+str(args)+")")
required=[]
if name in TOOLS_REGISTRY:
required=TOOLS_REGISTRY[name]["parameters"]["required"]
else:
return {"error": f"Unknown tool '{name}'."}
if not all(k in args and args[k] for k in required):
return {"error": f"Missing required arguments for tool '{name}'. Required: {required}"}
for k in args:
if k in required:
if args[k] in ['',None]:
return {"error": f"Missing required arguments for tool '{name}'. Required: {required}"}
if name == "repo_grep": if name == "repo_grep":
repo_url = args.get("repo_url","") repo_url = args.get("repo_url","")
branch = args.get("branch","main") branch = args.get("branch","main")
@ -2050,26 +2142,33 @@ async def _execute_tool(name: str, args: dict) -> dict:
}) })
return out return out
if name == "rag_query": if name == "rag_query":
out= await run_in_threadpool(_rag_index_repo_sync, **{ try:
"repo_url": args.get("repo",""), out= await run_in_threadpool(_rag_index_repo_sync, **{
"branch": "main", "repo_url": args.get("repo",""),
"profile": "auto", "branch": "main",
"include": "", "profile": "auto",
"exclude_dirs": "", "include": "",
"chunk_chars": 3000, "exclude_dirs": "",
"overlap": 400, "chunk_chars": 3000,
"collection_name": "code_docs", "overlap": 400,
"force": False, "collection_name": "code_docs",
}) "force": False,
out = await rag_query_api( })
query=args.get("query",""), except Exception as e:
n_results=int(args.get("n_results",5)), return {"error": f"Error for functioncall '{name}', while doing repo_index. errortext: {str(e)}"}
collection_name=_norm_collection_name(args.get("collection_name","code_docs"), "code_docs"), try:
repo=args.get("repo"), out = await rag_query_api(
path_contains=args.get("path_contains"), query=args.get("query",""),
profile=args.get("profile") n_results=int(args.get("n_results",5)),
) collection_name=_norm_collection_name(args.get("collection_name","code_docs"), "code_docs"),
return out repo=args.get("repo"),
path_contains=args.get("path_contains"),
profile=args.get("profile")
)
return out
except Exception as e:
return {"error": f"Error for functioncall '{name}', while doing repo_query. errortext: {str(e)}"}
# Console tools # Console tools
if name == "run_shell": if name == "run_shell":
@ -2080,7 +2179,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
# Repo # Repo
if name == "repo_qa": if name == "repo_qa":
# High-level QA over een specifieke repo. # High-level QA over een specifieke repo.
out=json.dumps(await repo_qa_answer(repo_hint=args.get("repo"),question=args.get("question"),branch=args.get("branch","main"),n_ctx=10), ensure_ascii=False) out=json.dumps(await repo_qa_answer(repo_hint=args.get("repo").replace('"','').replace("'",""),question=args.get("question"),branch=args.get("branch","main"),n_ctx=10), ensure_ascii=False)
return out return out
# Web tools # Web tools
@ -2203,7 +2302,7 @@ TOOLS_REGISTRY = {
"repo_url":{"type":"string"}, "repo_url":{"type":"string"},
"branch":{"type":"string","default":"main"}, "branch":{"type":"string","default":"main"},
"query":{"type":"string"}, "query":{"type":"string"},
"max_hits":{"type":"integer","default":200} "max_hits":{"type":"integer","default":10}
}, },
"required":["repo_url","query"] "required":["repo_url","query"]
} }
@ -2250,7 +2349,7 @@ TOOLS_REGISTRY = {
"path_contains":{"type":["string","null"]}, "path_contains":{"type":["string","null"]},
"profile":{"type":["string","null"]} "profile":{"type":["string","null"]}
}, },
"required":["query"] "required":["query","repo"]
} }
}, },
"web_search_xng": { "web_search_xng": {
@ -2286,7 +2385,7 @@ TOOLS_REGISTRY = {
"repo":{"type":"string"}, "repo":{"type":"string"},
"question":{"type":"string"}, "question":{"type":"string"},
"branch":{"type":"string"}, "branch":{"type":"string"},
},"required":["repo_hint","question"]} },"required":["repo","question"]}
}, },
"summarize_text": { "summarize_text": {
"description": "Vat tekst samen in bullets met inleiding en actiepunten.", "description": "Vat tekst samen in bullets met inleiding en actiepunten.",
@ -2490,7 +2589,7 @@ async def llm_call_autocont(
*, *,
model: Optional[str] = None, model: Optional[str] = None,
stream: bool = False, stream: bool = False,
temperature: float = 0.2, temperature: float = 0.02,
top_p: float = 0.9, top_p: float = 0.9,
max_tokens: int = 1024, max_tokens: int = 1024,
extra: Optional[dict] = None, extra: Optional[dict] = None,
@ -2581,7 +2680,8 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
stream = bool(body.get("stream", False)) stream = bool(body.get("stream", False))
raw_messages = body.get("messages") or [] raw_messages = body.get("messages") or []
# normaliseer tool-berichten naar plain tekst voor het LLM # normaliseer tool-berichten naar plain tekst voor het LLM
if False: NORMALIZE_TOOL_MESSAGES = os.getenv("NORMALIZE_TOOL_MESSAGES", "0").lower() not in ("0","false","no")
if NORMALIZE_TOOL_MESSAGES:
norm_messages = [] norm_messages = []
for m in raw_messages: for m in raw_messages:
if m.get("role") == "tool": if m.get("role") == "tool":
@ -2616,7 +2716,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
logger.info("🧰 tools_count=%s, tool_choice=%s", len(tools), tool_choice_req) logger.info("🧰 tools_count=%s, tool_choice=%s", len(tools), tool_choice_req)
except Exception: except Exception:
pass pass
if not stream: if RUN_BRIDGE and not stream:
# OWUI stuurt vaak "required" als: "er MOET een tool worden gebruikt". # OWUI stuurt vaak "required" als: "er MOET een tool worden gebruikt".
# Als er precies 1 tool is meegegeven, normaliseren we dat naar "force deze tool". # Als er precies 1 tool is meegegeven, normaliseren we dat naar "force deze tool".
@ -2648,7 +2748,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
client = app.state.HTTPX client = app.state.HTTPX
r = await client.post(LLM_URL, json=passthrough) r = await client.post(LLM_URL, json=passthrough)
try: try:
return JSONResponse(r.json(), status_code=r.status_code) data = r.json()
data = _coerce_text_toolcalls_to_openai(data)
return JSONResponse(data, status_code=r.status_code)
except Exception: except Exception:
return PlainTextResponse(r.text, status_code=r.status_code) return PlainTextResponse(r.text, status_code=r.status_code)
@ -2700,7 +2802,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
client = app.state.HTTPX client = app.state.HTTPX
r = await client.post(LLM_URL, json=passthrough) r = await client.post(LLM_URL, json=passthrough)
try: try:
return JSONResponse(r.json(), status_code=r.status_code) data = r.json()
data = _coerce_text_toolcalls_to_openai(data)
return JSONResponse(data, status_code=r.status_code)
except Exception: except Exception:
return PlainTextResponse(r.text, status_code=r.status_code) return PlainTextResponse(r.text, status_code=r.status_code)
@ -2849,6 +2953,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
if LLM_FUNCTION_CALLING_MODE in ("native","auto") and stream: if LLM_FUNCTION_CALLING_MODE in ("native","auto") and stream:
passthrough = dict(body); passthrough["messages"]=messages passthrough = dict(body); passthrough["messages"]=messages
if images_b64: passthrough["images"]=images_b64 if images_b64: passthrough["images"]=images_b64
STREAM_TOOLCALL_COERCE = os.getenv("STREAM_TOOLCALL_COERCE","1").lower() not in ("0","false","no")
async def _aiter(): async def _aiter():
import asyncio, contextlib import asyncio, contextlib
client = app.state.HTTPX client = app.state.HTTPX
@ -2867,6 +2972,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
await q.put(b"__EOF__") await q.put(b"__EOF__")
reader_task = asyncio.create_task(_reader()) reader_task = asyncio.create_task(_reader())
try: try:
buf = ""
acc = ""
suppress = False
while True: while True:
try: try:
chunk = await asyncio.wait_for(q.get(), timeout=HEARTBEAT) chunk = await asyncio.wait_for(q.get(), timeout=HEARTBEAT)
@ -2875,7 +2983,91 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
continue continue
if chunk == b"__EOF__": if chunk == b"__EOF__":
break break
yield chunk if not STREAM_TOOLCALL_COERCE:
yield chunk
continue
try:
buf += chunk.decode("utf-8", errors="ignore")
except Exception:
yield chunk
continue
# SSE events zijn gescheiden door een lege regel
while "\n\n" in buf:
event, buf = buf.split("\n\n", 1)
if not event:
continue
if event.startswith(":"):
yield (event + "\n\n").encode("utf-8")
continue
lines = event.splitlines()
data_lines = [ln[5:].lstrip() for ln in lines if ln.startswith("data:")]
if not data_lines:
if not suppress:
yield (event + "\n\n").encode("utf-8")
continue
data_s = "\n".join(data_lines).strip()
if data_s == "[DONE]":
yield b"data: [DONE]\n\n"
return
try:
obj = json.loads(data_s)
except Exception:
if not suppress:
yield (event + "\n\n").encode("utf-8")
continue
try:
ch0 = (obj.get("choices") or [{}])[0] or {}
delta = ch0.get("delta") or {}
except Exception:
delta = {}
# Als upstream al echte tool_calls streamt: pass-through
if isinstance(delta, dict) and delta.get("tool_calls"):
if not suppress:
yield ("data: " + json.dumps(obj, ensure_ascii=False) + "\n\n").encode("utf-8")
continue
content = (delta.get("content") if isinstance(delta, dict) else None)
if isinstance(content, str) and content:
acc += content
if "[TOOL_CALLS" in acc:
suppress = True # onderdruk de text-tag stream
calls = detect_toolcalls_any(acc) or []
if calls:
created = int(time.time())
chunk_id = obj.get("id") or f"chatcmpl-{uuid.uuid4().hex[:24]}"
model_name = obj.get("model") or body.get("model") or "unknown"
tc_delta = []
for i, tc in enumerate(calls):
tcc = dict(tc)
tcc["index"] = i
tc_delta.append(tcc)
first = {
"id": chunk_id,
"object": "chat.completion.chunk",
"created": created,
"model": model_name,
"choices": [{
"index": 0,
"delta": {"role":"assistant", "tool_calls": tc_delta},
"finish_reason": None
}]
}
second = {
"id": chunk_id,
"object": "chat.completion.chunk",
"created": created,
"model": model_name,
"choices": [{
"index": 0,
"delta": {},
"finish_reason": "tool_calls"
}]
}
yield ("data: " + json.dumps(first, ensure_ascii=False) + "\n\n").encode("utf-8")
yield ("data: " + json.dumps(second, ensure_ascii=False) + "\n\n").encode("utf-8")
yield b"data: [DONE]\n\n"
return
if not suppress:
yield ("data: " + json.dumps(obj, ensure_ascii=False) + "\n\n").encode("utf-8")
finally: finally:
reader_task.cancel() reader_task.cancel()
with contextlib.suppress(Exception): with contextlib.suppress(Exception):
@ -2895,65 +3087,76 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
if images_b64: passthrough["images"]=images_b64 if images_b64: passthrough["images"]=images_b64
r = await client.post(LLM_URL, json=passthrough) r = await client.post(LLM_URL, json=passthrough)
try: try:
return JSONResponse(r.json(), status_code=r.status_code) data = r.json()
data = _coerce_text_toolcalls_to_openai(data)
return JSONResponse(data, status_code=r.status_code)
except Exception: except Exception:
return PlainTextResponse(r.text, status_code=r.status_code) return PlainTextResponse(r.text, status_code=r.status_code)
# (A) 1e call: vraag de LLM om tool_calls (geen stream) # Relay-modus: iteratief tools uitvoeren totdat de LLM stopt met tool_calls
first_req = dict(body) max_rounds = int(os.getenv("LLM_TOOL_MAX_ROUNDS", "5"))
first_req["messages"] = messages follow_messages = messages
first_req["stream"] = False last_status = None
if images_b64: first_req["images"] = images_b64 for _round in range(max_rounds):
r1 = await client.post(LLM_URL, json=first_req) req_i = dict(body)
try: req_i["messages"] = follow_messages
data1 = r1.json() req_i["stream"] = False
except Exception: if images_b64: req_i["images"] = images_b64
return PlainTextResponse(r1.text, status_code=r1.status_code) r_i = await client.post(LLM_URL, json=req_i)
msg1 = ((data1.get("choices") or [{}])[0].get("message") or {}) last_status = r_i.status_code
tool_calls = msg1.get("tool_calls") or []
# Geen tool-calls? Geef direct door.
if not tool_calls:
return JSONResponse(data1, status_code=r1.status_code)
# (B) voer tool_calls lokaal uit
tool_msgs = []
for tc in tool_calls:
fn = ((tc or {}).get("function") or {})
tname = fn.get("name")
raw_args = fn.get("arguments") or "{}"
try: try:
args = json.loads(raw_args) if isinstance(raw_args, str) else (raw_args or {}) data_i = r_i.json()
except Exception: except Exception:
args = {} return PlainTextResponse(r_i.text, status_code=r_i.status_code)
if not tname or tname not in TOOLS_REGISTRY: msg_i = ((data_i.get("choices") or [{}])[0].get("message") or {})
out = {"error": f"Unknown tool '{tname}'"} tool_calls = msg_i.get("tool_calls") or []
else: # Fallback: sommige backends gooien toolcalls als tekst (bv. [TOOL_CALLS])
try: if not tool_calls:
out = await _execute_tool(tname, args) txt = (msg_i.get("content") or "")
except Exception as e: tool_calls = detect_toolcalls_any(txt) or []
out = {"error": str(e)} # Geen tool-calls? Geef direct door.
tool_msgs.append({ if not tool_calls:
"role": "tool", data_i = _coerce_text_toolcalls_to_openai(data_i)
"tool_call_id": tc.get("id"), return JSONResponse(data_i, status_code=r_i.status_code)
"name": tname or "unknown",
"content": json.dumps(out, ensure_ascii=False)
})
# (C) 2e call: geef tool outputs terug aan LLM voor eindantwoord # Tools uitvoeren
follow_messages = messages + [ tool_msgs = []
{"role": "assistant", "tool_calls": tool_calls}, for tc in tool_calls:
*tool_msgs # Normaliseer tc structuur
] tc_id = (tc or {}).get("id") or f"call_{uuid.uuid4().hex[:8]}"
second_req = dict(body) fn = ((tc or {}).get("function") or {})
second_req["messages"] = follow_messages tname = fn.get("name")
second_req["stream"] = False logger.info(f"Running tool: '{tname}'")
# images opnieuw meesturen is niet nodig, maar kan geen kwaad: raw_args = fn.get("arguments") or "{}"
if images_b64: second_req["images"] = images_b64 try:
r2 = await client.post(LLM_URL, json=second_req) args = json.loads(raw_args) if isinstance(raw_args, str) else (raw_args or {})
try: except Exception:
return JSONResponse(r2.json(), status_code=r2.status_code) args = {}
except Exception: if not tname or tname not in TOOLS_REGISTRY:
return PlainTextResponse(r2.text, status_code=r2.status_code) out = {"error": f"Unknown tool '{tname}'"}
else:
try:
out = await _execute_tool(tname, args)
except Exception as e:
out = {"error": str(e)}
tool_msgs.append({
"role": "tool",
"tool_call_id": tc_id,
"name": tname or "unknown",
"content": json.dumps(out, ensure_ascii=False)
})
# Zorg dat assistant tool_calls een id heeft
if isinstance(tc, dict) and not tc.get("id"):
tc["id"] = tc_id
follow_messages = follow_messages + [
{"role": "assistant", "tool_calls": tool_calls},
*tool_msgs
]
# Te veel tool-rondes → stop om loops te voorkomen
safe_msg = f"Te veel tool-rondes ({max_rounds}). Stop om loop te voorkomen."
return JSONResponse(_openai_chat_response(model, safe_msg, follow_messages), status_code=(last_status or 200))
# shim (non-stream) # shim (non-stream)
if LLM_FUNCTION_CALLING_MODE == "shim" and not stream: if LLM_FUNCTION_CALLING_MODE == "shim" and not stream:
@ -2967,7 +3170,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
"Otherwise reply with ONLY: {\"final_answer\":\"...\"}\n\nTools:\n" + "\n".join(tool_lines)) "Otherwise reply with ONLY: {\"final_answer\":\"...\"}\n\nTools:\n" + "\n".join(tool_lines))
decide = await llm_call_openai_compat( decide = await llm_call_openai_compat(
[{"role":"system","content":sys}] + messages, [{"role":"system","content":sys}] + messages,
stream=False, temperature=float(body.get("temperature",0.2)), stream=False, temperature=float(body.get("temperature",0.02)),
top_p=float(body.get("top_p",0.9)), max_tokens=min(512, int(body.get("max_tokens",1024))), top_p=float(body.get("top_p",0.9)), max_tokens=min(512, int(body.get("max_tokens",1024))),
extra=extra_payload if extra_payload else None extra=extra_payload if extra_payload else None
) )
@ -3006,7 +3209,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
] ]
final = await llm_call_openai_compat( final = await llm_call_openai_compat(
follow, stream=False, follow, stream=False,
temperature=float(body.get("temperature",0.2)), temperature=float(body.get("temperature",0.02)),
top_p=float(body.get("top_p",0.9)), top_p=float(body.get("top_p",0.9)),
max_tokens=int(body.get("max_tokens",1024)), max_tokens=int(body.get("max_tokens",1024)),
extra=extra_payload if extra_payload else None extra=extra_payload if extra_payload else None
@ -3023,7 +3226,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
LLM_WINDOWING_ENABLE = os.getenv("LLM_WINDOWING_ENABLE", "1").lower() not in ("0","false","no") LLM_WINDOWING_ENABLE = os.getenv("LLM_WINDOWING_ENABLE", "1").lower() not in ("0","false","no")
MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "13021")) MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "13021"))
RESP_RESERVE = int(os.getenv("LLM_RESPONSE_RESERVE", "1024")) RESP_RESERVE = int(os.getenv("LLM_RESPONSE_RESERVE", "1024"))
temperature = float(body.get("temperature", 0.2)) temperature = float(body.get("temperature", 0.02))
top_p = float(body.get("top_p", 0.9)) top_p = float(body.get("top_p", 0.9))
# respecteer env-override voor default # respecteer env-override voor default
_default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "1024")) _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "1024"))
@ -3051,7 +3254,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
{"role":"system","content":"Je bent een bondige notulist. Vat samen in max 10 bullets (feiten/besluiten/acties)."}, {"role":"system","content":"Je bent een bondige notulist. Vat samen in max 10 bullets (feiten/besluiten/acties)."},
{"role":"user","content": f"Vorige samenvatting:\n{old}\n\nNieuwe geschiedenis:\n{chunk_text}\n\nGeef geüpdatete samenvatting."} {"role":"user","content": f"Vorige samenvatting:\n{old}\n\nNieuwe geschiedenis:\n{chunk_text}\n\nGeef geüpdatete samenvatting."}
] ]
resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.1, top_p=1.0, max_tokens=300) resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.01, top_p=1.0, max_tokens=300)
return (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content", old or "") return (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content", old or "")
trimmed_stream_msgs = await win.build_within_budget(system_prompt=None, summarizer=_summarizer) trimmed_stream_msgs = await win.build_within_budget(system_prompt=None, summarizer=_summarizer)
new_summary = getattr(win, "running_summary", running_summary) new_summary = getattr(win, "running_summary", running_summary)
@ -3116,7 +3319,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
else: else:
# --- ÉCHTE streaming (geen tools): direct passthrough met heartbeats --- # --- ÉCHTE streaming (geen tools): direct passthrough met heartbeats ---
if stream: if stream:
temperature = float(body.get("temperature", 0.2)) temperature = float(body.get("temperature", 0.02))
top_p = float(body.get("top_p", 0.9)) top_p = float(body.get("top_p", 0.9))
_default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021")) _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021"))
max_tokens = int(body.get("max_tokens", _default_max)) max_tokens = int(body.get("max_tokens", _default_max))
@ -3138,7 +3341,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "42000")) MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "42000"))
RESP_RESERVE = int(os.getenv("LLM_RESPONSE_RESERVE", "1024")) RESP_RESERVE = int(os.getenv("LLM_RESPONSE_RESERVE", "1024"))
MAX_AUTOCONT = int(os.getenv("LLM_AUTO_CONTINUES", "2")) MAX_AUTOCONT = int(os.getenv("LLM_AUTO_CONTINUES", "2"))
temperature = float(body.get("temperature", 0.2)) temperature = float(body.get("temperature", 0.02))
top_p = float(body.get("top_p", 0.9)) top_p = float(body.get("top_p", 0.9))
# Laat env de default bepalen, zodat OWUI niet hard op 1024 blijft hangen # Laat env de default bepalen, zodat OWUI niet hard op 1024 blijft hangen
_default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "42000")) _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "42000"))
@ -3165,7 +3368,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
{"role":"system","content":"Je bent een bondige notulist. Vat samen in max 10 bullets (feiten/besluiten/acties)."}, {"role":"system","content":"Je bent een bondige notulist. Vat samen in max 10 bullets (feiten/besluiten/acties)."},
{"role":"user","content": f"Vorige samenvatting:\n{old}\n\nNieuwe geschiedenis:\n{chunk_text}\n\nGeef geüpdatete samenvatting."} {"role":"user","content": f"Vorige samenvatting:\n{old}\n\nNieuwe geschiedenis:\n{chunk_text}\n\nGeef geüpdatete samenvatting."}
] ]
resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.1, top_p=1.0, max_tokens=300) resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.01, top_p=1.0, max_tokens=300)
return (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content", old or "") return (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content", old or "")
trimmed = await win.build_within_budget(system_prompt=None, summarizer=_summarizer) trimmed = await win.build_within_budget(system_prompt=None, summarizer=_summarizer)
new_summary = getattr(win, "running_summary", running_summary) new_summary = getattr(win, "running_summary", running_summary)
@ -3309,7 +3512,7 @@ async def _summarize_files_llm(items: list[tuple[str, str]]) -> dict[str, str]:
{"role":"user","content": f"Pad: {path}\n\nInhoud (ingekort):\n{snippet}\n\nAntwoord: "} {"role":"user","content": f"Pad: {path}\n\nInhoud (ingekort):\n{snippet}\n\nAntwoord: "}
] ]
try: try:
resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.1, top_p=1.0, max_tokens=64) resp = await llm_call_openai_compat(prompt, stream=False, temperature=0.01, top_p=1.0, max_tokens=64)
summ = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content","").strip() summ = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content","").strip()
except Exception: except Exception:
summ = "" summ = ""
@ -3807,14 +4010,19 @@ async def rag_query_api(
collection_name_eff = _collection_effective(collection_name) collection_name_eff = _collection_effective(collection_name)
col = _get_collection(collection_name_eff) col = _get_collection(collection_name_eff)
q_emb = _EMBEDDER.embed_query(query) q_emb = _EMBEDDER.embed_query(query)
where = {} # Chroma: $and/$or moet >=2 where-expressies bevatten.
conds = []
if repo: if repo:
# Accepteer zowel 'repo' (basename) als 'repo_full' (owner/repo) conds.append({"repo_full": {"$eq": repo}})
base = repo.rsplit("/", 1)[-1] if branch:
where = {"$and": [ conds.append({"branch": {"$eq": branch}})
{"repo_full": {"$eq": repo}} if profile:
]} conds.append({"profile": {"$eq": profile}})
if profile: where["profile"] = {"$eq": profile} where = None
if len(conds) == 1:
where = conds[0]
elif len(conds) >= 2:
where = {"$and": conds}
# ---- symbol hit set (repo-scoped) ---- # ---- symbol hit set (repo-scoped) ----
sym_hit_keys: set[str] = set() sym_hit_keys: set[str] = set()
@ -4113,7 +4321,7 @@ async def rag_query_api(
resp = await llm_call_openai_compat( resp = await llm_call_openai_compat(
[{"role":"system","content":"You are precise and return only valid JSON."}, [{"role":"system","content":"You are precise and return only valid JSON."},
{"role":"user","content": prompt+"\n\nOnly JSON array."}], {"role":"user","content": prompt+"\n\nOnly JSON array."}],
stream=False, temperature=0.0, top_p=1.0, max_tokens=1024 stream=False, temperature=0.01, top_p=1.0, max_tokens=1024
) )
try: try:
order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]")) order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]"))

View File

@ -1 +1 @@
docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=16288 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 mistral-api docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=0 -e LLM_CONTEXT_TOKENS=42000 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=0 -e LLM_FUNCTION_CALLING_MODE=auto -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 -e FORCE_ALL_TOOLS=0 -e AUTO_CONTINUE=0 -e STREAM_PREFER_DIRECT=1 mistral-api