openapi.json changes and native or bridged toolcalling changes

This commit is contained in:
admin 2026-01-08 16:10:04 +01:00
parent ddc1c56cd7
commit 4e2ffac24c
6 changed files with 425 additions and 187 deletions

View File

@ -1,9 +1,15 @@
FROM python:3.11-slim
# ===== Base met CUDA11.8 + cuDNN + conda =====
FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime
WORKDIR /app
# ===== Model caches op vaste paden (blijven in image) =====
# Hugging Face caches (embeddings) + XDG cache (o.a. whisper)
# Zorg dat conda libs altijd eerst gevonden worden
ENV LD_LIBRARY_PATH=/opt/conda/lib:${LD_LIBRARY_PATH}
# P5000 = Pascal SM 6.1; handig voor (eventueel) on-the-fly builds
ENV TORCH_CUDA_ARCH_LIST="6.1"
# ===== Model caches op vaste paden =====
ENV HF_HOME=/opt/hf \
HUGGINGFACE_HUB_CACHE=/opt/hf \
TRANSFORMERS_CACHE=/opt/hf \
@ -11,83 +17,92 @@ ENV HF_HOME=/opt/hf \
XDG_CACHE_HOME=/opt/cache \
STT_MODEL=small
# Optioneel build-args om modelkeuzes te pinnen
ARG RAG_EMBEDDINGS=gte-multilingual # of: bge-small / e5-small / gte-base-en
ARG STT_MODEL_ARG=small # tiny | base | small | medium | large-v3, etc.
ARG RAG_EMBEDDINGS=gte-multilingual
ARG STT_MODEL_ARG=small
ENV RAG_EMBEDDINGS=${RAG_EMBEDDINGS}
ENV STT_MODEL=${STT_MODEL_ARG}
# maak directories nu al aan (rechten)
# directories
RUN mkdir -p /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper && \
chmod -R a+rX /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper
# ===== Alleen minimale apt utils (géén multimedia libs!) =====
RUN apt-get update && DEBIAN_FRONTEND=noninteractive \
apt-get install -y --no-install-recommends \
git curl build-essential ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# ===== Multimedia via conda-forge (alles uit één ecosysteem) =====
# - av 10 + ffmpeg<7 (past goed bij pyAV)
# - cairo/pango/gdk-pixbuf/pixman voor cairosvg stack
# VERVANG de vorige conda multimedia regel door deze:
# Tooling voor PyAV build
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
pkg-config git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential && rm -rf /var/lib/apt/lists/*
# FFmpeg via conda-forge (zodat je recente headers/libs hebt)
RUN conda config --system --set channel_priority flexible \
&& conda install -y -c conda-forge "ffmpeg>=6,<8" \
&& conda clean -afy
# Later in je pip stap:
# ... faster-whisper==1.0.0 zal av==11.* trekken en nu WEL kunnen bouwen tegen condas FFmpeg 6
# ===== Python deps =====
COPY requirements.txt .
RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential
RUN pip install --upgrade pip
# jouw requirements
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25
#RUN pip cache purge
# losse extras (let op: av via conda, niet via pip!)
RUN pip install --no-cache-dir \
PyPDF2 python-multipart gitpython chromadb httpx meilisearch \
pandas openpyxl python-pptx faster-whisper==1.0.0 \
cairosvg sentence-transformers rank-bm25
RUN apt-get update && apt-get install -y --no-install-recommends \
wget ca-certificates libstdc++6 libatomic1 \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /opt/piper \
&& set -eux; \
URL="https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz"; \
wget -O /tmp/piper.tgz "$URL"; \
tar -xzf /tmp/piper.tgz -C /opt/piper --strip-components=1; \
ln -sf /opt/piper/piper /usr/local/bin/piper; \
rm -f /tmp/piper.tgz
# ===== Prefetch modellen =====
# ===== Prefetch modellen tijdens de build =====
# 1) SentenceTransformers (embeddings) — volgens je mapping in app.py
# 1) SentenceTransformers
RUN python - <<'PY'
import os
from sentence_transformers import SentenceTransformer
mapping = {
"gte-multilingual": ("Alibaba-NLP/gte-multilingual-base"),
"bge-small": ("BAAI/bge-small-en-v1.5"),
"e5-small": ("intfloat/e5-small-v2"),
"gte-base-en": ("thenlper/gte-base"),
"gte-multilingual": "Alibaba-NLP/gte-multilingual-base",
"bge-small": "BAAI/bge-small-en-v1.5",
"e5-small": "intfloat/e5-small-v2",
"gte-base-en": "thenlper/gte-base",
}
choice = os.environ.get("RAG_EMBEDDINGS","gte-multilingual").lower()
hf_id = mapping.get(choice, "BAAI/bge-small-en-v1.5")
# cache_folder respecteert SENTENCE_TRANSFORMERS_HOME/HF_HOME, maar we forceren expliciet:
cache_root = os.environ.get("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers")
local_dir = os.path.join(cache_root, "embedder")
os.makedirs(cache_root, exist_ok=True)
print("Downloading SentenceTransformer:", hf_id)
# Laat SentenceTransformer zelf hun cache doen (HF_HOME etc)
# wij saven het eindresultaat daarna naar local_dir
model = SentenceTransformer(hf_id, cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME","/opt/sentence-transformers"))
model = SentenceTransformer(hf_id, cache_folder=cache_root, device="cpu") # download only
model.save(local_dir)
print("Prefetched SentenceTransformer:", hf_id)
PY
# 2) faster-whisper (STT) — cache in /opt/cache/whisper
# 2) faster-whisper (prefetch CPU-kant; runtime kan je device kiezen)
RUN python - <<'PY'
import os
from faster_whisper import WhisperModel
name = os.environ.get("STT_MODEL","small")
cache_root = os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper")
os.makedirs(cache_root, exist_ok=True)
# Build-time altijd CPU/INT8 (geen GPU nodig tijdens build)
_ = WhisperModel(name, device="cpu", compute_type="int8", download_root=cache_root)
print("Prefetched faster-whisper:", name, "->", cache_root)
PY
# (optioneel) piper voice kun je hier ook voorcachen; laat ik nu achterwege omdat voice per omgeving wisselt.
# (optioneel) piper skip ik hier; kan later
# ===== App code =====
COPY app.py .
COPY queue_helper.py .
COPY agent_repo.py .
COPY windowing_utils.py .
COPY smart_rag.py .
COPY llm_client.py .
COPY web_search.py .
EXPOSE 8080
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"]

View File

@ -231,7 +231,7 @@ _embed_documents = None
# Non-invasief: behoudt hetzelfde response-shape als _llm_call.
# Harde cap van jouw Mistral-LLM docker (zoals je aangaf)
_MODEL_BUDGET = int(os.getenv("LLM_TOTAL_TOKEN_BUDGET", "13027"))
_MODEL_BUDGET = int(os.getenv("LLM_TOTAL_TOKEN_BUDGET", "42000"))
# Veiligheidsmarge voor headers/EOS/afwijkingen in token-raming
_BUDGET_SAFETY = int(os.getenv("LLM_BUDGET_SAFETY_TOKENS", "512"))
# Max aantal vervolgstappen als het net afgekapt lijkt
@ -3255,7 +3255,7 @@ def _prepare_contexts_under_budget(
question: str,
stack_summary_text: str,
*,
budget_tokens: int = int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")),
budget_tokens: int = int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "7000")),
tok_len=approx_token_count
) -> List[dict]:
"""
@ -3392,7 +3392,7 @@ async def _llm_qa_answer(question: str, stack_summary_text: str, contexts: List[
# --- NIEUW: trim contexts onder tokenbudget ---
contexts = _prepare_contexts_under_budget(
contexts, question, stack_summary_text,
budget_tokens=int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")),
budget_tokens=int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "7000")),
tok_len=approx_token_count
)
@ -3617,7 +3617,7 @@ async def propose_patches_without_apply(repo_path: str, candidates: List[str], u
# sla deze stap over; ga door naar volgende kandidaat
continue
last_err = None
for mx in [1024]:
for mx in [2048]:
try:
messages = [
{"role":"system","content":"Voer exact de gevraagde wijziging uit. GEEN extra refactors/best practices. Lever de volledige, werkende bestandinformatie als 1 codeblok."},
@ -4215,7 +4215,7 @@ async def handle_repo_agent(messages: List[dict], request) -> str:
)
llm_resp = await _llm_call(
[{"role":"system","content":sys},{"role":"user","content":user}],
stream=False, temperature=0.2, top_p=0.9, max_tokens=1536
stream=False, temperature=0.2, top_p=0.9, max_tokens=2048
)
out = (llm_resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
if out.strip():

257
app.py
View File

@ -14,7 +14,7 @@ import contextlib
from contextlib import contextmanager
import os, re, json, time, uuid, hashlib, logging, asyncio, fnmatch, threading
from dataclasses import dataclass
from typing import List, Dict, Optional, Union, Any
from typing import List, Dict, Optional, Union, Any, Callable
from pathlib import Path
from io import BytesIO
@ -37,6 +37,12 @@ AUTO_CONTINUE_MAX_ROUNDS = int(os.getenv("AUTO_CONTINUE_MAX_ROUNDS", "6"))
AUTO_CONTINUE_TAIL_CHARS = int(os.getenv("AUTO_CONTINUE_TAIL_CHARS", "600"))
from llm_client import init_llm_client, _sync_model_infer
from web_search import Tools, HelpFunctions, EventEmitter
import subprocess
import tempfile
# Optionele libs voor tekst-extractie
try:
@ -299,7 +305,8 @@ def _unique_id(route: APIRoute):
method = list(route.methods)[0].lower() if route.methods else "get"
return f"{route.name}_{route.path.replace('/', '_')}_{method}"
app = FastAPI(title="Mistral Bridge API",generate_unique_id_function=_unique_id)
app = FastAPI(title="Mistral Bridge API",openapi_url=None,generate_unique_id_function=_unique_id)#openapi_url=None,
app.add_middleware(
CORSMiddleware,
allow_credentials=True,
@ -531,9 +538,26 @@ class _Embedder:
return self._encode(docs)
def embed_query(self, q: str) -> list[float]:
# e5 prefix blijft identiek
if self.family == "e5":
q = f"query: {q}"
return self._encode([q])[0]
# --- LRU cache (per proces) om dubbele embeds bij routed buckets te vermijden ---
key = (q or "").strip()
cache = getattr(self, "_q_cache", None)
if cache is None:
from collections import OrderedDict
self._q_cache = OrderedDict()
self._q_cache_cap = int(os.getenv("RAG_Q_EMBED_CACHE", "2048"))
cache = self._q_cache
if key in cache:
v = cache.pop(key)
cache[key] = v
return v
v = self._encode([q])[0]
cache[key] = v
if len(cache) > getattr(self, "_q_cache_cap", 2048):
cache.popitem(last=False) # evict oldest
return v
def _build_embedder() -> _Embedder:
import inspect
@ -553,7 +577,16 @@ def _build_embedder() -> _Embedder:
model_name, family, slug = mapping[choice]
cache_dir = os.environ.get("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers")
local_dir = os.path.join(cache_dir, "embedder")
st_kwargs = {"device": "cpu"}
# --- device auto-select: cuda als beschikbaar, anders cpu; override met RAG_EMBED_DEVICE ---
dev_req = os.getenv("RAG_EMBED_DEVICE", "auto").strip().lower()
dev = "cpu"
try:
import torch
if dev_req in ("cuda", "gpu", "auto") and torch.cuda.is_available():
dev = "cuda"
except Exception:
dev = "cpu"
st_kwargs = {"device": dev}
if os.path.isdir(local_dir):
# Prefetched model in image → gebruik dat
model_source = local_dir
@ -568,7 +601,9 @@ def _build_embedder() -> _Embedder:
if "trust_remote_code" in inspect.signature(SentenceTransformer).parameters:
st_kwargs["trust_remote_code"] = True
model = SentenceTransformer(model_source, **st_kwargs)
# optioneel: CPU thread-telling forceren
# logging: inzichtelijk maken waar embeds draaien
print(f"[embeddings] model={model_name} device={dev}")
# optioneel: CPU thread-telling forceren (fallback)
try:
thr = int(os.getenv("RAG_TORCH_THREADS", "0"))
if thr > 0:
@ -576,7 +611,7 @@ def _build_embedder() -> _Embedder:
torch.set_num_threads(thr)
except Exception:
pass
return _Embedder(slug=slug, family=family, model=model, device="cpu")
return _Embedder(slug=slug, family=family, model=model, device=dev)
except Exception as e:
print("ERROR building embedder:",str(e))
@ -835,7 +870,7 @@ async def llm_call_openai_compat(
stream: bool = False,
temperature: float = 0.2,
top_p: float = 0.9,
max_tokens: int = 13027,
max_tokens: int = 42000,
extra: Optional[dict] = None,
stop: Optional[Union[str, list[str]]] = None,
**kwargs
@ -846,7 +881,8 @@ async def llm_call_openai_compat(
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens,
"stream": bool(stream)
"stream": bool(stream),
"repeat_penalty": 1.5,
}
# OpenAI-compat: optionele stop-sequenties doorgeven indien aanwezig
if stop is not None:
@ -1447,6 +1483,77 @@ def _stt_transcribe_path(path: str, lang: str | None):
text = "".join(seg.text for seg in segments).strip()
return text, getattr(info, "language", None)
# Initialize the Tools instance
tools_instance = Tools()
@app.post("/web/search/xng")
async def web_search_xng(
query: str = Form(...),
SEARXNG_ENGINE_API_BASE_URL: str = tools_instance.valves.SEARXNG_ENGINE_API_BASE_URL,
IGNORED_WEBSITES: str = tools_instance.valves.IGNORED_WEBSITES,
RETURNED_SCRAPPED_PAGES_NO: int = tools_instance.valves.RETURNED_SCRAPPED_PAGES_NO,
SCRAPPED_PAGES_NO: int = tools_instance.valves.SCRAPPED_PAGES_NO,
PAGE_CONTENT_WORDS_LIMIT: int = tools_instance.valves.PAGE_CONTENT_WORDS_LIMIT,
CITATION_LINKS: bool = tools_instance.valves.CITATION_LINKS,
) -> str:
omsch="""
Search the web using SearXNG and get the content of the relevant pages.
OpenAI-compat: { "query": "string", "SEARXNG_ENGINE_API_BASE_URL": "string", "IGNORED_WEBSITES": "string", "RETURNED_SCRAPPED_PAGES_NO": "integer", "SCRAPPED_PAGES_NO": "integer", "PAGE_CONTENT_WORDS_LIMIT": "integer", "CITATION_LINKS": "boolean" }
Return: JSON string with search results.
"""
#query = (body.get("query") or "").strip()
if not query:
raise HTTPException(status_code=400, detail="Lege query")
# Extract parameters from the body
params = {
"SEARXNG_ENGINE_API_BASE_URL": SEARXNG_ENGINE_API_BASE_URL,
"IGNORED_WEBSITES": IGNORED_WEBSITES,
"RETURNED_SCRAPPED_PAGES_NO": RETURNED_SCRAPPED_PAGES_NO,
"SCRAPPED_PAGES_NO": SCRAPPED_PAGES_NO,
"PAGE_CONTENT_WORDS_LIMIT": PAGE_CONTENT_WORDS_LIMIT,
"CITATION_LINKS": CITATION_LINKS,
}
# Update the valves with the provided parameters
tools_instance.valves = tools_instance.Valves(**params)
# Call the existing search_web function from tools_instance
result = await tools_instance.search_web(query)
return JSONResponse(result)
@app.post("/web/get_website/xng")
async def get_website_xng(
url: str = Form(...),
SEARXNG_ENGINE_API_BASE_URL: str = tools_instance.valves.SEARXNG_ENGINE_API_BASE_URL,
IGNORED_WEBSITES: str = tools_instance.valves.IGNORED_WEBSITES,
PAGE_CONTENT_WORDS_LIMIT: int = tools_instance.valves.PAGE_CONTENT_WORDS_LIMIT,
CITATION_LINKS: bool = tools_instance.valves.CITATION_LINKS,
) -> str:
omsch="""
Web scrape the website provided and get the content of it.
OpenAI-compat: { "url": "string", "SEARXNG_ENGINE_API_BASE_URL": "string", "IGNORED_WEBSITES": "string", "PAGE_CONTENT_WORDS_LIMIT": "integer", "CITATION_LINKS": "boolean" }
Return: JSON string with website content.
"""
#url = (body.get("url") or "").strip()
if not url:
raise HTTPException(status_code=400, detail="Lege URL")
# Extract parameters from the body
params = {
"SEARXNG_ENGINE_API_BASE_URL": SEARXNG_ENGINE_API_BASE_URL,
"IGNORED_WEBSITES": IGNORED_WEBSITES,
"PAGE_CONTENT_WORDS_LIMIT": PAGE_CONTENT_WORDS_LIMIT,
"CITATION_LINKS": CITATION_LINKS,
}
# Update the valves with the provided parameters
tools_instance.valves = tools_instance.Valves(**params)
# Call the existing get_website function from tools_instance
result = await tools_instance.get_website(url)
return JSONResponse(result)
@app.post("/v1/audio/transcriptions")
async def audio_transcriptions(
file: UploadFile = File(...),
@ -1521,6 +1628,22 @@ async def images_generations(payload: dict = Body(...)):
out_items.append({"b64_json": b64})
return {"created": int(time.time()), "data": out_items}
@app.get("/improve_web_query")
async def improve_web_query(request: Request, format: str = "proxy"):
text = (request.query_params.get("text") or "")[:int(request.query_params.get("max_chars", 20000))]
objective = request.query_params.get("objective") or "Verbeter deze search query door de kern van de tekst te identificeren en deze om te zetten in een betere query. Gebruik AND, OR, of andere logische operatoren om de query te optimaliseren."
style = request.query_params.get("style") or "Hanteer best practices, behoud inhoudelijke betekenis."
resp = await llm_call_openai_compat(
[{"role": "user", "content": f""
f"Je taak is om deze search query te verbeteren: {text}\n"
f"Objectief: {objective}\n"
f"Stijl: {style}\n"
f"Verbeterde query:"}],
stream=False, max_tokens=200
)
return resp
@app.get("/v1/images/health")
def images_health():
return {"svg_to_png": bool(cairosvg is not None)}
@ -1751,10 +1874,26 @@ def _normalize_files_arg(args: dict):
@app.get("/openapi.json", include_in_schema=False)
async def openapi_endpoint():
tool_routes=[]
for r in app.routes:
if getattr(r, "path", "").split("/")[1] in ["v1","web","openapi","vision","file","rag","repo"]:
tool_routes.append(r)
logger.info("toolwithpath: %s",getattr(r, "path", ""))
tool_routes = [
r for r in app.routes
if isinstance(r, APIRoute) and r.path.startswith("/openapi/")
]
logger.info("OpenAPI tool_routes=%d", len(tool_routes))
for r in tool_routes:
logger.info(" tool route: %s", r.path)
#tool_routes = [
# r for r in app.routes
# if getattr(r, "path", "").startswith("/openapi/")
#]
return get_openapi(
title="Tool Server",
version="0.1.0",
routes=app.routes,
routes=tool_routes,
)
def _openai_tools_from_registry(reg: dict):
@ -1794,6 +1933,7 @@ def _parse_validation_results(text: str) -> list[str]:
return issues
async def _execute_tool(name: str, args: dict) -> dict:
logger.info("toolcall: "+str(name)+" ("+str(args)+")")
if name == "repo_grep":
repo_url = args.get("repo_url","")
branch = args.get("branch","main")
@ -1856,6 +1996,18 @@ async def _execute_tool(name: str, args: dict) -> dict:
)
return out
# Web tools
if name == "web_search_xng":
# Search the web using SearXNG and get the content of the relevant pages.
out=await web_search_xng(query=args.get("query",""))
return out
if name == "get_website_xng":
# Web scrape the website provided and get the content of it.
out=await get_website_xng(
url=args.get("url", ""))
return out
# Tekst tools
if name == "summarize_text":
text = (args.get("text") or "")[:int(args.get("max_chars",16000))]
@ -2010,6 +2162,22 @@ TOOLS_REGISTRY = {
"required":["query"]
}
},
"web_search_xng": {
"description": "Search the web using SearXNG and get the content of the relevant pages.",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string"},
},"required":["query"]}
},
"get_website_xng": {
"description": "Web scrape the website provided and get the content of it.",
"parameters": {
"type": "object",
"properties": {
"url": {"type": "string"},
},"required":["url"]}
},
"summarize_text": {
"description": "Vat tekst samen in bullets met inleiding en actiepunten.",
"parameters": {"type":"object","properties":{
@ -2252,15 +2420,54 @@ async def llm_call_autocont(
# Bouw een standaard OpenAI-chat response met het samengevoegde antwoord
return _openai_chat_response(mdl, full_text, messages)
def _normalize_tools_and_choice(body: dict) -> None:
tools = body.get("tools") or []
tc = body.get("tool_choice")
# 1) OWUI: tool_choice="required" -> maak het OpenAI-compat
if tc == "required":
names = []
for t in tools:
fn = (t.get("function") or {}) if isinstance(t, dict) else {}
n = fn.get("name")
if n:
names.append(n)
uniq = list(dict.fromkeys(names))
if len(uniq) == 1:
# force die ene tool
body["tool_choice"] = {"type": "function", "function": {"name": uniq[0]}}
else:
# meerdere tools: upstream snapt "required" niet -> kies auto
body["tool_choice"] = "auto"
# 2) Sommige clients sturen tools als [{"name":..., "parameters":...}] i.p.v. OpenAI {"type":"function","function":{...}}
if tools and isinstance(tools, list) and isinstance(tools[0], dict):
if "type" not in tools[0] and "name" in tools[0]:
body["tools"] = [{"type": "function", "function": t} for t in tools]
# 3) (optioneel) backward compat: functions -> tools
if (not body.get("tools")) and body.get("functions"):
body["tools"] = [{"type": "function", "function": f} for f in body["functions"]]
body.pop("functions", None)
if body.get("function_call") and not body.get("tool_choice"):
# grof: map function_call->tool_choice
fc = body["function_call"]
if isinstance(fc, dict) and fc.get("name"):
body["tool_choice"] = {"type": "function", "function": {"name": fc["name"]}}
body.pop("function_call", None)
@app.post("/v1/chat/completions")
async def openai_chat_completions(body: dict = Body(...), request: Request = None):
model = (body.get("model") or os.getenv("LLM_MODEL", "mistral-medium")).strip()
#logging.info(str(body))
#logging.info(str(request))
_normalize_tools_and_choice(body)
logging.info(str(body))
logging.info(str(request))
stream = bool(body.get("stream", False))
raw_messages = body.get("messages") or []
# normaliseer tool-berichten naar plain tekst voor het LLM
if False:
norm_messages = []
for m in raw_messages:
if m.get("role") == "tool":
@ -2271,16 +2478,31 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
})
else:
norm_messages.append(m)
else:
norm_messages=raw_messages
# --- minimal tool-calling glue (laat rest van je functie intact) ---
tools = body.get("tools") or []
RUN_BRIDGE = os.getenv("LLM_TOOL_RUNNER", "bridge").lower() == "bridge"
# (optioneel maar vaak nodig) forceer jouw eigen tools i.p.v. wat OWUI meestuurt
if RUN_BRIDGE and os.getenv("FORCE_ALL_TOOLS", "1").lower() not in ("0","false","no"):
body["tools"] = _openai_tools_from_registry(_visible_registry(TOOLS_REGISTRY))
tools = body["tools"]
# bridge werkt het makkelijkst non-stream
if RUN_BRIDGE and stream and (body.get("tools") or []):
body["stream"] = False
stream = False
# 'tool_choice' hier alleen lezen; later in de native branch wordt opnieuw naar body gekeken
tool_choice_req = body.get("tool_choice") # 'auto' | 'none' | 'required' | {...}
try:
logger.info("🧰 tools_count=%s, tool_choice=%s", len(tools), tool_choice_req)
except Exception:
pass
if not stream:
# OWUI stuurt vaak "required" als: "er MOET een tool worden gebruikt".
# Als er precies 1 tool is meegegeven, normaliseren we dat naar "force deze tool".
@ -2388,7 +2610,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
# Snelle escape: bij streaming en geen expliciete 'required' tool -> forceer directe streaming
if stream and tools and tool_choice_req in (None, "auto", "none") and \
os.getenv("STREAM_PREFER_DIRECT", "1").lower() not in ("0","false","no"):
os.getenv("STREAM_PREFER_DIRECT", "0").lower() not in ("0","false","no"):
tools = [] # bypass tool glue zodat we rechtstreeks naar de echte streaming gaan
# (2) Auto: vraag de LLM om 1+ function calls te produceren
@ -2549,7 +2771,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
if LLM_FUNCTION_CALLING_MODE in ("native","auto") and not stream:
# Relay-modus: laat LLM tools kiezen, bridge voert uit, daarna 2e run.
relay = os.getenv("LLM_TOOL_RUNNER", "passthrough").lower() == "bridge"
relay = os.getenv("LLM_TOOL_RUNNER", "bridge").lower() == "bridge" #passthrough
client = app.state.HTTPX
if not relay:
passthrough = dict(body); passthrough["messages"]=messages
@ -2781,6 +3003,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
top_p = float(body.get("top_p", 0.9))
_default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021"))
max_tokens = int(body.get("max_tokens", _default_max))
logger.info("UP tool_choice=%r tools0=%s", body.get("tool_choice"), (body.get("tools") or [{}])[0])
return await llm_call_openai_compat(
messages,
model=model,
@ -2795,13 +3018,13 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
# --- non-stream: windowing + auto-continue (zoals eerder gepatcht) ---
LLM_WINDOWING_ENABLE = os.getenv("LLM_WINDOWING_ENABLE", "1").lower() not in ("0","false","no")
MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "13021"))
MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "42000"))
RESP_RESERVE = int(os.getenv("LLM_RESPONSE_RESERVE", "1024"))
MAX_AUTOCONT = int(os.getenv("LLM_AUTO_CONTINUES", "2"))
temperature = float(body.get("temperature", 0.2))
top_p = float(body.get("top_p", 0.9))
# Laat env de default bepalen, zodat OWUI niet hard op 1024 blijft hangen
_default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13027"))
_default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "42000"))
max_tokens = int(body.get("max_tokens", _default_max))
trimmed = messages
@ -3689,7 +3912,7 @@ async def rag_query_api(
resp = await llm_call_openai_compat(
[{"role":"system","content":"You are precise and return only valid JSON."},
{"role":"user","content": prompt+"\n\nOnly JSON array."}],
stream=False, temperature=0.0, top_p=1.0, max_tokens=512
stream=False, temperature=0.0, top_p=1.0, max_tokens=1024
)
try:
order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]"))

View File

@ -1 +1 @@
docker run -d --name mistral-api --network host -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=13021 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ mistral-api
docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=16288 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 mistral-api

View File

@ -213,7 +213,7 @@ async def enrich_intent(llm_call_fn, messages: List[Dict]) -> Dict:
try:
resp = await llm_call_fn(
[{"role":"system","content":sys},{"role":"user","content":usr}],
stream=False, temperature=0.1, top_p=1.0, max_tokens=300
stream=False, temperature=0.1, top_p=1.0, max_tokens=512
)
raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}")
spec = _safe_json_loads(raw) or {"task": user_text, "constraints": [], "file_hints": [], "keywords": [], "acceptance": [], "ask": None} #json.loads(raw.strip())
@ -245,7 +245,7 @@ async def expand_queries(llm_call_fn, q: str, k: int = 3) -> List[str]:
try:
resp = await llm_call_fn(
[{"role":"system","content":sys},{"role":"user","content":usr}],
stream=False, temperature=0.2, top_p=0.9, max_tokens=120
stream=False, temperature=0.2, top_p=0.9, max_tokens=240
)
raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]")
arr = _safe_json_loads(raw) or []

View File

@ -156,9 +156,9 @@ def fit_context_under_budget(
def build_repo_context(
files_ranked: List[Tuple[str, str, float]],
per_chunk_tokens: int = 1200,
per_chunk_tokens: int = 2100,
overlap_tokens: int = 60,
ctx_budget_tokens: int = 4000,
ctx_budget_tokens: int = 5000,
tok_len: Callable[[str], int] = approx_token_count
) -> str:
expanded: List[Tuple[str,str]] = []