diff --git a/Dockerfile b/Dockerfile index de84bdc..e273002 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,15 @@ -FROM python:3.11-slim +# ===== Base met CUDA11.8 + cuDNN + conda ===== +FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime WORKDIR /app -# ===== Model caches op vaste paden (blijven in image) ===== -# Hugging Face caches (embeddings) + XDG cache (o.a. whisper) +# Zorg dat conda libs altijd eerst gevonden worden +ENV LD_LIBRARY_PATH=/opt/conda/lib:${LD_LIBRARY_PATH} + +# P5000 = Pascal SM 6.1; handig voor (eventueel) on-the-fly builds +ENV TORCH_CUDA_ARCH_LIST="6.1" + +# ===== Model caches op vaste paden ===== ENV HF_HOME=/opt/hf \ HUGGINGFACE_HUB_CACHE=/opt/hf \ TRANSFORMERS_CACHE=/opt/hf \ @@ -11,83 +17,92 @@ ENV HF_HOME=/opt/hf \ XDG_CACHE_HOME=/opt/cache \ STT_MODEL=small -# Optioneel build-args om modelkeuzes te pinnen -ARG RAG_EMBEDDINGS=gte-multilingual # of: bge-small / e5-small / gte-base-en -ARG STT_MODEL_ARG=small # tiny | base | small | medium | large-v3, etc. +ARG RAG_EMBEDDINGS=gte-multilingual +ARG STT_MODEL_ARG=small ENV RAG_EMBEDDINGS=${RAG_EMBEDDINGS} ENV STT_MODEL=${STT_MODEL_ARG} -# maak directories nu al aan (rechten) +# directories RUN mkdir -p /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper && \ chmod -R a+rX /opt/hf /opt/cache /opt/sentence-transformers /opt/whisper +# ===== Alleen minimale apt utils (géén multimedia libs!) ===== +RUN apt-get update && DEBIAN_FRONTEND=noninteractive \ + apt-get install -y --no-install-recommends \ + git curl build-essential ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# ===== Multimedia via conda-forge (alles uit één ecosysteem) ===== +# - av 10 + ffmpeg<7 (past goed bij pyAV) +# - cairo/pango/gdk-pixbuf/pixman voor cairosvg stack +# VERVANG de vorige conda multimedia regel door deze: +# Tooling voor PyAV build +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + pkg-config git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential && rm -rf /var/lib/apt/lists/* + +# FFmpeg via conda-forge (zodat je recente headers/libs hebt) +RUN conda config --system --set channel_priority flexible \ + && conda install -y -c conda-forge "ffmpeg>=6,<8" \ + && conda clean -afy + +# Later in je pip stap: +# ... faster-whisper==1.0.0 zal av==11.* trekken en nu WEL kunnen bouwen tegen conda’s FFmpeg 6 + +# ===== Python deps ===== COPY requirements.txt . -RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils pkg-config libavformat-dev libavcodec-dev libavdevice-dev libavutil-dev libavfilter-dev libswscale-dev libswresample-dev build-essential RUN pip install --upgrade pip +# jouw requirements RUN pip install --no-cache-dir -r requirements.txt -RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25 -#RUN pip cache purge +# losse extras (let op: av via conda, niet via pip!) +RUN pip install --no-cache-dir \ + PyPDF2 python-multipart gitpython chromadb httpx meilisearch \ + pandas openpyxl python-pptx faster-whisper==1.0.0 \ + cairosvg sentence-transformers rank-bm25 -RUN apt-get update && apt-get install -y --no-install-recommends \ - wget ca-certificates libstdc++6 libatomic1 \ - && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /opt/piper \ - && set -eux; \ - URL="https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz"; \ - wget -O /tmp/piper.tgz "$URL"; \ - tar -xzf /tmp/piper.tgz -C /opt/piper --strip-components=1; \ - ln -sf /opt/piper/piper /usr/local/bin/piper; \ - rm -f /tmp/piper.tgz +# ===== Prefetch modellen ===== -# ===== Prefetch modellen tijdens de build ===== -# 1) SentenceTransformers (embeddings) — volgens je mapping in app.py +# 1) SentenceTransformers RUN python - <<'PY' import os from sentence_transformers import SentenceTransformer mapping = { - "gte-multilingual": ("Alibaba-NLP/gte-multilingual-base"), - "bge-small": ("BAAI/bge-small-en-v1.5"), - "e5-small": ("intfloat/e5-small-v2"), - "gte-base-en": ("thenlper/gte-base"), + "gte-multilingual": "Alibaba-NLP/gte-multilingual-base", + "bge-small": "BAAI/bge-small-en-v1.5", + "e5-small": "intfloat/e5-small-v2", + "gte-base-en": "thenlper/gte-base", } choice = os.environ.get("RAG_EMBEDDINGS","gte-multilingual").lower() hf_id = mapping.get(choice, "BAAI/bge-small-en-v1.5") -# cache_folder respecteert SENTENCE_TRANSFORMERS_HOME/HF_HOME, maar we forceren expliciet: cache_root = os.environ.get("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers") local_dir = os.path.join(cache_root, "embedder") -os.makedirs(cache_root, exist_ok = True) - +os.makedirs(cache_root, exist_ok=True) print("Downloading SentenceTransformer:", hf_id) -# Laat SentenceTransformer zelf hun cache doen (HF_HOME etc) -# wij saven het eindresultaat daarna naar local_dir -model = SentenceTransformer(hf_id, cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME","/opt/sentence-transformers")) +model = SentenceTransformer(hf_id, cache_folder=cache_root, device="cpu") # download only model.save(local_dir) print("Prefetched SentenceTransformer:", hf_id) PY -# 2) faster-whisper (STT) — cache in /opt/cache/whisper +# 2) faster-whisper (prefetch CPU-kant; runtime kan je device kiezen) RUN python - <<'PY' import os from faster_whisper import WhisperModel name = os.environ.get("STT_MODEL","small") cache_root = os.path.join(os.environ.get("XDG_CACHE_HOME","/opt/cache"), "whisper") os.makedirs(cache_root, exist_ok=True) -# Build-time altijd CPU/INT8 (geen GPU nodig tijdens build) _ = WhisperModel(name, device="cpu", compute_type="int8", download_root=cache_root) print("Prefetched faster-whisper:", name, "->", cache_root) PY -# (optioneel) piper voice kun je hier ook voorcachen; laat ik nu achterwege omdat voice per omgeving wisselt. - - +# (optioneel) piper skip ik hier; kan later +# ===== App code ===== COPY app.py . COPY queue_helper.py . COPY agent_repo.py . COPY windowing_utils.py . COPY smart_rag.py . COPY llm_client.py . +COPY web_search.py . EXPOSE 8080 - CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/agent_repo.py b/agent_repo.py index 4c6ec2e..a7913c3 100644 --- a/agent_repo.py +++ b/agent_repo.py @@ -231,7 +231,7 @@ _embed_documents = None # Non-invasief: behoudt hetzelfde response-shape als _llm_call. # Harde cap van jouw Mistral-LLM docker (zoals je aangaf) -_MODEL_BUDGET = int(os.getenv("LLM_TOTAL_TOKEN_BUDGET", "13027")) +_MODEL_BUDGET = int(os.getenv("LLM_TOTAL_TOKEN_BUDGET", "42000")) # Veiligheidsmarge voor headers/EOS/afwijkingen in token-raming _BUDGET_SAFETY = int(os.getenv("LLM_BUDGET_SAFETY_TOKENS", "512")) # Max aantal vervolgstappen als het net afgekapt lijkt @@ -3255,7 +3255,7 @@ def _prepare_contexts_under_budget( question: str, stack_summary_text: str, *, - budget_tokens: int = int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")), + budget_tokens: int = int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "7000")), tok_len=approx_token_count ) -> List[dict]: """ @@ -3392,7 +3392,7 @@ async def _llm_qa_answer(question: str, stack_summary_text: str, contexts: List[ # --- NIEUW: trim contexts onder tokenbudget --- contexts = _prepare_contexts_under_budget( contexts, question, stack_summary_text, - budget_tokens=int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")), + budget_tokens=int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "7000")), tok_len=approx_token_count ) @@ -3617,7 +3617,7 @@ async def propose_patches_without_apply(repo_path: str, candidates: List[str], u # sla deze stap over; ga door naar volgende kandidaat continue last_err = None - for mx in [1024]: + for mx in [2048]: try: messages = [ {"role":"system","content":"Voer exact de gevraagde wijziging uit. GEEN extra refactors/best practices. Lever de volledige, werkende bestandinformatie als 1 codeblok."}, @@ -4215,7 +4215,7 @@ async def handle_repo_agent(messages: List[dict], request) -> str: ) llm_resp = await _llm_call( [{"role":"system","content":sys},{"role":"user","content":user}], - stream=False, temperature=0.2, top_p=0.9, max_tokens=1536 + stream=False, temperature=0.2, top_p=0.9, max_tokens=2048 ) out = (llm_resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","") if out.strip(): diff --git a/app.py b/app.py index d2ca986..d9d647c 100644 --- a/app.py +++ b/app.py @@ -14,7 +14,7 @@ import contextlib from contextlib import contextmanager import os, re, json, time, uuid, hashlib, logging, asyncio, fnmatch, threading from dataclasses import dataclass -from typing import List, Dict, Optional, Union, Any +from typing import List, Dict, Optional, Union, Any, Callable from pathlib import Path from io import BytesIO @@ -37,6 +37,12 @@ AUTO_CONTINUE_MAX_ROUNDS = int(os.getenv("AUTO_CONTINUE_MAX_ROUNDS", "6")) AUTO_CONTINUE_TAIL_CHARS = int(os.getenv("AUTO_CONTINUE_TAIL_CHARS", "600")) from llm_client import init_llm_client, _sync_model_infer +from web_search import Tools, HelpFunctions, EventEmitter + +import subprocess +import tempfile + + # Optionele libs voor tekst-extractie try: @@ -299,7 +305,8 @@ def _unique_id(route: APIRoute): method = list(route.methods)[0].lower() if route.methods else "get" return f"{route.name}_{route.path.replace('/', '_')}_{method}" -app = FastAPI(title="Mistral Bridge API",generate_unique_id_function=_unique_id) + +app = FastAPI(title="Mistral Bridge API",openapi_url=None,generate_unique_id_function=_unique_id)#openapi_url=None, app.add_middleware( CORSMiddleware, allow_credentials=True, @@ -531,9 +538,26 @@ class _Embedder: return self._encode(docs) def embed_query(self, q: str) -> list[float]: + # e5 prefix blijft identiek if self.family == "e5": q = f"query: {q}" - return self._encode([q])[0] + # --- LRU cache (per proces) om dubbele embeds bij routed buckets te vermijden --- + key = (q or "").strip() + cache = getattr(self, "_q_cache", None) + if cache is None: + from collections import OrderedDict + self._q_cache = OrderedDict() + self._q_cache_cap = int(os.getenv("RAG_Q_EMBED_CACHE", "2048")) + cache = self._q_cache + if key in cache: + v = cache.pop(key) + cache[key] = v + return v + v = self._encode([q])[0] + cache[key] = v + if len(cache) > getattr(self, "_q_cache_cap", 2048): + cache.popitem(last=False) # evict oldest + return v def _build_embedder() -> _Embedder: import inspect @@ -553,7 +577,16 @@ def _build_embedder() -> _Embedder: model_name, family, slug = mapping[choice] cache_dir = os.environ.get("SENTENCE_TRANSFORMERS_HOME", "/opt/sentence-transformers") local_dir = os.path.join(cache_dir, "embedder") - st_kwargs = {"device": "cpu"} + # --- device auto-select: cuda als beschikbaar, anders cpu; override met RAG_EMBED_DEVICE --- + dev_req = os.getenv("RAG_EMBED_DEVICE", "auto").strip().lower() + dev = "cpu" + try: + import torch + if dev_req in ("cuda", "gpu", "auto") and torch.cuda.is_available(): + dev = "cuda" + except Exception: + dev = "cpu" + st_kwargs = {"device": dev} if os.path.isdir(local_dir): # Prefetched model in image → gebruik dat model_source = local_dir @@ -568,7 +601,9 @@ def _build_embedder() -> _Embedder: if "trust_remote_code" in inspect.signature(SentenceTransformer).parameters: st_kwargs["trust_remote_code"] = True model = SentenceTransformer(model_source, **st_kwargs) - # optioneel: CPU thread-telling forceren + # logging: inzichtelijk maken waar embeds draaien + print(f"[embeddings] model={model_name} device={dev}") + # optioneel: CPU thread-telling forceren (fallback) try: thr = int(os.getenv("RAG_TORCH_THREADS", "0")) if thr > 0: @@ -576,7 +611,7 @@ def _build_embedder() -> _Embedder: torch.set_num_threads(thr) except Exception: pass - return _Embedder(slug=slug, family=family, model=model, device="cpu") + return _Embedder(slug=slug, family=family, model=model, device=dev) except Exception as e: print("ERROR building embedder:",str(e)) @@ -835,7 +870,7 @@ async def llm_call_openai_compat( stream: bool = False, temperature: float = 0.2, top_p: float = 0.9, - max_tokens: int = 13027, + max_tokens: int = 42000, extra: Optional[dict] = None, stop: Optional[Union[str, list[str]]] = None, **kwargs @@ -846,7 +881,8 @@ async def llm_call_openai_compat( "temperature": temperature, "top_p": top_p, "max_tokens": max_tokens, - "stream": bool(stream) + "stream": bool(stream), + "repeat_penalty": 1.5, } # OpenAI-compat: optionele stop-sequenties doorgeven indien aanwezig if stop is not None: @@ -1447,6 +1483,77 @@ def _stt_transcribe_path(path: str, lang: str | None): text = "".join(seg.text for seg in segments).strip() return text, getattr(info, "language", None) +# Initialize the Tools instance +tools_instance = Tools() + +@app.post("/web/search/xng") +async def web_search_xng( + query: str = Form(...), + SEARXNG_ENGINE_API_BASE_URL: str = tools_instance.valves.SEARXNG_ENGINE_API_BASE_URL, + IGNORED_WEBSITES: str = tools_instance.valves.IGNORED_WEBSITES, + RETURNED_SCRAPPED_PAGES_NO: int = tools_instance.valves.RETURNED_SCRAPPED_PAGES_NO, + SCRAPPED_PAGES_NO: int = tools_instance.valves.SCRAPPED_PAGES_NO, + PAGE_CONTENT_WORDS_LIMIT: int = tools_instance.valves.PAGE_CONTENT_WORDS_LIMIT, + CITATION_LINKS: bool = tools_instance.valves.CITATION_LINKS, +) -> str: + omsch=""" + Search the web using SearXNG and get the content of the relevant pages. + OpenAI-compat: { "query": "string", "SEARXNG_ENGINE_API_BASE_URL": "string", "IGNORED_WEBSITES": "string", "RETURNED_SCRAPPED_PAGES_NO": "integer", "SCRAPPED_PAGES_NO": "integer", "PAGE_CONTENT_WORDS_LIMIT": "integer", "CITATION_LINKS": "boolean" } + Return: JSON string with search results. + """ + #query = (body.get("query") or "").strip() + if not query: + raise HTTPException(status_code=400, detail="Lege query") + + # Extract parameters from the body + params = { + "SEARXNG_ENGINE_API_BASE_URL": SEARXNG_ENGINE_API_BASE_URL, + "IGNORED_WEBSITES": IGNORED_WEBSITES, + "RETURNED_SCRAPPED_PAGES_NO": RETURNED_SCRAPPED_PAGES_NO, + "SCRAPPED_PAGES_NO": SCRAPPED_PAGES_NO, + "PAGE_CONTENT_WORDS_LIMIT": PAGE_CONTENT_WORDS_LIMIT, + "CITATION_LINKS": CITATION_LINKS, + } + + # Update the valves with the provided parameters + tools_instance.valves = tools_instance.Valves(**params) + + # Call the existing search_web function from tools_instance + result = await tools_instance.search_web(query) + return JSONResponse(result) + +@app.post("/web/get_website/xng") +async def get_website_xng( + url: str = Form(...), + SEARXNG_ENGINE_API_BASE_URL: str = tools_instance.valves.SEARXNG_ENGINE_API_BASE_URL, + IGNORED_WEBSITES: str = tools_instance.valves.IGNORED_WEBSITES, + PAGE_CONTENT_WORDS_LIMIT: int = tools_instance.valves.PAGE_CONTENT_WORDS_LIMIT, + CITATION_LINKS: bool = tools_instance.valves.CITATION_LINKS, +) -> str: + omsch=""" + Web scrape the website provided and get the content of it. + OpenAI-compat: { "url": "string", "SEARXNG_ENGINE_API_BASE_URL": "string", "IGNORED_WEBSITES": "string", "PAGE_CONTENT_WORDS_LIMIT": "integer", "CITATION_LINKS": "boolean" } + Return: JSON string with website content. + """ + #url = (body.get("url") or "").strip() + if not url: + raise HTTPException(status_code=400, detail="Lege URL") + + # Extract parameters from the body + params = { + "SEARXNG_ENGINE_API_BASE_URL": SEARXNG_ENGINE_API_BASE_URL, + "IGNORED_WEBSITES": IGNORED_WEBSITES, + "PAGE_CONTENT_WORDS_LIMIT": PAGE_CONTENT_WORDS_LIMIT, + "CITATION_LINKS": CITATION_LINKS, + } + + # Update the valves with the provided parameters + tools_instance.valves = tools_instance.Valves(**params) + + # Call the existing get_website function from tools_instance + result = await tools_instance.get_website(url) + return JSONResponse(result) + @app.post("/v1/audio/transcriptions") async def audio_transcriptions( file: UploadFile = File(...), @@ -1521,6 +1628,22 @@ async def images_generations(payload: dict = Body(...)): out_items.append({"b64_json": b64}) return {"created": int(time.time()), "data": out_items} +@app.get("/improve_web_query") +async def improve_web_query(request: Request, format: str = "proxy"): + text = (request.query_params.get("text") or "")[:int(request.query_params.get("max_chars", 20000))] + objective = request.query_params.get("objective") or "Verbeter deze search query door de kern van de tekst te identificeren en deze om te zetten in een betere query. Gebruik AND, OR, of andere logische operatoren om de query te optimaliseren." + style = request.query_params.get("style") or "Hanteer best practices, behoud inhoudelijke betekenis." + + resp = await llm_call_openai_compat( + [{"role": "user", "content": f"" + f"Je taak is om deze search query te verbeteren: {text}\n" + f"Objectief: {objective}\n" + f"Stijl: {style}\n" + f"Verbeterde query:"}], + stream=False, max_tokens=200 + ) + return resp + @app.get("/v1/images/health") def images_health(): return {"svg_to_png": bool(cairosvg is not None)} @@ -1751,10 +1874,26 @@ def _normalize_files_arg(args: dict): @app.get("/openapi.json", include_in_schema=False) async def openapi_endpoint(): + tool_routes=[] + for r in app.routes: + if getattr(r, "path", "").split("/")[1] in ["v1","web","openapi","vision","file","rag","repo"]: + tool_routes.append(r) + logger.info("toolwithpath: %s",getattr(r, "path", "")) + tool_routes = [ + r for r in app.routes + if isinstance(r, APIRoute) and r.path.startswith("/openapi/") + ] + logger.info("OpenAPI tool_routes=%d", len(tool_routes)) + for r in tool_routes: + logger.info(" tool route: %s", r.path) + #tool_routes = [ + # r for r in app.routes + # if getattr(r, "path", "").startswith("/openapi/") + #] return get_openapi( title="Tool Server", version="0.1.0", - routes=app.routes, + routes=tool_routes, ) def _openai_tools_from_registry(reg: dict): @@ -1794,6 +1933,7 @@ def _parse_validation_results(text: str) -> list[str]: return issues async def _execute_tool(name: str, args: dict) -> dict: + logger.info("toolcall: "+str(name)+" ("+str(args)+")") if name == "repo_grep": repo_url = args.get("repo_url","") branch = args.get("branch","main") @@ -1856,6 +1996,18 @@ async def _execute_tool(name: str, args: dict) -> dict: ) return out + # Web tools + if name == "web_search_xng": + # Search the web using SearXNG and get the content of the relevant pages. + out=await web_search_xng(query=args.get("query","")) + return out + + if name == "get_website_xng": + # Web scrape the website provided and get the content of it. + out=await get_website_xng( + url=args.get("url", "")) + return out + # Tekst tools if name == "summarize_text": text = (args.get("text") or "")[:int(args.get("max_chars",16000))] @@ -2010,6 +2162,22 @@ TOOLS_REGISTRY = { "required":["query"] } }, + "web_search_xng": { + "description": "Search the web using SearXNG and get the content of the relevant pages.", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string"}, + },"required":["query"]} + }, + "get_website_xng": { + "description": "Web scrape the website provided and get the content of it.", + "parameters": { + "type": "object", + "properties": { + "url": {"type": "string"}, + },"required":["url"]} + }, "summarize_text": { "description": "Vat tekst samen in bullets met inleiding en actiepunten.", "parameters": {"type":"object","properties":{ @@ -2252,111 +2420,116 @@ async def llm_call_autocont( # Bouw een standaard OpenAI-chat response met het samengevoegde antwoord return _openai_chat_response(mdl, full_text, messages) +def _normalize_tools_and_choice(body: dict) -> None: + tools = body.get("tools") or [] + tc = body.get("tool_choice") + + # 1) OWUI: tool_choice="required" -> maak het OpenAI-compat + if tc == "required": + names = [] + for t in tools: + fn = (t.get("function") or {}) if isinstance(t, dict) else {} + n = fn.get("name") + if n: + names.append(n) + + uniq = list(dict.fromkeys(names)) + if len(uniq) == 1: + # force die ene tool + body["tool_choice"] = {"type": "function", "function": {"name": uniq[0]}} + else: + # meerdere tools: upstream snapt "required" niet -> kies auto + body["tool_choice"] = "auto" + + # 2) Sommige clients sturen tools als [{"name":..., "parameters":...}] i.p.v. OpenAI {"type":"function","function":{...}} + if tools and isinstance(tools, list) and isinstance(tools[0], dict): + if "type" not in tools[0] and "name" in tools[0]: + body["tools"] = [{"type": "function", "function": t} for t in tools] + + # 3) (optioneel) backward compat: functions -> tools + if (not body.get("tools")) and body.get("functions"): + body["tools"] = [{"type": "function", "function": f} for f in body["functions"]] + body.pop("functions", None) + if body.get("function_call") and not body.get("tool_choice"): + # grof: map function_call->tool_choice + fc = body["function_call"] + if isinstance(fc, dict) and fc.get("name"): + body["tool_choice"] = {"type": "function", "function": {"name": fc["name"]}} + body.pop("function_call", None) + @app.post("/v1/chat/completions") async def openai_chat_completions(body: dict = Body(...), request: Request = None): model = (body.get("model") or os.getenv("LLM_MODEL", "mistral-medium")).strip() - #logging.info(str(body)) - #logging.info(str(request)) + _normalize_tools_and_choice(body) + logging.info(str(body)) + logging.info(str(request)) stream = bool(body.get("stream", False)) raw_messages = body.get("messages") or [] # normaliseer tool-berichten naar plain tekst voor het LLM - norm_messages = [] - for m in raw_messages: - if m.get("role") == "tool": - nm = m.get("name") or "tool" - norm_messages.append({ - "role": "user", - "content": f"[{nm} RESULT]\n{m.get('content') or ''}" - }) - else: - norm_messages.append(m) - + if False: + norm_messages = [] + for m in raw_messages: + if m.get("role") == "tool": + nm = m.get("name") or "tool" + norm_messages.append({ + "role": "user", + "content": f"[{nm} RESULT]\n{m.get('content') or ''}" + }) + else: + norm_messages.append(m) + else: + norm_messages=raw_messages # --- minimal tool-calling glue (laat rest van je functie intact) --- tools = body.get("tools") or [] + + RUN_BRIDGE = os.getenv("LLM_TOOL_RUNNER", "bridge").lower() == "bridge" + + # (optioneel maar vaak nodig) forceer jouw eigen tools i.p.v. wat OWUI meestuurt + if RUN_BRIDGE and os.getenv("FORCE_ALL_TOOLS", "1").lower() not in ("0","false","no"): + body["tools"] = _openai_tools_from_registry(_visible_registry(TOOLS_REGISTRY)) + tools = body["tools"] + + # bridge werkt het makkelijkst non-stream + if RUN_BRIDGE and stream and (body.get("tools") or []): + body["stream"] = False + stream = False + # 'tool_choice' hier alleen lezen; later in de native branch wordt opnieuw naar body gekeken tool_choice_req = body.get("tool_choice") # 'auto' | 'none' | 'required' | {...} try: logger.info("🧰 tools_count=%s, tool_choice=%s", len(tools), tool_choice_req) except Exception: pass + if not stream: - # OWUI stuurt vaak "required" als: "er MOET een tool worden gebruikt". - # Als er precies 1 tool is meegegeven, normaliseren we dat naar "force deze tool". - if tool_choice_req == "required" and tools: - names = [ (t.get("function") or {}).get("name") for t in tools if t.get("function") ] - names = [ n for n in names if n ] - # 1) exact 1 → force die - if len(set(names)) == 1: - tool_choice_req = {"type":"function","function":{"name": names[0]}} - else: - # 2) meerdere → kies op basis van user prompt (noem de toolnaam) - last_user = next((m for m in reversed(norm_messages) if m.get("role")=="user"), {}) - utext = (last_user.get("content") or "").lower() - mentioned = [n for n in names if n and n.lower() in utext] - if mentioned: - tool_choice_req = {"type":"function","function":{"name": mentioned[0]}} - logger.info("🔧 required->picked tool by mention: %s", mentioned[0]) + # OWUI stuurt vaak "required" als: "er MOET een tool worden gebruikt". + # Als er precies 1 tool is meegegeven, normaliseren we dat naar "force deze tool". + if tool_choice_req == "required" and tools: + names = [ (t.get("function") or {}).get("name") for t in tools if t.get("function") ] + names = [ n for n in names if n ] + # 1) exact 1 → force die + if len(set(names)) == 1: + tool_choice_req = {"type":"function","function":{"name": names[0]}} + else: + # 2) meerdere → kies op basis van user prompt (noem de toolnaam) + last_user = next((m for m in reversed(norm_messages) if m.get("role")=="user"), {}) + utext = (last_user.get("content") or "").lower() + mentioned = [n for n in names if n and n.lower() in utext] + if mentioned: + tool_choice_req = {"type":"function","function":{"name": mentioned[0]}} + logger.info("🔧 required->picked tool by mention: %s", mentioned[0]) - # (1) Force: OWUI dwingt een specifieke tool af - if isinstance(tool_choice_req, dict) and (tool_choice_req.get("type") == "function"): - fname = (tool_choice_req.get("function") or {}).get("name") - if fname and fname not in TOOLS_REGISTRY: - # Onbekende tool → laat de LLM zelf native tool_calls teruggeven. - passthrough = dict(body) - passthrough["messages"] = norm_messages - passthrough["stream"] = False - client = app.state.HTTPX - r = await client.post(LLM_URL, json=passthrough) - try: - return JSONResponse(r.json(), status_code=r.status_code) - except Exception: - return PlainTextResponse(r.text, status_code=r.status_code) - - if fname: - # Probeer lichte heuristiek voor bekende tools - last_user = next((m for m in reversed(norm_messages) if m.get("role")=="user"), {}) - utext = (last_user.get("content") or "") - args: dict = {} - - if fname == "rag_index_repo": - m = re.search(r'(https?://\S+)', utext) - if m: args["repo_url"] = m.group(1) - mb = re.search(r'\bbranch\s+([A-Za-z0-9._/-]+)', utext, re.I) - if mb: args["branch"] = mb.group(1) - elif fname == "rag_query": - args["query"] = utext.strip() - elif fname == "summarize_text": - m = re.search(r':\s*(.+)$', utext, re.S) - args["text"] = (m.group(1).strip() if m else utext.strip())[:16000] - elif fname == "analyze_text": - m = re.search(r':\s*(.+)$', utext, re.S) - args["text"] = (m.group(1).strip() if m else utext.strip())[:20000] - elif fname == "improve_text": - m = re.search(r':\s*(.+)$', utext, re.S) - args["text"] = (m.group(1).strip() if m else utext.strip())[:20000] - elif fname == "validate_code_text": - code = re.search(r"```.*?\n(.*?)```", utext, re.S) - args["code"] = (code.group(1).strip() if code else utext.strip()) - elif fname == "improve_code_text": - code = re.search(r"```.*?\n(.*?)```", utext, re.S) - args["code"] = (code.group(1).strip() if code else utext.strip()) - elif fname == "vision_analyze": - m = re.search(r'(data:image\/[a-zA-Z]+;base64,[A-Za-z0-9+/=]+|https?://\S+)', utext) - if m: args["image_url"] = m.group(1) - - # Check verplichte velden; zo niet → native passthrough met alleen deze tool - required = (TOOLS_REGISTRY.get(fname, {}).get("parameters", {}) or {}).get("required", []) - if not all(k in args and args[k] for k in required): + # (1) Force: OWUI dwingt een specifieke tool af + if isinstance(tool_choice_req, dict) and (tool_choice_req.get("type") == "function"): + fname = (tool_choice_req.get("function") or {}).get("name") + if fname and fname not in TOOLS_REGISTRY: + # Onbekende tool → laat de LLM zelf native tool_calls teruggeven. passthrough = dict(body) passthrough["messages"] = norm_messages - # Alleen deze tool meegeven + dwing deze tool af - only = [t for t in (body.get("tools") or []) if (t.get("function") or {}).get("name")==fname] - if only: passthrough["tools"] = only - passthrough["tool_choice"] = {"type":"function","function":{"name": fname}} passthrough["stream"] = False client = app.state.HTTPX r = await client.post(LLM_URL, json=passthrough) @@ -2365,52 +2538,101 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non except Exception: return PlainTextResponse(r.text, status_code=r.status_code) + if fname: + # Probeer lichte heuristiek voor bekende tools + last_user = next((m for m in reversed(norm_messages) if m.get("role")=="user"), {}) + utext = (last_user.get("content") or "") + args: dict = {} - # Heuristiek geslaagd → stuur tool_calls terug (compat met OWUI) - return { - "id": f"chatcmpl-{uuid.uuid4().hex}", - "object": "chat.completion", - "created": int(time.time()), - "model": model, - "choices": [{ - "index": 0, - "finish_reason": "tool_calls", - "message": { - "role": "assistant", - "tool_calls": [{ - "id": f"call_{uuid.uuid4().hex[:8]}", - "type": "function", - "function": {"name": fname, "arguments": json.dumps(args, ensure_ascii=False)} - }] - } - }] - } + if fname == "rag_index_repo": + m = re.search(r'(https?://\S+)', utext) + if m: args["repo_url"] = m.group(1) + mb = re.search(r'\bbranch\s+([A-Za-z0-9._/-]+)', utext, re.I) + if mb: args["branch"] = mb.group(1) + elif fname == "rag_query": + args["query"] = utext.strip() + elif fname == "summarize_text": + m = re.search(r':\s*(.+)$', utext, re.S) + args["text"] = (m.group(1).strip() if m else utext.strip())[:16000] + elif fname == "analyze_text": + m = re.search(r':\s*(.+)$', utext, re.S) + args["text"] = (m.group(1).strip() if m else utext.strip())[:20000] + elif fname == "improve_text": + m = re.search(r':\s*(.+)$', utext, re.S) + args["text"] = (m.group(1).strip() if m else utext.strip())[:20000] + elif fname == "validate_code_text": + code = re.search(r"```.*?\n(.*?)```", utext, re.S) + args["code"] = (code.group(1).strip() if code else utext.strip()) + elif fname == "improve_code_text": + code = re.search(r"```.*?\n(.*?)```", utext, re.S) + args["code"] = (code.group(1).strip() if code else utext.strip()) + elif fname == "vision_analyze": + m = re.search(r'(data:image\/[a-zA-Z]+;base64,[A-Za-z0-9+/=]+|https?://\S+)', utext) + if m: args["image_url"] = m.group(1) - # Snelle escape: bij streaming en geen expliciete 'required' tool -> forceer directe streaming - if stream and tools and tool_choice_req in (None, "auto", "none") and \ - os.getenv("STREAM_PREFER_DIRECT", "1").lower() not in ("0","false","no"): - tools = [] # bypass tool glue zodat we rechtstreeks naar de echte streaming gaan + # Check verplichte velden; zo niet → native passthrough met alleen deze tool + required = (TOOLS_REGISTRY.get(fname, {}).get("parameters", {}) or {}).get("required", []) + if not all(k in args and args[k] for k in required): + passthrough = dict(body) + passthrough["messages"] = norm_messages + # Alleen deze tool meegeven + dwing deze tool af + only = [t for t in (body.get("tools") or []) if (t.get("function") or {}).get("name")==fname] + if only: passthrough["tools"] = only + passthrough["tool_choice"] = {"type":"function","function":{"name": fname}} + passthrough["stream"] = False + client = app.state.HTTPX + r = await client.post(LLM_URL, json=passthrough) + try: + return JSONResponse(r.json(), status_code=r.status_code) + except Exception: + return PlainTextResponse(r.text, status_code=r.status_code) - # (2) Auto: vraag de LLM om 1+ function calls te produceren - if (tool_choice_req in (None, "auto")) and tools: - sys = _build_tools_system_prompt(tools) - ask = [{"role": "system", "content": sys}] + norm_messages - # jouw bestaande helper; hou 'm zoals je al gebruikt - resp = await llm_call_openai_compat(ask, stream=False, max_tokens=512) - txt = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content", "") or "" - calls = detect_toolcalls_any(txt) #_extract_tool_calls_from_text(txt) - if calls: - return { - "id": f"chatcmpl-{uuid.uuid4().hex}", - "object": "chat.completion", - "created": int(time.time()), - "model": model, - "choices": [{ - "index": 0, - "finish_reason": "tool_calls", - "message": {"role": "assistant", "tool_calls": calls} - }] - } + + # Heuristiek geslaagd → stuur tool_calls terug (compat met OWUI) + return { + "id": f"chatcmpl-{uuid.uuid4().hex}", + "object": "chat.completion", + "created": int(time.time()), + "model": model, + "choices": [{ + "index": 0, + "finish_reason": "tool_calls", + "message": { + "role": "assistant", + "tool_calls": [{ + "id": f"call_{uuid.uuid4().hex[:8]}", + "type": "function", + "function": {"name": fname, "arguments": json.dumps(args, ensure_ascii=False)} + }] + } + }] + } + + # Snelle escape: bij streaming en geen expliciete 'required' tool -> forceer directe streaming + if stream and tools and tool_choice_req in (None, "auto", "none") and \ + os.getenv("STREAM_PREFER_DIRECT", "0").lower() not in ("0","false","no"): + tools = [] # bypass tool glue zodat we rechtstreeks naar de echte streaming gaan + + # (2) Auto: vraag de LLM om 1+ function calls te produceren + if (tool_choice_req in (None, "auto")) and tools: + sys = _build_tools_system_prompt(tools) + ask = [{"role": "system", "content": sys}] + norm_messages + # jouw bestaande helper; hou 'm zoals je al gebruikt + resp = await llm_call_openai_compat(ask, stream=False, max_tokens=512) + txt = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content", "") or "" + calls = detect_toolcalls_any(txt) #_extract_tool_calls_from_text(txt) + if calls: + return { + "id": f"chatcmpl-{uuid.uuid4().hex}", + "object": "chat.completion", + "created": int(time.time()), + "model": model, + "choices": [{ + "index": 0, + "finish_reason": "tool_calls", + "message": {"role": "assistant", "tool_calls": calls} + }] + } # --- einde minimal tool-calling glue --- # Vision normalisatie (na tool->tekst normalisatie) @@ -2549,7 +2771,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non if LLM_FUNCTION_CALLING_MODE in ("native","auto") and not stream: # Relay-modus: laat LLM tools kiezen, bridge voert uit, daarna 2e run. - relay = os.getenv("LLM_TOOL_RUNNER", "passthrough").lower() == "bridge" + relay = os.getenv("LLM_TOOL_RUNNER", "bridge").lower() == "bridge" #passthrough client = app.state.HTTPX if not relay: passthrough = dict(body); passthrough["messages"]=messages @@ -2781,6 +3003,7 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non top_p = float(body.get("top_p", 0.9)) _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13021")) max_tokens = int(body.get("max_tokens", _default_max)) + logger.info("UP tool_choice=%r tools0=%s", body.get("tool_choice"), (body.get("tools") or [{}])[0]) return await llm_call_openai_compat( messages, model=model, @@ -2795,13 +3018,13 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non # --- non-stream: windowing + auto-continue (zoals eerder gepatcht) --- LLM_WINDOWING_ENABLE = os.getenv("LLM_WINDOWING_ENABLE", "1").lower() not in ("0","false","no") - MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "13021")) + MAX_CTX_TOKENS = int(os.getenv("LLM_CONTEXT_TOKENS", "42000")) RESP_RESERVE = int(os.getenv("LLM_RESPONSE_RESERVE", "1024")) MAX_AUTOCONT = int(os.getenv("LLM_AUTO_CONTINUES", "2")) temperature = float(body.get("temperature", 0.2)) top_p = float(body.get("top_p", 0.9)) # Laat env de default bepalen, zodat OWUI niet hard op 1024 blijft hangen - _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "13027")) + _default_max = int(os.getenv("LLM_DEFAULT_MAX_TOKENS", "42000")) max_tokens = int(body.get("max_tokens", _default_max)) trimmed = messages @@ -3689,7 +3912,7 @@ async def rag_query_api( resp = await llm_call_openai_compat( [{"role":"system","content":"You are precise and return only valid JSON."}, {"role":"user","content": prompt+"\n\nOnly JSON array."}], - stream=False, temperature=0.0, top_p=1.0, max_tokens=512 + stream=False, temperature=0.0, top_p=1.0, max_tokens=1024 ) try: order = json.loads((resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]")) diff --git a/mistral-api.sh b/mistral-api.sh index 1291b57..b0187e0 100755 --- a/mistral-api.sh +++ b/mistral-api.sh @@ -1 +1 @@ -docker run -d --name mistral-api --network host -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=13021 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ mistral-api +docker run -d --rm --name mistral-api --network host -v /opt/SentenceTransformer:/opt/sentence-transformers -v /opt/piper/voices:/voices:ro -e LLM_TOOL_RUNNER=bridge -e LLM_UPSTREAMS="http://localhost:8000/v1/chat/completions,http://localhost:8001/v1/chat/completions" -e LLM_MAX_CONCURRENCY=2 -e REPO_AGENT_SMART=1 -e RAG_EXPAND_QUERIES=1 -e RAG_EXPAND_K=3 -e RAG_PER_QUERY_K=30 -e RAG_N_RESULT=8 -e RAG_EMB_WEIGHT=0.6 -e REPO_AGENT_CONTEXT_CHARS=24000 -e REPO_AGENT_ASK_CLARIFY=1 -e REPO_AGENT_ASK_THRESHOLD=0.35 -e PIPER_BIN=/usr/local/bin/piper -e PIPER_VOICE=/voices/nl_NL-mls-medium.onnx.gz -e LLM_WINDOWING_ENABLE=1 -e LLM_CONTEXT_TOKENS=16288 -e LLM_RESPONSE_RESERVE=1024 -e LLM_AUTO_CONTINUES=2 -e LLM_FUNCTION_CALLING_MODE=shim -e RAG_EMB_WEIGHT=0.6 -e LLM_URL="http://localhost:8000/v1/chat/completions" -e NO_PROXY="127.0.0.1,localhost,::1,host.docker.internal" -e RAG_TORCH_THREADS=6 -e OMP_NUM_THREADS=6 -e MKL_NUM_THREADS=6 -e OPENBLAS_NUM_THREADS=6 -e NUMEXPR_NUM_THREADS=6 -e LLM_READ_TIMEOUT=3600 -e NO_PROXY=localhost,127.0.0.1,::1,192.168.100.1,192.168.100.2 -e HTTP_PROXY=http://192.168.100.2:8118 -e HTTPS_PROXY=http://192.168.100.2:8118 -e MEILI_URL=http://localhost:7700 -e MEILI_KEY=0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ --gpus device=0 -e CUDA_VISIBLE_DEVICES=0 mistral-api diff --git a/smart_rag.py b/smart_rag.py index 55c1c2d..06d49f6 100644 --- a/smart_rag.py +++ b/smart_rag.py @@ -213,7 +213,7 @@ async def enrich_intent(llm_call_fn, messages: List[Dict]) -> Dict: try: resp = await llm_call_fn( [{"role":"system","content":sys},{"role":"user","content":usr}], - stream=False, temperature=0.1, top_p=1.0, max_tokens=300 + stream=False, temperature=0.1, top_p=1.0, max_tokens=512 ) raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}") spec = _safe_json_loads(raw) or {"task": user_text, "constraints": [], "file_hints": [], "keywords": [], "acceptance": [], "ask": None} #json.loads(raw.strip()) @@ -245,7 +245,7 @@ async def expand_queries(llm_call_fn, q: str, k: int = 3) -> List[str]: try: resp = await llm_call_fn( [{"role":"system","content":sys},{"role":"user","content":usr}], - stream=False, temperature=0.2, top_p=0.9, max_tokens=120 + stream=False, temperature=0.2, top_p=0.9, max_tokens=240 ) raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]") arr = _safe_json_loads(raw) or [] diff --git a/windowing_utils.py b/windowing_utils.py index 81a144f..3f8606b 100644 --- a/windowing_utils.py +++ b/windowing_utils.py @@ -156,9 +156,9 @@ def fit_context_under_budget( def build_repo_context( files_ranked: List[Tuple[str, str, float]], - per_chunk_tokens: int = 1200, + per_chunk_tokens: int = 2100, overlap_tokens: int = 60, - ctx_budget_tokens: int = 4000, + ctx_budget_tokens: int = 5000, tok_len: Callable[[str], int] = approx_token_count ) -> str: expanded: List[Tuple[str,str]] = []