from __future__ import annotations import os import asyncio import logging from typing import List, Dict, Any, Optional import httpx from queue_helper import QueueManager logger = logging.getLogger(__name__) # ------------------------------------------------------------- # Config voor onderliggende LLM-backend / proxy # ------------------------------------------------------------- # Je kunt één van deze zetten: # - LLM_PROXY_URL: volledige URL naar OpenAI-compat endpoint (bv. http://host:8081/v1/completions of /v1/chat/completions) # - LLM_API_BASE : base-url (fallback). Dan gebruiken we /v1/chat/completions LLM_PROXY_URL = (os.getenv("LLM_PROXY_URL") or "").strip() LLM_API_BASE = os.getenv("LLM_API_BASE", "").strip() or "http://127.0.0.1:11434" LLM_DEFAULT_MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini") LLM_REQUEST_TIMEOUT = float(os.getenv("LLM_REQUEST_TIMEOUT", "180")) # Deze wordt in app.py gezet via init_llm_client(...) LLM_QUEUE: QueueManager | None = None def init_llm_client(queue: QueueManager) -> None: global LLM_QUEUE LLM_QUEUE = queue logger.info("llm_client: LLM_QUEUE gekoppeld via init_llm_client.") def _resolve_llm_url() -> str: if LLM_PROXY_URL: return LLM_PROXY_URL.rstrip("/") # fallback: base -> chat completions return f"{LLM_API_BASE.rstrip('/')}/v1/chat/completions" def _messages_to_prompt(messages: List[Dict[str, Any]]) -> str: # eenvoudige, robuuste prompt-serialisatie voor /v1/completions proxies parts: list[str] = [] for m in messages: role = (m.get("role") or "user").upper() content = m.get("content") or "" parts.append(f"{role}: {content}") parts.append("ASSISTANT:") return "\n".join(parts) def _chat_from_text(text: str) -> Dict[str, Any]: return { "object": "chat.completion", "choices": [{ "index": 0, "finish_reason": "stop", "message": {"role": "assistant", "content": text}, }], "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, } def _sync_model_infer(payload: Dict[str, Any]) -> Dict[str, Any]: url = _resolve_llm_url() try: with httpx.Client(timeout=LLM_REQUEST_TIMEOUT) as client: # detect endpoint type is_chat = "/chat/" in url or url.endswith("/chat/completions") if is_chat: resp = client.post(url, json=payload) resp.raise_for_status() return resp.json() # /v1/completions style messages = payload.get("messages") or [] prompt = payload.get("prompt") if not prompt: prompt = _messages_to_prompt(messages) comp_payload: Dict[str, Any] = { "model": payload.get("model") or LLM_DEFAULT_MODEL, "prompt": prompt, "max_tokens": payload.get("max_tokens", 2048), "temperature": payload.get("temperature", 0.2), "top_p": payload.get("top_p", 0.9), "stream": False, } # pass-through extras if present for k in ("stop", "presence_penalty", "frequency_penalty"): if k in payload: comp_payload[k] = payload[k] resp = client.post(url, json=comp_payload) resp.raise_for_status() data = resp.json() # normalize to chat.completion try: choice = (data.get("choices") or [{}])[0] txt = choice.get("text") or choice.get("message", {}).get("content") or "" return _chat_from_text(txt) except Exception: return _chat_from_text(str(data)[:2000]) except Exception as exc: logger.exception("LLM backend call failed: %s", exc) return _chat_from_text(f"[LLM-fout] {exc}") async def _llm_call( messages: List[Dict[str, str]], *, stream: bool = False, temperature: float = 0.2, top_p: float = 0.9, max_tokens: Optional[int] = None, model: Optional[str] = None, **extra: Any, ) -> Dict[str, Any]: if stream: raise NotImplementedError("_llm_call(stream=True) wordt momenteel niet ondersteund.") if LLM_QUEUE is None: raise RuntimeError("LLM_QUEUE is niet geïnitialiseerd. Roep init_llm_client(...) aan in app.py") payload: Dict[str, Any] = { "model": model or LLM_DEFAULT_MODEL, "messages": messages, "stream": False, "temperature": float(temperature), "top_p": float(top_p), } if max_tokens is not None: payload["max_tokens"] = int(max_tokens) payload.update(extra) loop = asyncio.get_running_loop() try: response: Dict[str, Any] = await loop.run_in_executor( None, lambda: LLM_QUEUE.request_agent_sync(payload) ) return response except Exception as exc: logger.exception("_llm_call via agent-queue failed: %s", exc) return _chat_from_text(f"[LLM-queue-fout] {exc}")