142 lines
5.0 KiB
Python
142 lines
5.0 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
import os
|
||
|
|
import asyncio
|
||
|
|
import logging
|
||
|
|
from typing import List, Dict, Any, Optional
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
|
||
|
|
from queue_helper import QueueManager
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# -------------------------------------------------------------
|
||
|
|
# Config voor onderliggende LLM-backend / proxy
|
||
|
|
# -------------------------------------------------------------
|
||
|
|
# Je kunt één van deze zetten:
|
||
|
|
# - LLM_PROXY_URL: volledige URL naar OpenAI-compat endpoint (bv. http://host:8081/v1/completions of /v1/chat/completions)
|
||
|
|
# - LLM_API_BASE : base-url (fallback). Dan gebruiken we /v1/chat/completions
|
||
|
|
LLM_PROXY_URL = (os.getenv("LLM_PROXY_URL") or "").strip()
|
||
|
|
LLM_API_BASE = os.getenv("LLM_API_BASE", "").strip() or "http://127.0.0.1:11434"
|
||
|
|
LLM_DEFAULT_MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini")
|
||
|
|
LLM_REQUEST_TIMEOUT = float(os.getenv("LLM_REQUEST_TIMEOUT", "180"))
|
||
|
|
|
||
|
|
# Deze wordt in app.py gezet via init_llm_client(...)
|
||
|
|
LLM_QUEUE: QueueManager | None = None
|
||
|
|
|
||
|
|
|
||
|
|
def init_llm_client(queue: QueueManager) -> None:
|
||
|
|
global LLM_QUEUE
|
||
|
|
LLM_QUEUE = queue
|
||
|
|
logger.info("llm_client: LLM_QUEUE gekoppeld via init_llm_client.")
|
||
|
|
|
||
|
|
|
||
|
|
def _resolve_llm_url() -> str:
|
||
|
|
if LLM_PROXY_URL:
|
||
|
|
return LLM_PROXY_URL.rstrip("/")
|
||
|
|
# fallback: base -> chat completions
|
||
|
|
return f"{LLM_API_BASE.rstrip('/')}/v1/chat/completions"
|
||
|
|
|
||
|
|
|
||
|
|
def _messages_to_prompt(messages: List[Dict[str, Any]]) -> str:
|
||
|
|
# eenvoudige, robuuste prompt-serialisatie voor /v1/completions proxies
|
||
|
|
parts: list[str] = []
|
||
|
|
for m in messages:
|
||
|
|
role = (m.get("role") or "user").upper()
|
||
|
|
content = m.get("content") or ""
|
||
|
|
parts.append(f"{role}: {content}")
|
||
|
|
parts.append("ASSISTANT:")
|
||
|
|
return "\n".join(parts)
|
||
|
|
|
||
|
|
|
||
|
|
def _chat_from_text(text: str) -> Dict[str, Any]:
|
||
|
|
return {
|
||
|
|
"object": "chat.completion",
|
||
|
|
"choices": [{
|
||
|
|
"index": 0,
|
||
|
|
"finish_reason": "stop",
|
||
|
|
"message": {"role": "assistant", "content": text},
|
||
|
|
}],
|
||
|
|
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _sync_model_infer(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||
|
|
url = _resolve_llm_url()
|
||
|
|
try:
|
||
|
|
with httpx.Client(timeout=LLM_REQUEST_TIMEOUT) as client:
|
||
|
|
# detect endpoint type
|
||
|
|
is_chat = "/chat/" in url or url.endswith("/chat/completions")
|
||
|
|
if is_chat:
|
||
|
|
resp = client.post(url, json=payload)
|
||
|
|
resp.raise_for_status()
|
||
|
|
return resp.json()
|
||
|
|
# /v1/completions style
|
||
|
|
messages = payload.get("messages") or []
|
||
|
|
prompt = payload.get("prompt")
|
||
|
|
if not prompt:
|
||
|
|
prompt = _messages_to_prompt(messages)
|
||
|
|
comp_payload: Dict[str, Any] = {
|
||
|
|
"model": payload.get("model") or LLM_DEFAULT_MODEL,
|
||
|
|
"prompt": prompt,
|
||
|
|
"max_tokens": payload.get("max_tokens", 2048),
|
||
|
|
"temperature": payload.get("temperature", 0.2),
|
||
|
|
"top_p": payload.get("top_p", 0.9),
|
||
|
|
"stream": False,
|
||
|
|
}
|
||
|
|
# pass-through extras if present
|
||
|
|
for k in ("stop", "presence_penalty", "frequency_penalty"):
|
||
|
|
if k in payload:
|
||
|
|
comp_payload[k] = payload[k]
|
||
|
|
resp = client.post(url, json=comp_payload)
|
||
|
|
resp.raise_for_status()
|
||
|
|
data = resp.json()
|
||
|
|
# normalize to chat.completion
|
||
|
|
try:
|
||
|
|
choice = (data.get("choices") or [{}])[0]
|
||
|
|
txt = choice.get("text") or choice.get("message", {}).get("content") or ""
|
||
|
|
return _chat_from_text(txt)
|
||
|
|
except Exception:
|
||
|
|
return _chat_from_text(str(data)[:2000])
|
||
|
|
except Exception as exc:
|
||
|
|
logger.exception("LLM backend call failed: %s", exc)
|
||
|
|
return _chat_from_text(f"[LLM-fout] {exc}")
|
||
|
|
|
||
|
|
|
||
|
|
async def _llm_call(
|
||
|
|
messages: List[Dict[str, str]],
|
||
|
|
*,
|
||
|
|
stream: bool = False,
|
||
|
|
temperature: float = 0.2,
|
||
|
|
top_p: float = 0.9,
|
||
|
|
max_tokens: Optional[int] = None,
|
||
|
|
model: Optional[str] = None,
|
||
|
|
**extra: Any,
|
||
|
|
) -> Dict[str, Any]:
|
||
|
|
if stream:
|
||
|
|
raise NotImplementedError("_llm_call(stream=True) wordt momenteel niet ondersteund.")
|
||
|
|
|
||
|
|
if LLM_QUEUE is None:
|
||
|
|
raise RuntimeError("LLM_QUEUE is niet geïnitialiseerd. Roep init_llm_client(...) aan in app.py")
|
||
|
|
|
||
|
|
payload: Dict[str, Any] = {
|
||
|
|
"model": model or LLM_DEFAULT_MODEL,
|
||
|
|
"messages": messages,
|
||
|
|
"stream": False,
|
||
|
|
"temperature": float(temperature),
|
||
|
|
"top_p": float(top_p),
|
||
|
|
}
|
||
|
|
if max_tokens is not None:
|
||
|
|
payload["max_tokens"] = int(max_tokens)
|
||
|
|
payload.update(extra)
|
||
|
|
|
||
|
|
loop = asyncio.get_running_loop()
|
||
|
|
try:
|
||
|
|
response: Dict[str, Any] = await loop.run_in_executor(
|
||
|
|
None, lambda: LLM_QUEUE.request_agent_sync(payload)
|
||
|
|
)
|
||
|
|
return response
|
||
|
|
except Exception as exc:
|
||
|
|
logger.exception("_llm_call via agent-queue failed: %s", exc)
|
||
|
|
return _chat_from_text(f"[LLM-queue-fout] {exc}")
|