Initial proxy MVP

2026-02-02 11:03:25 +01:00 · 2026-02-02 11:03:25 +01:00 · 6f533fc15a
commit 6f533fc15a
7 changed files with 552 additions and 0 deletions
--- a/18
+++ b/18
@ -0,0 +1,18 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Runtime deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY src ./src
+
+ENV PYTHONUNBUFFERED=1
+EXPOSE 8080
+
+CMD ["uvicorn", "src.queuegate_proxy.app:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,40 @@
+# QueueGate
+
+QueueGate is an OpenAI-compatible **LLM proxy** that:
+- routes requests to multiple upstream OpenAI-compatible backends (e.g. llama.cpp)
+- provides a simple **priority queue** (user > agent)
+- supports **stream** and **non-stream** (no fake streaming)
+- supports **sticky worker affinity** (same chat -> same upstream when possible)
+
+## Quick start
+
+### 1) Configure
+
+Minimal env:
+- `LLM_UPSTREAMS` (comma-separated URLs)
+  - e.g. `http://llama0:8000/v1/chat/completions,http://llama1:8000/v1/chat/completions`
+
+Optional:
+- `LLM_MAX_CONCURRENCY` (defaults to number of upstreams)
+- `STICKY_HEADER` (default: `X-Chat-Id`)
+- `AFFINITY_TTL_SEC` (default: `60`)
+- `QUEUE_NOTIFY_USER` = `auto|always|never` (default: `auto`)
+- `QUEUE_NOTIFY_MIN_MS` (default: `1200`)
+
+### 2) Run
+
+```bash
+uvicorn queuegate_proxy.app:app --host 0.0.0.0 --port 8080
+```
+
+### 3) Health
+
+`GET /healthz`
+
+### 4) OpenAI endpoint
+
+`POST /v1/chat/completions`
+
+## Notes
+- Tool calls are detected and suppressed in streaming output (to prevent leakage).
+- This first version is a **proxy-only MVP**; tool execution can be wired in later.
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+fastapi==0.115.8
+uvicorn[standard]==0.34.0
+httpx==0.27.2
+pydantic==2.10.6
+python-dotenv==1.0.1
--- a/run.sh
+++ b/run.sh
@ -0,0 +1,17 @@
+export HTTP_PROXY=http://192.168.100.2:8118
+export HTTPS_PROXY=http://192.168.100.2:8118
+export http_proxy=http://192.168.100.2:8118
+export https_proxy=http://192.168.100.2:8118
+docker build -t queuegate:proxy . --build-arg http_proxy=http://192.168.100.2:8118 --build-arg https_proxy=http://192.168.100.2:8118
+
+docker run --rm -p 8081:8080 \
+  -e LLM_UPSTREAMS="http://192.168.100.1:8000/v1/chat/completions,http://192.168.100.1:8001/v1/chat/completions" \
+  -e AFFINITY_TTL_SEC=240 \
+  -e HTTP_PROXY=http://192.168.100.2:8118 \
+  -e HTTPS_PROXY=http://192.168.100.2:8118 \
+  -e NO_PROXY=localhost,127.0.0.1,192.168.100.1 \
+  -e QUEUE_NOTIFY_USER=auto \
+  -e QUEUE_NOTIFY_MIN_MS=1200 \
+  -e LLM_READ_TIMEOUT=3600 \
+  -e TEXT_TOOLCALL_DETECT=1 \
+  queuegate:proxy 
--- a/src/queuegate_proxy/init.py
+++ b/src/queuegate_proxy/init.py
--- a/src/queuegate_proxy/pycache/app.cpython-311.pyc
+++ b/src/queuegate_proxy/pycache/app.cpython-311.pyc
--- a/src/queuegate_proxy/app.py
+++ b/src/queuegate_proxy/app.py
@ -0,0 +1,472 @@
+import asyncio
+import hashlib
+import json
+import os
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+
+import httpx
+from fastapi import Body, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+
+def env_bool(key: str, default: bool = False) -> bool:
+    v = os.getenv(key)
+    if v is None:
+        return default
+    return v.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def env_int(key: str, default: int) -> int:
+    v = os.getenv(key)
+    if v is None:
+        return default
+    try:
+        return int(v)
+    except ValueError:
+        return default
+
+
+def now_ts() -> float:
+    return time.time()
+
+
+def job_id() -> str:
+    return uuid.uuid4().hex
+
+
+def sha1(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()
+
+
+@dataclass
+class ProxyConfig:
+    upstreams: List[str]
+
+    sticky_header: str = "X-Chat-Id"
+    affinity_ttl_sec: int = 60
+
+    queue_notify_user: str = "auto"  # auto|always|never
+    queue_notify_min_ms: int = 1200
+
+    read_timeout_sec: int = 3600
+
+    toolserver_url: Optional[str] = None  # for later
+    text_toolcall_detect: bool = False
+
+
+def load_config() -> ProxyConfig:
+    ups = (os.getenv("LLM_UPSTREAMS") or "").strip()
+    if not ups:
+        raise RuntimeError("LLM_UPSTREAMS is required (comma-separated URLs)")
+    upstreams = [u.strip() for u in ups.split(",") if u.strip()]
+
+    return ProxyConfig(
+        upstreams=upstreams,
+        sticky_header=(os.getenv("STICKY_HEADER") or "X-Chat-Id").strip() or "X-Chat-Id",
+        affinity_ttl_sec=env_int("AFFINITY_TTL_SEC", 60),
+        queue_notify_user=(os.getenv("QUEUE_NOTIFY_USER") or "auto").strip().lower(),
+        queue_notify_min_ms=env_int("QUEUE_NOTIFY_MIN_MS", 1200),
+        read_timeout_sec=env_int("LLM_READ_TIMEOUT", 3600),
+        toolserver_url=(os.getenv("TOOLSERVER_URL") or "").strip() or None,
+        text_toolcall_detect=env_bool("TEXT_TOOLCALL_DETECT", False),
+    )
+
+@dataclass
+class Job:
+    job_id: str
+    created_ts: float
+    kind: str  # user_chat | agent_call
+    stream: bool
+    body: Dict[str, Any]
+    headers: Dict[str, str]
+    thread_key: str
+    assigned_worker: int
+
+    status: str = "queued"  # queued|running|done|error
+    error: Optional[str] = None
+    result: Optional[Dict[str, Any]] = None
+
+    # For waiting results
+    done_fut: asyncio.Future = field(default_factory=asyncio.Future)
+    stream_q: asyncio.Queue = field(default_factory=asyncio.Queue)
+    saw_any_output: bool = False
+
+
+class Worker:
+    def __init__(self, idx: int, upstream_url: str, client: httpx.AsyncClient):
+        self.idx = idx
+        self.upstream_url = upstream_url
+        self.client = client
+        self.user_q: asyncio.Queue[Job] = asyncio.Queue()
+        self.agent_q: asyncio.Queue[Job] = asyncio.Queue()
+        self.current_job_id: Optional[str] = None
+        self.task: Optional[asyncio.Task] = None
+
+    def pending_count(self) -> int:
+        return self.user_q.qsize() + self.agent_q.qsize() + (1 if self.current_job_id else 0)
+
+    async def start(self, state: "ProxyState") -> None:
+        self.task = asyncio.create_task(self._run(state), name=f"worker-{self.idx}")
+
+    async def _run(self, state: "ProxyState") -> None:
+        while True:
+            if not self.user_q.empty():
+                job = await self.user_q.get()
+            else:
+                job = await self.agent_q.get()
+
+            self.current_job_id = job.job_id
+            job.status = "running"
+
+            try:
+                if job.stream:
+                    await state.handle_stream_job(job, self)
+                else:
+                    await state.handle_non_stream_job(job, self)
+            except Exception as e:
+                job.status = "error"
+                job.error = str(e)
+                if not job.done_fut.done():
+                    job.done_fut.set_exception(e)
+                await job.stream_q.put(None)
+            finally:
+                self.current_job_id = None
+                try:
+                    if job.kind == "user_chat":
+                        self.user_q.task_done()
+                    else:
+                        self.agent_q.task_done()
+                except Exception:
+                    pass
+
+
+def get_header_any(headers: Dict[str, str], names: List[str]) -> Optional[str]:
+    for n in names:
+        v = headers.get(n)
+        if v:
+            return v
+    return None
+
+
+def infer_kind(headers: Dict[str, str]) -> str:
+    jk = (headers.get("X-Job-Kind") or "").strip().lower()
+    if jk in {"agent", "agent_call", "repo_agent"}:
+        return "agent_call"
+    return "user_chat"
+
+
+def infer_thread_key(cfg: ProxyConfig, headers: Dict[str, str], body: Dict[str, Any]) -> str:
+    v = get_header_any(
+        headers,
+        [cfg.sticky_header, "X-OpenWebUI-Chat-Id", "X-Chat-Id", "X-Conversation-Id"],
+    )
+    if v:
+        return v
+
+    seed = {
+        "model": (body.get("model") or "").strip(),
+        "messages": (body.get("messages") or [])[:2],
+        "user": body.get("user"),
+    }
+    try:
+        txt = json.dumps(seed, sort_keys=True, ensure_ascii=False)
+    except Exception:
+        txt = str(seed)
+    return "h:" + sha1(txt)
+
+
+class ProxyState:
+    def __init__(self, cfg: ProxyConfig):
+        self.cfg = cfg
+        self.http = httpx.AsyncClient(timeout=httpx.Timeout(cfg.read_timeout_sec))
+        self.workers: List[Worker] = [Worker(i, u, self.http) for i, u in enumerate(cfg.upstreams)]
+        self.affinity: Dict[str, Tuple[int, float]] = {}
+        self.jobs: Dict[str, Job] = {}
+
+    async def start(self) -> None:
+        for w in self.workers:
+            await w.start(self)
+
+    async def close(self) -> None:
+        await self.http.aclose()
+
+    def pick_worker(self, thread_key: str) -> int:
+        now = now_ts()
+        sticky = self.affinity.get(thread_key)
+        if sticky:
+            widx, last = sticky
+            if now - last <= self.cfg.affinity_ttl_sec and 0 <= widx < len(self.workers):
+                self.affinity[thread_key] = (widx, now)
+                return widx
+
+        best = 0
+        best_load = None
+        for w in self.workers:
+            load = w.pending_count()
+            if best_load is None or load < best_load:
+                best_load = load
+                best = w.idx
+
+        self.affinity[thread_key] = (best, now)
+        return best
+
+    def enqueue(self, job: Job) -> None:
+        w = self.workers[job.assigned_worker]
+        if job.kind == "user_chat":
+            w.user_q.put_nowait(job)
+        else:
+            w.agent_q.put_nowait(job)
+        self.jobs[job.job_id] = job
+
+
+
+def sse_pack(obj: Dict[str, Any]) -> bytes:
+    return ("data: " + json.dumps(obj, ensure_ascii=False) + "\n\n").encode("utf-8")
+
+
+def sse_done() -> bytes:
+    return b"data: [DONE]\n\n"
+
+
+def make_chunk(job_id: str, model: str, delta: Dict[str, Any], finish_reason: Optional[str] = None) -> Dict[str, Any]:
+    return {
+        "id": job_id,
+        "object": "chat.completion.chunk",
+        "created": int(now_ts()),
+        "model": model,
+        "choices": [{"index": 0, "delta": delta, "finish_reason": finish_reason}],
+    }
+
+
+def looks_like_toolcalls_text(txt: str) -> bool:
+    s = (txt or "").lstrip()
+    return s.startswith("[TOOL_CALLS]") or s.startswith("{\"tool_calls\"")
+
+
+def strip_toolcalls_text(txt: str) -> str:
+    return "(tool-call output suppressed; tools not enabled in proxy-only MVP)"
+
+
+def capture_tool_calls_from_delta(delta: Dict[str, Any], acc: List[Dict[str, Any]]) -> None:
+    tcs = delta.get("tool_calls")
+    if isinstance(tcs, list):
+        acc.extend(tcs)
+
+
+
+class ProxyState(ProxyState):  # type: ignore
+    def queue_position(self, job: Job) -> int:
+        w = self.workers[job.assigned_worker]
+        if job.kind == "user_chat":
+            return w.user_q.qsize()
+        return w.agent_q.qsize()
+
+    async def handle_non_stream_job(self, job: Job, worker: Worker) -> None:
+        body = dict(job.body)
+        body["stream"] = False
+
+        r = await self.http.post(worker.upstream_url, json=body)
+        r.raise_for_status()
+        data = r.json()
+
+        if self.cfg.text_toolcall_detect:
+            try:
+                msg = data.get("choices", [{}])[0].get("message", {})
+                content = msg.get("content")
+                if isinstance(content, str) and looks_like_toolcalls_text(content):
+                    msg["content"] = strip_toolcalls_text(content)
+            except Exception:
+                pass
+
+        job.result = data
+        job.status = "done"
+        if not job.done_fut.done():
+            job.done_fut.set_result(data)
+
+    async def handle_stream_job(self, job: Job, worker: Worker) -> None:
+        body = dict(job.body)
+        body["stream"] = True
+        model = (body.get("model") or "").strip() or "unknown"
+
+        tool_calls: List[Dict[str, Any]] = []
+
+        async with self.http.stream("POST", worker.upstream_url, json=body) as r:
+            r.raise_for_status()
+            async for line in r.aiter_lines():
+                if not line:
+                    continue
+                if not line.startswith("data:"):
+                    continue
+                payload = line[len("data:"):].strip()
+                if payload == "[DONE]":
+                    break
+
+                try:
+                    obj = json.loads(payload)
+                except Exception:
+                    # non-json chunk; pass through
+                    await job.stream_q.put((line + "\n\n").encode("utf-8"))
+                    job.saw_any_output = True
+                    continue
+
+                choice0 = (obj.get("choices") or [{}])[0]
+                delta = choice0.get("delta") or {}
+
+                if "tool_calls" in delta:
+                    capture_tool_calls_from_delta(delta, tool_calls)
+                    continue
+
+                if self.cfg.text_toolcall_detect:
+                    c = delta.get("content")
+                    if isinstance(c, str) and looks_like_toolcalls_text(c):
+                        continue
+
+                await job.stream_q.put((line + "\n\n").encode("utf-8"))
+                job.saw_any_output = True
+
+        if tool_calls and not job.saw_any_output:
+            msg = "(tool-call requested but tools are not enabled yet in the proxy-only MVP)"
+            await job.stream_q.put(sse_pack(make_chunk(job.job_id, model, {"role": "assistant", "content": msg})))
+            await job.stream_q.put(sse_pack(make_chunk(job.job_id, model, {}, finish_reason="stop")))
+
+        await job.stream_q.put(sse_done())
+        job.status = "done"
+        if not job.done_fut.done():
+            job.done_fut.set_result({"status": "streamed"})
+
+
+app = FastAPI(title="QueueGate Proxy", version="0.1.0")
+
+CFG: Optional[ProxyConfig] = None
+STATE: Optional[ProxyState] = None
+
+
+@app.on_event("startup")
+async def _startup() -> None:
+    global CFG, STATE
+    CFG = load_config()
+    STATE = ProxyState(CFG)
+    await STATE.start()
+
+
+@app.on_event("shutdown")
+async def _shutdown() -> None:
+    global STATE
+    if STATE:
+        await STATE.close()
+
+
+@app.get("/healthz")
+async def healthz() -> Dict[str, Any]:
+    if not STATE:
+        raise HTTPException(status_code=503, detail="not ready")
+    return {
+        "ok": True,
+        "upstreams": len(STATE.workers),
+        "workers": [
+            {
+                "id": w.idx,
+                "pending": w.pending_count(),
+                "busy": bool(w.current_job_id),
+                "upstream": w.upstream_url,
+            }
+            for w in STATE.workers
+        ],
+    }
+
+
+@app.get("/v1/jobs/{job_id}")
+async def job_status(job_id: str) -> Dict[str, Any]:
+    if not STATE:
+        raise HTTPException(status_code=503, detail="not ready")
+    job = STATE.jobs.get(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="unknown job")
+    return {
+        "job_id": job.job_id,
+        "status": job.status,
+        "kind": job.kind,
+        "created_ts": job.created_ts,
+        "worker": job.assigned_worker,
+        "queue_position_est": STATE.queue_position(job) if job.status == "queued" else 0,
+        "error": job.error,
+    }
+
+
+def _require_state() -> ProxyState:
+    if not STATE:
+        raise HTTPException(status_code=503, detail="not ready")
+    return STATE
+
+
+def _copy_headers(req: Request) -> Dict[str, str]:
+    # keep it simple: copy headers for sticky routing + internal kind
+    keep = {
+        "X-Job-Kind",
+        "X-Chat-Id",
+        "X-Conversation-Id",
+        "X-OpenWebUI-Chat-Id",
+    }
+    out: Dict[str, str] = {}
+    for k, v in req.headers.items():
+        if k in keep or k.lower() == (CFG.sticky_header.lower() if CFG else "").lower():
+            out[k] = v
+    return out
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(body: Dict[str, Any] = Body(...), request: Request = None):
+    state = _require_state()
+    cfg = state.cfg
+
+    stream = bool(body.get("stream", False))
+    headers = _copy_headers(request)
+    kind = infer_kind(headers)
+    thread_key = infer_thread_key(cfg, headers, body)
+    worker_idx = state.pick_worker(thread_key)
+
+    jid = job_id()
+    job = Job(
+        job_id=jid,
+        created_ts=now_ts(),
+        kind=kind,
+        stream=stream,
+        body=body,
+        headers=headers,
+        thread_key=thread_key,
+        assigned_worker=worker_idx,
+    )
+    state.enqueue(job)
+
+    if not stream:
+        # wait for result
+        try:
+            data = await job.done_fut
+        except Exception as e:
+            raise HTTPException(status_code=502, detail=f"upstream error: {e}")
+        return JSONResponse(content=data)
+
+    # streaming
+    async def gen() -> Any:
+        # Optional: queue notice (user jobs only)
+        if job.kind == "user_chat" and cfg.queue_notify_user != "never":
+            t0 = now_ts()
+            # wait until running or until threshold
+            while job.status == "queued" and (now_ts() - t0) * 1000 < cfg.queue_notify_min_ms:
+                await asyncio.sleep(0.05)
+            if job.status == "queued" and cfg.queue_notify_user in {"auto", "always"}:
+                pos = state.queue_position(job)
+                model = (body.get("model") or "unknown").strip() or "unknown"
+                msg = f"⏳ In wachtrij (positie ~{pos})…"
+                yield sse_pack(make_chunk(job.job_id, model, {"role": "assistant", "content": msg}))
+
+        while True:
+            item = await job.stream_q.get()
+            if item is None:
+                break
+            yield item
+
+    return StreamingResponse(gen(), media_type="text/event-stream")