diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b3194c0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils +RUN pip install --upgrade pip +RUN pip install --no-cache-dir -r requirements.txt +RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget ca-certificates libstdc++6 libatomic1 \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /opt/piper \ + && set -eux; \ + URL="https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz"; \ + wget -O /tmp/piper.tgz "$URL"; \ + tar -xzf /tmp/piper.tgz -C /opt/piper --strip-components=1; \ + ln -sf /opt/piper/piper /usr/local/bin/piper; \ + rm -f /tmp/piper.tgz + + +COPY app.py . +COPY queue_helper.py . +COPY agent_repo.py . +COPY windowing_utils.py . +COPY smart_rag.py . + +EXPOSE 8080 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/README.md b/README.md index e69de29..681d35f 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,314 @@ +# Repo Agent & QA — LLM Proxy • OpenWebUI Tools • Agent Repo/QA + +Een compacte, productierijpe stack om (1) chat/LLM-verzoeken af te handelen, (2) OpenWebUI-compatibele tools (STT/TTS/Retrieval) aan te bieden, en (3) een **Repo Agent** te draaien die **kandidaten zoekt, diffs genereert** (dry-run) en — na akkoord — **schrijft & pusht** op een nieuwe branch. Inclusief **Repo-QA** (vraag-antwoord over je codebase met padverwijzingen), **Laravel-bewuste heuristiek**, **hybride retrieval** (Meili/BM25 + embeddings/Chroma), **slim chunking**, **niet-destructieve guards** en een **lichte graaf-boost** (route ⇄ controller ⇄ view ⇄ lang). + +> Dockerfile inbegrepen (Python 3.11-slim), met o.a. `faster-whisper`, `piper` TTS, Meili/Chroma-clients, `gitpython`, `sentence-transformers`, `rank-bm25`. + +--- + +## Inhoudsopgave + +* [Architectuur in 3 onderdelen](#architectuur-in-3-onderdelen) +* [Belangrijkste features](#belangrijkste-features) +* [Snel starten](#snel-starten) + + * [Docker (aanbevolen)](#docker-aanbevolen) + * [Zonder Docker (dev)](#zonder-docker-dev) +* [Configuratie (ENV)](#configuratie-env) +* [Endpoints](#endpoints) +* [Hoe het werkt (flow prompt → diffs/qa)](#hoe-het-werkt-flow-prompt--diffsqa) +* [Retrieval & indexing](#retrieval--indexing) +* [Chunking & contextbudget](#chunking--contextbudget) +* [Laravel-bewust + lichte graaf-boost](#laravel-bewust--lichte-graaf-boost) +* [Veiligheid: diff-guard & apply](#veiligheid-diff-guard--apply) +* [Troubleshooting](#troubleshooting) +* [Roadmap (suggesties)](#roadmap-suggesties) +* [Licentie](#licentie) + +--- + +## Architectuur in 3 onderdelen + +| Onderdeel | Doel | Taken | Bestanden in deze repo | +| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------- | +| **LLM Proxy** | Dunne HTTP-laag voor chat/LLM + (optioneel) tool-calls; OpenWebUI-vriendelijk. | `/chat` endpoint, promptverrijking, streaming; kan tools/agent aanroepen. | `app.py` (routes) | +| **OpenWebUI-compatible Tools** | Kleine helper-endpoints die je rechtstreeks vanuit OpenWebUI of de proxy kunt aanroepen. | STT (faster-whisper), TTS (piper), retrieval (Meili/BM25/Chroma), PDF/afbeelding utils. | `app.py` (tool-routes), helpers in `queue_helper.py` | +| **Agent Repo & Repo QA** | “De smid”: zoekt relevante files, bouwt context, **maakt diffs** (dry-run), en kan **apply & push** doen op nieuwe branch. QA over repo. | Kandidaten zoeken, slim chunken, LLM-editplannen, diff-guards, branch & push. Repo-QA antwoord met padbronnen. | **`agent_repo.py`**, `smart_rag.py`, `windowing_utils.py`, `queue_helper.py` | + +**Mijn advies (prod vs dev):** + +* **Prod:** scheid *LLM Proxy + Tools* (publiek) van *Agent Repo/QA* (afschermen; heeft Gitea-rechten). +* **Dev:** alles in één Uvicorn-app (zoals in deze Dockerfile) is prima. + +--- + +## Belangrijkste features + +* **Hybride retrieval**: Meili/BM25 (file-niveau **harde signalen**) + Chroma/embeddings (chunk-niveau **semantiek**). +* **Laravel-bewuste heuristiek**: routes scannen, controller ↔ view ↔ lang keys, FormRequests/Policies. +* **Slim chunken**: taalspecifiek (PHP/Blade/JS/MD), functie/section-grenzen; overlap; contextbudget. +* **Lichte graaf-boost**: route ⇄ controller ⇄ view ⇄ lang relaties wegen mee in ranking. +* **Niet-destructieve patches**: diff-guard op deletieratio; kleine, anker-gebaseerde edits; sanity-checks. +* **Repo-QA**: compacte, bronverwijzende antwoorden (“Bronnen: padnamen”). +* **OpenWebUI-tools**: STT (faster-whisper), TTS (piper), retrieval als losse endpoints. +* **Dry-run → Apply**: eerst diffs tonen; na *“Akkoord apply”* nieuwe branch + push. + +--- + +## Snel starten + +### Docker (aanbevolen) + +**Dockerfile (samenvatting)** + +```dockerfile +FROM python:3.11-slim +WORKDIR /app +COPY requirements.txt . +RUN apt-get update && apt-get -y install git curl ffmpeg libcairo2 libpango-1.0-0 libgdk-pixbuf2.0-0 apt-utils +RUN pip install --upgrade pip +RUN pip install --no-cache-dir -r requirements.txt +RUN pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25 +# piper TTS +RUN apt-get update && apt-get install -y --no-install-recommends wget ca-certificates libstdc++6 libatomic1 \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /opt/piper \ + && set -eux; URL="https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz"; \ + wget -O /tmp/piper.tgz "$URL"; tar -xzf /tmp/piper.tgz -C /opt/piper --strip-components=1; \ + ln -sf /opt/piper/piper /usr/local/bin/piper; rm -f /tmp/piper.tgz +COPY app.py . +COPY queue_helper.py . +COPY agent_repo.py . +COPY windowing_utils.py . +COPY smart_rag.py . +EXPOSE 8080 +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"] +``` + +**Build & run** + +```bash +# 1) Build +docker build -t repo-agent:latest . + +# 2) Run (met voorbeeld-ENV; pas aan op jouw omgeving) +docker run --rm -p 8080:8080 \ + -e MEILI_URL=http://host.docker.internal:7700 \ + -e MEILI_MASTER_KEY=your_meili_key \ + -e GITEA_URL=http://host.docker.internal:3000 \ + -e GITEA_TOKEN=your_gitea_token \ + -e REPO_AGENT_SMART=1 \ + -e AGENT_DESTRUCTIVE_RATIO=0.25 \ + -e RAG_GRAPH_ENABLE=1 \ + repo-agent:latest +``` + +> **Tip:** start MeiliSearch naast deze container (of gebruik bestaande). Chroma is optioneel; zonder Chroma valt retrieval semantisch terug op BM25/Meili. + +### Zonder Docker (dev) + +```bash +python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt +pip install PyPDF2 python-multipart gitpython chromadb httpx meilisearch pandas openpyxl python-pptx faster-whisper==1.0.0 cairosvg sentence-transformers rank-bm25 +uvicorn app:app --host 0.0.0.0 --port 8080 --reload +``` + +--- + +## Configuratie (ENV) + +| Variabele | Betekenis | Default / Opmerking | +| ---------------------------------------------- | --------------------------------------------------------- | ---------------------------- | +| `MEILI_URL`, `MEILI_MASTER_KEY` | MeiliSearch voor file-retrieval & (optioneel) indexing. | Aanbevolen. | +| `REPO_AGENT_SMART` | Schakel intent-/expansie + hybride retrieval in de agent. | `1` (aan) | +| `GITEA_URL`, `GITEA_TOKEN` | Nodig voor private repos en **apply/push**. | Sterk aanbevolen voor Agent. | +| `AGENT_DEFAULT_BRANCH` | Basisbranch om op te clonen/indexeren. | `main` (fallback `master`) | +| `AGENT_DESTRUCTIVE_RATIO` | Max. deletieratio vóór blokkeren (0–1). | `0.25` (voorbeeld) | +| `RAG_GRAPH_ENABLE` | **Lichte graaf-boost** op ranking inschakelen. | **`1` (aan)** | +| `RAG_EMB_WEIGHT` | Gewicht embeddings t.o.v. lexicaal signaal. | `0.6` typisch | +| `RAG_PER_QUERY_K`, `RAG_N_RESULTS` | Recall parameters hybrid retrieval. | 30 / 18 (voorbeeld) | +| `RAG_NEIGHBORS` | Laravel “buren” (routes→controllers→views). | `1` (aan) | +| `LLM_PRIORS_ENABLE`, `LLM_PRIORS_K` | LLM-gebaseerde pad-prior suggesties. | `1` / `12` | +| `LARAVEL_PRIORS_K` | Max #Laravel priors vóór RAG. | `8` | +| `CHUNK_CHARS_LARAVEL`, `CHUNK_OVERLAP_LARAVEL` | Chunkgrootte/overlap voor Laravel-stacks. | 1800 / 300 | +| `CHUNK_CHARS_DEFAULT`, `CHUNK_OVERLAP_DEFAULT` | Idem voor generieke stacks. | 2600 / 350 | +| `AGENT_QA_CTX_BUDGET_TOKENS` | Tokenbudget voor Repo-QA context. | 6000 | +| `QA_MIN_PER_SNIPPET`, `QA_MAX_PER_SNIPPET` | Contextdistributie per snippet. | 180 / 900 | +| `QA_KEEP_TOP_K` | Max snippets na trimming. | 8 | + +> **NB**: Veel van bovenstaande komen rechtstreeks terug in `agent_repo.py` (zie env-reads aldaar). Waarden hierboven zijn “goede defaults”. + +--- + +## Endpoints + +> Endpoints staan in `app.py`. Onderstaande is het **standaardpatroon** in deze setup. + +### 1) LLM Proxy + +* `POST /chat` — voer een chat uit (optioneel met tool-calls) + +Voorbeeld: + +```bash +curl -X POST http://localhost:8080/chat \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Geef een korte samenvatting van de meldingenmodule."}]}' +``` + +### 2) OpenWebUI-compatible Tools + +* `POST /tools/stt` — audio → tekst (faster-whisper) +* `POST /tools/tts` — tekst → audio (piper) +* `POST /tools/retrieve` — retrieval (Meili/BM25 + embeddings) + +Voorbeeld: + +```bash +curl -X POST http://localhost:8080/tools/retrieve \ + -H "Content-Type: application/json" \ + -d '{"query":"waar staat het wachtwoordbeleid?", "k": 12}' +``` + +### 3) Agent Repo & Repo QA + +* `POST /agent/dryrun` — **genereer diffs**, geen writes +* `POST /agent/apply` — **schrijf & push** (na akkoord) +* `POST /qa/ask` — Repo-QA met paden als bronnen + +Voorbeelden: + +```bash +# Dry-run diffs +curl -X POST http://localhost:8080/agent/dryrun \ + -H "Content-Type: application/json" \ + -d '{"repo":"owner/project","user_goal":"Vervang \"Versturen\" → \"Verzenden\" in meldingen-create"}' + +# Apply (na akkoord) +curl -X POST http://localhost:8080/agent/apply \ + -H "Content-Type: application/json" \ + -d '{"repo":"owner/project","confirm":"Akkoord apply"}' + +# Repo-QA +curl -X POST http://localhost:8080/qa/ask \ + -H "Content-Type: application/json" \ + -d '{"repo":"owner/project","question":"Waar wordt de storing-aanmaak afgehandeld?"}' +``` + +--- + +## Hoe het werkt (flow prompt → diffs/qa) + +**Samengevatte pipeline (Agent Repo):** + +1. **Prompt binnen** → intent & scope (optioneel refine; NL/EN synonyms). +2. **Repo selecteren** → clone/update op basisbranch; meelifall-cache. +3. **Candidate discovery (hybride)** + + * **Meili/BM25 (file-niveau, hard signal)**: top-N files. + * **LLM-priors + Laravel-heuristiek**: routes, controllers, views, lang-keys. + * **Chroma/embeddings (chunk-niveau, zacht signaal)**. + * **Lichte graaf-boost**: route ⇄ controller ⇄ view ⇄ lang. +4. **Slim chunken**: functie/section-grenzen; overlap; metadata (path, class/method, blade-section, start/end). +5. **Contextbouw (RAG)**: top-chunks + korte file/dir-summaries → compact context. +6. **Patchvoorstel per bestand** + + * **Veilige literal-replaces** (UI-labels) → minimaal. + * **LLM edit-plan** met regex/insert/replace-operaties (max 4 stappen). + * **Volledige rewrite (guarded)** als laatste redmiddel. +7. **Diff-guard & checks**: deletieratio drempel; syntaxis/parse waar mogelijk. +8. **Resultaat**: toon diffs + “waarom geselecteerd”; **Akkoord apply** → branch + push. + +**Repo-QA:** + +* Zelfde discovery/ctx, maar i.p.v. diffs levert QA een **kort antwoord + bronpaden**. + +--- + +## Retrieval & indexing + +* **MeiliSearch**: snelle file-retriever. Gebruik `name/path/summary/content` als searchable; boost bekende paden (`routes/**`, `resources/views/**`, `app/Http/Controllers/**`) bij route/view/vertaling-taken. +* **BM25 fallback**: aanwezig als Meili niet beschikbaar is. +* **Chroma (optioneel)**: embeddings per **chunk** met metadata (`path`, `lang`, `class`, `function`, `route`, `blade_section`, `start/end`). Wordt gebruikt als **zachte** rankingbron naast Meili. + +**Reranking (vuistregel)** +`FinalScore(file) = 0.55 * Meili + 0.35 * Embeddings + 0.10 * Heuristiek + PathBoost + Recency` + +--- + +## Chunking & contextbudget + +* **Taalspecifiek**: + + * PHP → class/method/closure; docblocks mee. + * Blade → `@section`, componenten, top-level HTML-blokken. + * JS/TS → function/module-grenzen. + * MD/Tekst → alinea’s; headers intact. +* **Budget**: context trimmer verdeelt tokens over snippets (`QA_MIN/MAX_PER_SNIPPET`, `QA_KEEP_TOP_K`), deduplication, novelty-score en overlap. + +--- + +## Laravel-bewust & lichte graaf-boost + +* **Route-mapping**: scan `routes/web.php`/`api.php` → `Controller@method`. +* **View & lang-koppeling**: `return view('foo.bar')` → `resources/views/foo/bar.blade.php`; `__('key')`, `@lang('key')` → `resources/lang/**`. +* **Neighbors**: controller → view(s), route → controller, view → partials/layouts (dichtbij). +* **Graph-boost (aan)**: deze relaties wegen mee in ranking (standaard **aan** via `RAG_GRAPH_ENABLE=1`). + +**Tree + samenvattingen** + +* Projecttree krijgt **korte omschrijvingen per dir/file** (README’s, docblocks, eerste regels) zodat de LLM **snapt welke lagen** er zijn. Deze summaries worden hergebruikt in retrieval en prompt-context. + +--- + +## Veiligheid: diff-guard & apply + +* **Destructiviteits-guard**: schat deletieratio met `difflib.ndiff`; blokkeer als `> AGENT_DESTRUCTIVE_RATIO` (files < ~6 regels worden soepeler behandeld). +* **Editstrategie (minimaal eerst)**: + + 1. gerichte literal-replaces (quotes, fallbacks), + 2. scoped HTML/Blade vervanging, + 3. LLM edit-plan (max 4 bewerkingen), + 4. guarded rewrite. +* **Apply**: alleen na “Akkoord apply” → nieuwe branch `task/-YYYYMMDD-HHMMSS` → **push** (Gitea token vereist). + +--- + +## Troubleshooting + +* **Geen kandidaten gevonden** → specificeer een route, bestand of label-tekst; controleer Meili/Chroma beschikbaarheid. +* **Apply faalt** → check `GITEA_URL`/`GITEA_TOKEN` en repo-rechten; branch bestaat al? +* **STT/TTS werkt niet** → controleer `ffmpeg` in container en piper/faster-whisper install (zitten in Dockerfile). +* **Lage precisie** → verhoog `RAG_PER_QUERY_K` en `RAG_N_RESULTS`, zet `RAG_GRAPH_ENABLE=1`, gebruik `LLM_PRIORS_ENABLE=1`. + +--- + +## Roadmap (suggesties) + +* **AST-chunking voor PHP** (nikic/php-parser) voor nog scherpere grenzen. +* **Cross-encoder reranker** (klein model) bovenop cosine. +* **Blade compile-check** (sanity) en `php -l` op changed files. +* **Auto test-suggesties** bij diffs. +* **Meer tools** (PDF tabel-extractie, image-OCR) als OpenWebUI-endpoints. + +--- + +## Licentie + +Kies een licentie (MIT/Apache-2.0/GPL-3.0). Voeg `LICENSE` toe aan de repo. + +--- + +### Bestandsoverzicht + +``` +app.py # Uvicorn/FastAPI app; /chat, tools, agent/qa routes +agent_repo.py # Repo Agent & Repo QA: retrieval, chunking, diffs, apply +smart_rag.py # Intent/expansie, hybride retrieval helpers +windowing_utils.py # Chunking & contextbudget utils +queue_helper.py # Hulpfuncties voor taken/IO +Dockerfile # Container build (Python 3.11-slim, piper, faster-whisper, Meili/Chroma clients) +requirements.txt # Basis Python dependencies +``` + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4f85179 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +fastapi +uvicorn[standard] +requests +python-docx +pypdf diff --git a/smart_rag.py b/smart_rag.py new file mode 100644 index 0000000..05c3759 --- /dev/null +++ b/smart_rag.py @@ -0,0 +1,598 @@ +# smart_rag.py +# Kleine util-laag voor intent + hybride retrieval + context-assemblage. +from __future__ import annotations +import os, re, json, math, hashlib +from typing import List, Dict, Tuple, DefaultDict, Optional +from collections import defaultdict + + + +def _decamel(s: str) -> str: + s = re.sub(r"([a-z])([A-Z])", r"\1 \2", s) + s = s.replace("_", " ") + return re.sub(r"\s+", " ", s).strip() + +def _symbol_guess(q: str) -> list[str]: + # pak langste 'code-achtig' token als symboolkandidaat + toks = re.findall(r"[A-Za-z_][A-Za-z0-9_]{2,}", q) + toks.sort(key=len, reverse=True) + return toks[:2] + +def _simple_variants(q: str, max_k: int = 3) -> list[str]: + base = [q] + lo = q.lower().strip() + if lo and lo not in base: + base.append(lo) + dec = _decamel(q) + if dec and dec.lower() != lo and dec not in base: + base.append(dec) + syms = _symbol_guess(q) + for s in syms: + v = s.replace("_", " ") + if v not in base: + base.append(v) + v2 = s # raw symbool + if v2 not in base: + base.append(v2) + # cap + return base[: max(1, min(len(base), max_k))] + + +# --- Query routing + RRF fuse --- + +def _route_query_buckets(q: str) -> list[dict]: + """Hele lichte router: retourneert lijst subqueries met optionele path filters en boost.""" + lo = (q or "").lower() + buckets = [] + + # Queue/Jobs/Event pipeline (Laravel) + if any(w in lo for w in ["job", "queue", "listener", "event", "dispatch"]): + buckets.append({"q": q, "path_contains": "app/Jobs", "boost": 1.18}) + buckets.append({"q": q, "path_contains": "app/Listeners", "boost": 1.12}) + buckets.append({"q": q, "path_contains": "app/Events", "boost": 1.10}) + # Models / Migrations + if any(w in lo for w in ["model", "eloquent", "scope", "attribute"]): + buckets.append({"q": q, "path_contains": "app/Models", "boost": 1.12}) + if any(w in lo for w in ["migration", "schema", "table", "column"]): + buckets.append({"q": q, "path_contains": "database/migrations", "boost": 1.08}) + + # Laravel/Blade/UI + if any(w in lo for w in ["blade", "view", "template", "button", "placeholder", "label"]): + buckets.append({"q": q, "path_contains": "resources/views", "boost": 1.2}) + # Routes/controllers + if any(w in lo for w in ["route", "controller", "middleware", "api", "web.php", "controller@"]): + buckets.append({"q": q, "path_contains": "routes", "boost": 1.15}) + buckets.append({"q": q, "path_contains": "app/Http/Controllers", "boost": 1.2}) + # Config/ENV + if any(w in lo for w in ["env", "config", "database", "queue", "cache"]): + buckets.append({"q": q, "path_contains": "config", "boost": 1.15}) + buckets.append({"q": q, "path_contains": ".env", "boost": 1.1}) + # Docs/README + if any(w in lo for w in ["readme", "install", "setup", "document", "usage"]): + buckets.append({"q": q, "path_contains": "README", "boost": 1.05}) + buckets.append({"q": q, "path_contains": "docs", "boost": 1.05}) + + # Fallback: generiek + buckets.append({"q": q, "path_contains": None, "boost": 1.0}) + # dedup op (q, path_contains) + seen = set(); out = [] + for b in buckets: + key = (b["q"], b["path_contains"]) + if key in seen: continue + seen.add(key); out.append(b) + return out + +def rrf_fuse_ranked_lists(ranked_lists: list[list[dict]], k: int = 60) -> list[dict]: + """ + ranked_lists: bv. [[{key,score,item},...], ...] (elk al per kanaal/bucket gesorteerd) + Return: één samengevoegde lijst (dicts) met veld 'score_fused'. + """ + # bouw mapping + pos_maps: list[dict] = [] + for rl in ranked_lists or []: + pos = {} + for i, it in enumerate(rl, 1): + meta = it.get("metadata") or {} + key = f"{meta.get('repo','')}::{meta.get('path','')}::{meta.get('chunk_index','')}" + pos[key] = i + pos_maps.append(pos) + + fused: dict[str, float] = {} + ref_item: dict[str, dict] = {} + for idx, rl in enumerate(ranked_lists or []): + pos_map = pos_maps[idx] + for it in rl: + meta = it.get("metadata") or {} + key = f"{meta.get('repo','')}::{meta.get('path','')}::{meta.get('chunk_index','')}" + r = pos_map.get(key, 10**9) + fused[key] = fused.get(key, 0.0) + 1.0 / (k + r) + ref_item[key] = it + + out = [] + for key, f in fused.items(): + it = dict(ref_item[key]) + it["score_fused"] = f + out.append(it) + out.sort(key=lambda x: x.get("score_fused", 0.0), reverse=True) + return out + + +def _rrf_from_ranklists(ranklists: List[List[str]], k: int = int(os.getenv("RRF_K", "60"))) -> Dict[str, float]: + """ + Reciprocal Rank Fusion: neemt geordende lijsten (best eerst) en + geeft samengevoegde scores {key: rrf_score}. + """ + acc = defaultdict(float) + for lst in ranklists: + for i, key in enumerate(lst): + acc[key] += 1.0 / (k + i + 1) + return acc + +def _path_prior(path: str) -> float: + """ + Light-weight prior per pad. 0..1 schaal. Laravel paden krijgen bonus, + generieke code dirs ook een kleine bonus; binaire/test/asset minder. + """ + p = (path or "").replace("\\", "/").lower() + bonus = 0.0 + # Laravel priors + if p.startswith("routes/"): bonus += 0.35 + if p.startswith("app/http/controllers/"): bonus += 0.30 + if p.startswith("resources/views/"): bonus += 0.25 + if p.endswith(".blade.php"): bonus += 0.15 + # Generieke priors + if p.startswith(("src/", "app/", "lib/", "pages/", "components/")): bonus += 0.12 + if p.endswith((".php",".ts",".tsx",".js",".jsx",".py",".go",".rb",".java",".cs",".vue",".html",".md")): + bonus += 0.05 + # Demote obvious low-signal + if "/tests/" in p or p.startswith(("tests/", "test/")): bonus -= 0.10 + if p.endswith((".lock",".map",".min.js",".min.css")): bonus -= 0.10 + return max(0.0, min(1.0, bonus)) + + +def _safe_json_loads(s: str): + if not s: + return None + t = s.strip() + if t.startswith("```"): + t = re.sub(r"^```(?:json)?", "", t, count=1, flags=re.IGNORECASE).strip() + if t.endswith("```"): + t = t[:-3].strip() + try: + return json.loads(t) + except Exception: + return None + + +def _tok(s: str) -> List[str]: + return re.findall(r"[A-Za-z0-9_]+", s.lower()) + +def _jaccard(a: str, b: str) -> float: + A, B = set(_tok(a)), set(_tok(b)) + if not A or not B: return 0.0 + # heel kleine set-caps (noodrem tegen pathologische inputs) + if len(B) > 8000: + # reduceer B met stabiele (deterministische) sampling op basis van sha1 + def _stable_byte(tok: str) -> int: + return hashlib.sha1(tok.encode("utf-8")).digest()[0] + B = {t for t in B if _stable_byte(t) < 64} # ~25% sample + return len(A & B) / max(1, len(A | B)) + + +def _normalize(xs: List[float]) -> List[float]: + if not xs: return xs + lo, hi = min(xs), max(xs) + if hi <= lo: return [0.0]*len(xs) + return [(x - lo) / (hi - lo) for x in xs] + +async def enrich_intent(llm_call_fn, messages: List[Dict]) -> Dict: + """ + Zet ongestructureerde vraag om naar een compact plan. + Velden: task, constraints, file_hints, keywords, acceptance, ask(optional). + """ + user_text = "" + for m in reversed(messages): + if m.get("role") == "user": + user_text = m.get("content","").strip() + break + + sys = ("Je herstructureert een developer-vraag naar JSON. " + "Geef ALLEEN JSON, geen toelichting.") + usr = ( + "Zet de essentie van de vraag om naar dit schema:\n" + "{" + "\"task\": str, " + "\"constraints\": [str,...], " + "\"file_hints\": [str,...], " + "\"keywords\": [str,...], " + "\"acceptance\": [str,...], " + "\"ask\": str|null " + "}\n\n" + f"Vraag:\n{user_text}" + ) + try: + resp = await llm_call_fn( + [{"role":"system","content":sys},{"role":"user","content":usr}], + stream=False, temperature=0.1, top_p=1.0, max_tokens=300 + ) + raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","{}") + spec = _safe_json_loads(raw) or {"task": user_text, "constraints": [], "file_hints": [], "keywords": [], "acceptance": [], "ask": None} #json.loads(raw.strip()) + except Exception: + # Veilige defaults + spec = { + "task": user_text, + "constraints": [], + "file_hints": [], + "keywords": [], + "acceptance": [], + "ask": None + } + # Minimalistische fallback sanity + for k in ("constraints","file_hints","keywords","acceptance"): + if not isinstance(spec.get(k), list): + spec[k] = [] + if not isinstance(spec.get("task"), str): + spec["task"] = user_text + if spec.get("ask") is not None and not isinstance(spec["ask"], str): + spec["ask"] = None + return spec + +async def expand_queries(llm_call_fn, q: str, k: int = 3) -> List[str]: + if str(os.getenv("RAG_EXPAND_QUERIES","1")).lower() in ("0","false"): + return [q] + sys = "Geef 3-4 korte NL/EN zoekvarianten als JSON array. Geen toelichting." + usr = f"Bronvraag:\n{q}\n\nAlleen JSON array." + try: + resp = await llm_call_fn( + [{"role":"system","content":sys},{"role":"user","content":usr}], + stream=False, temperature=0.2, top_p=0.9, max_tokens=120 + ) + raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","[]") + arr = _safe_json_loads(raw) or [] + arr = [str(x).strip() for x in arr if str(x).strip()] + seen = {q.lower()} + base = [q] + for v in arr: + lv = v.lower() + if lv not in seen: + base.append(v); seen.add(lv) + return base[: max(1, min(len(base), k + 1))] + except Exception: + return [q] + +def _sim_from_chroma_distance(d: float|None) -> float: + """ + Converteer (Chroma) distance naar similarity in [0,1]; defensief tegen None/NaN/negatief. + """ + if d is None: + return 0.0 + try: + dv = float(d) + except Exception: + dv = 0.0 + if not math.isfinite(dv) or dv < 0: + return 0.0 + return 1.0 / (1.0 + dv) + + +async def hybrid_retrieve( + rag_query_internal_fn, + query: str, + *, + repo: str|None = None, + profile: str|None = None, + path_contains: str|None = None, + per_query_k: int = 30, + n_results: int = 8, + alpha: float = 0.6, + collection_name: str = "code_docs", + llm_call_fn=None, +) -> List[Dict]: + """ + Multi-variant retrieval met RRF-fusie + path-prior. + Return: lijst met dict(document, metadata, score) + """ + + # Optionele query-routing + RRF + use_route = str(os.getenv("RAG_ROUTE", "1")).lower() not in ("0", "false") + use_rrf = str(os.getenv("RAG_RRF", "1")).lower() not in ("0", "false") + # Optionele mini multi-query expansion (default uit) + use_expand = str(os.getenv("RAG_MULTI_EXPAND", "1")).lower() in ("1","true","yes") + k_variants = max(1, int(os.getenv("RAG_MULTI_K", "3"))) + per_query_k = max(1, int(per_query_k)) + n_results = max(1, int(n_results)) + if not (query or "").strip(): + return [] + # Multi-query variants: + if use_expand: + if llm_call_fn is not None: + variants = await expand_queries(llm_call_fn, query, k=k_variants) + else: + variants = _simple_variants(query, max_k=k_variants) + else: + variants = [query] + + ranked_lists = [] # voor RRF (alle varianten/buckets) + for qv in variants: + if use_route: + buckets = _route_query_buckets(qv) + for b in buckets: + res = await rag_query_internal_fn( + query=b["q"], n_results=per_query_k, + collection_name=collection_name, + repo=repo, path_contains=b["path_contains"], profile=profile + ) + lst = [] + for item in (res or {}).get("results", []): + # distance kan ontbreken bij oudere backends; defensieve cast + dist = item.get("distance", None) + try: dist = float(dist) if dist is not None else None + except Exception: dist = None + emb_sim = _sim_from_chroma_distance(dist) * float(b.get("boost",1.0)) + lst.append({**item, "emb_sim_routed": emb_sim}) + lst.sort(key=lambda x: x.get("emb_sim_routed",0.0), reverse=True) + # Laat RRF voldoende kandidaten zien (niet te vroeg afsnijden): + ranked_lists.append(lst[:per_query_k]) + else: + # geen routing: per variant direct query'en (consistent scoren/sorteren) + res = await rag_query_internal_fn( + query=qv, n_results=per_query_k, + collection_name=collection_name, + repo=repo, path_contains=path_contains, profile=profile + ) + lst = [] + for item in (res or {}).get("results", []): + dist = item.get("distance", None) + try: dist = float(dist) if dist is not None else None + except Exception: dist = None + emb_sim = _sim_from_chroma_distance(dist) + lst.append({**item, "emb_sim_routed": emb_sim}) + lst.sort(key=lambda x: x.get("emb_sim_routed", 0.0), reverse=True) + ranked_lists.append(lst[:per_query_k]) + + + + # Als RRF aanstaat: fuseer nu + items = rrf_fuse_ranked_lists(ranked_lists) if use_rrf else [x for rl in ranked_lists for x in rl] + + if not items: + return [] + + # Eenvoudige lexicale score (op samengevoegde set): + # neem het BESTE van alle varianten i.p.v. alleen de hoofdquery. + bm: List[float] = [] + if variants and len(variants) > 1: + for it in items: + doc = it.get("document", "") or "" + bm.append(max((_jaccard(v, doc) for v in variants), default=_jaccard(query, doc))) + else: + bm = [_jaccard(query, it.get("document","")) for it in items] + bm_norm = _normalize(bm) + + out = [] + for i, it in enumerate(items): + # Betere fallback: gebruik routed emb sim → plain emb_sim → distance + emb = ( + float(it.get("emb_sim_routed", 0.0)) + or float(it.get("emb_sim", 0.0)) + or _sim_from_chroma_distance(it.get("distance")) + ) + score = alpha * emb + (1.0 - alpha) * bm_norm[i] + meta = (it.get("metadata") or {}) + path = meta.get("path","") or "" + # — optioneel: path-prior + symbol-boost via env — + pp_w = float(os.getenv("RAG_PATH_PRIOR_W", "0.08")) + if pp_w > 0.0: + score += pp_w * _path_prior(path) + sym_w = float(os.getenv("RAG_SYM_BOOST", "0.04")) + if sym_w > 0.0: + syms_raw = meta.get("symbols") + if isinstance(syms_raw, str): + syms = [s.strip().lower() for s in syms_raw.split(",") if s.strip()] + elif isinstance(syms_raw, list): + syms = [str(s).strip().lower() for s in syms_raw if str(s).strip()] + else: + syms = [] + if syms: + q_terms = set(_tok(query)) + if q_terms & set(syms): + score += sym_w + out.append({**it, "score": float(score)}) + + out.sort(key=lambda x: x["score"], reverse=True) + return out[:int(n_results)] + +def assemble_context(chunks: List[Dict], *, max_chars: int = 24000) -> Tuple[str, float]: + """ + Budgeted stitching: + - groepeer per path + - per path: neem 1-3 fragmenten (op volgorde van chunk_index indien beschikbaar) + - verdeel char-budget over paden, zwaarder voor hogere scores + - behoud Laravel stitching + Retour: (context_text, top_score) + """ + if not chunks: + return "", 0.0 + + # 1) Groepeer per path en verzamel scores + (optioneel) chunk_index + by_path: Dict[str, List[Dict]] = {} + top_score = 0.0 + for r in chunks: + meta = (r.get("metadata") or {}) + path = meta.get("path","") or "" + r["_chunk_index"] = meta.get("chunk_index") + r["_score"] = float(r.get("score", 0.0) or 0.0) + top_score = max(top_score, r["_score"]) + by_path.setdefault(path, []).append(r) + + # 2) Per path: sorteer op chunk_index (indien beschikbaar) anders score; cap op N stukken + def _sort_key(x): + ci = x.get("_chunk_index") + return (0, int(ci)) if isinstance(ci, int) or (isinstance(ci, str) and str(ci).isdigit()) else (1, -x["_score"]) + + path_items = [] + max_pieces = int(os.getenv("CTX_PIECES_PER_PATH_CAP", "3")) + for p, lst in by_path.items(): + lst_sorted = sorted(lst, key=_sort_key) + path_items.append({ + "path": p, + "best_score": max(x["_score"] for x in lst_sorted), + "pieces": lst_sorted[:max(1, max_pieces)], # cap per bestand + }) + + # 3) Sorteer paden op best_score en bereken budgetverdeling (softmax-achtig, maar bounded) + path_items.sort(key=lambda t: t["best_score"], reverse=True) + # clamp scores naar [0,1] voor stabielere allocatie + scores = [min(1.0, max(0.0, t["best_score"])) for t in path_items] + # softmax-lite: exp(score*beta) normaliseren; beta iets lager om niet te scherp te verdelen + beta = float(os.getenv("CTX_ALLOC_BETA", "2.2")) + w = [math.exp(beta * s) for s in scores] + S = max(1e-9, sum(w)) + weights = [x / S for x in w] + + # 4) Bouw snelle lookup path->full body (voor Laravel stitching) + by_path_first_body: Dict[str, str] = {} + for t in path_items: + doc0 = (t["pieces"][0].get("document") or "").strip() + by_path_first_body[t["path"]] = doc0 + + # 5) Render met budget per pad + out = [] + used = 0 + for t, w_i in zip(path_items, weights): + p = t["path"] + # minimaal & maximaal budget per pad (chars) + min_chars = int(os.getenv("CTX_ALLOC_MIN_PER_PATH", "1200")) + max_chars_path = int(os.getenv("CTX_ALLOC_MAX_PER_PATH", "6000")) + alloc = min(max(min_chars, int(max_chars * w_i)), max_chars_path) + + # stitch 1..3 stukken van dit pad binnen alloc + header = f"### {p} (score={t['best_score']:.3f})" + block_buf = [header] + remaining = max(0, alloc - len(header) - 1) + + + for piece in t["pieces"]: + body = (piece.get("document") or "").strip() + # knip niet middenin een regel: neem tot remaining en rol terug tot laatste newline + if remaining <= 0: + break + if len(body) > remaining: + cut = body[:remaining] + nl = cut.rfind("\n") + if nl > 300: # laat niet té kort + body = cut[:nl] + "\n…" + else: + body = cut + "…" + block_buf.append(body) + remaining -= len(body) + if remaining <= 300: # hou wat over voor stitching + break + + block = "\n".join(block_buf) + + # --- Laravel mini-stitch zoals voorheen, maar budgetbewust + stitched = [] + if p in ("routes/web.php", "routes/api.php"): + for ctrl_path, _meth in _laravel_pairs_from_route_text(by_path_first_body.get(p,"")): + if ctrl_path in by_path_first_body and remaining > 400: + snippet = by_path_first_body[ctrl_path][:min(400, remaining)] + stitched.append(f"\n### {ctrl_path} (stitch)\n{snippet}") + remaining -= len(snippet) + if p.startswith("app/Http/Controllers/"): + for vpath in _laravel_guess_view_paths_from_text(by_path_first_body.get(p,"")): + if vpath in by_path_first_body and remaining > 400: + snippet = by_path_first_body[vpath][:min(400, remaining)] + stitched.append(f"\n### {vpath} (stitch)\n{snippet}") + remaining -= len(snippet) + + if stitched: + block += "\n" + "\n".join(stitched) + + # Past het volledige blok niet meer, knip netjes i.p.v. alles laten vallen + remaining_total = max_chars - used + if remaining_total <= 0: + break + if len(block) > remaining_total: + # Zorg dat we niet midden in markdown header afkappen + trimmed = block[:max(0, remaining_total - 1)] + block = trimmed + "…" + out.append(block) + used = max_chars + break + else: + out.append(block) + used += len(block) + + + # stop vroeg als we het budget bijna op hebben + if max_chars - used < 800: + break + + return ("\n\n".join(out), float(top_score)) + +# --- Laravel route/controller/view helpers (lightweight, cycle-safe) --- + +def _laravel_pairs_from_route_text(route_text: str): + """ + Parse routes/web.php|api.php tekst en yield (controller_path, method) guesses. + Ondersteunt: + - 'Controller@method' + - FQCN zoals App\\Http\\Controllers\\Foo\\BarController::class + """ + out = [] + + # 1) 'Controller@method' + for m in re.finditer(r"['\"]([A-Za-z0-9_\\]+)@([A-Za-z0-9_]+)['\"]", route_text): + fq = m.group(1) + method = m.group(2) + ctrl = fq.replace("\\\\","/").replace("\\","/") + name = ctrl.split("/")[-1] + guess = f"app/Http/Controllers/{ctrl}.php" + alt = f"app/Http/Controllers/{name}.php" + out.append((guess, method)) + out.append((alt, method)) + + # 2) FQCN ::class + for m in re.finditer(r"([A-Za-z_][A-Za-z0-9_\\]+)\s*::\s*class", route_text): + fq = m.group(1) + ctrl = fq.replace("\\\\","/").replace("\\","/") + name = ctrl.split("/")[-1] + guess = f"app/Http/Controllers/{ctrl}.php" + alt = f"app/Http/Controllers/{name}.php" + out.append((guess, None)) + out.append((alt, None)) + + # dedupe, behoud orde + seen = set(); dedup = [] + for p in out: + if p not in seen: + seen.add(p); dedup.append(p) + return dedup + + +def _laravel_guess_view_paths_from_text(controller_text: str): + """ + Parse simpele 'return view(\"foo.bar\")' patronen → resources/views/foo/bar.blade.php + """ + out = [] + for m in re.finditer(r"view\(\s*['\"]([A-Za-z0-9_.\/-]+)['\"]\s*\)", controller_text): + view = m.group(1).strip().strip(".") + # 'foo.bar' of 'foo/bar' + path = view.replace(".", "/") + out.append(f"resources/views/{path}.blade.php") + # dedupe + seen = set(); dedup = [] + for p in out: + if p not in seen: + seen.add(p); dedup.append(p) + return dedup + +# Public API surface +__all__ = [ + "enrich_intent", + "expand_queries", + "hybrid_retrieve", + "assemble_context", +] + diff --git a/windowing_utils.py b/windowing_utils.py new file mode 100644 index 0000000..a4354e0 --- /dev/null +++ b/windowing_utils.py @@ -0,0 +1,167 @@ +# windowing_utils.py +from __future__ import annotations +from dataclasses import dataclass, field +from typing import List, Dict, Callable, Optional, Tuple +import hashlib +import os +import time + +# ---------- Token counting (vervang door echte tokenizer indien je wilt) +def approx_token_count(text: str) -> int: + # ~4 chars ≈ 1 token (ruwe maar stabiele vuistregel) + return max(1, len(text) // 4) + +def count_message_tokens(messages: List[Dict], tok_len: Callable[[str], int]) -> int: + total = 0 + for m in messages: + total += tok_len(m.get("content", "")) + return total + +# ---------- Thread ID + summary store +def derive_thread_id(body: Dict) -> str: + for key in ("conversation_id", "thread_id", "chat_id", "session_id", "room_id"): + if key in body and body[key]: + return str(body[key]) + parts = [str(body.get("model", ""))] + msgs = body.get("messages", [])[:2] + for m in msgs: + parts.append(m.get("role", "")) + parts.append(m.get("content", "")[:256]) + raw = "||".join(parts) + return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16] + +class RunningSummaryStore: + def __init__(self): + self._mem: dict[str, str] = {} + def get(self, thread_id: str) -> str: + return self._mem.get(thread_id, "") + def update(self, thread_id: str, new_summary: str): + self._mem[thread_id] = new_summary + +SUMMARY_STORE = RunningSummaryStore() + +# ---------- Sliding window + running summary +@dataclass +class ConversationWindow: + max_ctx_tokens: int + response_reserve: int = 2048 + tok_len: Callable[[str], int] = approx_token_count + running_summary: str = "" + summary_header: str = "Samenvatting tot nu toe" + history: List[Dict] = field(default_factory=list) + + def add(self, role: str, content: str): + self.history.append({"role": role, "content": content}) + + def _base_messages(self, system_prompt: Optional[str]) -> List[Dict]: + msgs: List[Dict] = [] + if system_prompt: + msgs.append({"role": "system", "content": system_prompt}) + if self.running_summary: + msgs.append({"role": "system", "content": f"{self.summary_header}:\n{self.running_summary}"}) + return msgs + + async def build_within_budget( + self, + system_prompt: Optional[str], + summarizer: Optional[Callable[[str, List[Dict]], "awaitable[str]"]] = None + ) -> List[Dict]: + budget = self.max_ctx_tokens - max(1, self.response_reserve) + working = self.history[:] + candidate = self._base_messages(system_prompt) + working + if count_message_tokens(candidate, self.tok_len) <= budget: + return candidate + + # 1) trim oudste turns + while working and count_message_tokens(self._base_messages(system_prompt) + working, self.tok_len) > budget: + working.pop(0) + candidate = self._base_messages(system_prompt) + working + if count_message_tokens(candidate, self.tok_len) <= budget: + self.history = working + return candidate + + # 2) samenvatten indien mogelijk + if summarizer is None: + while working and count_message_tokens(self._base_messages(system_prompt) + working, self.tok_len) > budget: + working.pop(0) + self.history = working + return self._base_messages(system_prompt) + working + + # samenvat in batches + working = self.history[:] + chunk_buf: List[Dict] = [] + + async def build_candidate(_summary: str, _working: List[Dict]) -> List[Dict]: + base = [] + if system_prompt: + base.append({"role": "system", "content": system_prompt}) + if _summary: + base.append({"role": "system", "content": f"{self.summary_header}:\n{_summary}"}) + return base + _working + + while working and count_message_tokens(await build_candidate(self.running_summary, working), self.tok_len) > budget: + chunk_buf.append(working.pop(0)) + # bij ~1500 tokens in buffer (ruw) samenvatten + if count_message_tokens([{"role":"system","content":str(chunk_buf)}], self.tok_len) > 1500 or not working: + self.running_summary = await summarizer(self.running_summary, chunk_buf) + chunk_buf = [] + + self.history = working + return await build_candidate(self.running_summary, working) + +# ---------- Repo chunking +from typing import Iterable +def split_text_tokens( + text: str, + tok_len: Callable[[str], int], + max_tokens: int, + overlap_tokens: int = 60 +) -> List[str]: + if tok_len(text) <= max_tokens: + return [text] + approx_ratio = max_tokens / max(1, tok_len(text)) + step = max(1000, int(len(text) * approx_ratio)) + chunks: List[str] = [] + i = 0 + while i < len(text): + ch = text[i:i+step] + while tok_len(ch) > max_tokens and len(ch) > 200: + ch = ch[:-200] + chunks.append(ch) + if overlap_tokens > 0: + ov_chars = max(100, overlap_tokens * 4) + i += max(1, len(ch) - ov_chars) + else: + i += len(ch) + return chunks + +def fit_context_under_budget( + items: List[Tuple[str,str]], tok_len: Callable[[str], int], budget_tokens: int +) -> List[Tuple[str,str]]: + res: List[Tuple[str,str]] = [] + used = 0 + for title, text in items: + t = tok_len(text) + if used + t <= budget_tokens: + res.append((title, text)) + used += t + else: + break + return res + +def build_repo_context( + files_ranked: List[Tuple[str, str, float]], + per_chunk_tokens: int = 1200, + overlap_tokens: int = 60, + ctx_budget_tokens: int = 4000, + tok_len: Callable[[str], int] = approx_token_count +) -> str: + expanded: List[Tuple[str,str]] = [] + for path, content, _ in files_ranked: + for i, ch in enumerate(split_text_tokens(content, tok_len, per_chunk_tokens, overlap_tokens)): + expanded.append((f"{path}#chunk{i+1}", ch)) + selected = fit_context_under_budget(expanded, tok_len, ctx_budget_tokens) + ctx = "" + for title, ch in selected: + ctx += f"\n\n=== {title} ===\n{ch}" + return ctx.strip()