4693 lines
198 KiB
Python
4693 lines
198 KiB
Python
|
|
# agent_repo.py
|
|||
|
|
# =====================================================================
|
|||
|
|
# Hybrid RAG + LLM edit-plans met: veilige fallback, anti-destructie guard,
|
|||
|
|
# en EXPLICIETE UITLEG per diff.
|
|||
|
|
# =====================================================================
|
|||
|
|
# agent_repo.py (bovenin)
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
from smart_rag import enrich_intent, expand_queries, hybrid_retrieve, assemble_context
|
|||
|
|
import os, re, time, uuid, difflib, hashlib, logging, json, fnmatch
|
|||
|
|
from dataclasses import dataclass, field
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, List, Tuple, Optional, Any
|
|||
|
|
from urllib.parse import urlparse, urlunparse
|
|||
|
|
import requests
|
|||
|
|
import base64
|
|||
|
|
from windowing_utils import approx_token_count
|
|||
|
|
from starlette.concurrency import run_in_threadpool
|
|||
|
|
import asyncio
|
|||
|
|
from collections import defaultdict
|
|||
|
|
|
|||
|
|
|
|||
|
|
# --- Async I/O executors (voorkom event-loop blocking) ---
|
|||
|
|
from concurrent.futures import ThreadPoolExecutor
|
|||
|
|
|
|||
|
|
_IO_POOL = ThreadPoolExecutor(max_workers=int(os.getenv("AGENT_IO_WORKERS", "8")))
|
|||
|
|
_CPU_POOL = ThreadPoolExecutor(max_workers=int(os.getenv("AGENT_CPU_WORKERS", "2")))
|
|||
|
|
_CLONE_SEMA = asyncio.Semaphore(int(os.getenv("AGENT_MAX_CONCURRENT_CLONES", "2")))
|
|||
|
|
|
|||
|
|
BACKEND = (os.getenv("VECTOR_BACKEND") or "CHROMA").upper().strip()
|
|||
|
|
|
|||
|
|
#PATH_RE = re.compile(r"(?<!https?:\/\/)([A-Za-z0-9._\-\/]+\/[A-Za-z0-9._\-]+(?:\.[A-Za-z0-9._\-]+))")
|
|||
|
|
#PATH_RE = re.compile(r"(?<!http:\/\/)(?<!https:\/\/)(/[A-Za-z0-9._\-\/]+\/[A-Za-z0-9._\-]+(?:\.[A-Za-z0-9._\-]+))")
|
|||
|
|
#PATH_RE = re.compile(r'(?<!https?://)(?:^|(?<=\s)|(?<=["\'(]))'r'((?:\.{0,2}/)?(?:[A-Za-z0-9._-]+/)+[A-Za-z0-9._-]+\.[A-Za-z0-9._-]+)')
|
|||
|
|
PATH_RE = re.compile(
|
|||
|
|
r'''
|
|||
|
|
(?<!http://)(?<!https://) # niet voorafgegaan door http:// of https://
|
|||
|
|
(?:^|(?<=\s)|(?<=[\'"\[])) # begin van string, whitespace of na ", ', [
|
|||
|
|
( # ---------- capture group ----------
|
|||
|
|
(?:\.{1,2}/)? # optioneel ./ of ../
|
|||
|
|
(?:[\w.-]+/)* # 0 of meer map‑segmenten
|
|||
|
|
[\w.-]+\.[\w.-]+ # bestandsnaam + extensie
|
|||
|
|
)
|
|||
|
|
''',
|
|||
|
|
re.VERBOSE | re.IGNORECASE,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Debounce: onthoud laatst-geïndexeerde HEAD per (repo_url|branch) in-memory
|
|||
|
|
_INDEX_HEAD_MEMO: dict[str, str] = {}
|
|||
|
|
_MEILI_HEAD_MEMO: dict[str, str] = {}
|
|||
|
|
_BM25_HEAD_MEMO: dict[str, str] = {}
|
|||
|
|
|
|||
|
|
DEF_INJECTS = {}
|
|||
|
|
_search_candidates_fn = None
|
|||
|
|
_repo_summary_get_fn = None
|
|||
|
|
_meili_search_fn = None
|
|||
|
|
|
|||
|
|
# --- caches voor graph en tree summaries (per HEAD) ---
|
|||
|
|
_GRAPH_CACHE: dict[str, dict[str, set[str]]] = {}
|
|||
|
|
_TREE_SUM_CACHE: dict[str, dict[str, str]] = {}
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
# zet dit dicht bij de andere module-consts
|
|||
|
|
async def _call_get_git_repo(repo_url: str, branch: str):
|
|||
|
|
"""
|
|||
|
|
Veilig wrapper: ondersteunt zowel sync als async implementaties van _get_git_repo.
|
|||
|
|
"""
|
|||
|
|
if asyncio.iscoroutinefunction(_get_git_repo):
|
|||
|
|
return await _get_git_repo(repo_url, branch)
|
|||
|
|
# sync: draai in IO pool
|
|||
|
|
return await run_io_blocking(_get_git_repo, repo_url, branch)
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def run_io_blocking(func, *args, pool=None, **kwargs):
|
|||
|
|
"""Draai sync/blokkerende I/O in threadpool zodat de event-loop vrij blijft."""
|
|||
|
|
loop = asyncio.get_running_loop()
|
|||
|
|
executor = pool or _IO_POOL
|
|||
|
|
return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
|
|||
|
|
|
|||
|
|
async def run_cpu_blocking(func, *args, pool=None, **kwargs):
|
|||
|
|
"""Voor CPU-zwaardere taken (bv. index bouwen)."""
|
|||
|
|
loop = asyncio.get_running_loop()
|
|||
|
|
executor = pool or _CPU_POOL
|
|||
|
|
return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
|
|||
|
|
|
|||
|
|
# Lazy imports
|
|||
|
|
_chroma = None
|
|||
|
|
_qdrant = None
|
|||
|
|
_qdrant_models = None
|
|||
|
|
try:
|
|||
|
|
if BACKEND == "CHROMA":
|
|||
|
|
import chromadb # type: ignore
|
|||
|
|
_chroma = chromadb
|
|||
|
|
except Exception:
|
|||
|
|
_chroma = None
|
|||
|
|
try:
|
|||
|
|
if BACKEND == "QDRANT":
|
|||
|
|
from qdrant_client import QdrantClient # type: ignore
|
|||
|
|
from qdrant_client.http.models import Filter, FieldCondition, MatchValue # type: ignore
|
|||
|
|
_qdrant = QdrantClient
|
|||
|
|
_qdrant_models = (Filter, FieldCondition, MatchValue)
|
|||
|
|
except Exception:
|
|||
|
|
_qdrant = None
|
|||
|
|
_qdrant_models = None
|
|||
|
|
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from rank_bm25 import BM25Okapi
|
|||
|
|
except Exception:
|
|||
|
|
BM25Okapi = None
|
|||
|
|
|
|||
|
|
logger = logging.getLogger("agent_repo")
|
|||
|
|
|
|||
|
|
# ---------- Omgeving / Config ----------
|
|||
|
|
GITEA_URL = os.environ.get("GITEA_URL", "http://localhost:3080").rstrip("/")
|
|||
|
|
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "8bdbe18dd2ec93ecbf9cd0a8f01a6eadf9cfa87d")
|
|||
|
|
GITEA_API = os.environ.get("GITEA_API", f"{GITEA_URL}/api/v1").rstrip("/")
|
|||
|
|
AGENT_DEFAULT_BRANCH = os.environ.get("AGENT_DEFAULT_BRANCH", "main")
|
|||
|
|
AGENT_MAX_QUESTIONS = int(os.environ.get("AGENT_MAX_QUESTIONS", "3"))
|
|||
|
|
MAX_FILES_DRYRUN = int(os.environ.get("AGENT_MAX_FILES_DRYRUN", "27"))
|
|||
|
|
RAG_TOPK = int(os.environ.get("AGENT_RAG_TOPK", "24")) # grotere kandidaatpool helpt de reranker
|
|||
|
|
AGENT_DISCOVER_MAX_REPOS = int(os.environ.get("AGENT_DISCOVER_MAX_REPOS", "200"))
|
|||
|
|
AGENT_AUTOSELECT_THRESHOLD = float(os.environ.get("AGENT_AUTOSELECT_THRESHOLD", "0.80")) # 0..1
|
|||
|
|
REPO_CATALOG_MEILI_INDEX = os.environ.get("REPO_CATALOG_MEILI_INDEX", "repo-catalog")
|
|||
|
|
AGENT_ENABLE_GOAL_REFINE = os.environ.get("AGENT_ENABLE_GOAL_REFINE", "1").lower() in ("1","true","yes")
|
|||
|
|
AGENT_CLARIFY_THRESHOLD = float(os.environ.get("AGENT_CLARIFY_THRESHOLD", "0.6"))
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Meilisearch (optioneel)
|
|||
|
|
MEILI_URL = os.environ.get("MEILI_URL", "http://localhost:7700").strip()
|
|||
|
|
MEILI_KEY = os.environ.get("MEILI_KEY", "0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ").strip()
|
|||
|
|
MEILI_INDEX_PREFIX = os.environ.get("MEILI_INDEX_PREFIX", "code").strip()
|
|||
|
|
|
|||
|
|
# optioneel: basic auth injectie voor HTTP clone (private repos)
|
|||
|
|
GITEA_HTTP_USER = os.environ.get("GITEA_HTTP_USER", "Mistral-llm")
|
|||
|
|
GITEA_HTTP_TOKEN = os.environ.get("GITEA_HTTP_TOKEN", "8bdbe18dd2ec93ecbf9cd0a8f01a6eadf9cfa87d")
|
|||
|
|
|
|||
|
|
# Geen destructive edits. (geen complete inhoud van files verwijderen.)
|
|||
|
|
AGENT_DESTRUCTIVE_RATIO = float(os.environ.get("AGENT_DESTRUCTIVE_RATIO", "0.50"))
|
|||
|
|
|
|||
|
|
# Alleen relevante code/tekst-extensies (geen binaire/caches)
|
|||
|
|
ALLOWED_EXTS = {
|
|||
|
|
".php",".blade.php",".vue",".js",".ts",".jsx",".tsx",".css",".scss",
|
|||
|
|
".html",".htm",".json",".md",".ini",".cfg",".yml",".yaml",".toml",
|
|||
|
|
".py",".go",".rb",".java",".cs",".txt"
|
|||
|
|
}
|
|||
|
|
INTERNAL_EXCLUDE_DIRS = {
|
|||
|
|
".git",".npm","node_modules","vendor","storage","dist","build",".next",
|
|||
|
|
"__pycache__",".venv","venv",".mypy_cache",".pytest_cache",
|
|||
|
|
"target","bin","obj","logs","cache","temp",".cache"
|
|||
|
|
}
|
|||
|
|
_LIST_FILES_CACHE: dict[str, tuple[float, List[str]]] = {} # path -> (ts, files)
|
|||
|
|
# ---------- Injectie vanuit app.py ----------
|
|||
|
|
_app = None
|
|||
|
|
_get_git_repo = None
|
|||
|
|
_rag_index_repo_internal = None
|
|||
|
|
_rag_query_internal = None
|
|||
|
|
_llm_call = None
|
|||
|
|
_extract_code_block = None
|
|||
|
|
_read_text_file = None
|
|||
|
|
_client_ip = None
|
|||
|
|
_PROFILE_EXCLUDE_DIRS: set[str] = set()
|
|||
|
|
_get_chroma_collection = None
|
|||
|
|
_embed_query_fn = None
|
|||
|
|
_embed_documents = None
|
|||
|
|
|
|||
|
|
|
|||
|
|
# === SMART LLM WRAPPER: budget + nette afronding + auto-continue ===
|
|||
|
|
# Past binnen jouw GPU-cap (typisch 13027 tokens totale context).
|
|||
|
|
# Non-invasief: behoudt hetzelfde response-shape als _llm_call.
|
|||
|
|
|
|||
|
|
# Harde cap van jouw Mistral-LLM docker (zoals je aangaf)
|
|||
|
|
_MODEL_BUDGET = int(os.getenv("LLM_TOTAL_TOKEN_BUDGET", "13027"))
|
|||
|
|
# Veiligheidsmarge voor headers/EOS/afwijkingen in token-raming
|
|||
|
|
_BUDGET_SAFETY = int(os.getenv("LLM_BUDGET_SAFETY_TOKENS", "512"))
|
|||
|
|
# Max aantal vervolgstappen als het net afgekapt lijkt
|
|||
|
|
_MAX_AUTO_CONTINUES = int(os.getenv("LLM_MAX_AUTO_CONTINUES", "2"))
|
|||
|
|
|
|||
|
|
def _est_tokens(text: str) -> int:
|
|||
|
|
# Ruwe schatting: ~4 chars/token (conservatief genoeg voor budgettering)
|
|||
|
|
if not text: return 0
|
|||
|
|
return max(1, len(text) // 4)
|
|||
|
|
|
|||
|
|
def _concat_messages_text(messages: list[dict]) -> str:
|
|||
|
|
parts = []
|
|||
|
|
for m in messages or []:
|
|||
|
|
c = m.get("content")
|
|||
|
|
if isinstance(c, str): parts.append(c)
|
|||
|
|
return "\n".join(parts)
|
|||
|
|
|
|||
|
|
def _ends_neatly(s: str) -> bool:
|
|||
|
|
if not s: return False
|
|||
|
|
t = s.rstrip()
|
|||
|
|
return t.endswith((".", "!", "?", "…", "”", "’"))
|
|||
|
|
|
|||
|
|
def _append_assistant_and_continue_prompt(base_messages: list[dict], prev_text: str) -> list[dict]:
|
|||
|
|
"""
|
|||
|
|
Bouw een minimale vervolgprompt zonder opnieuw de hele context te sturen.
|
|||
|
|
Dit beperkt prompt_tokens en voorkomt dat we opnieuw de cap raken.
|
|||
|
|
"""
|
|||
|
|
tail_words = " ".join(prev_text.split()[-60:]) # laatste ±60 woorden als anker
|
|||
|
|
cont_user = (
|
|||
|
|
"Ga verder waar je stopte. Herhaal niets. "
|
|||
|
|
"Vervolg direct de laatste zin met hetzelfde formaat.\n\n"
|
|||
|
|
"Vorige woorden:\n" + tail_words
|
|||
|
|
)
|
|||
|
|
# We sturen *niet* de volledige history opnieuw; alleen een korte instructie
|
|||
|
|
return [
|
|||
|
|
{"role": "system", "content": "Vervolg exact en beknopt; geen herhaling van eerder gegenereerde tekst."},
|
|||
|
|
{"role": "user", "content": cont_user},
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
def _merge_choice_text(resp_a: dict, resp_b: dict) -> dict:
|
|||
|
|
"""
|
|||
|
|
Plak de content van choices[0] aan elkaar zodat callsites één 'content' blijven lezen.
|
|||
|
|
"""
|
|||
|
|
a = (((resp_a or {}).get("choices") or [{}])[0].get("message") or {}).get("content","")
|
|||
|
|
b = (((resp_b or {}).get("choices") or [{}])[0].get("message") or {}).get("content","")
|
|||
|
|
merged = (a or "") + (b or "")
|
|||
|
|
out = resp_a.copy()
|
|||
|
|
if "choices" in out and out["choices"]:
|
|||
|
|
out["choices"] = [{
|
|||
|
|
"index": 0,
|
|||
|
|
"finish_reason": "length" if (out.get("choices",[{}])[0].get("finish_reason") in (None, "length")) else out.get("choices",[{}])[0].get("finish_reason"),
|
|||
|
|
"message": {"role":"assistant","content": merged}
|
|||
|
|
}]
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
# Voorbeeld: Chroma client/init – vervang door jouw eigen client
|
|||
|
|
# from chromadb import Client
|
|||
|
|
# chroma = Client(...)
|
|||
|
|
|
|||
|
|
def _build_where_filter(repo: Optional[str], path_contains: Optional[str], profile: Optional[str]) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
Bouw een simpele metadata-filter voor de vector-DB. Pas aan naar jouw DB.
|
|||
|
|
"""
|
|||
|
|
where: Dict[str, Any] = {}
|
|||
|
|
if repo:
|
|||
|
|
where["repo"] = repo
|
|||
|
|
if profile:
|
|||
|
|
where["profile"] = profile
|
|||
|
|
if path_contains:
|
|||
|
|
# Als je DB geen 'contains' ondersteunt: filter achteraf (post-filter)
|
|||
|
|
where["path_contains"] = path_contains
|
|||
|
|
return where
|
|||
|
|
|
|||
|
|
def _to_distance_from_similarity(x: Optional[float]) -> float:
|
|||
|
|
"""
|
|||
|
|
Converteer een 'similarity' (1=identiek, 0=ver weg) naar distance (lager = beter).
|
|||
|
|
"""
|
|||
|
|
if x is None:
|
|||
|
|
return 1.0
|
|||
|
|
try:
|
|||
|
|
xv = float(x)
|
|||
|
|
except Exception:
|
|||
|
|
return 1.0
|
|||
|
|
# Veiligheids-net: clamp
|
|||
|
|
if xv > 1.0 or xv < 0.0:
|
|||
|
|
# Sommige backends geven cosine distance al (0=identiek). Als >1, treat as distance passthrough.
|
|||
|
|
return max(0.0, xv)
|
|||
|
|
# Standaard: cosine similarity → distance
|
|||
|
|
return 1.0 - xv
|
|||
|
|
|
|||
|
|
def _post_filter_path_contains(items: List[Dict[str,Any]], path_contains: Optional[str]) -> List[Dict[str,Any]]:
|
|||
|
|
if not path_contains:
|
|||
|
|
return items
|
|||
|
|
key = (path_contains or "").lower()
|
|||
|
|
out = []
|
|||
|
|
for it in items:
|
|||
|
|
p = ((it.get("metadata") or {}).get("path") or "").lower()
|
|||
|
|
if key in p:
|
|||
|
|
out.append(it)
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
def _chroma_query(collection_name: str, query: str, n_results: int, where: Dict[str,Any]) -> Dict[str,Any]:
|
|||
|
|
global _chroma
|
|||
|
|
if _chroma is None:
|
|||
|
|
raise RuntimeError("Chroma backend niet beschikbaar (module niet geïnstalleerd).")
|
|||
|
|
# Gebruik dezelfde collection-factory als de indexer, zodat versie/suffix consistent is
|
|||
|
|
if _get_chroma_collection is None:
|
|||
|
|
client = _chroma.Client()
|
|||
|
|
coll = client.get_or_create_collection(collection_name)
|
|||
|
|
else:
|
|||
|
|
coll = _get_chroma_collection(collection_name)
|
|||
|
|
# Chroma: use 'where' only for exact fields (repo/profile)
|
|||
|
|
where_exact = {k:v for k,v in where.items() if k in ("repo","profile")}
|
|||
|
|
qr = coll.query(
|
|||
|
|
query_texts=[query],
|
|||
|
|
n_results=max(1, n_results),
|
|||
|
|
where=where_exact,
|
|||
|
|
include=["documents","metadatas","distances"]
|
|||
|
|
)
|
|||
|
|
docs = qr.get("documents", [[]])[0] or []
|
|||
|
|
metas = qr.get("metadatas", [[]])[0] or []
|
|||
|
|
dists = qr.get("distances", [[]])[0] or []
|
|||
|
|
# Chroma 'distances': lager = beter (ok)
|
|||
|
|
items: List[Dict[str,Any]] = []
|
|||
|
|
for doc, meta, dist in zip(docs, metas, dists):
|
|||
|
|
items.append({
|
|||
|
|
"document": doc,
|
|||
|
|
"metadata": {
|
|||
|
|
"repo": meta.get("repo",""),
|
|||
|
|
"path": meta.get("path",""),
|
|||
|
|
"chunk_index": meta.get("chunk_index", 0),
|
|||
|
|
"symbols": meta.get("symbols", []),
|
|||
|
|
"profile": meta.get("profile",""),
|
|||
|
|
},
|
|||
|
|
"distance": float(dist) if dist is not None else 1.0,
|
|||
|
|
})
|
|||
|
|
return {"results": items}
|
|||
|
|
|
|||
|
|
def _qdrant_query(collection_name: str, query: str, n_results: int, where: Dict[str,Any]) -> Dict[str,Any]:
|
|||
|
|
global _qdrant, _qdrant_models
|
|||
|
|
if _qdrant is None or _qdrant_models is None:
|
|||
|
|
raise RuntimeError("Qdrant backend niet beschikbaar (module niet geïnstalleerd).")
|
|||
|
|
Filter, FieldCondition, MatchValue = _qdrant_models
|
|||
|
|
# Let op: je hebt hier *ook* een embedder nodig (client-side). In dit skeleton verwachten we dat
|
|||
|
|
# je server-side search by text hebt geconfigureerd. Anders: voeg hier je embedder toe.
|
|||
|
|
client = _qdrant(host=os.getenv("QDRANT_HOST","localhost"), port=int(os.getenv("QDRANT_PORT","6333")))
|
|||
|
|
# Eenvoudig: text search (als ingeschakeld). Anders: raise en laat de mock fallback pakken.
|
|||
|
|
try:
|
|||
|
|
must: List[Any] = []
|
|||
|
|
if where.get("repo"):
|
|||
|
|
must.append(FieldCondition(key="repo", match=MatchValue(value=where["repo"])))
|
|||
|
|
if where.get("profile"):
|
|||
|
|
must.append(FieldCondition(key="profile", match=MatchValue(value=where["profile"])))
|
|||
|
|
flt = Filter(must=must) if must else None
|
|||
|
|
# NB: Qdrant 'score' is vaak cosine similarity (hoog=goed). Converteer naar distance.
|
|||
|
|
res = client.search(
|
|||
|
|
collection_name=collection_name,
|
|||
|
|
query=query,
|
|||
|
|
limit=max(1, n_results),
|
|||
|
|
query_filter=flt,
|
|||
|
|
with_payload=True,
|
|||
|
|
)
|
|||
|
|
except Exception as e:
|
|||
|
|
raise RuntimeError(f"Qdrant text search niet geconfigureerd: {e}")
|
|||
|
|
|
|||
|
|
items: List[Dict[str,Any]] = []
|
|||
|
|
for p in res:
|
|||
|
|
meta = (p.payload or {})
|
|||
|
|
sim = getattr(p, "score", None)
|
|||
|
|
items.append({
|
|||
|
|
"document": meta.get("document",""),
|
|||
|
|
"metadata": {
|
|||
|
|
"repo": meta.get("repo",""),
|
|||
|
|
"path": meta.get("path",""),
|
|||
|
|
"chunk_index": meta.get("chunk_index", 0),
|
|||
|
|
"symbols": meta.get("symbols", []),
|
|||
|
|
"profile": meta.get("profile",""),
|
|||
|
|
},
|
|||
|
|
"distance": _to_distance_from_similarity(sim),
|
|||
|
|
})
|
|||
|
|
return {"results": items}
|
|||
|
|
|
|||
|
|
async def rag_query_internal_fn(
|
|||
|
|
*, query: str, n_results: int, collection_name: str,
|
|||
|
|
repo: Optional[str], path_contains: Optional[str], profile: Optional[str]
|
|||
|
|
) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
Adapter die zoekt in je vector-DB en *exact* het verwachte formaat teruggeeft:
|
|||
|
|
{
|
|||
|
|
"results": [
|
|||
|
|
{"document": str, "metadata": {...}, "distance": float}
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
"""
|
|||
|
|
# 1) Haal collectie op (pas aan naar jouw client)
|
|||
|
|
# coll = chroma.get_or_create_collection(collection_name)
|
|||
|
|
|
|||
|
|
# 2) Bouw where/filter (optioneel afhankelijk van jouw DB)
|
|||
|
|
where = _build_where_filter(repo, path_contains, profile)
|
|||
|
|
|
|||
|
|
# ?2?) Router naar backend
|
|||
|
|
try:
|
|||
|
|
if BACKEND == "CHROMA":
|
|||
|
|
res = _chroma_query(collection_name, query, n_results, where)
|
|||
|
|
elif BACKEND == "QDRANT":
|
|||
|
|
res = _qdrant_query(collection_name, query, n_results, where)
|
|||
|
|
else:
|
|||
|
|
raise RuntimeError(f"Onbekende VECTOR_BACKEND={BACKEND}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
# Mock fallback zodat je app bruikbaar blijft
|
|||
|
|
qr = {
|
|||
|
|
"documents": [["(mock) no DB connected"]],
|
|||
|
|
"metadatas": [[{"repo": repo or "", "path": "README.md", "chunk_index": 0, "symbols": []}]],
|
|||
|
|
"distances": [[0.99]],
|
|||
|
|
}
|
|||
|
|
docs = qr.get("documents", [[]])[0] or []
|
|||
|
|
metas = qr.get("metadatas", [[]])[0] or []
|
|||
|
|
dists = qr.get("distances", [[]])[0] or []
|
|||
|
|
|
|||
|
|
items: List[Dict[str, Any]] = []
|
|||
|
|
for doc, meta, dist in zip(docs, metas, dists):
|
|||
|
|
# Post-filter op path_contains als je DB dat niet ondersteunt
|
|||
|
|
if path_contains:
|
|||
|
|
p = (meta.get("path") or "").lower()
|
|||
|
|
if (path_contains or "").lower() not in p:
|
|||
|
|
continue
|
|||
|
|
items.append({
|
|||
|
|
"document": doc,
|
|||
|
|
"metadata": {
|
|||
|
|
"repo": meta.get("repo",""),
|
|||
|
|
"path": meta.get("path",""),
|
|||
|
|
"chunk_index": meta.get("chunk_index", 0),
|
|||
|
|
"symbols": meta.get("symbols", []),
|
|||
|
|
"profile": meta.get("profile",""),
|
|||
|
|
},
|
|||
|
|
"distance": float(dist) if dist is not None else 1.0,
|
|||
|
|
})
|
|||
|
|
res = {"results": items[:max(1, n_results)]}
|
|||
|
|
# 3) Post-filter path_contains (indien nodig)
|
|||
|
|
res["results"] = _post_filter_path_contains(res.get("results", []), path_contains)
|
|||
|
|
# 4) Trim
|
|||
|
|
res["results"] = res.get("results", [])[:max(1, n_results)]
|
|||
|
|
return res
|
|||
|
|
|
|||
|
|
async def _smart_llm_call_base(
|
|||
|
|
llm_call_fn,
|
|||
|
|
messages: list[dict],
|
|||
|
|
*,
|
|||
|
|
stop: list[str] | None = None,
|
|||
|
|
max_tokens: int | None = None,
|
|||
|
|
temperature: float = 0.2,
|
|||
|
|
top_p: float = 0.9,
|
|||
|
|
stream: bool = False,
|
|||
|
|
**kwargs
|
|||
|
|
):
|
|||
|
|
"""
|
|||
|
|
1) Dwing max_tokens af binnen totale budget (prompt + output ≤ cap).
|
|||
|
|
2) Voeg milde stop-sequenties toe voor nette afronding.
|
|||
|
|
3) Auto-continue als het lijkt afgekapt en we ruimte willen voor een vervolg.
|
|||
|
|
"""
|
|||
|
|
# 1) Budget berekenen op basis van huidige prompt omvang
|
|||
|
|
prompt_text = _concat_messages_text(messages)
|
|||
|
|
prompt_tokens = _est_tokens(prompt_text)
|
|||
|
|
room = max(128, _MODEL_BUDGET - prompt_tokens - _BUDGET_SAFETY)
|
|||
|
|
eff_max_tokens = max(1, min(int(max_tokens or 900), room))
|
|||
|
|
|
|||
|
|
# 2) Stop-sequenties (mild, niet beperkend voor code)
|
|||
|
|
default_stops = ["\n\n", "###"]
|
|||
|
|
stops = list(dict.fromkeys((stop or []) + default_stops))
|
|||
|
|
|
|||
|
|
# eerste call
|
|||
|
|
try:
|
|||
|
|
resp = await llm_call_fn(
|
|||
|
|
messages,
|
|||
|
|
stream=stream,
|
|||
|
|
temperature=temperature,
|
|||
|
|
top_p=top_p,
|
|||
|
|
max_tokens=eff_max_tokens,
|
|||
|
|
stop=stops,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
except TypeError as e:
|
|||
|
|
# backend accepteert geen 'stop' → probeer opnieuw zonder stop
|
|||
|
|
resp = await llm_call_fn(
|
|||
|
|
messages,
|
|||
|
|
stream=stream,
|
|||
|
|
temperature=temperature,
|
|||
|
|
top_p=top_p,
|
|||
|
|
max_tokens=eff_max_tokens,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
text = (((resp or {}).get("choices") or [{}])[0].get("message") or {}).get("content","")
|
|||
|
|
# Heuristiek: bijna vol + niet netjes eindigen → waarschijnlijk afgekapt
|
|||
|
|
near_cap = (_est_tokens(text) >= int(0.92 * eff_max_tokens))
|
|||
|
|
needs_more = (near_cap and not _ends_neatly(text))
|
|||
|
|
|
|||
|
|
continues = 0
|
|||
|
|
merged = resp
|
|||
|
|
while needs_more and continues < _MAX_AUTO_CONTINUES:
|
|||
|
|
continues += 1
|
|||
|
|
cont_msgs = _append_assistant_and_continue_prompt(messages, text)
|
|||
|
|
# Herbereken budget voor vervolg (nieuwe prompt is veel kleiner)
|
|||
|
|
cont_prompt_tokens = _est_tokens(_concat_messages_text(cont_msgs))
|
|||
|
|
cont_room = max(128, _MODEL_BUDGET - cont_prompt_tokens - _BUDGET_SAFETY)
|
|||
|
|
cont_max = max(1, min(int(max_tokens or 900), cont_room))
|
|||
|
|
try:
|
|||
|
|
cont_resp = await llm_call_fn(
|
|||
|
|
cont_msgs,
|
|||
|
|
stream=False,
|
|||
|
|
temperature=temperature,
|
|||
|
|
top_p=top_p,
|
|||
|
|
max_tokens=cont_max,
|
|||
|
|
stop=stops,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
except TypeError:
|
|||
|
|
cont_resp = await llm_call_fn(
|
|||
|
|
cont_msgs,
|
|||
|
|
stream=False,
|
|||
|
|
temperature=temperature,
|
|||
|
|
top_p=top_p,
|
|||
|
|
max_tokens=cont_max,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
merged = _merge_choice_text(merged, cont_resp)
|
|||
|
|
text = (((merged or {}).get("choices") or [{}])[0].get("message") or {}).get("content","")
|
|||
|
|
near_cap = (_est_tokens(text.split()[-800:]) >= int(0.9 * cont_max)) # check op laatst stuk
|
|||
|
|
needs_more = (near_cap and not _ends_neatly(text))
|
|||
|
|
|
|||
|
|
return merged
|
|||
|
|
|
|||
|
|
def initialize_agent(*, app, get_git_repo_fn, rag_index_repo_internal_fn, rag_query_internal_fn,
|
|||
|
|
llm_call_fn, extract_code_block_fn, read_text_file_fn, client_ip_fn,
|
|||
|
|
profile_exclude_dirs, chroma_get_collection_fn, embed_query_fn, embed_documents_fn,
|
|||
|
|
search_candidates_fn=None, repo_summary_get_fn=None, meili_search_fn=None):
|
|||
|
|
global DEF_INJECTS
|
|||
|
|
DEF_INJECTS.update({
|
|||
|
|
"app": app,
|
|||
|
|
"get_git_repo_fn": get_git_repo_fn,
|
|||
|
|
"rag_index_repo_internal_fn": rag_index_repo_internal_fn,
|
|||
|
|
"rag_query_internal_fn": rag_query_internal_fn,
|
|||
|
|
"llm_call_fn": llm_call_fn,
|
|||
|
|
"extract_code_block_fn": extract_code_block_fn,
|
|||
|
|
"read_text_file_fn": read_text_file_fn,
|
|||
|
|
"client_ip_fn": client_ip_fn,
|
|||
|
|
"profile_exclude_dirs": profile_exclude_dirs,
|
|||
|
|
"chroma_get_collection_fn": chroma_get_collection_fn,
|
|||
|
|
"embed_query_fn": embed_query_fn,
|
|||
|
|
"embed_documents_fn": embed_documents_fn,
|
|||
|
|
})
|
|||
|
|
global _search_candidates_fn, _repo_summary_get_fn, _meili_search_fn
|
|||
|
|
_search_candidates_fn = search_candidates_fn
|
|||
|
|
_repo_summary_get_fn = repo_summary_get_fn
|
|||
|
|
_meili_search_fn = meili_search_fn
|
|||
|
|
global _get_chroma_collection, _embed_query_fn
|
|||
|
|
global _app, _get_git_repo, _rag_index_repo_internal, _rag_query_internal, _llm_call
|
|||
|
|
global _extract_code_block, _read_text_file, _client_ip, _PROFILE_EXCLUDE_DIRS
|
|||
|
|
_app = app
|
|||
|
|
_get_git_repo = get_git_repo_fn
|
|||
|
|
_rag_index_repo_internal = rag_index_repo_internal_fn
|
|||
|
|
_rag_query_internal = rag_query_internal_fn
|
|||
|
|
# Bewaar de originele en wrap met budget + auto-continue
|
|||
|
|
_llm_call_original = llm_call_fn
|
|||
|
|
async def _wrapped_llm_call(messages, **kwargs):
|
|||
|
|
return await _smart_llm_call_base(_llm_call_original, messages, **kwargs)
|
|||
|
|
globals()["_llm_call"] = _wrapped_llm_call
|
|||
|
|
_extract_code_block = extract_code_block_fn
|
|||
|
|
_read_text_file = read_text_file_fn
|
|||
|
|
_client_ip = client_ip_fn
|
|||
|
|
_PROFILE_EXCLUDE_DIRS = set(profile_exclude_dirs) | INTERNAL_EXCLUDE_DIRS
|
|||
|
|
_get_chroma_collection = chroma_get_collection_fn
|
|||
|
|
_embed_query_fn = embed_query_fn
|
|||
|
|
_embed_documents = embed_documents_fn
|
|||
|
|
if not hasattr(_app.state, "AGENT_SESSIONS"):
|
|||
|
|
_app.state.AGENT_SESSIONS: Dict[str, AgentState] = {}
|
|||
|
|
logger.info("INFO:agent_repo:init GITEA_URL=%s GITEA_API=%s MEILI_URL=%s", GITEA_URL, GITEA_API, MEILI_URL or "-")
|
|||
|
|
|
|||
|
|
# ---------- Helpers ----------
|
|||
|
|
def extract_explicit_paths(text: str) -> List[str]:
|
|||
|
|
"""
|
|||
|
|
Robuuste extractor:
|
|||
|
|
- negeert urls (http/https)
|
|||
|
|
- vereist minstens één '/' en een extensie
|
|||
|
|
- dedupe, behoud originele volgorde
|
|||
|
|
"""
|
|||
|
|
if not text:
|
|||
|
|
return []
|
|||
|
|
# normaliseer “slimme” quotes naar gewone quotes (kan later handig zijn)
|
|||
|
|
t = (text or "").replace("“","\"").replace("”","\"").replace("’","'").replace("\\","/").strip()
|
|||
|
|
cands = PATH_RE.findall(t)
|
|||
|
|
seen = set()
|
|||
|
|
out: List[str] = []
|
|||
|
|
for p in cands:
|
|||
|
|
if p not in seen:
|
|||
|
|
seen.add(p)
|
|||
|
|
out.append(p)
|
|||
|
|
logger.info("EXPLICIT PATHS parsed: %s", out) # <— log
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
async def _llm_recovery_plan(user_goal: str, observed_candidates: list[str], last_reason: str = "") -> dict:
|
|||
|
|
"""
|
|||
|
|
Vraag de LLM om gerichte herstel-zoekpatronen en trefwoorden wanneer we 'geen voorstel' kregen.
|
|||
|
|
Output JSON: { "patterns":[{"glob"| "regex": str},...], "keywords":[str,...], "note": str }
|
|||
|
|
"""
|
|||
|
|
sys = ("Return ONLY compact JSON. Schema:\n"
|
|||
|
|
"{\"patterns\":[{\"glob\":str}|{\"regex\":str},...],\"keywords\":[str,...],\"note\":str}\n"
|
|||
|
|
"Prefer Laravel-centric paths (resources/views/**.blade.php, routes/*.php, app/Http/Controllers/**.php, "
|
|||
|
|
"config/*.php, .env, database/migrations/**.php). Max 12 patterns, 8 keywords.")
|
|||
|
|
usr = (f"User goal:\n{user_goal}\n\n"
|
|||
|
|
f"Candidates we tried (may be irrelevant):\n{json.dumps(observed_candidates[-12:], ensure_ascii=False)}\n\n"
|
|||
|
|
f"Failure reason (if any): {last_reason or '(none)'}\n"
|
|||
|
|
"Propose minimal extra patterns/keywords to find the exact files.")
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
|||
|
|
stream=False, temperature=0.0, top_p=1.0, max_tokens=280
|
|||
|
|
)
|
|||
|
|
raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
|
|||
|
|
m = re.search(r"\{[\s\S]*\}", raw or "")
|
|||
|
|
obj = json.loads(m.group(0)) if m else {}
|
|||
|
|
except Exception:
|
|||
|
|
obj = {}
|
|||
|
|
# sanitize
|
|||
|
|
pats = []
|
|||
|
|
for it in (obj.get("patterns") or []):
|
|||
|
|
if isinstance(it, dict):
|
|||
|
|
if "glob" in it and isinstance(it["glob"], str) and it["glob"].strip():
|
|||
|
|
pats.append({"glob": it["glob"].strip()[:200]})
|
|||
|
|
elif "regex" in it and isinstance(it["regex"], str) and it["regex"].strip():
|
|||
|
|
pats.append({"regex": it["regex"].strip()[:200]})
|
|||
|
|
if len(pats) >= 16: break
|
|||
|
|
kws = [str(x).strip()[:64] for x in (obj.get("keywords") or []) if str(x).strip()][:8]
|
|||
|
|
note = str(obj.get("note",""))[:400]
|
|||
|
|
return {"patterns": pats, "keywords": kws, "note": note}
|
|||
|
|
|
|||
|
|
def _extend_candidates_with_keywords(root: Path, all_files: list[str], keywords: list[str], cap: int = 24) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Deterministische keyword-scan (lichtgewicht). Gebruikt dezelfde text loader.
|
|||
|
|
"""
|
|||
|
|
out: list[str] = []; seen: set[str] = set()
|
|||
|
|
kws = [k for k in keywords if k]
|
|||
|
|
if not kws: return out
|
|||
|
|
for rel in all_files:
|
|||
|
|
if len(out) >= cap: break
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(Path(root)/rel)
|
|||
|
|
except Exception:
|
|||
|
|
txt = ""
|
|||
|
|
if not txt: continue
|
|||
|
|
low = txt.lower()
|
|||
|
|
if any(k.lower() in low for k in kws):
|
|||
|
|
if rel not in seen:
|
|||
|
|
seen.add(rel); out.append(rel)
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
async def _recovery_expand_candidates(root: Path, all_files: list[str], user_goal: str,
|
|||
|
|
current: list[str], *, last_reason: str = "") -> tuple[list[str], dict]:
|
|||
|
|
"""
|
|||
|
|
1) vraag LLM om recovery plan → patterns + keywords
|
|||
|
|
2) scan deterministisch met _scan_repo_for_patterns
|
|||
|
|
3) keyword-scan als tweede spoor
|
|||
|
|
Retourneert (nieuwe_kandidaten_lijst, debug_info)
|
|||
|
|
"""
|
|||
|
|
plan = await _llm_recovery_plan(user_goal, current, last_reason=last_reason)
|
|||
|
|
added: list[str] = []
|
|||
|
|
# patterns → scan
|
|||
|
|
if plan.get("patterns"):
|
|||
|
|
hits = _scan_repo_for_patterns(root, all_files, plan["patterns"], max_hits=int(os.getenv("LLM_RECOVERY_MAX_HITS","24")))
|
|||
|
|
for h in hits:
|
|||
|
|
if h not in current and h not in added:
|
|||
|
|
added.append(h)
|
|||
|
|
# keywords → scan
|
|||
|
|
if len(added) < int(os.getenv("LLM_RECOVERY_MAX_HITS","24")) and plan.get("keywords"):
|
|||
|
|
khits = _extend_candidates_with_keywords(root, all_files, plan["keywords"],
|
|||
|
|
cap=int(os.getenv("LLM_RECOVERY_MAX_HITS","24")) - len(added))
|
|||
|
|
for h in khits:
|
|||
|
|
if h not in current and h not in added:
|
|||
|
|
added.append(h)
|
|||
|
|
new_list = (current + added)[:MAX_FILES_DRYRUN]
|
|||
|
|
debug = {"recovery_plan": plan, "added": added[:12]}
|
|||
|
|
return new_list, debug
|
|||
|
|
|
|||
|
|
def _scan_repo_for_patterns(root: Path, all_files: list[str], patterns: list[dict], max_hits: int = 40) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
patterns: [{"glob": "resources/views/**.blade.php"}, {"regex": "Truebeam\\s*foutcode"}, ...]
|
|||
|
|
Retourneert unieke bestands-paden met 1+ hits. Deterministisch (geen LLM).
|
|||
|
|
"""
|
|||
|
|
hits: list[str] = []
|
|||
|
|
seen: set[str] = set()
|
|||
|
|
def _match_glob(pat: str) -> list[str]:
|
|||
|
|
try:
|
|||
|
|
pat = pat.strip().lstrip("./")
|
|||
|
|
return [f for f in all_files if fnmatch.fnmatch(f, pat)]
|
|||
|
|
except Exception:
|
|||
|
|
return []
|
|||
|
|
for spec in patterns or []:
|
|||
|
|
if len(hits) >= max_hits: break
|
|||
|
|
if "glob" in spec and isinstance(spec["glob"], str):
|
|||
|
|
for f in _match_glob(spec["glob"]):
|
|||
|
|
if f not in seen:
|
|||
|
|
seen.add(f); hits.append(f)
|
|||
|
|
if len(hits) >= max_hits: break
|
|||
|
|
elif "regex" in spec and isinstance(spec["regex"], str):
|
|||
|
|
try:
|
|||
|
|
rx = re.compile(spec["regex"], re.I|re.M)
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
for f in all_files:
|
|||
|
|
if f in seen: continue
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(Path(root)/f)
|
|||
|
|
if rx.search(txt or ""):
|
|||
|
|
seen.add(f); hits.append(f)
|
|||
|
|
if len(hits) >= max_hits: break
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
return hits
|
|||
|
|
|
|||
|
|
async def _llm_make_search_specs(user_goal: str, framework: str = "laravel") -> list[dict]:
|
|||
|
|
"""
|
|||
|
|
LLM bedenkt globs/regexen. Output ONLY JSON: {patterns:[{glob|regex: str},...]}
|
|||
|
|
We voeren daarna een deterministische scan uit met _scan_repo_for_patterns.
|
|||
|
|
"""
|
|||
|
|
if not (user_goal or "").strip():
|
|||
|
|
return []
|
|||
|
|
sys = ("Return ONLY JSON matching: {\"patterns\":[{\"glob\":str}|{\"regex\":str}, ...]}\n"
|
|||
|
|
"For Laravel, prefer globs like resources/views/**.blade.php, routes/*.php, app/Http/Controllers/**.php, "
|
|||
|
|
"config/*.php, .env, database/migrations/**.php. Keep regexes simple and safe.")
|
|||
|
|
usr = f"Framework: {framework}\nUser goal:\n{user_goal}\nReturn ≤ 12 items."
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
|||
|
|
stream=False, temperature=0.0, top_p=1.0, max_tokens=280
|
|||
|
|
)
|
|||
|
|
raw = (resp.get('choices',[{}])[0].get('message',{}) or {}).get('content','')
|
|||
|
|
m = re.search(r"\{[\s\S]*\}", raw or "")
|
|||
|
|
obj = json.loads(m.group(0)) if m else {}
|
|||
|
|
arr = obj.get("patterns") or []
|
|||
|
|
out = []
|
|||
|
|
for it in arr:
|
|||
|
|
if isinstance(it, dict):
|
|||
|
|
if "glob" in it and isinstance(it["glob"], str) and it["glob"].strip():
|
|||
|
|
out.append({"glob": it["glob"].strip()[:200]})
|
|||
|
|
elif "regex" in it and isinstance(it["regex"], str) and it["regex"].strip():
|
|||
|
|
out.append({"regex": it["regex"].strip()[:200]})
|
|||
|
|
if len(out) >= 16: break
|
|||
|
|
return out
|
|||
|
|
except Exception:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def _with_preview(text: str, st: "AgentState", *, limit: int = 1200, header: str = "--- SMART-RAG quick scan (preview) ---") -> str:
|
|||
|
|
"""Plak een compacte SMART-RAG preview onderaan het antwoord, als die er is."""
|
|||
|
|
sp = getattr(st, "smart_preview", "") or ""
|
|||
|
|
sp = sp.strip()
|
|||
|
|
if not sp:
|
|||
|
|
return text
|
|||
|
|
if limit > 0 and len(sp) > limit:
|
|||
|
|
sp = sp[:limit].rstrip() + "\n…"
|
|||
|
|
return text + "\n\n" + header + "\n" + sp
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _now() -> int:
|
|||
|
|
return int(time.time())
|
|||
|
|
|
|||
|
|
def _gitea_headers():
|
|||
|
|
return {"Authorization": f"token {GITEA_TOKEN}"} if GITEA_TOKEN else {}
|
|||
|
|
|
|||
|
|
def add_auth_to_url(url: str, user: str | None = None, token: str | None = None) -> str:
|
|||
|
|
if not url or not (user and token):
|
|||
|
|
return url
|
|||
|
|
u = urlparse(url)
|
|||
|
|
if u.scheme not in ("http", "https") or "@" in u.netloc:
|
|||
|
|
return url
|
|||
|
|
netloc = f"{user}:{token}@{u.netloc}"
|
|||
|
|
return urlunparse((u.scheme, netloc, u.path, u.params, u.query, u.fragment))
|
|||
|
|
|
|||
|
|
def ensure_git_suffix(url: str) -> str:
|
|||
|
|
try:
|
|||
|
|
u = urlparse(url)
|
|||
|
|
if not u.path.endswith(".git") and "/api/" not in u.path:
|
|||
|
|
return urlunparse((u.scheme, u.netloc, u.path.rstrip("/") + ".git", u.params, u.query, u.fragment))
|
|||
|
|
return url
|
|||
|
|
except Exception:
|
|||
|
|
return url
|
|||
|
|
|
|||
|
|
def parse_owner_repo(hint: str) -> tuple[str | None, str | None]:
|
|||
|
|
m = re.match(r"^([A-Za-z0-9_.\-]+)/([A-Za-z0-9_.\-]+)$", (hint or "").strip())
|
|||
|
|
if not m:
|
|||
|
|
return None, None
|
|||
|
|
return m.group(1), m.group(2)
|
|||
|
|
|
|||
|
|
def gitea_get_repo(owner: str, repo: str) -> dict | None:
|
|||
|
|
try:
|
|||
|
|
r = requests.get(f"{GITEA_API}/repos/{owner}/{repo}", headers=_gitea_headers(), timeout=10)
|
|||
|
|
if r.status_code == 404:
|
|||
|
|
return None
|
|||
|
|
r.raise_for_status()
|
|||
|
|
return r.json()
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:gitea_get_repo %s/%s failed: %s", owner, repo, e)
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def gitea_search_repos(q: str, limit: int = 5) -> List[dict]:
|
|||
|
|
try:
|
|||
|
|
r = requests.get(f"{GITEA_API}/repos/search",
|
|||
|
|
params={"q": q, "limit": limit},
|
|||
|
|
headers=_gitea_headers(), timeout=10)
|
|||
|
|
r.raise_for_status()
|
|||
|
|
data = r.json() or {}
|
|||
|
|
if isinstance(data, dict) and "data" in data: return data["data"]
|
|||
|
|
if isinstance(data, list): return data
|
|||
|
|
if isinstance(data, dict) and "ok" in data and "data" in data: return data["data"]
|
|||
|
|
return []
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:/repos/search failed: %s", e)
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def resolve_repo(hint: str) -> tuple[dict | None, str | None]:
|
|||
|
|
hint = (hint or "").strip()
|
|||
|
|
logger.info("INFO:agent_repo:resolve_repo hint=%s", hint)
|
|||
|
|
if hint.startswith("http://") or hint.startswith("https://"):
|
|||
|
|
url = add_auth_to_url(ensure_git_suffix(hint), GITEA_HTTP_USER, GITEA_HTTP_TOKEN)
|
|||
|
|
owner, repo = owner_repo_from_url(url)
|
|||
|
|
rd = {"full_name": f"{owner}/{repo}" if owner and repo else None, "clone_url": url}
|
|||
|
|
logger.info("INFO:agent_repo:resolved direct-url %s", rd.get("full_name"))
|
|||
|
|
return rd, "direct-url"
|
|||
|
|
owner, repo = parse_owner_repo(hint)
|
|||
|
|
if owner and repo:
|
|||
|
|
meta = gitea_get_repo(owner, repo)
|
|||
|
|
if meta:
|
|||
|
|
url = meta.get("clone_url") or f"{GITEA_URL}/{owner}/{repo}.git"
|
|||
|
|
url = add_auth_to_url(ensure_git_suffix(url), GITEA_HTTP_USER, GITEA_HTTP_TOKEN)
|
|||
|
|
meta["clone_url"] = url
|
|||
|
|
logger.info("INFO:agent_repo:resolved owner-repo %s", meta.get("full_name"))
|
|||
|
|
return meta, "owner-repo"
|
|||
|
|
url = add_auth_to_url(ensure_git_suffix(f"{GITEA_URL}/{owner}/{repo}.git"), GITEA_HTTP_USER, GITEA_HTTP_TOKEN)
|
|||
|
|
rd = {"full_name": f"{owner}/{repo}", "clone_url": url}
|
|||
|
|
logger.info("INFO:agent_repo:resolved owner-repo-fallback %s", rd.get("full_name"))
|
|||
|
|
return rd, "owner-repo-fallback"
|
|||
|
|
found = gitea_search_repos(hint, limit=5)
|
|||
|
|
if found:
|
|||
|
|
found[0]["clone_url"] = add_auth_to_url(ensure_git_suffix(found[0].get("clone_url") or ""), GITEA_HTTP_USER, GITEA_HTTP_TOKEN)
|
|||
|
|
logger.info("INFO:agent_repo:resolved search %s", found[0].get("full_name"))
|
|||
|
|
return found[0], "search"
|
|||
|
|
logger.error("ERROR:agent_repo:repo not found for hint=%s", hint)
|
|||
|
|
return None, "not-found"
|
|||
|
|
|
|||
|
|
def extract_context_hints_from_prompt(user_goal: str) -> dict:
|
|||
|
|
"""
|
|||
|
|
Haal dynamisch hints uit de prompt:
|
|||
|
|
- tag_names: HTML/XML tags die genoemd zijn (<title>, <h1>, <button> ...)
|
|||
|
|
- attr_names: genoemde HTML attributen (value, placeholder, title, aria-label ...)
|
|||
|
|
"""
|
|||
|
|
tag_names = set()
|
|||
|
|
for m in re.finditer(r"<\s*([A-Za-z][A-Za-z0-9:_-]*)\s*>", user_goal):
|
|||
|
|
tag_names.add(m.group(1).lower())
|
|||
|
|
attr_names = set()
|
|||
|
|
for m in re.finditer(r"\b(value|placeholder|title|aria-[a-z-]+|alt|label)\b", user_goal, flags=re.IGNORECASE):
|
|||
|
|
attr_names.add(m.group(1).lower())
|
|||
|
|
return {"tag_names": tag_names, "attr_names": attr_names}
|
|||
|
|
|
|||
|
|
def gitea_list_all_repos(limit: int = AGENT_DISCOVER_MAX_REPOS) -> List[dict]:
|
|||
|
|
"""
|
|||
|
|
Haal zo veel mogelijk repos op die de token kan zien.
|
|||
|
|
Probeert /repos/search paginated; valt terug op lege lijst bij problemen.
|
|||
|
|
"""
|
|||
|
|
out = []
|
|||
|
|
page = 1
|
|||
|
|
per_page = 50
|
|||
|
|
try:
|
|||
|
|
while len(out) < limit:
|
|||
|
|
r = requests.get(
|
|||
|
|
f"{GITEA_API}/repos/search",
|
|||
|
|
params={"q":"", "limit": per_page, "page": page},
|
|||
|
|
headers=_gitea_headers(), timeout=10
|
|||
|
|
)
|
|||
|
|
r.raise_for_status()
|
|||
|
|
data = r.json()
|
|||
|
|
items = data.get("data") if isinstance(data, dict) else (data if isinstance(data, list) else [])
|
|||
|
|
if not items:
|
|||
|
|
break
|
|||
|
|
out.extend(items)
|
|||
|
|
if len(items) < per_page:
|
|||
|
|
break
|
|||
|
|
page += 1
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:gitea_list_all_repos failed: %s", e)
|
|||
|
|
# Normaliseer velden
|
|||
|
|
norm = []
|
|||
|
|
for it in out[:limit]:
|
|||
|
|
full = it.get("full_name") or (f"{it.get('owner',{}).get('login','')}/{it.get('name','')}".strip("/"))
|
|||
|
|
clone = it.get("clone_url") or (f"{GITEA_URL}/{full}.git" if full else None)
|
|||
|
|
default_branch = it.get("default_branch") or "main"
|
|||
|
|
norm.append({
|
|||
|
|
"full_name": full,
|
|||
|
|
"name": it.get("name"),
|
|||
|
|
"owner": (it.get("owner") or {}).get("login"),
|
|||
|
|
"description": it.get("description") or "",
|
|||
|
|
"language": it.get("language") or "",
|
|||
|
|
"topics": it.get("topics") or [],
|
|||
|
|
"default_branch": default_branch,
|
|||
|
|
"clone_url": add_auth_to_url(ensure_git_suffix(clone), GITEA_HTTP_USER, GITEA_HTTP_TOKEN) if clone else None,
|
|||
|
|
})
|
|||
|
|
return [n for n in norm if n.get("full_name")]
|
|||
|
|
|
|||
|
|
def gitea_fetch_readme(owner: str, repo: str, ref: str = "main") -> str:
|
|||
|
|
"""Probeer README via API; dek meerdere varianten af; decode base64 als nodig."""
|
|||
|
|
candidates = [
|
|||
|
|
f"{GITEA_API}/repos/{owner}/{repo}/readme",
|
|||
|
|
f"{GITEA_API}/repos/{owner}/{repo}/contents/README.md",
|
|||
|
|
f"{GITEA_API}/repos/{owner}/{repo}/contents/README",
|
|||
|
|
f"{GITEA_API}/repos/{owner}/{repo}/contents/readme.md",
|
|||
|
|
]
|
|||
|
|
for url in candidates:
|
|||
|
|
try:
|
|||
|
|
r = requests.get(url, params={"ref": ref}, headers=_gitea_headers(), timeout=10)
|
|||
|
|
if r.status_code == 404:
|
|||
|
|
continue
|
|||
|
|
r.raise_for_status()
|
|||
|
|
js = r.json()
|
|||
|
|
# content in base64?
|
|||
|
|
if isinstance(js, dict) and "content" in js:
|
|||
|
|
try:
|
|||
|
|
return base64.b64decode(js["content"]).decode("utf-8", errors="ignore")
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
# sommige Gitea versies hebben 'download_url'
|
|||
|
|
dl = js.get("download_url") if isinstance(js, dict) else None
|
|||
|
|
if dl:
|
|||
|
|
rr = requests.get(dl, timeout=10, headers=_gitea_headers())
|
|||
|
|
rr.raise_for_status()
|
|||
|
|
return rr.text
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
def gitea_repo_exists(owner: str, name: str) -> bool:
|
|||
|
|
"""Controleer via de Gitea API of owner/name bestaat (en je token rechten heeft)."""
|
|||
|
|
try:
|
|||
|
|
r = requests.get(f"{GITEA_API}/repos/{owner}/{name}",
|
|||
|
|
headers=_gitea_headers(), timeout=5)
|
|||
|
|
return r.status_code == 200
|
|||
|
|
except Exception:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def owner_repo_from_url(url: str) -> tuple[str|None, str|None]:
|
|||
|
|
"""
|
|||
|
|
Probeer owner/repo uit een http(s) .git URL te halen.
|
|||
|
|
Voorbeeld: http://host:3080/owner/repo.git -> ('owner', 'repo')
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
from urllib.parse import urlparse
|
|||
|
|
p = urlparse(url)
|
|||
|
|
parts = [x for x in (p.path or "").split("/") if x]
|
|||
|
|
if len(parts) >= 2:
|
|||
|
|
repo = parts[-1]
|
|||
|
|
if repo.endswith(".git"):
|
|||
|
|
repo = repo[:-4]
|
|||
|
|
owner = parts[-2]
|
|||
|
|
return owner, repo
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
return None, None
|
|||
|
|
|
|||
|
|
|
|||
|
|
# === Repo-catalogus indexeren in Meili (optioneel) en Chroma ===
|
|||
|
|
def meili_get_index(name: str):
|
|||
|
|
cli = get_meili()
|
|||
|
|
if not cli: return None
|
|||
|
|
try:
|
|||
|
|
return cli.index(name)
|
|||
|
|
except Exception:
|
|||
|
|
try:
|
|||
|
|
return cli.create_index(uid=name, options={"primaryKey":"id"})
|
|||
|
|
except Exception:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def meili_catalog_upsert(docs: List[dict]):
|
|||
|
|
idx = meili_get_index(REPO_CATALOG_MEILI_INDEX)
|
|||
|
|
if not idx or not docs: return
|
|||
|
|
try:
|
|||
|
|
idx.add_documents(docs)
|
|||
|
|
try:
|
|||
|
|
idx.update_searchable_attributes(["full_name","name","description","readme","topics","language"])
|
|||
|
|
idx.update_filterable_attributes(["full_name","owner","language","topics"])
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:meili_catalog_upsert: %s", e)
|
|||
|
|
|
|||
|
|
def meili_catalog_search(q: str, limit: int = 10) -> List[dict]:
|
|||
|
|
idx = meili_get_index(REPO_CATALOG_MEILI_INDEX)
|
|||
|
|
if not idx: return []
|
|||
|
|
try:
|
|||
|
|
res = idx.search(q, {"limit": limit})
|
|||
|
|
return res.get("hits", [])
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:meili_catalog_search: %s", e)
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def chroma_catalog_upsert(docs: List[dict]):
|
|||
|
|
"""Indexeer/upsérten van repo-catalogus in Chroma met de GEÏNJECTEERDE embedding_function. (bij HTTP-mode embeddings client-side meezenden.)"""
|
|||
|
|
try:
|
|||
|
|
if not docs or _get_chroma_collection is None:
|
|||
|
|
return
|
|||
|
|
col = _get_chroma_collection("repo_catalog") # naam wordt in app.py gesuffixed met __<slug>__v<ver>
|
|||
|
|
ids = [d["id"] for d in docs]
|
|||
|
|
texts = [d["doc"] for d in docs]
|
|||
|
|
metas = [d["meta"] for d in docs]
|
|||
|
|
# schoon oud weg, best-effort
|
|||
|
|
try:
|
|||
|
|
col.delete(ids=ids)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
if _embed_documents:
|
|||
|
|
embs = _embed_documents(texts)
|
|||
|
|
col.add(ids=ids, documents=texts, embeddings=embs, metadatas=metas)
|
|||
|
|
else:
|
|||
|
|
col.add(ids=ids, documents=texts, metadatas=metas)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:chroma_catalog_upsert: %s", e)
|
|||
|
|
|
|||
|
|
def chroma_catalog_search(q: str, n: int = 8) -> List[dict]:
|
|||
|
|
try:
|
|||
|
|
if _get_chroma_collection is None or _embed_query_fn is None:
|
|||
|
|
return []
|
|||
|
|
col = _get_chroma_collection("repo_catalog")
|
|||
|
|
q_emb = _embed_query_fn(q)
|
|||
|
|
res = col.query(query_embeddings=[q_emb], n_results=n, include=["documents","metadatas","distances"])
|
|||
|
|
docs = (res.get("documents") or [[]])[0]
|
|||
|
|
metas = (res.get("metadatas") or [[]])[0]
|
|||
|
|
dists = (res.get("distances") or [[]])[0]
|
|||
|
|
out = []
|
|||
|
|
for doc, meta, dist in zip(docs, metas, dists):
|
|||
|
|
if isinstance(meta, dict):
|
|||
|
|
sim = 1.0 / (1.0 + float(dist or 0.0)) # simpele afstand→similarity
|
|||
|
|
out.append({"full_name": meta.get("full_name"), "score": float(sim), "preview": doc})
|
|||
|
|
return out
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:chroma_catalog_search: %s", e)
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
|
|||
|
|
# === Documenten maken voor catalogus ===
|
|||
|
|
def build_repo_catalog_doc(meta: dict, readme: str) -> dict:
|
|||
|
|
full_name = meta.get("full_name","")
|
|||
|
|
name = meta.get("name","")
|
|||
|
|
desc = meta.get("description","")
|
|||
|
|
lang = meta.get("language","")
|
|||
|
|
topics = " ".join(meta.get("topics") or [])
|
|||
|
|
preview = (readme or "")[:2000]
|
|||
|
|
doc = (
|
|||
|
|
f"{full_name}\n"
|
|||
|
|
f"{name}\n"
|
|||
|
|
f"{desc}\n"
|
|||
|
|
f"language: {lang}\n"
|
|||
|
|
f"topics: {topics}\n"
|
|||
|
|
f"README:\n{preview}"
|
|||
|
|
)
|
|||
|
|
return {
|
|||
|
|
"id": f"repo:{full_name}",
|
|||
|
|
"doc": doc,
|
|||
|
|
"meta": {
|
|||
|
|
"full_name": full_name,
|
|||
|
|
"name": name,
|
|||
|
|
"description": desc,
|
|||
|
|
"language": lang,
|
|||
|
|
"topics": topics,
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# === Heuristische (lexicale) score als fallback ===
|
|||
|
|
def lexical_repo_score(q: str, meta: dict, readme: str) -> float:
|
|||
|
|
qtokens = re.findall(r"[A-Za-z0-9_]{2,}", q.lower())
|
|||
|
|
text = " ".join([
|
|||
|
|
meta.get("full_name",""),
|
|||
|
|
meta.get("name",""),
|
|||
|
|
meta.get("description",""),
|
|||
|
|
" ".join(meta.get("topics") or []),
|
|||
|
|
(readme or "")[:4000],
|
|||
|
|
]).lower()
|
|||
|
|
if not qtokens or not text:
|
|||
|
|
return 0.0
|
|||
|
|
score = 0
|
|||
|
|
for t in set(qtokens):
|
|||
|
|
score += text.count(t)
|
|||
|
|
# kleine bonus als 'mainten', 'admin', 'viewer' etc tegelijk voorkomen in naam
|
|||
|
|
name = (meta.get("name") or "").lower()
|
|||
|
|
for t in set(qtokens):
|
|||
|
|
if t in name:
|
|||
|
|
score += 2
|
|||
|
|
return float(score)
|
|||
|
|
|
|||
|
|
# === LLM-rerank voor repo's (hergebruik van je bestaande reranker) ===
|
|||
|
|
async def llm_rerank_repos(user_goal: str, candidates: List[dict], topk: int = 5) -> List[dict]:
|
|||
|
|
if not candidates:
|
|||
|
|
return []
|
|||
|
|
pack = []
|
|||
|
|
for i, c in enumerate(candidates[:12], 1):
|
|||
|
|
pv = c.get("preview","")[:700]
|
|||
|
|
pack.append(f"{i}. REPO: {c['full_name']}\nDESC: {c.get('description','')}\nPREVIEW:\n{pv}")
|
|||
|
|
prompt = (
|
|||
|
|
"Rangschik onderstaande repositories op geschiktheid voor het doel. "
|
|||
|
|
"Geef een geldige JSON-array met objecten: {\"full_name\":\"...\",\"score\":0-100}.\n\n"
|
|||
|
|
"DOEL:\n" + user_goal + "\n\nCANDIDATES:\n" + "\n\n".join(pack)
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":"Alleen geldige JSON."},
|
|||
|
|
{"role":"user","content":prompt}],
|
|||
|
|
stream=False, temperature=0.0, top_p=0.9, max_tokens=600
|
|||
|
|
)
|
|||
|
|
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
|||
|
|
arr = safe_json_loads(raw)
|
|||
|
|
if not isinstance(arr, list):
|
|||
|
|
return candidates[:topk]
|
|||
|
|
smap = {}
|
|||
|
|
for d in (arr or []):
|
|||
|
|
if not isinstance(d, dict):
|
|||
|
|
continue
|
|||
|
|
fn = d.get("full_name"); sc = d.get("score")
|
|||
|
|
try:
|
|||
|
|
if isinstance(fn, str):
|
|||
|
|
smap[fn] = float(sc)
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
#smap = {d.get("full_name"): float(d.get("score",0)) for d in arr if isinstance(d, dict) and "full_name" in d}
|
|||
|
|
resc = []
|
|||
|
|
for c in candidates:
|
|||
|
|
resc.append({**c, "score": smap.get(c["full_name"], 0.0)/100.0})
|
|||
|
|
resc.sort(key=lambda x: x.get("score",0.0), reverse=True)
|
|||
|
|
return resc[:topk]
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:llm_rerank_repos failed: %s", e)
|
|||
|
|
return candidates[:topk]
|
|||
|
|
|
|||
|
|
# --- Intent/goal refine ---
|
|||
|
|
async def llm_refine_goal(raw_goal: str) -> tuple[str, List[str], float]:
|
|||
|
|
"""
|
|||
|
|
Laat LLM een compacte, concrete 'refined_goal' maken + max 2 verduidelijkingsvragen.
|
|||
|
|
Retourneert (refined_goal, clarifying_questions, confidence(0..1)).
|
|||
|
|
"""
|
|||
|
|
SYSTEM = "Geef uitsluitend geldige JSON; geen uitleg."
|
|||
|
|
USER = (
|
|||
|
|
"Vat de bedoeling van deze opdracht ultra-kort en concreet samen als 'refined_goal'. "
|
|||
|
|
"Als er kritieke onduidelijkheden zijn: geef max 2 korte 'clarifying_questions'. "
|
|||
|
|
"Geef ook 'confidence' (0..1). JSON:\n"
|
|||
|
|
"{ \"refined_goal\": \"...\", \"clarifying_questions\": [\"...\"], \"confidence\": 0.0 }\n\n"
|
|||
|
|
f"RAW_GOAL:\n{raw_goal}"
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":SYSTEM},{"role":"user","content":USER}],
|
|||
|
|
stream=False, temperature=0.0, top_p=0.9, max_tokens=300
|
|||
|
|
)
|
|||
|
|
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
|||
|
|
js = safe_json_loads(raw) or {}
|
|||
|
|
rg = (js.get("refined_goal") or "").strip() or raw_goal
|
|||
|
|
qs = [q.strip() for q in (js.get("clarifying_questions") or []) if isinstance(q, str) and q.strip()][:2]
|
|||
|
|
cf = float(js.get("confidence", 0.0) or 0.0)
|
|||
|
|
cf = max(0.0, min(1.0, cf))
|
|||
|
|
return rg, qs, cf
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:llm_refine_goal failed: %s", e)
|
|||
|
|
return raw_goal, [], 0.0
|
|||
|
|
|
|||
|
|
|
|||
|
|
# === Discovery pipeline ===
|
|||
|
|
async def discover_candidate_repos(user_goal: str) -> List[dict]:
|
|||
|
|
"""Zoek een passende repo puur op basis van de vraag (zonder hint)."""
|
|||
|
|
#repos = gitea_list_all_repos(limit=AGENT_DISCOVER_MAX_REPOS)
|
|||
|
|
repos = await run_io_blocking(gitea_list_all_repos, limit=AGENT_DISCOVER_MAX_REPOS)
|
|||
|
|
if not repos:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# Concurrerende fetch (beperk paralleliteit licht voor stabiliteit)
|
|||
|
|
sem = asyncio.Semaphore(int(os.getenv("AGENT_DISCOVER_README_CONCURRENCY", "8")))
|
|||
|
|
|
|||
|
|
async def _fetch_readme(m):
|
|||
|
|
async with sem:
|
|||
|
|
return await run_io_blocking(
|
|||
|
|
gitea_fetch_readme,
|
|||
|
|
m.get("owner",""), m.get("name",""), m.get("default_branch","main")
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
readmes = await asyncio.gather(*[_fetch_readme(m) for m in repos], return_exceptions=True)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Verzamel README's (kort) en bouw catalogus docs
|
|||
|
|
docs_meili = []
|
|||
|
|
docs_chroma = []
|
|||
|
|
cands = []
|
|||
|
|
for i, m in enumerate(repos):
|
|||
|
|
#readme = gitea_fetch_readme(m.get("owner",""), m.get("name",""), m.get("default_branch","main"))
|
|||
|
|
readme = "" if isinstance(readmes[i], Exception) else (readmes[i] or "")
|
|||
|
|
doc = build_repo_catalog_doc(m, readme)
|
|||
|
|
docs_chroma.append(doc)
|
|||
|
|
docs_meili.append({
|
|||
|
|
"id": m["full_name"],
|
|||
|
|
"full_name": m["full_name"],
|
|||
|
|
"name": m.get("name",""),
|
|||
|
|
"owner": m.get("owner",""),
|
|||
|
|
"description": m.get("description",""),
|
|||
|
|
"language": m.get("language",""),
|
|||
|
|
"topics": " ".join(m.get("topics") or []),
|
|||
|
|
"readme": (readme or "")[:5000],
|
|||
|
|
})
|
|||
|
|
cands.append({
|
|||
|
|
"full_name": m["full_name"],
|
|||
|
|
"description": m.get("description",""),
|
|||
|
|
"clone_url": m.get("clone_url"),
|
|||
|
|
"preview": (readme or "")[:1200],
|
|||
|
|
"base_score": 0.0, # vullen we zo
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# Indexeer catalogus (best effort)
|
|||
|
|
if MEILI_URL:
|
|||
|
|
meili_catalog_upsert(docs_meili)
|
|||
|
|
chroma_catalog_upsert(docs_chroma)
|
|||
|
|
|
|||
|
|
# Multi-query expand
|
|||
|
|
queries = await llm_expand_queries(user_goal, extract_quotes(user_goal), extract_word_hints(user_goal), k=5)
|
|||
|
|
|
|||
|
|
# Heuristische score + Meili/Chroma boosts
|
|||
|
|
score_map: Dict[str, float] = {c["full_name"]: 0.0 for c in cands}
|
|||
|
|
for q in queries:
|
|||
|
|
# lexicale score
|
|||
|
|
for i, m in enumerate(repos):
|
|||
|
|
score_map[m["full_name"]] += 0.2 * lexical_repo_score(q, m, (docs_meili[i].get("readme") if i < len(docs_meili) else ""))
|
|||
|
|
|
|||
|
|
# Meili boost
|
|||
|
|
if MEILI_URL:
|
|||
|
|
hits = meili_catalog_search(q, limit=10)
|
|||
|
|
for h in hits:
|
|||
|
|
fn = h.get("full_name")
|
|||
|
|
if fn in score_map:
|
|||
|
|
score_map[fn] += 2.0
|
|||
|
|
|
|||
|
|
# Chroma boost
|
|||
|
|
chroma_hits = chroma_catalog_search(q, n=6)
|
|||
|
|
for h in chroma_hits:
|
|||
|
|
fn = h.get("full_name")
|
|||
|
|
if fn in score_map:
|
|||
|
|
score_map[fn] += 1.2
|
|||
|
|
|
|||
|
|
# Combineer in kandidaten
|
|||
|
|
for c in cands:
|
|||
|
|
c["score"] = score_map.get(c["full_name"], 0.0)
|
|||
|
|
|
|||
|
|
# Snelle preselectie
|
|||
|
|
cands.sort(key=lambda x: x["score"], reverse=True)
|
|||
|
|
pre = cands[:8]
|
|||
|
|
|
|||
|
|
# LLM rerank met uitleg-score
|
|||
|
|
top = await llm_rerank_repos(user_goal, pre, topk=5)
|
|||
|
|
return top
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------- Chroma collection naam ----------
|
|||
|
|
def sanitize_collection_name(s: str) -> str:
|
|||
|
|
s = re.sub(r"[^A-Za-z0-9._-]+", "-", s).strip("-")[:128]
|
|||
|
|
return s or "code_docs"
|
|||
|
|
|
|||
|
|
def repo_collection_name(owner_repo: str | None, branch: str) -> str:
|
|||
|
|
return sanitize_collection_name(f"code_docs-{owner_repo or 'repo'}-{branch}")
|
|||
|
|
|
|||
|
|
def _get_session_id(messages: List[dict], request) -> str:
|
|||
|
|
for m in messages:
|
|||
|
|
if m.get("role") == "system" and str(m.get("content","")).startswith("session:"):
|
|||
|
|
return str(m["content"]).split("session:",1)[1].strip()
|
|||
|
|
key = (messages[0].get("content","") + "|" + _client_ip(request)).encode("utf-8", errors="ignore")
|
|||
|
|
return hashlib.sha256(key).hexdigest()[:16]
|
|||
|
|
|
|||
|
|
# ---------- Files & filters ----------
|
|||
|
|
def allowed_file(p: Path) -> bool:
|
|||
|
|
lo = p.name.lower()
|
|||
|
|
return any(lo.endswith(ext) for ext in ALLOWED_EXTS)
|
|||
|
|
|
|||
|
|
def list_repo_files(repo_root: Path) -> List[str]:
|
|||
|
|
# lichte TTL-cache om herhaalde rglob/IO te beperken (sneller bij multi-queries)
|
|||
|
|
ttl = float(os.getenv("AGENT_LIST_CACHE_TTL", "20"))
|
|||
|
|
key = str(repo_root.resolve())
|
|||
|
|
now = time.time()
|
|||
|
|
if key in _LIST_FILES_CACHE:
|
|||
|
|
ts, cached = _LIST_FILES_CACHE[key]
|
|||
|
|
if now - ts <= ttl:
|
|||
|
|
return list(cached)
|
|||
|
|
|
|||
|
|
files: List[str] = []
|
|||
|
|
for p in repo_root.rglob("*"):
|
|||
|
|
if p.is_dir(): continue
|
|||
|
|
if any(part in _PROFILE_EXCLUDE_DIRS for part in p.parts): continue
|
|||
|
|
try:
|
|||
|
|
if p.stat().st_size > 2_000_000: continue
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
if not allowed_file(p): continue
|
|||
|
|
files.append(str(p.relative_to(repo_root)))
|
|||
|
|
_LIST_FILES_CACHE[key] = (now, files)
|
|||
|
|
return files
|
|||
|
|
|
|||
|
|
# ---------- Query parsing ----------
|
|||
|
|
def extract_quotes(text: str) -> List[str]:
|
|||
|
|
if not text: return []
|
|||
|
|
t = (text or "").replace("“","\"").replace("”","\"").replace("’","'").strip()
|
|||
|
|
return re.findall(r"['\"]([^'\"]{2,})['\"]", t)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_word_hints(text: str) -> List[str]:
|
|||
|
|
if not text: return []
|
|||
|
|
words = set(re.findall(r"[A-Za-z_][A-Za-z0-9_]{1,}", text))
|
|||
|
|
blacklist = {"de","het","een","and","the","voor","naar","op","in","of","to","is","are","van","met","die","dat"}
|
|||
|
|
return [w for w in words if w.lower() not in blacklist]
|
|||
|
|
|
|||
|
|
# ---------- SAFE JSON loader ----------
|
|||
|
|
def safe_json_loads(s: str):
|
|||
|
|
if not s: return None
|
|||
|
|
t = s.strip()
|
|||
|
|
if t.startswith("```"):
|
|||
|
|
t = re.sub(r"^```(?:json)?", "", t.strip(), count=1).strip()
|
|||
|
|
if t.endswith("```"): t = t[:-3].strip()
|
|||
|
|
try:
|
|||
|
|
return json.loads(t)
|
|||
|
|
except Exception:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# ---------- Meilisearch (optioneel) ----------
|
|||
|
|
_meili_client = None
|
|||
|
|
def get_meili():
|
|||
|
|
global _meili_client
|
|||
|
|
if _meili_client is not None:
|
|||
|
|
return _meili_client
|
|||
|
|
if not MEILI_URL:
|
|||
|
|
return None
|
|||
|
|
try:
|
|||
|
|
from meilisearch import Client
|
|||
|
|
_meili_client = Client(MEILI_URL, MEILI_KEY or None)
|
|||
|
|
return _meili_client
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:Meilisearch not available: %s", e)
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def meili_index_name(owner_repo: Optional[str], branch: str) -> str:
|
|||
|
|
base = sanitize_collection_name((owner_repo or "repo") + "-" + branch)
|
|||
|
|
return sanitize_collection_name(f"{MEILI_INDEX_PREFIX}-{base}")
|
|||
|
|
|
|||
|
|
# --- Slimmere, taalbewuste chunker ---
|
|||
|
|
|
|||
|
|
_LANG_BY_EXT = {
|
|||
|
|
".php": "php", ".blade.php": "blade", ".js": "js", ".ts": "ts",
|
|||
|
|
".jsx": "js", ".tsx": "ts", ".py": "py", ".go": "go",
|
|||
|
|
".rb": "rb", ".java": "java", ".cs": "cs",
|
|||
|
|
".css": "css", ".scss": "css",
|
|||
|
|
".html": "html", ".htm": "html", ".md": "md",
|
|||
|
|
".yml": "yaml", ".yaml": "yaml", ".toml": "toml", ".ini": "ini",
|
|||
|
|
".json": "json",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def _detect_lang_from_path(path: str) -> str:
|
|||
|
|
lo = path.lower()
|
|||
|
|
for ext, lang in _LANG_BY_EXT.items():
|
|||
|
|
if lo.endswith(ext):
|
|||
|
|
return lang
|
|||
|
|
return "txt"
|
|||
|
|
|
|||
|
|
def _find_breakpoints(text: str, lang: str) -> list[int]:
|
|||
|
|
"""
|
|||
|
|
Retourneer lijst met 'mooie' breekposities (char indices) om chunks te knippen.
|
|||
|
|
We houden het conservatief; false-positives zijn OK (we kiezen toch dichtstbij).
|
|||
|
|
"""
|
|||
|
|
bps = set()
|
|||
|
|
# Altijd: lege-regelblokken en paragrafen
|
|||
|
|
for m in re.finditer(r"\n\s*\n\s*", text):
|
|||
|
|
bps.add(m.end())
|
|||
|
|
|
|||
|
|
if lang in ("php", "js", "ts", "java", "cs", "go", "rb", "py"):
|
|||
|
|
# Functie/klasse boundaries
|
|||
|
|
pats = [
|
|||
|
|
r"\n\s*(class|interface|trait)\s+[A-Za-z_][A-Za-z0-9_]*\b",
|
|||
|
|
r"\n\s*(public|private|protected|static|\s)*\s*function\b",
|
|||
|
|
r"\n\s*def\s+[A-Za-z_][A-Za-z0-9_]*\s*\(", # py
|
|||
|
|
r"\n\s*func\s+[A-Za-z_][A-Za-z0-9_]*\s*\(", # go
|
|||
|
|
r"\n\s*[A-Za-z0-9_<>\[\]]+\s+[A-Za-z_][A-Za-z0-9_]*\s*\(", # java/cs method-ish
|
|||
|
|
r"\n\}", # sluitende brace op kolom 0 → goed eind
|
|||
|
|
]
|
|||
|
|
for p in pats:
|
|||
|
|
for m in re.finditer(p, text):
|
|||
|
|
bps.add(m.start())
|
|||
|
|
|
|||
|
|
if lang == "blade":
|
|||
|
|
for p in [r"\n\s*@section\b", r"\n\s*@endsection\b", r"\n\s*@if\b", r"\n\s*@endif\b", r"\n\s*<\w"]:
|
|||
|
|
for m in re.finditer(p, text, flags=re.I):
|
|||
|
|
bps.add(m.start())
|
|||
|
|
|
|||
|
|
if lang in ("html", "css"):
|
|||
|
|
for p in [r"\n\s*<\w", r"\n\s*</\w", r"\n\s*}\s*\n"]:
|
|||
|
|
for m in re.finditer(p, text):
|
|||
|
|
bps.add(m.start())
|
|||
|
|
|
|||
|
|
if lang in ("md",):
|
|||
|
|
for p in [r"\n#+\s", r"\n\-{3,}\n", r"\n\*\s", r"\n\d+\.\s"]:
|
|||
|
|
for m in re.finditer(p, text):
|
|||
|
|
bps.add(m.start())
|
|||
|
|
|
|||
|
|
if lang in ("yaml", "toml", "ini"):
|
|||
|
|
# secties/keys aan kolom 0
|
|||
|
|
for m in re.finditer(r"\n[A-Za-z0-9_\-]+\s*[:=]", text):
|
|||
|
|
bps.add(m.start())
|
|||
|
|
|
|||
|
|
# JSON: split op object/array boundaries (conservatief: op { of [ aan kolom 0-ish)
|
|||
|
|
if lang == "json":
|
|||
|
|
for m in re.finditer(r"\n\s*[\{\[]\s*\n", text):
|
|||
|
|
bps.add(m.start())
|
|||
|
|
|
|||
|
|
# Altijd: regelgrenzen
|
|||
|
|
for m in re.finditer(r"\n", text):
|
|||
|
|
bps.add(m.start()+1)
|
|||
|
|
|
|||
|
|
# sorteer & filter binnen range
|
|||
|
|
out = sorted([bp for bp in bps if 0 < bp < len(text)])
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
def smart_chunk_text(text: str, path_hint: str, target_chars: int = 1800,
|
|||
|
|
hard_max: int = 2600, min_chunk: int = 800) -> List[str]:
|
|||
|
|
"""
|
|||
|
|
Chunk op ~target_chars, maar breek op dichtstbijzijnde semantische breakpoint.
|
|||
|
|
- Als geen goed breakpoint: breek op dichtstbijzijnde newline.
|
|||
|
|
- Adaptieve overlap: 200 bij nette break, 350 bij 'ruwe' break.
|
|||
|
|
"""
|
|||
|
|
if not text:
|
|||
|
|
return []
|
|||
|
|
lang = _detect_lang_from_path(path_hint or "")
|
|||
|
|
bps = _find_breakpoints(text, lang)
|
|||
|
|
if not bps:
|
|||
|
|
# fallback: vaste stappen met overlap
|
|||
|
|
chunks = []
|
|||
|
|
i, n = 0, len(text)
|
|||
|
|
step = max(min_chunk, target_chars - 300)
|
|||
|
|
while i < n:
|
|||
|
|
j = min(n, i + target_chars)
|
|||
|
|
chunks.append(text[i:j])
|
|||
|
|
i = min(n, i + step)
|
|||
|
|
return chunks
|
|||
|
|
|
|||
|
|
chunks = []
|
|||
|
|
i, n = 0, len(text)
|
|||
|
|
while i < n:
|
|||
|
|
# streef naar i+target_chars, maar zoek 'mooie' breakpoints tussen [i+min_chunk, i+hard_max]
|
|||
|
|
ideal = i + target_chars
|
|||
|
|
lo = i + min_chunk
|
|||
|
|
hi = min(n, i + hard_max)
|
|||
|
|
# kandidaten = bps in range
|
|||
|
|
candidates = [bp for bp in bps if lo <= bp <= hi]
|
|||
|
|
if not candidates:
|
|||
|
|
# geen mooie; breek grof op ideal of n
|
|||
|
|
j = min(n, ideal)
|
|||
|
|
chunk = text[i:j]
|
|||
|
|
chunks.append(chunk)
|
|||
|
|
# grotere overlap (ruw)
|
|||
|
|
i = j - 350 if j - 350 > i else j
|
|||
|
|
continue
|
|||
|
|
# kies dichtstbij het ideaal
|
|||
|
|
j = min(candidates, key=lambda bp: abs(bp - ideal))
|
|||
|
|
chunk = text[i:j]
|
|||
|
|
chunks.append(chunk)
|
|||
|
|
# nette break → kleine overlap
|
|||
|
|
i = j - 200 if j - 200 > i else j
|
|||
|
|
|
|||
|
|
# schoon lege/te-kleine staarten
|
|||
|
|
out = [c for c in chunks if c and c.strip()]
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
|
|||
|
|
def meili_index_repo(repo_root: Path, owner_repo: Optional[str], branch: str):
|
|||
|
|
cli = get_meili()
|
|||
|
|
if not cli: return
|
|||
|
|
idx_name = meili_index_name(owner_repo, branch)
|
|||
|
|
try:
|
|||
|
|
idx = cli.index(idx_name)
|
|||
|
|
except Exception:
|
|||
|
|
idx = cli.create_index(uid=idx_name, options={"primaryKey":"id"})
|
|||
|
|
docs = []
|
|||
|
|
bm25_docs = [] # ← verzamel hier voor BM25
|
|||
|
|
count = 0
|
|||
|
|
for rel in list_repo_files(repo_root):
|
|||
|
|
p = repo_root / rel
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(p) or ""
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
for ci, chunk in enumerate(smart_chunk_text(txt, rel, target_chars=int(os.getenv("CHUNK_TARGET_CHARS","1800")),hard_max=int(os.getenv("CHUNK_HARD_MAX","2600")),min_chunk=int(os.getenv("CHUNK_MIN_CHARS","800")))):
|
|||
|
|
doc_id = f"{owner_repo}:{branch}:{rel}:{ci}"
|
|||
|
|
item = {"id": doc_id, "path": rel, "repo": owner_repo, "branch": branch, "content": chunk}
|
|||
|
|
docs.append(item)
|
|||
|
|
bm25_docs.append(item) # ← ook hier
|
|||
|
|
count += 1
|
|||
|
|
if len(docs) >= 1000:
|
|||
|
|
idx.add_documents(docs); docs.clear()
|
|||
|
|
if docs:
|
|||
|
|
idx.add_documents(docs)
|
|||
|
|
try:
|
|||
|
|
idx.update_searchable_attributes(["content","path","repo","branch"])
|
|||
|
|
idx.update_filterable_attributes(["repo","branch","path"])
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
logger.info("INFO:agent_repo:meili indexed ~%d chunks into %s", count, idx_name)
|
|||
|
|
|
|||
|
|
# Lokale BM25 cache opbouwen uit bm25_docs (niet uit docs dat intussen leeg kan zijn)
|
|||
|
|
try:
|
|||
|
|
if BM25Okapi and bm25_docs:
|
|||
|
|
toks = [re.findall(r"[A-Za-z0-9_]+", d["content"].lower()) for d in bm25_docs]
|
|||
|
|
bm = BM25Okapi(toks) if toks else None
|
|||
|
|
if bm:
|
|||
|
|
_BM25_CACHE[idx_name] = {"bm25": bm, "docs": bm25_docs}
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:bm25 build failed: %s", e)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def meili_search(owner_repo: Optional[str], branch: str, q: str, limit: int = 10) -> List[dict]:
|
|||
|
|
cli = get_meili()
|
|||
|
|
if not cli: return []
|
|||
|
|
try:
|
|||
|
|
idx = cli.index(meili_index_name(owner_repo, branch))
|
|||
|
|
res = idx.search(q, {"limit": limit})
|
|||
|
|
return res.get("hits", [])
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:meili_search failed: %s", e)
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# ---------- BM25 fallback ----------
|
|||
|
|
_BM25_CACHE: Dict[str, dict] = {}
|
|||
|
|
|
|||
|
|
# module-scope
|
|||
|
|
_BM25_BY_REPO: dict[str, tuple[BM25Okapi, list[dict]]] = {}
|
|||
|
|
def _tok(s: str) -> list[str]:
|
|||
|
|
return re.findall(r"[A-Za-z0-9_]+", s.lower())
|
|||
|
|
|
|||
|
|
# --- Lightweight symbol index (in-memory, per repo collection) ---
|
|||
|
|
_SYMBOL_INDEX: dict[str, dict[str, dict[str, int]]] = {}
|
|||
|
|
# structuur: { collection_name: { symbol_lower: { path: count } } }
|
|||
|
|
|
|||
|
|
|
|||
|
|
def bm25_index_name(owner_repo: Optional[str], branch: str) -> str:
|
|||
|
|
return meili_index_name(owner_repo, branch) # dezelfde naam, andere cache
|
|||
|
|
|
|||
|
|
def bm25_build_index(repo_root: Path, owner_repo: Optional[str], branch: str):
|
|||
|
|
# hergebruik meili_index_repo’s docs-opbouw om dubbele IO te vermijden? Hier snel en lokaal:
|
|||
|
|
if not BM25Okapi:
|
|||
|
|
return
|
|||
|
|
idx_name = bm25_index_name(owner_repo, branch)
|
|||
|
|
docs = []
|
|||
|
|
for rel in list_repo_files(repo_root):
|
|||
|
|
p = repo_root / rel
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(p) or ""
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
for ci, chunk in enumerate(smart_chunk_text(txt, rel,
|
|||
|
|
target_chars=int(os.getenv("CHUNK_TARGET_CHARS","1800")),
|
|||
|
|
hard_max=int(os.getenv("CHUNK_HARD_MAX","2600")),
|
|||
|
|
min_chunk=int(os.getenv("CHUNK_MIN_CHARS","800")))):
|
|||
|
|
docs.append({"id": f"{owner_repo}:{branch}:{rel}:{ci}", "path": rel, "repo": owner_repo, "branch": branch, "content": chunk})
|
|||
|
|
toks = [re.findall(r"[A-Za-z0-9_]+", d["content"].lower()) for d in docs]
|
|||
|
|
if toks:
|
|||
|
|
_BM25_CACHE[idx_name] = {"bm25": BM25Okapi(toks), "docs": docs}
|
|||
|
|
|
|||
|
|
def bm25_search(owner_repo: Optional[str], branch: str, q: str, limit: int = 10) -> List[dict]:
|
|||
|
|
idx = _BM25_CACHE.get(bm25_index_name(owner_repo, branch))
|
|||
|
|
if not idx:
|
|||
|
|
return []
|
|||
|
|
bm = idx.get("bm25"); docs = idx.get("docs") or []
|
|||
|
|
if not bm:
|
|||
|
|
return []
|
|||
|
|
toks = re.findall(r"[A-Za-z0-9_]+", (q or "").lower())
|
|||
|
|
if not toks:
|
|||
|
|
return []
|
|||
|
|
scores = bm.get_scores(toks)
|
|||
|
|
order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:limit]
|
|||
|
|
return [docs[i] for i in order]
|
|||
|
|
|
|||
|
|
def _extract_symbols_generic(path: str, text: str) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Ultra-simpele symbol scraper (taal-agnostisch):
|
|||
|
|
- class/interface/trait namen
|
|||
|
|
- function foo(...), Foo::bar, "Controller@method"
|
|||
|
|
- Laravel: ->name('route.name')
|
|||
|
|
- React-ish: function Foo(...) { return ( ... ) }, export default function Foo(...)
|
|||
|
|
- Blade-ish: @section('...'), @component('...'), <x-foo-bar>
|
|||
|
|
- Basename van file als pseudo-symbool
|
|||
|
|
"""
|
|||
|
|
if not text:
|
|||
|
|
return []
|
|||
|
|
syms = set()
|
|||
|
|
|
|||
|
|
for m in re.finditer(r"\b(class|interface|trait)\s+([A-Za-z_][A-Za-z0-9_\\]*)", text):
|
|||
|
|
syms.add(m.group(2))
|
|||
|
|
|
|||
|
|
for m in re.finditer(r"\bfunction\s+([A-Za-z_][A-Za-z0-9_]*)\s*\(", text):
|
|||
|
|
syms.add(m.group(1))
|
|||
|
|
|
|||
|
|
for m in re.finditer(r"([A-Za-z_][A-Za-z0-9_\\]*)::([A-Za-z_][A-Za-z0-9_]*)", text):
|
|||
|
|
syms.add(m.group(1) + "::" + m.group(2))
|
|||
|
|
|
|||
|
|
for m in re.finditer(r"['\"]([A-Za-z0-9_\\]+)@([A-Za-z0-9_]+)['\"]", text):
|
|||
|
|
syms.add(m.group(1) + "@" + m.group(2))
|
|||
|
|
|
|||
|
|
for m in re.finditer(r"->\s*name\s*\(\s*['\"]([^'\"]+)['\"]\s*\)", text):
|
|||
|
|
syms.add(m.group(1))
|
|||
|
|
|
|||
|
|
for m in re.finditer(r"\bfunction\s+([A-Z][A-Za-z0-9_]*)\s*\(", text):
|
|||
|
|
syms.add(m.group(1))
|
|||
|
|
|
|||
|
|
for m in re.finditer(r"export\s+default\s+function\s+([A-Za-z_][A-Za-z0-9_]*)\s*\(", text):
|
|||
|
|
syms.add(m.group(1))
|
|||
|
|
|
|||
|
|
for m in re.finditer(r"@\s*(section|component|slot)\s*\(\s*['\"]([^'\"]+)['\"]\s*\)", text):
|
|||
|
|
syms.add(m.group(2))
|
|||
|
|
for m in re.finditer(r"<\s*x-([a-z0-9\-:]+)", text, flags=re.IGNORECASE):
|
|||
|
|
syms.add("x-" + m.group(1).lower())
|
|||
|
|
|
|||
|
|
base = os.path.basename(path)
|
|||
|
|
if base:
|
|||
|
|
syms.add(base)
|
|||
|
|
|
|||
|
|
return list(syms)
|
|||
|
|
|
|||
|
|
def _symbol_index_name(owner_repo: Optional[str], branch: str) -> str:
|
|||
|
|
return repo_collection_name(owner_repo, branch)
|
|||
|
|
|
|||
|
|
def symbol_index_repo(repo_root: Path, owner_repo: Optional[str], branch: str):
|
|||
|
|
"""Best-effort: bouw/refresh symbol index voor dit repo/branch."""
|
|||
|
|
try:
|
|||
|
|
coll = _symbol_index_name(owner_repo, branch)
|
|||
|
|
store: dict[str, dict[str, int]] = {}
|
|||
|
|
for rel in list_repo_files(repo_root):
|
|||
|
|
p = repo_root / rel
|
|||
|
|
try:
|
|||
|
|
if p.stat().st_size > 500_000:
|
|||
|
|
continue
|
|||
|
|
txt = _read_text_file(p) or ""
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
for s in _extract_symbols_generic(rel, txt):
|
|||
|
|
k = s.strip().lower()
|
|||
|
|
if not k:
|
|||
|
|
continue
|
|||
|
|
bucket = store.setdefault(k, {})
|
|||
|
|
bucket[rel] = bucket.get(rel, 0) + 1
|
|||
|
|
_SYMBOL_INDEX[coll] = store
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:symbol_index_repo: %s", e)
|
|||
|
|
|
|||
|
|
def symbol_search(owner_repo: Optional[str], branch: str, q: str, limit: int = 10) -> list[tuple[str, int]]:
|
|||
|
|
"""Eenvoudige symbol-zoeker -> [(path, score)]."""
|
|||
|
|
coll = _symbol_index_name(owner_repo, branch)
|
|||
|
|
idx = _SYMBOL_INDEX.get(coll) or {}
|
|||
|
|
if not idx or not q:
|
|||
|
|
return []
|
|||
|
|
quoted = re.findall(r"['\"]([^'\"]{2,})['\"]", q)
|
|||
|
|
words = re.findall(r"[A-Za-z0-9_:\\.\-]{2,}", q)
|
|||
|
|
seen = set(); tokens = []
|
|||
|
|
for t in quoted + words:
|
|||
|
|
tl = t.lower()
|
|||
|
|
if tl not in seen:
|
|||
|
|
seen.add(tl); tokens.append(tl)
|
|||
|
|
|
|||
|
|
scores: dict[str, int] = {}
|
|||
|
|
# exact
|
|||
|
|
for t in tokens[:12]:
|
|||
|
|
if t in idx:
|
|||
|
|
for path, c in idx[t].items():
|
|||
|
|
scores[path] = scores.get(path, 0) + 3 * c
|
|||
|
|
# zachte substring
|
|||
|
|
for sym, paths in idx.items():
|
|||
|
|
if t in sym:
|
|||
|
|
for path, c in paths.items():
|
|||
|
|
scores[path] = scores.get(path, 0) + 1
|
|||
|
|
|
|||
|
|
return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:limit]
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------- Signal-first scan ----------
|
|||
|
|
def glob_match(rel: str, patterns: List[str]) -> bool:
|
|||
|
|
for pat in patterns or []:
|
|||
|
|
if fnmatch.fnmatch(rel, pat):
|
|||
|
|
return True
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def scan_with_signals(repo_root: Path, files: List[str], sig: dict, phrase_boosts: List[str], hint_boosts: List[str], limit: int = 20) -> List[Tuple[str,int,dict]]:
|
|||
|
|
file_globs = sig.get("file_globs") or []
|
|||
|
|
must = [s.lower() for s in (sig.get("must_substrings") or [])]
|
|||
|
|
maybe = [s.lower() for s in (sig.get("maybe_substrings") or [])]
|
|||
|
|
regexes = sig.get("regexes") or []
|
|||
|
|
path_hints = [s.lower() for s in (sig.get("path_hints") or [])]
|
|||
|
|
exclude_dirs = set(sig.get("exclude_dirs") or [])
|
|||
|
|
|
|||
|
|
maybe = list(set(maybe + [p.lower() for p in phrase_boosts]))[:20]
|
|||
|
|
path_hints = list(set(path_hints + [h.lower() for h in hint_boosts]))[:20]
|
|||
|
|
|
|||
|
|
scored: List[Tuple[str,int,dict]] = []
|
|||
|
|
for rel in files:
|
|||
|
|
if any(part in exclude_dirs for part in Path(rel).parts): continue
|
|||
|
|
if file_globs and not glob_match(rel, file_globs): continue
|
|||
|
|
score = 0
|
|||
|
|
meta = {"must_hits":0,"maybe_hits":0,"regex_hits":0,"path_hits":0,"phrase_hits":0}
|
|||
|
|
rel_lo = rel.lower()
|
|||
|
|
for h in path_hints:
|
|||
|
|
if h and h in rel_lo: meta["path_hits"] += 1; score += 1
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(repo_root / rel) or ""
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
txt_lo = txt.lower()
|
|||
|
|
if any(m and (m not in txt_lo) for m in must):
|
|||
|
|
continue
|
|||
|
|
meta["must_hits"] = len([m for m in must if m and m in txt_lo]); score += 3*meta["must_hits"]
|
|||
|
|
meta["maybe_hits"] = len([m for m in maybe if m and m in txt_lo]); score += meta["maybe_hits"]
|
|||
|
|
for rp in regexes:
|
|||
|
|
try:
|
|||
|
|
if re.search(rp, txt, flags=re.IGNORECASE|re.DOTALL):
|
|||
|
|
meta["regex_hits"] += 1; score += 2
|
|||
|
|
except re.error:
|
|||
|
|
pass
|
|||
|
|
phrase_hits = 0
|
|||
|
|
for ph in phrase_boosts:
|
|||
|
|
if ph and ph.lower() in txt_lo:
|
|||
|
|
phrase_hits += 1
|
|||
|
|
if phrase_hits:
|
|||
|
|
meta["phrase_hits"] = phrase_hits
|
|||
|
|
score += 2*phrase_hits
|
|||
|
|
if score > 0:
|
|||
|
|
scored.append((rel, score, meta))
|
|||
|
|
scored.sort(key=lambda x: x[1], reverse=True)
|
|||
|
|
return scored[:limit]
|
|||
|
|
|
|||
|
|
# ---------- Simple keyword fallback ----------
|
|||
|
|
def simple_keyword_search(repo_root: Path, files: List[str], query: str, limit: int = 8) -> List[Tuple[str,int]]:
|
|||
|
|
toks = set(re.findall(r"[A-Za-z0-9_]{2,}", (query or "").lower()))
|
|||
|
|
scores: List[Tuple[str,int]] = []
|
|||
|
|
for rel in files:
|
|||
|
|
score = 0
|
|||
|
|
lo = rel.lower()
|
|||
|
|
for t in toks:
|
|||
|
|
if t in lo: score += 1
|
|||
|
|
if score == 0:
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(Path(repo_root) / rel) or ""
|
|||
|
|
txt_lo = txt.lower()
|
|||
|
|
score += sum(txt_lo.count(t) for t in toks)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
if score > 0: scores.append((rel, score))
|
|||
|
|
scores.sort(key=lambda x: x[1], reverse=True)
|
|||
|
|
return scores[:limit]
|
|||
|
|
|
|||
|
|
# ---------- Expliciete paden ----------
|
|||
|
|
|
|||
|
|
|
|||
|
|
def best_path_by_basename(all_files: List[str], hint: str) -> str | None:
|
|||
|
|
base = os.path.basename(hint)
|
|||
|
|
if not base: return None
|
|||
|
|
hint_tokens = set(re.findall(r"[A-Za-z0-9_]+", hint.lower()))
|
|||
|
|
scored = []
|
|||
|
|
for rel in all_files:
|
|||
|
|
if os.path.basename(rel).lower() == base.lower():
|
|||
|
|
score = 1
|
|||
|
|
lo = rel.lower()
|
|||
|
|
for t in hint_tokens:
|
|||
|
|
if t in lo: score += 1
|
|||
|
|
scored.append((rel, score))
|
|||
|
|
if not scored: return None
|
|||
|
|
scored.sort(key=lambda x: x[1], reverse=True)
|
|||
|
|
return scored[0][0]
|
|||
|
|
|
|||
|
|
# ---------- Hybrid RAG ----------
|
|||
|
|
def _append_ctx_preview(answer: str, chunks: list[dict], limit: int = 12) -> str:
|
|||
|
|
paths = []
|
|||
|
|
for h in chunks:
|
|||
|
|
meta = h.get("metadata") or {}
|
|||
|
|
p = meta.get("path");
|
|||
|
|
if p and p not in paths: paths.append(p)
|
|||
|
|
if not paths: return answer
|
|||
|
|
head = paths[:limit]
|
|||
|
|
return answer + "\n\n--- context (paths) ---\n" + "\n".join(f"- {p}" for p in head)
|
|||
|
|
|
|||
|
|
async def smart_rag_answer(messages: list[dict], *, n_ctx: int = 8,
|
|||
|
|
owner_repo: Optional[str] = None,
|
|||
|
|
branch: Optional[str] = None,
|
|||
|
|
collection_name: Optional[str] = None,
|
|||
|
|
add_preview: bool = True) -> str:
|
|||
|
|
# 1) intent
|
|||
|
|
spec = await enrich_intent(_llm_call, messages)
|
|||
|
|
task = (spec.get("task") or "").strip()
|
|||
|
|
if not task:
|
|||
|
|
return "Geen vraag gedetecteerd."
|
|||
|
|
|
|||
|
|
# 2) queries
|
|||
|
|
variants = await expand_queries(_llm_call, task, k=3)
|
|||
|
|
|
|||
|
|
# 3) hybrid retrieve (let op: gebruik dezelfde collectie als index; 'code_docs' wordt in app.py al versied via _collection_versioned)
|
|||
|
|
# resolve collection: expliciet > (owner_repo,branch) > default
|
|||
|
|
coll = collection_name or (repo_collection_name(owner_repo, branch or AGENT_DEFAULT_BRANCH) if owner_repo else "code_docs")
|
|||
|
|
all_hits = []
|
|||
|
|
for q in variants:
|
|||
|
|
hits = await hybrid_retrieve(
|
|||
|
|
_rag_query_internal,
|
|||
|
|
q,
|
|||
|
|
n_results=n_ctx,
|
|||
|
|
per_query_k=max(30, n_ctx * 6),
|
|||
|
|
alpha=0.6,
|
|||
|
|
# expliciet doorgeven om zeker de juiste (versie-gesufficede) collectie te raken:
|
|||
|
|
collection_name=coll,
|
|||
|
|
)
|
|||
|
|
all_hits.extend(hits)
|
|||
|
|
|
|||
|
|
# dedup op path + chunk_index
|
|||
|
|
seen = set()
|
|||
|
|
uniq = []
|
|||
|
|
for h in sorted(all_hits, key=lambda x: x.get("score", 0), reverse=True):
|
|||
|
|
meta = h.get("metadata") or {}
|
|||
|
|
key = (meta.get("path"), meta.get("chunk_index"))
|
|||
|
|
if key in seen:
|
|||
|
|
continue
|
|||
|
|
seen.add(key)
|
|||
|
|
uniq.append(h)
|
|||
|
|
if len(uniq) >= n_ctx:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 4) context
|
|||
|
|
ctx, top = assemble_context(uniq, max_chars=int(os.getenv("REPO_AGENT_CONTEXT_CHARS","640000")))
|
|||
|
|
if not ctx:
|
|||
|
|
return "Geen context gevonden."
|
|||
|
|
|
|||
|
|
# 5) laat LLM antwoorden
|
|||
|
|
sys = "Beantwoord concreet en kort. Citeer relevante paths. Als iets onzeker is: zeg dat."
|
|||
|
|
usr = f"Vraag: {task}\n\n--- CONTEXT ---\n{ctx}"
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
|||
|
|
stream=False, temperature=0.2, top_p=0.9, max_tokens=700
|
|||
|
|
)
|
|||
|
|
ans = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
|
|||
|
|
return _append_ctx_preview(ans, uniq) if (add_preview and os.getenv("REPO_AGENT_PREVIEW","1") not in ("0","false")) else ans
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def llm_expand_queries(user_goal: str, quotes: List[str], hints: List[str], k: int = 5, extra_seeds: Optional[List[str]] = None) -> List[str]: # already defined above
|
|||
|
|
# (duplicate name kept intentionally — Python allows redef; using the latest one)
|
|||
|
|
|
|||
|
|
seed = []
|
|||
|
|
if quotes: seed += quotes
|
|||
|
|
if hints: seed += hints[:6]
|
|||
|
|
if extra_seeds: seed += extra_seeds[:6]
|
|||
|
|
seed = list(dict.fromkeys(seed))[:8]
|
|||
|
|
prompt = (
|
|||
|
|
f"Maak {k} alternatieve zoekqueries (kort, divers). Mix NL/EN, synoniemen, veldnamen."
|
|||
|
|
" Alleen geldige JSON-array met strings.\n"
|
|||
|
|
f"Doel:\n{user_goal}\n\nHints:\n" + ", ".join(seed)
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":"Alleen geldige JSON, geen uitleg."},
|
|||
|
|
{"role":"user","content":prompt}],
|
|||
|
|
stream=False, temperature=0.3, top_p=0.9, max_tokens=400
|
|||
|
|
)
|
|||
|
|
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
|||
|
|
arr = safe_json_loads(raw)
|
|||
|
|
base = [user_goal]
|
|||
|
|
if isinstance(arr, list):
|
|||
|
|
base += [s for s in arr if isinstance(s, str) and s.strip()]
|
|||
|
|
out = []
|
|||
|
|
for q in base:
|
|||
|
|
qn = re.sub(r"\s+", " ", q.strip())
|
|||
|
|
if qn and qn not in out: out.append(qn)
|
|||
|
|
return out[:1+k]
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:llm_expand_queries failed: %s", e)
|
|||
|
|
return [user_goal]
|
|||
|
|
|
|||
|
|
def get_file_preview(repo_root: Path, rel: str, terms: List[str], window: int = 180) -> str:
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(repo_root / rel) or ""
|
|||
|
|
except Exception:
|
|||
|
|
return ""
|
|||
|
|
if not txt: return ""
|
|||
|
|
if not terms: return txt[:window*2]
|
|||
|
|
lo = txt.lower()
|
|||
|
|
for t in terms:
|
|||
|
|
i = lo.find(t.lower())
|
|||
|
|
if i >= 0:
|
|||
|
|
a = max(0, i - window); b = min(len(txt), i + len(t) + window)
|
|||
|
|
return txt[a:b]
|
|||
|
|
return txt[:window*2]
|
|||
|
|
|
|||
|
|
async def llm_rerank_candidates(user_goal: str, candidates: List[dict], topk: int = 8) -> List[dict]:
|
|||
|
|
if not candidates: return []
|
|||
|
|
pack = []
|
|||
|
|
for i, c in enumerate(candidates[:20], 1):
|
|||
|
|
pv = c.get("preview","")[:600]
|
|||
|
|
pth = c["path"]
|
|||
|
|
base = os.path.basename(pth)
|
|||
|
|
dr = os.path.dirname(pth)
|
|||
|
|
pack.append(f"{i}. PATH: {pth}\nDIR: {dr}\nBASENAME: {base}\nPREVIEW:\n{pv}")
|
|||
|
|
|
|||
|
|
prompt = (
|
|||
|
|
"Rangschik de onderstaande codefragmenten op relevantie om het doel te behalen. "
|
|||
|
|
"Geef een JSON-array met objecten: {\"path\":\"...\",\"score\":0-100}."
|
|||
|
|
"\n\nDOEL:\n" + user_goal + "\n\nFRAGMENTEN:\n" + "\n\n".join(pack)
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":"Alleen geldige JSON zonder uitleg."},
|
|||
|
|
{"role":"user","content":prompt}],
|
|||
|
|
stream=False, temperature=0.0, top_p=0.9, max_tokens=600
|
|||
|
|
)
|
|||
|
|
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
|||
|
|
arr = safe_json_loads(raw)
|
|||
|
|
if not isinstance(arr, list):
|
|||
|
|
return candidates[:topk]
|
|||
|
|
score_map = {d.get("path"): float(d.get("score",0)) for d in arr if isinstance(d, dict) and "path" in d}
|
|||
|
|
rescored = []
|
|||
|
|
for c in candidates:
|
|||
|
|
rescored.append({**c, "score": score_map.get(c["path"], 0.0)})
|
|||
|
|
rescored.sort(key=lambda x: x.get("score",0.0), reverse=True)
|
|||
|
|
return rescored[:topk]
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:llm_rerank_candidates failed: %s", e)
|
|||
|
|
return candidates[:topk]
|
|||
|
|
|
|||
|
|
def _rrf_fuse_paths(*ordered_lists: List[str], k: int = int(os.getenv("RRF_K","60"))) -> List[str]:
|
|||
|
|
"""
|
|||
|
|
Neem meerdere geordende padlijsten (beste eerst) en geef een RRF-fusie.
|
|||
|
|
"""
|
|||
|
|
acc = defaultdict(float)
|
|||
|
|
for lst in ordered_lists:
|
|||
|
|
for i, p in enumerate(lst):
|
|||
|
|
acc[p] += 1.0 / (k + i + 1)
|
|||
|
|
# path prior
|
|||
|
|
def _prior(p: str) -> float:
|
|||
|
|
return (
|
|||
|
|
(0.35 if p.lower().startswith("routes/") else 0.0) +
|
|||
|
|
(0.30 if p.lower().startswith("app/http/controllers/") else 0.0) +
|
|||
|
|
(0.25 if p.lower().startswith("resources/views/") or p.lower().endswith(".blade.php") else 0.0) +
|
|||
|
|
(0.12 if p.lower().startswith(("src/","app/","lib/","pages/","components/")) else 0.0) +
|
|||
|
|
(0.05 if p.lower().endswith((".php",".ts",".tsx",".js",".jsx",".py",".go",".rb",".java",".cs",".vue",".html",".md")) else 0.0) -
|
|||
|
|
(0.10 if ("/tests/" in p.lower() or p.lower().startswith(("tests/","test/"))) else 0.0) -
|
|||
|
|
(0.10 if p.lower().endswith((".lock",".map",".min.js",".min.css")) else 0.0)
|
|||
|
|
)
|
|||
|
|
for p in list(acc.keys()):
|
|||
|
|
acc[p] += float(os.getenv("RRF_PATH_PRIOR_WEIGHT","0.25")) * _prior(p)
|
|||
|
|
return [p for p,_ in sorted(acc.items(), key=lambda t: t[1], reverse=True)]
|
|||
|
|
|
|||
|
|
async def hybrid_rag_select_paths(repo_root: Path,
|
|||
|
|
owner_repo: Optional[str],
|
|||
|
|
branch: str,
|
|||
|
|
user_goal: str,
|
|||
|
|
all_files: List[str],
|
|||
|
|
max_out: int = 8) -> List[str]:
|
|||
|
|
quotes = extract_quotes(user_goal)
|
|||
|
|
hints = extract_word_hints(user_goal)
|
|||
|
|
# signals
|
|||
|
|
sig_messages = [
|
|||
|
|
{"role":"system","content":"Produceer alleen geldige JSON zonder uitleg."},
|
|||
|
|
{"role":"user","content":(
|
|||
|
|
"Bedenk een compacte zoekstrategie als JSON om relevante bestanden te vinden (globs/must/maybe/regex/path_hints/excludes). Wijziging:\n"
|
|||
|
|
+ user_goal
|
|||
|
|
)}
|
|||
|
|
]
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(sig_messages, stream=False, temperature=0.1, top_p=0.9, max_tokens=384)
|
|||
|
|
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","").strip()
|
|||
|
|
sig = safe_json_loads(raw) or {}
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:signals LLM failed: %s", e)
|
|||
|
|
sig = {}
|
|||
|
|
# Tweepassig: eerst lenient (recall), dan strict (precision)
|
|||
|
|
sig_lenient = dict(sig or {})
|
|||
|
|
sig_lenient["must_substrings"] = []
|
|||
|
|
sig_lenient["regexes"] = []
|
|||
|
|
scan_hits_lenient = scan_with_signals(
|
|||
|
|
repo_root, all_files, sig_lenient,
|
|||
|
|
phrase_boosts=quotes, hint_boosts=hints, limit=24
|
|||
|
|
)
|
|||
|
|
scan_hits_strict = scan_with_signals(
|
|||
|
|
repo_root, all_files, sig,
|
|||
|
|
phrase_boosts=quotes, hint_boosts=hints, limit=20
|
|||
|
|
)
|
|||
|
|
# combineer met voorkeur voor strict
|
|||
|
|
seen_paths_local = set()
|
|||
|
|
prepicked = []
|
|||
|
|
for rel, _sc, _m in scan_hits_strict + scan_hits_lenient:
|
|||
|
|
if rel not in seen_paths_local:
|
|||
|
|
seen_paths_local.add(rel); prepicked.append(rel)
|
|||
|
|
|
|||
|
|
# --- NIEUW: expliciete pad-hints uit de user prompt voorrang geven ---
|
|||
|
|
try:
|
|||
|
|
explicit = extract_explicit_paths(user_goal)
|
|||
|
|
except Exception:
|
|||
|
|
explicit = []
|
|||
|
|
explicit_resolved: List[str] = []
|
|||
|
|
for ep in explicit:
|
|||
|
|
if ep in all_files:
|
|||
|
|
explicit_resolved.append(ep)
|
|||
|
|
else:
|
|||
|
|
bp = best_path_by_basename(all_files, ep)
|
|||
|
|
if bp: explicit_resolved.append(bp)
|
|||
|
|
# plaats expliciete paden vooraan met dedupe
|
|||
|
|
for ep in reversed(explicit_resolved):
|
|||
|
|
if ep not in seen_paths_local:
|
|||
|
|
prepicked.insert(0, ep); seen_paths_local.add(ep)
|
|||
|
|
|
|||
|
|
# lichte stack-seeds
|
|||
|
|
seeds = []
|
|||
|
|
if (repo_root / "artisan").exists() or (repo_root / "composer.json").exists():
|
|||
|
|
seeds += ["Route::get", "Controller", "blade", "resources/views", "routes/web.php", "app/Http/Controllers"]
|
|||
|
|
if (repo_root / "package.json").exists():
|
|||
|
|
seeds += ["component", "pages", "src/components", "useState", "useEffect"]
|
|||
|
|
queries = await llm_expand_queries(user_goal, quotes, hints, k=5, extra_seeds=seeds)
|
|||
|
|
|
|||
|
|
|
|||
|
|
chroma_paths: List[str] = []
|
|||
|
|
for q in queries:
|
|||
|
|
try:
|
|||
|
|
rag_res = await _rag_query_internal(
|
|||
|
|
query=q, n_results=RAG_TOPK,
|
|||
|
|
# zoek in de versie-consistente collectie:
|
|||
|
|
collection_name=repo_collection_name(owner_repo, branch),
|
|||
|
|
repo=None, path_contains=None, profile=None
|
|||
|
|
)
|
|||
|
|
for item in rag_res.get("results", []):
|
|||
|
|
meta = item.get("metadata") or {}
|
|||
|
|
pth = meta.get("path")
|
|||
|
|
if pth and pth in all_files:
|
|||
|
|
chroma_paths.append(pth)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:Chroma query failed: %s", e)
|
|||
|
|
|
|||
|
|
meili_paths: List[str] = []
|
|||
|
|
if MEILI_URL:
|
|||
|
|
for q in queries:
|
|||
|
|
hits = meili_search(owner_repo, branch, q, limit=RAG_TOPK)
|
|||
|
|
for h in hits:
|
|||
|
|
p = h.get("path")
|
|||
|
|
if p and p in all_files:
|
|||
|
|
meili_paths.append(p)
|
|||
|
|
else:
|
|||
|
|
# BM25 fallback wanneer Meili uit staat
|
|||
|
|
# zorg dat er een (eenmalige) index is
|
|||
|
|
try:
|
|||
|
|
if bm25_index_name(owner_repo, branch) not in _BM25_CACHE:
|
|||
|
|
bm25_build_index(repo_root, owner_repo, branch)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
for q in queries:
|
|||
|
|
hits = bm25_search(owner_repo, branch, q, limit=RAG_TOPK)
|
|||
|
|
for h in hits:
|
|||
|
|
p = h.get("path")
|
|||
|
|
if p and p in all_files:
|
|||
|
|
meili_paths.append(p)
|
|||
|
|
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
laravel_picks = laravel_signal_candidates(repo_root, user_goal, all_files, max_out=6)
|
|||
|
|
except Exception:
|
|||
|
|
laravel_picks = []
|
|||
|
|
|
|||
|
|
|
|||
|
|
# --- NIEUW: Symbol-driven candidates ---
|
|||
|
|
sym_hits = symbol_search(owner_repo, branch, user_goal, limit=12)
|
|||
|
|
sym_paths = [p for p, _sc in sym_hits if p in all_files]
|
|||
|
|
|
|||
|
|
# RRF-fusie van bronnen + Laravel-picks
|
|||
|
|
#fused = _rrf_fuse_paths(prepicked, chroma_paths, meili_paths, laravel_picks)
|
|||
|
|
|
|||
|
|
# --- Optionele RRF-fusie van kanalen (standaard UIT) ---
|
|||
|
|
use_rrf = str(os.getenv("RRF_ENABLE", "1")).lower() in ("1","true","yes")
|
|||
|
|
if use_rrf:
|
|||
|
|
k = int(os.getenv("RRF_K", "30"))
|
|||
|
|
# eenvoudige gewichten per kanaal (pas aan via env)
|
|||
|
|
w_signals = float(os.getenv("RRF_W_SIGNALS", "1.0"))
|
|||
|
|
w_chroma = float(os.getenv("RRF_W_CHROMA", "1.0"))
|
|||
|
|
w_meili = float(os.getenv("RRF_W_MEILI", "0.8"))
|
|||
|
|
w_sym = float(os.getenv("RRF_W_SYMBOLS", "1.3"))
|
|||
|
|
w_lara = float(os.getenv("RRF_W_LARAVEL", "1.2"))
|
|||
|
|
|
|||
|
|
sources = [
|
|||
|
|
("signals", prepicked, w_signals),
|
|||
|
|
("chroma", chroma_paths, w_chroma),
|
|||
|
|
("meili", meili_paths, w_meili),
|
|||
|
|
("symbols", sym_paths, w_sym),
|
|||
|
|
("laravel", laravel_picks,w_lara),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
rrf_scores: dict[str, float] = {}
|
|||
|
|
seen_any = set()
|
|||
|
|
for _name, paths, w in sources:
|
|||
|
|
for rank, p in enumerate(paths, start=1):
|
|||
|
|
if p not in all_files:
|
|||
|
|
continue
|
|||
|
|
seen_any.add(p)
|
|||
|
|
rrf_scores[p] = rrf_scores.get(p, 0.0) + (w * (1.0 / (k + rank)))
|
|||
|
|
|
|||
|
|
# kies top op basis van RRF; val terug op union als leeg
|
|||
|
|
fused_paths = [p for p, _ in sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)]
|
|||
|
|
base_pool = fused_paths[: max_out*3] if fused_paths else []
|
|||
|
|
|
|||
|
|
# bouw pool (met dedupe) + vul aan met de oude volgorde indien nodig
|
|||
|
|
pool, seen = [], set()
|
|||
|
|
def add(p):
|
|||
|
|
if p not in seen and p in all_files:
|
|||
|
|
seen.add(p); pool.append(p)
|
|||
|
|
|
|||
|
|
for p in base_pool: add(p)
|
|||
|
|
if len(pool) < max_out:
|
|||
|
|
for lst in (prepicked, chroma_paths, meili_paths, sym_paths, laravel_picks):
|
|||
|
|
for p in lst:
|
|||
|
|
add(p)
|
|||
|
|
else:
|
|||
|
|
# oude (jouw huidige) manier zonder RRF
|
|||
|
|
pool, seen = [], set()
|
|||
|
|
def add(p):
|
|||
|
|
if p not in seen and p in all_files:
|
|||
|
|
seen.add(p); pool.append(p)
|
|||
|
|
for lst in (prepicked, chroma_paths, meili_paths, sym_paths, laravel_picks):
|
|||
|
|
for p in lst:
|
|||
|
|
add(p)
|
|||
|
|
|
|||
|
|
# LLM-rerank blijft identiek:
|
|||
|
|
cands = [{"path": p, "preview": get_file_preview(repo_root, p, quotes+hints)} for p in pool[:20]]
|
|||
|
|
ranked = await llm_rerank_candidates(user_goal, cands, topk=max_out)
|
|||
|
|
|
|||
|
|
# symbol-boost (licht) ná LLM-rerank (ongewijzigd)
|
|||
|
|
sym_map = {p: sc for p, sc in sym_hits}
|
|||
|
|
boost = float(os.getenv("SYMBOL_LIGHT_BOOST", "0.15"))
|
|||
|
|
rescored = []
|
|||
|
|
for c in ranked:
|
|||
|
|
base = float(c.get("score", 0.0))
|
|||
|
|
s = sym_map.get(c["path"], 0)
|
|||
|
|
adj = base + (boost if s > 0 else 0.0)
|
|||
|
|
rescored.append({**c, "score": adj})
|
|||
|
|
rescored.sort(key=lambda x: x["score"], reverse=True)
|
|||
|
|
return [c["path"] for c in rescored[:max_out]]
|
|||
|
|
|
|||
|
|
# ---------- Focus-snippets ----------
|
|||
|
|
def extract_focus_snippets(text: str, needles: List[str], window: int = 240, max_snippets: int = 3) -> str:
|
|||
|
|
if not text or not needles: return (text[:window*2] if text else "")
|
|||
|
|
lo = text.lower()
|
|||
|
|
hits = []
|
|||
|
|
for n in needles:
|
|||
|
|
nlo = (n or "").lower()
|
|||
|
|
if not nlo: continue
|
|||
|
|
start = 0
|
|||
|
|
for _ in range(4):
|
|||
|
|
idx = lo.find(nlo, start)
|
|||
|
|
if idx < 0: break
|
|||
|
|
a = max(0, idx - window)
|
|||
|
|
b = min(len(text), idx + len(nlo) + window)
|
|||
|
|
hits.append(text[a:b]); start = idx + len(nlo)
|
|||
|
|
uniq = []
|
|||
|
|
for h in hits:
|
|||
|
|
# de-dupe met wederzijdse containment (voorkom overlap/ingebed)
|
|||
|
|
if all((h not in u) and (u not in h) for u in uniq):
|
|||
|
|
uniq.append(h)
|
|||
|
|
if len(uniq) >= max_snippets: break
|
|||
|
|
return "\n----- CONTEXT SPLIT -----\n".join(uniq) if uniq else text[:window*2]
|
|||
|
|
|
|||
|
|
# ---------- LLM edit-plan ----------
|
|||
|
|
async def llm_plan_edits_for_file(user_goal: str, rel: str, focus_snippet: str) -> dict | None:
|
|||
|
|
SYSTEM = "Produceer uitsluitend geldige JSON; geen verdere uitleg. Minimaliseer edits; raak zo min mogelijk regels."
|
|||
|
|
# (optioneel) korte tree-hint in de prompt – zet AGENT_TREE_PROMPT=1 om te activeren
|
|||
|
|
# Tree-hint standaard aan: korte mapoverzicht + samenvattingen van nabije files
|
|||
|
|
tree_block = globals().get("_LLM_EDIT_TREE_HINT", "")
|
|||
|
|
tree_hint = os.getenv("AGENT_TREE_PROMPT","1").lower() not in ("0","false")
|
|||
|
|
tree_block = ""
|
|||
|
|
try:
|
|||
|
|
if tree_hint:
|
|||
|
|
# NB: eenvoudige, lokale context: alleen siblings + map info om tokens te sparen
|
|||
|
|
# (Vereist repo_root hier normaal gesproken; als niet beschikbaar, laat leeg)
|
|||
|
|
tree_block = "\n(Tree-overzicht niet beschikbaar in deze context)\n"
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
USER = (
|
|||
|
|
"Doel:\n" + user_goal + "\n\n" +
|
|||
|
|
f"Bestand: {rel}\n" +
|
|||
|
|
"Relevante contextfragmenten:\n----- BEGIN SNIPPETS -----\n" +
|
|||
|
|
focus_snippet + "\n----- EIND SNIPPETS -----\n\n" +
|
|||
|
|
("Korte tree-hint:\n" + tree_block + "\n") +
|
|||
|
|
"JSON schema:\n" +
|
|||
|
|
"{ \"allow_destructive\": false, \"edits\": [\n" +
|
|||
|
|
" {\"type\":\"regex_replace\",\"pattern\":\"...\",\"replacement\":\"...\",\"flags\":\"ims\",\"count\":1,\"explain\":\"...\"},\n" +
|
|||
|
|
" {\"type\":\"string_replace\",\"find\":\"...\",\"replace\":\"...\",\"count\":1,\"explain\":\"...\"},\n" +
|
|||
|
|
" {\"type\":\"insert_after\",\"anchor_regex\":\"...\",\"text\":\"...\",\"occur\":\"first|last\",\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
|||
|
|
" {\"type\":\"insert_before\",\"anchor_regex\":\"...\",\"text\":\"...\",\"occur\":\"first|last\",\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
|||
|
|
" {\"type\":\"replace_between_anchors\",\"start_regex\":\"...\",\"end_regex\":\"...\",\"replacement\":\"...\",\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
|||
|
|
" {\"type\":\"delete_between_anchors\",\"start_regex\":\"...\",\"end_regex\":\"...\",\"keep_anchors\":false,\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
|||
|
|
" {\"type\":\"conditional_insert\",\"absent_regex\":\"...\",\"anchor_regex\":\"...\",\"text\":\"...\",\"occur\":\"first|last\",\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
|||
|
|
" {\"type\":\"insert_at_top\",\"text\":\"...\",\"explain\":\"...\"},\n" +
|
|||
|
|
" {\"type\":\"insert_at_bottom\",\"text\":\"...\",\"explain\":\"...\"}\n" +
|
|||
|
|
"]}\n" +
|
|||
|
|
"Maximaal 4 edits. Geef bij elke edit een korte 'explain'."
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":SYSTEM},{"role":"user","content":USER}],
|
|||
|
|
stream=False, temperature=0.1, top_p=0.9, max_tokens=800
|
|||
|
|
)
|
|||
|
|
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","").strip()
|
|||
|
|
plan = safe_json_loads(raw)
|
|||
|
|
if isinstance(plan, dict) and isinstance(plan.get("edits"), list):
|
|||
|
|
return plan
|
|||
|
|
return None
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:llm_plan_edits_for_file failed for %s: %s", rel, e)
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# ---------- Apply helpers ----------
|
|||
|
|
def _regex_flags(flag_str: str) -> int:
|
|||
|
|
flags = 0
|
|||
|
|
if not flag_str: return flags
|
|||
|
|
for ch in flag_str.lower():
|
|||
|
|
if ch == 'i': flags |= re.IGNORECASE
|
|||
|
|
if ch == 'm': flags |= re.MULTILINE
|
|||
|
|
if ch == 's': flags |= re.DOTALL
|
|||
|
|
return flags
|
|||
|
|
|
|||
|
|
def apply_edit_plan(original: str, plan: dict) -> tuple[str, int, List[str], bool]:
|
|||
|
|
"""
|
|||
|
|
Returns: (modified, changes_count, explains[], allow_destructive)
|
|||
|
|
"""
|
|||
|
|
if not original or not plan or not isinstance(plan.get("edits"), list):
|
|||
|
|
return original, 0, [], False
|
|||
|
|
txt = original
|
|||
|
|
changes = 0
|
|||
|
|
explains: List[str] = []
|
|||
|
|
for ed in plan["edits"]:
|
|||
|
|
try:
|
|||
|
|
et = (ed.get("type") or "").lower()
|
|||
|
|
ex = ed.get("explain") or et
|
|||
|
|
if et == "string_replace":
|
|||
|
|
find = ed.get("find") or ""; rep = ed.get("replace") or ""
|
|||
|
|
cnt = int(ed.get("count") or 0) or 1
|
|||
|
|
if find:
|
|||
|
|
new = txt.replace(find, rep, cnt)
|
|||
|
|
if new != txt: changes += 1; txt = new; explains.append(f"string_replace: {ex}")
|
|||
|
|
elif et == "regex_replace":
|
|||
|
|
pat = ed.get("pattern") or ""; rep = ed.get("replacement") or ""
|
|||
|
|
flags = _regex_flags(ed.get("flags") or ""); cnt = int(ed.get("count") or 0) or 1
|
|||
|
|
if pat:
|
|||
|
|
new, n = re.subn(pat, rep, txt, count=cnt, flags=flags)
|
|||
|
|
if n > 0: changes += 1; txt = new; explains.append(f"regex_replace: {ex}")
|
|||
|
|
elif et in ("insert_after","insert_before"):
|
|||
|
|
anchor = ed.get("anchor_regex") or ""; ins = ed.get("text") or ""
|
|||
|
|
occur = (ed.get("occur") or "first").lower(); flags = _regex_flags(ed.get("flags") or "")
|
|||
|
|
if not anchor or not ins: continue
|
|||
|
|
matches = list(re.finditer(anchor, txt, flags))
|
|||
|
|
if not matches: continue
|
|||
|
|
m = matches[0] if occur != "last" else matches[-1]
|
|||
|
|
pos = m.end() if et == "insert_after" else m.start()
|
|||
|
|
# idempotentie: voeg niet opnieuw in als de tekst al vlakbij staat
|
|||
|
|
win_a, win_b = max(0, pos-200), min(len(txt), pos+200)
|
|||
|
|
if ins in txt[win_a:win_b]:
|
|||
|
|
continue
|
|||
|
|
txt = txt[:pos] + ins + txt[pos:]; changes += 1; explains.append(f"{et}: {ex}")
|
|||
|
|
elif et in ("replace_between_anchors","delete_between_anchors"):
|
|||
|
|
srx = ed.get("start_regex") or ""; erx = ed.get("end_regex") or ""
|
|||
|
|
flags = _regex_flags(ed.get("flags") or ""); keep_anchors = bool(ed.get("keep_anchors")) if et == "delete_between_anchors" else True
|
|||
|
|
repl = ed.get("replacement") or ""
|
|||
|
|
if not srx or not erx: continue
|
|||
|
|
s_matches = list(re.finditer(srx, txt, flags))
|
|||
|
|
e_matches = list(re.finditer(erx, txt, flags))
|
|||
|
|
if not s_matches or not e_matches: continue
|
|||
|
|
s0 = s_matches[0]
|
|||
|
|
# Kies de eerste end-anker ná het start-anker
|
|||
|
|
e0 = next((em for em in e_matches if em.start() >= s0.end()), None)
|
|||
|
|
if not e0: continue
|
|||
|
|
a = s0.end(); b = e0.start()
|
|||
|
|
if et == "replace_between_anchors":
|
|||
|
|
txt = txt[:a] + repl + txt[b:]; changes += 1; explains.append(f"replace_between_anchors: {ex}")
|
|||
|
|
else:
|
|||
|
|
if keep_anchors: txt = txt[:a] + txt[b:]
|
|||
|
|
else: txt = txt[:s0.start()] + txt[e0.end():]
|
|||
|
|
changes += 1; explains.append(f"delete_between_anchors: {ex}")
|
|||
|
|
elif et == "conditional_insert":
|
|||
|
|
absent = ed.get("absent_regex") or ""; anchor = ed.get("anchor_regex") or ""
|
|||
|
|
occur = (ed.get("occur") or "first").lower(); ins = ed.get("text") or ""
|
|||
|
|
flags = _regex_flags(ed.get("flags") or "")
|
|||
|
|
if not anchor or not ins: continue
|
|||
|
|
if absent and re.search(absent, txt, flags): continue
|
|||
|
|
matches = list(re.finditer(anchor, txt, flags))
|
|||
|
|
if not matches: continue
|
|||
|
|
m = matches[0] if occur != "last" else matches[-1]
|
|||
|
|
pos = m.end()
|
|||
|
|
# idempotentie: lokale window-check
|
|||
|
|
win_a, win_b = max(0, pos-200), min(len(txt), pos+200)
|
|||
|
|
if ins in txt[win_a:win_b]:
|
|||
|
|
continue
|
|||
|
|
txt = txt[:pos] + ins + txt[pos:]; changes += 1; explains.append(f"conditional_insert: {ex}")
|
|||
|
|
elif et == "insert_at_top":
|
|||
|
|
ins = ed.get("text") or ""
|
|||
|
|
if ins: txt = ins + txt; changes += 1; explains.append(f"insert_at_top: {ex}")
|
|||
|
|
elif et == "insert_at_bottom":
|
|||
|
|
ins = ed.get("text") or ""
|
|||
|
|
if ins: txt = txt + ins; changes += 1; explains.append(f"insert_at_bottom: {ex}")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:apply_edit_plan step failed: %s", e)
|
|||
|
|
continue
|
|||
|
|
allow_destructive = bool(plan.get("allow_destructive"))
|
|||
|
|
return txt, changes, explains, allow_destructive
|
|||
|
|
|
|||
|
|
# ==== BEGIN PATCH A: destructiviteit op diff-basis + drempel via env ====
|
|||
|
|
# Veilige default voor AGENT_DESTRUCTIVE_RATIO (voorkom NameError als niet gedefinieerd)
|
|||
|
|
try:
|
|||
|
|
AGENT_DESTRUCTIVE_RATIO
|
|||
|
|
except NameError:
|
|||
|
|
AGENT_DESTRUCTIVE_RATIO = float(os.getenv("AGENT_DESTRUCTIVE_RATIO", "0.45"))
|
|||
|
|
|
|||
|
|
def _deletion_ratio(original: str, modified: str) -> float:
|
|||
|
|
"""Schat welk deel van de originele regels als deletions wegvalt."""
|
|||
|
|
ol = original.splitlines()
|
|||
|
|
ml = modified.splitlines()
|
|||
|
|
if not ol:
|
|||
|
|
return 0.0
|
|||
|
|
# ndiff: regels met prefix '- ' tellen we als deletions
|
|||
|
|
dels = 0
|
|||
|
|
for line in difflib.ndiff(ol, ml):
|
|||
|
|
if line.startswith("- "):
|
|||
|
|
dels += 1
|
|||
|
|
return dels / max(1, len(ol))
|
|||
|
|
|
|||
|
|
def is_destructive(original: str, modified: str, allow_destructive: bool) -> bool:
|
|||
|
|
"""Blokkeer alleen als er aantoonbaar veel deletions zijn."""
|
|||
|
|
if allow_destructive:
|
|||
|
|
return False
|
|||
|
|
# heel kleine files: laat door, we willen niet te streng zijn
|
|||
|
|
if len(original.splitlines()) < 6:
|
|||
|
|
return False
|
|||
|
|
ratio = _deletion_ratio(original, modified)
|
|||
|
|
return ratio > AGENT_DESTRUCTIVE_RATIO
|
|||
|
|
|
|||
|
|
# ==== END PATCH A ====
|
|||
|
|
|
|||
|
|
def list_sibling_files(repo_root: Path, rel: str, limit: int = 12) -> List[str]:
|
|||
|
|
d = (repo_root / rel).parent
|
|||
|
|
if not d.exists():
|
|||
|
|
# directory kan nog niet bestaan; kies dichtstbijzijnde bestaande ouder
|
|||
|
|
d = repo_root / os.path.dirname(rel)
|
|||
|
|
while not d.exists() and d != repo_root:
|
|||
|
|
d = d.parent
|
|||
|
|
outs = []
|
|||
|
|
if d.exists():
|
|||
|
|
for p in d.iterdir():
|
|||
|
|
if p.is_file() and allowed_file(p) and p.stat().st_size < 500_000:
|
|||
|
|
outs.append(str(p.name))
|
|||
|
|
# stabiele output i.p.v. FS-volgorde
|
|||
|
|
outs.sort(key=str.lower)
|
|||
|
|
return outs[:limit]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_snippet(p: Path, max_chars: int = 2000) -> str:
|
|||
|
|
try:
|
|||
|
|
t = _read_text_file(p) or ""
|
|||
|
|
return t[:max_chars]
|
|||
|
|
except Exception:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
async def propose_new_file(repo_root: Path, rel: str, user_goal: str) -> tuple[Optional[str], str]:
|
|||
|
|
"""
|
|||
|
|
Vraag de LLM om een *volledig nieuwe file* te genereren op pad `rel`
|
|||
|
|
met minimale aannames. Geeft (content, reason).
|
|||
|
|
"""
|
|||
|
|
ext = os.path.splitext(rel)[1].lower()
|
|||
|
|
siblings = list_sibling_files(repo_root, rel)
|
|||
|
|
sibling_snippets = []
|
|||
|
|
for name in siblings[:3]:
|
|||
|
|
snippet = read_snippet(repo_root / os.path.join(os.path.dirname(rel), name), max_chars=1600)
|
|||
|
|
if snippet:
|
|||
|
|
sibling_snippets.append({"name": name, "snippet": snippet[:1600]})
|
|||
|
|
|
|||
|
|
SYSTEM = "Je bent een zorgvuldige codegenerator. Lever exact één compleet bestand. Geen extra refactors."
|
|||
|
|
USER = (
|
|||
|
|
f"Doel (nieuwe file aanmaken):\n{user_goal}\n\n"
|
|||
|
|
f"Bestandspad: {rel}\n"
|
|||
|
|
f"Directory siblings: {', '.join(siblings) if siblings else '(geen)'}\n\n"
|
|||
|
|
"Enkele nabije referenties (indien aanwezig):\n" +
|
|||
|
|
"\n".join([f"--- {s['name']} ---\n{s['snippet']}" for s in sibling_snippets]) +
|
|||
|
|
"\n\nEisen:\n"
|
|||
|
|
"- Maak een minimal-werkende versie van dit bestand die past bij de context hierboven.\n"
|
|||
|
|
"- Raak geen andere paden aan; geen includes naar niet-bestaande bestanden.\n"
|
|||
|
|
"- Gebruik hetzelfde framework/stack als de referenties suggereren (indien duidelijk).\n"
|
|||
|
|
"- Output: alleen de VOLLEDIGE bestandinformatie in één codeblok, niets anders."
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":SYSTEM},{"role":"user","content":USER}],
|
|||
|
|
stream=False, temperature=0.2, top_p=0.9, max_tokens=2048
|
|||
|
|
)
|
|||
|
|
content = _extract_code_block(
|
|||
|
|
resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
|||
|
|
) or ""
|
|||
|
|
content = content.strip()
|
|||
|
|
if not content:
|
|||
|
|
return None, "LLM gaf geen inhoud terug."
|
|||
|
|
# simpele sanity-limit
|
|||
|
|
if len(content) > 200_000:
|
|||
|
|
content = content[:200_000]
|
|||
|
|
return content, "Nieuw bestand voorgesteld op basis van directory-context en doel."
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:propose_new_file failed for %s: %s", rel, e)
|
|||
|
|
return None, f"Kon geen nieuwe file genereren: {e}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------- Diff helper ----------
|
|||
|
|
def make_diffs(original: str, modified: str, filename: str, max_lines: int = 200) -> str:
|
|||
|
|
diff = list(difflib.unified_diff(
|
|||
|
|
original.splitlines(keepends=True),
|
|||
|
|
modified.splitlines(keepends=True),
|
|||
|
|
fromfile=f"a/{filename}",
|
|||
|
|
tofile=f"b/{filename}",
|
|||
|
|
lineterm=""
|
|||
|
|
))
|
|||
|
|
if len(diff) > max_lines:
|
|||
|
|
return "".join(diff[:max_lines]) + "\n... (diff ingekort)"
|
|||
|
|
return "".join(diff)
|
|||
|
|
|
|||
|
|
def make_new_file_diff(filename: str, content: str, max_lines: int = 400) -> str:
|
|||
|
|
new_lines = content.splitlines(keepends=True)
|
|||
|
|
diff = list(difflib.unified_diff(
|
|||
|
|
[], new_lines,
|
|||
|
|
fromfile="/dev/null",
|
|||
|
|
tofile=f"b/{filename}",
|
|||
|
|
lineterm=""
|
|||
|
|
))
|
|||
|
|
if len(diff) > max_lines:
|
|||
|
|
return "".join(diff[:max_lines]) + "\n... (diff ingekort)"
|
|||
|
|
return "".join(diff)
|
|||
|
|
|
|||
|
|
# ---------- Lightweight Laravel Graph helpers ----------
|
|||
|
|
def _view_name_to_path(repo_root: Path, view_name: str) -> Optional[str]:
|
|||
|
|
"""
|
|||
|
|
'users.index' -> resources/views/users/index.blade.php (als bestaand)
|
|||
|
|
'users/index' -> idem. Return relatieve path of None als niet gevonden.
|
|||
|
|
"""
|
|||
|
|
if not view_name:
|
|||
|
|
return None
|
|||
|
|
cand = view_name.replace(".", "/").strip("/ ")
|
|||
|
|
for ext in [".blade.php", ".php"]:
|
|||
|
|
rel = f"resources/views/{cand}{ext}"
|
|||
|
|
if (repo_root / rel).exists():
|
|||
|
|
return rel
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def _controller_extract_views(text: str, repo_root: Path) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Zoek 'return view("x.y")' en map naar blade-bestanden.
|
|||
|
|
Ondersteunt ook: View::make('x.y'), Inertia::render('X/Y') -> best effort naar blade.
|
|||
|
|
"""
|
|||
|
|
outs: list[str] = []
|
|||
|
|
# view('foo.bar')
|
|||
|
|
for m in re.finditer(r"(?:return\s+)?view\s*\(\s*['\"]([^'\"]+)['\"]", text, flags=re.I):
|
|||
|
|
rel = _view_name_to_path(repo_root, m.group(1))
|
|||
|
|
if rel:
|
|||
|
|
outs.append(rel)
|
|||
|
|
# View::make('foo.bar')
|
|||
|
|
for m in re.finditer(r"View::make\s*\(\s*['\"]([^'\"]+)['\"]", text, flags=re.I):
|
|||
|
|
rel = _view_name_to_path(repo_root, m.group(1))
|
|||
|
|
if rel:
|
|||
|
|
outs.append(rel)
|
|||
|
|
# Inertia::render('Foo/Bar') -> probeer view pad heuristisch
|
|||
|
|
for m in re.finditer(r"Inertia::render\s*\(\s*['\"]([^'\"]+)['\"]", text, flags=re.I):
|
|||
|
|
rel = _view_name_to_path(repo_root, m.group(1))
|
|||
|
|
if rel:
|
|||
|
|
outs.append(rel)
|
|||
|
|
# dedupe
|
|||
|
|
seen=set(); uniq=[]
|
|||
|
|
for r in outs:
|
|||
|
|
if r not in seen:
|
|||
|
|
uniq.append(r); seen.add(r)
|
|||
|
|
return uniq
|
|||
|
|
|
|||
|
|
def _blade_extract_lang_keys(text: str) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Haal vertaalkeys uit Blade/PHP: __('x.y'), @lang('x.y'), trans('x.y')
|
|||
|
|
"""
|
|||
|
|
keys = []
|
|||
|
|
for rx in [
|
|||
|
|
r"__\(\s*['\"]([^'\"]+)['\"]\s*\)",
|
|||
|
|
r"@lang\(\s*['\"]([^'\"]+)['\"]\s*\)",
|
|||
|
|
r"trans\(\s*['\"]([^'\"]+)['\"]\s*\)"
|
|||
|
|
]:
|
|||
|
|
for m in re.finditer(rx, text):
|
|||
|
|
keys.append(m.group(1))
|
|||
|
|
# dedupe
|
|||
|
|
seen=set(); out=[]
|
|||
|
|
for k in keys:
|
|||
|
|
if k not in seen:
|
|||
|
|
out.append(k); seen.add(k)
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
def _grep_lang_files_for_key(repo_root: Path, key: str, limit: int = 6) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Zoek in resources/lang/**/*.(json|php) naar KEY. Best-effort, klein limiet.
|
|||
|
|
"""
|
|||
|
|
base = repo_root / "resources/lang"
|
|||
|
|
if not base.exists():
|
|||
|
|
return []
|
|||
|
|
hits=[]
|
|||
|
|
try:
|
|||
|
|
for p in base.rglob("*"):
|
|||
|
|
if p.is_dir():
|
|||
|
|
continue
|
|||
|
|
if not (str(p).endswith(".json") or str(p).endswith(".php")):
|
|||
|
|
continue
|
|||
|
|
if p.stat().st_size > 300_000:
|
|||
|
|
continue
|
|||
|
|
txt = p.read_text(encoding="utf-8", errors="ignore")
|
|||
|
|
if key in txt:
|
|||
|
|
hits.append(str(p.relative_to(repo_root)))
|
|||
|
|
if len(hits) >= limit:
|
|||
|
|
break
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
return hits
|
|||
|
|
|
|||
|
|
def _build_laravel_graph(repo_root: Path) -> dict[str, set[str]]:
|
|||
|
|
"""
|
|||
|
|
Maak een lichte ongerichte graaf:
|
|||
|
|
- routes/web.php|api.php ↔ controller-bestanden
|
|||
|
|
- controller ↔ views (via return view(...))
|
|||
|
|
- view ↔ lang-bestanden (voor keys die in de view voorkomen)
|
|||
|
|
Node-labels = relatieve padnamen; edges zijn ongericht (buren).
|
|||
|
|
"""
|
|||
|
|
g: dict[str, set[str]] = {}
|
|||
|
|
def _add(a: str, b: str):
|
|||
|
|
g.setdefault(a, set()).add(b)
|
|||
|
|
g.setdefault(b, set()).add(a)
|
|||
|
|
|
|||
|
|
# 1) routes → controllers (reeds beschikbare scanner hergebruiken)
|
|||
|
|
routes = laravel_scan_routes(repo_root)
|
|||
|
|
for r in routes:
|
|||
|
|
rp = r.get("file") or ""
|
|||
|
|
ctrl = r.get("controller") or ""
|
|||
|
|
if not ctrl:
|
|||
|
|
continue
|
|||
|
|
for cpath in _candidate_paths_for_controller(repo_root, ctrl):
|
|||
|
|
_add(rp, cpath)
|
|||
|
|
# 2) controllers → views (parse controller file)
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(repo_root / cpath) or ""
|
|||
|
|
except Exception:
|
|||
|
|
txt = ""
|
|||
|
|
for vrel in _controller_extract_views(txt, repo_root):
|
|||
|
|
_add(cpath, vrel)
|
|||
|
|
# 3) views → lang-files (op basis van keys)
|
|||
|
|
try:
|
|||
|
|
vtxt = _read_text_file(repo_root / vrel) or ""
|
|||
|
|
except Exception:
|
|||
|
|
vtxt = ""
|
|||
|
|
for key in _blade_extract_lang_keys(vtxt):
|
|||
|
|
for lrel in _grep_lang_files_for_key(repo_root, key, limit=4):
|
|||
|
|
_add(vrel, lrel)
|
|||
|
|
return g
|
|||
|
|
|
|||
|
|
def _graph_bfs_boosts(graph: dict[str, set[str]], seeds: list[str], max_depth: int = 3) -> dict[str, tuple[int, str]]:
|
|||
|
|
"""
|
|||
|
|
BFS vanaf seed-nodes. Return: {node: (distance, via)} met via=eerste buur of route.
|
|||
|
|
"""
|
|||
|
|
from collections import deque
|
|||
|
|
dist: dict[str, int] = {}
|
|||
|
|
via: dict[str, str] = {}
|
|||
|
|
q = deque()
|
|||
|
|
for s in seeds:
|
|||
|
|
if s in graph:
|
|||
|
|
dist[s] = 0
|
|||
|
|
via[s] = s
|
|||
|
|
q.append(s)
|
|||
|
|
while q:
|
|||
|
|
cur = q.popleft()
|
|||
|
|
if dist[cur] >= max_depth:
|
|||
|
|
continue
|
|||
|
|
for nb in graph.get(cur, ()):
|
|||
|
|
if nb not in dist:
|
|||
|
|
dist[nb] = dist[cur] + 1
|
|||
|
|
via[nb] = cur if via.get(cur) == cur else via.get(cur, cur)
|
|||
|
|
q.append(nb)
|
|||
|
|
return {n: (d, via.get(n, "")) for n, d in dist.items()}
|
|||
|
|
|
|||
|
|
def _get_graph_cached(repo_root: Path, memo_key: str) -> dict[str, set[str]]:
|
|||
|
|
if os.getenv("AGENT_GRAPH_ENABLE", "1").lower() in ("0", "false"):
|
|||
|
|
return {}
|
|||
|
|
g = _GRAPH_CACHE.get(memo_key)
|
|||
|
|
if g is not None:
|
|||
|
|
return g
|
|||
|
|
try:
|
|||
|
|
g = _build_laravel_graph(repo_root)
|
|||
|
|
except Exception:
|
|||
|
|
g = {}
|
|||
|
|
_GRAPH_CACHE[memo_key] = g
|
|||
|
|
return g
|
|||
|
|
|
|||
|
|
# ---------- Tree summaries (korte per-file beschrijving) ----------
|
|||
|
|
def _summarize_file_for_tree(path: Path) -> str:
|
|||
|
|
"""
|
|||
|
|
Heuristische mini-samenvatting (<=160 chars):
|
|||
|
|
- eerste docblock / commentregel / heading
|
|||
|
|
- anders eerste niet-lege regel
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
txt = path.read_text(encoding="utf-8", errors="ignore")
|
|||
|
|
except Exception:
|
|||
|
|
return ""
|
|||
|
|
head = txt[:1200]
|
|||
|
|
# PHP docblock
|
|||
|
|
m = re.search(r"/\*\*([\s\S]{0,400}?)\*/", head)
|
|||
|
|
if m:
|
|||
|
|
s = re.sub(r"[*\s]+", " ", m.group(1)).strip()
|
|||
|
|
return (s[:160])
|
|||
|
|
# single-line comments / headings
|
|||
|
|
for rx in [r"^\s*//\s*(.+)$", r"^\s*#\s*(.+)$", r"^\s*<!--\s*(.+?)\s*-->", r"^\s*<h1[^>]*>([^<]+)</h1>", r"^\s*<title[^>]*>([^<]+)</title>"]:
|
|||
|
|
mm = re.search(rx, head, flags=re.M|re.I)
|
|||
|
|
if mm:
|
|||
|
|
return mm.group(1).strip()[:160]
|
|||
|
|
# first non-empty line
|
|||
|
|
for line in head.splitlines():
|
|||
|
|
ln = line.strip()
|
|||
|
|
if ln:
|
|||
|
|
return ln[:160]
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
def _build_tree_summaries(repo_root: Path, all_files: list[str], max_files: int = 2000) -> dict[str, str]:
|
|||
|
|
out: dict[str, str] = {}
|
|||
|
|
count = 0
|
|||
|
|
for rel in all_files:
|
|||
|
|
if count >= max_files:
|
|||
|
|
break
|
|||
|
|
p = repo_root / rel
|
|||
|
|
try:
|
|||
|
|
if p.stat().st_size > 200_000:
|
|||
|
|
continue
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
s = _summarize_file_for_tree(p)
|
|||
|
|
if s:
|
|||
|
|
out[rel] = s
|
|||
|
|
count += 1
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
def _get_tree_cached(repo_root: Path, memo_key: str, all_files: list[str]) -> dict[str, str]:
|
|||
|
|
if os.getenv("AGENT_TREE_ENABLE", "1").lower() in ("0","false"):
|
|||
|
|
return {}
|
|||
|
|
t = _TREE_SUM_CACHE.get(memo_key)
|
|||
|
|
if t is not None:
|
|||
|
|
return t
|
|||
|
|
try:
|
|||
|
|
t = _build_tree_summaries(repo_root, all_files)
|
|||
|
|
except Exception:
|
|||
|
|
t = {}
|
|||
|
|
_TREE_SUM_CACHE[memo_key] = t
|
|||
|
|
return t
|
|||
|
|
|
|||
|
|
# ---------- Mini tree-hint voor LLM edit-plannen ----------
|
|||
|
|
def _make_local_tree_hint(repo_root: Path, rel: str, max_siblings: int = 14) -> str:
|
|||
|
|
"""
|
|||
|
|
Bouw een compact overzicht van de map van 'rel' met 10–14 nabije files en korte samenvattingen.
|
|||
|
|
Houd het kort en voorspelbaar voor de LLM.
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
base_dir = (repo_root / rel).parent
|
|||
|
|
except Exception:
|
|||
|
|
return ""
|
|||
|
|
lines = []
|
|||
|
|
try:
|
|||
|
|
folder = str(base_dir.relative_to(repo_root))
|
|||
|
|
except Exception:
|
|||
|
|
folder = base_dir.name
|
|||
|
|
lines.append(f"Map: {folder or '.'}")
|
|||
|
|
|
|||
|
|
items = []
|
|||
|
|
try:
|
|||
|
|
for p in sorted(base_dir.iterdir(), key=lambda x: x.name.lower()):
|
|||
|
|
if not p.is_file():
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
if not allowed_file(p) or p.stat().st_size > 200_000:
|
|||
|
|
continue
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
summ = _summarize_file_for_tree(p)
|
|||
|
|
name = p.name
|
|||
|
|
if summ:
|
|||
|
|
items.append(f"- {name}: {summ[:120]}")
|
|||
|
|
else:
|
|||
|
|
items.append(f"- {name}")
|
|||
|
|
if len(items) >= max_siblings:
|
|||
|
|
break
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
lines.extend(items)
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
# ---------- Basic syntax guards ----------
|
|||
|
|
def _write_tmp(content: str, suffix: str) -> Path:
|
|||
|
|
import tempfile
|
|||
|
|
fd, path = tempfile.mkstemp(suffix=suffix)
|
|||
|
|
os.close(fd)
|
|||
|
|
p = Path(path)
|
|||
|
|
p.write_text(content, encoding="utf-8")
|
|||
|
|
return p
|
|||
|
|
|
|||
|
|
def _php_lint_ok(tmp_path: Path) -> bool:
|
|||
|
|
# disable via AGENT_SYNTAX_GUARD=0
|
|||
|
|
if os.getenv("AGENT_SYNTAX_GUARD","1").lower() in ("0","false"):
|
|||
|
|
return True
|
|||
|
|
try:
|
|||
|
|
import subprocess
|
|||
|
|
res = subprocess.run(["php","-l",str(tmp_path)], capture_output=True, text=True, timeout=8)
|
|||
|
|
return res.returncode == 0
|
|||
|
|
except Exception:
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
def _blade_balance_ok(text: str) -> bool:
|
|||
|
|
# Zeer conservatieve balans-check voor veelvoorkomende Blade directives
|
|||
|
|
tl = (text or "").lower()
|
|||
|
|
pairs = [("section","endsection"),("if","endif"),("foreach","endforeach"),("isset","endisset"),("php","endphp")]
|
|||
|
|
for a,b in pairs:
|
|||
|
|
if tl.count("@"+a) != tl.count("@"+b):
|
|||
|
|
return False
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------- Gerichte, veilige literal fallback ----------
|
|||
|
|
# === PATCH: generieke HTML-scope vervanging ===
|
|||
|
|
|
|||
|
|
def html_scoped_literal_replace(html: str, old: str, new: str, tag_names: set[str]) -> tuple[str, bool, str]:
|
|||
|
|
"""
|
|||
|
|
Probeer 'old' -> 'new' te vervangen, maar ALLEEN binnen de genoemde tags.
|
|||
|
|
Werkt zonder externe libs; gebruikt conservatieve regex (DOTALL).
|
|||
|
|
Retour: (modified, changed, rationale)
|
|||
|
|
"""
|
|||
|
|
if not html or not old or not tag_names:
|
|||
|
|
return html, False, ""
|
|||
|
|
changed = False
|
|||
|
|
rationale = []
|
|||
|
|
result = html
|
|||
|
|
|
|||
|
|
for tag in sorted(tag_names):
|
|||
|
|
# <tag ...> ... </tag> (greedy genoeg per blok, maar beperkt via DOTALL)
|
|||
|
|
tag_re = re.compile(rf"(<\s*{re.escape(tag)}\b[^>]*>)(.*?)(</\s*{re.escape(tag)}\s*>)",
|
|||
|
|
flags=re.IGNORECASE | re.DOTALL)
|
|||
|
|
def _one(m):
|
|||
|
|
nonlocal changed
|
|||
|
|
open_tag, inner, close_tag = m.group(1), m.group(2), m.group(3)
|
|||
|
|
if old in inner:
|
|||
|
|
# maximaal 1 vervanging per tag-blok (conform docstring)
|
|||
|
|
new_inner = inner.replace(old, new, 1)
|
|||
|
|
if new_inner != inner:
|
|||
|
|
changed = True
|
|||
|
|
rationale.append(f"'{old}' vervangen binnen <{tag}> (1x)")
|
|||
|
|
return open_tag + new_inner + close_tag
|
|||
|
|
return m.group(0)
|
|||
|
|
result_new = tag_re.sub(_one, result)
|
|||
|
|
result = result_new
|
|||
|
|
|
|||
|
|
return result, changed, "; ".join(rationale) if changed else ""
|
|||
|
|
|
|||
|
|
# === PATCH: veilige, algemene string-literal vervanging ===
|
|||
|
|
|
|||
|
|
def quoted_literal_replace(original: str, old: str, new: str, max_occurrences: int = 2) -> tuple[str, bool, str]:
|
|||
|
|
"""
|
|||
|
|
Vervang 'old' of "old" als string-literal, maximaal 'max_occurrences' keer.
|
|||
|
|
Dit is taalagnostisch en wijzigt geen identifiers, enkel stringwaarden.
|
|||
|
|
Return: (modified, changed, rationale)
|
|||
|
|
"""
|
|||
|
|
if not original or not old:
|
|||
|
|
return original, False, ""
|
|||
|
|
pat = re.compile(rf"(?P<q>['\"])({re.escape(old)})(?P=q)")
|
|||
|
|
cnt = 0
|
|||
|
|
def _repl(m):
|
|||
|
|
nonlocal cnt
|
|||
|
|
if cnt >= max_occurrences:
|
|||
|
|
return m.group(0)
|
|||
|
|
cnt += 1
|
|||
|
|
q = m.group("q")
|
|||
|
|
return q + new + q
|
|||
|
|
new_text = pat.sub(_repl, original)
|
|||
|
|
if new_text != original and cnt > 0:
|
|||
|
|
return new_text, True, f"'{old}' → '{new}' als string-literal ({cnt}x, limiet {max_occurrences})"
|
|||
|
|
return original, False, ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ==== BEGIN PATCH B: per-bestand oud/nieuw bepalen + generieke fallback ====
|
|||
|
|
def _literal_matches_with_context(src: str, needle: str, window: int = 160):
|
|||
|
|
"""Vind alle posities waar 'needle' als literal voorkomt en geef de operator-context terug."""
|
|||
|
|
escaped = re.escape(needle)
|
|||
|
|
pat = re.compile(r"(?P<q>['\"])(" + escaped + r")(?P=q)")
|
|||
|
|
for m in pat.finditer(src):
|
|||
|
|
a, b = m.span()
|
|||
|
|
before = src[max(0, a - window):a]
|
|||
|
|
op = None
|
|||
|
|
if re.search(r"\?\?\s*$", before):
|
|||
|
|
op = "??"
|
|||
|
|
elif re.search(r"\?\s*[^:\n]{0,120}:\s*$", before):
|
|||
|
|
op = "?:"
|
|||
|
|
elif re.search(r"\|\|\s*$", before):
|
|||
|
|
op = "||"
|
|||
|
|
elif re.search(r"\bor\b\s*$", before, flags=re.IGNORECASE):
|
|||
|
|
op = "or"
|
|||
|
|
yield (a, b, op)
|
|||
|
|
|
|||
|
|
def deduce_old_new_literals(user_goal: str, original: str) -> tuple[Optional[str], Optional[str], str]:
|
|||
|
|
"""
|
|||
|
|
Kies 'old' als de quoted string uit de prompt die ook in de file staat
|
|||
|
|
én het vaakst in fallback-context (??, ?:, ||, or) voorkomt.
|
|||
|
|
Kies 'new' als een andere quoted string uit de prompt (liefst die níet in de file voorkomt).
|
|||
|
|
Retourneer (old, new, rationale).
|
|||
|
|
"""
|
|||
|
|
quotes = extract_quotes(user_goal)
|
|||
|
|
if not quotes:
|
|||
|
|
return None, None, "Geen quoted strings in prompt gevonden."
|
|||
|
|
# Score candidates for OLD
|
|||
|
|
scores = []
|
|||
|
|
for q in quotes:
|
|||
|
|
hits = list(_literal_matches_with_context(original, q))
|
|||
|
|
if hits:
|
|||
|
|
# gewicht: aantal hits + bonus als er operator context is
|
|||
|
|
ctx_hits = sum(1 for _,_,op in hits if op)
|
|||
|
|
score = 2 * ctx_hits + len(hits)
|
|||
|
|
scores.append((q, score, ctx_hits))
|
|||
|
|
if not scores:
|
|||
|
|
# Geen van de quotes komt in de file voor; dan geen gerichte fallback
|
|||
|
|
return None, None, "Geen van de quotes uit prompt kwam in de file voor."
|
|||
|
|
scores.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
|||
|
|
old = scores[0][0]
|
|||
|
|
|
|||
|
|
# Kies NEW uit overige quotes: bij voorkeur eentje die niet in de file voorkomt
|
|||
|
|
rest = [q for q in quotes if q != old]
|
|||
|
|
if not rest:
|
|||
|
|
return old, None, f"OLD='{old}' gekozen; geen 'new' gevonden."
|
|||
|
|
prefer = [q for q in rest if q not in original]
|
|||
|
|
new = (prefer[0] if prefer else rest[0])
|
|||
|
|
|
|||
|
|
why = f"OLD='{old}' (meeste fallback-contexthits), NEW='{new}'."
|
|||
|
|
return old, new, why
|
|||
|
|
|
|||
|
|
def targeted_fallback_replace(original: str, old: str, new: str) -> tuple[str, bool, str]:
|
|||
|
|
"""
|
|||
|
|
Vervang uitsluitend de literal OLD als die duidelijk fallback is nabij ??, ?:, || of 'or'.
|
|||
|
|
Retourneer (modified, changed_bool, rationale).
|
|||
|
|
"""
|
|||
|
|
if not original or not old:
|
|||
|
|
return original, False, ""
|
|||
|
|
window = 160
|
|||
|
|
escaped_old = re.escape(old)
|
|||
|
|
pat = re.compile(r"(?P<q>['\"])(" + escaped_old + r")(?P=q)")
|
|||
|
|
text = original
|
|||
|
|
for m in pat.finditer(text):
|
|||
|
|
q = m.group("q")
|
|||
|
|
a, b = m.span()
|
|||
|
|
before = text[max(0, a - window):a]
|
|||
|
|
op = None
|
|||
|
|
if re.search(r"\?\?\s*$", before):
|
|||
|
|
op = "??"
|
|||
|
|
elif re.search(r"\?\s*[^:\n]{0,120}:\s*$", before):
|
|||
|
|
op = "?:"
|
|||
|
|
elif re.search(r"\|\|\s*$", before):
|
|||
|
|
op = "||"
|
|||
|
|
elif re.search(r"\bor\b\s*$", before, flags=re.IGNORECASE):
|
|||
|
|
op = "or"
|
|||
|
|
if not op:
|
|||
|
|
continue
|
|||
|
|
new_text = text[:a] + q + new + q + text[b:]
|
|||
|
|
reason = f"Gerichte vervanging van fallback-literal nabij operator '{op}'"
|
|||
|
|
return new_text, True, reason
|
|||
|
|
return original, False, ""
|
|||
|
|
|
|||
|
|
# ==== END PATCH B ====
|
|||
|
|
|
|||
|
|
# === Repo-QA: vraag-antwoord over 1 specifieke repository ===
|
|||
|
|
_LARAVEL_CREATE_HINTS = {
|
|||
|
|
"verbs": ["create", "store", "new", "aanmaken", "aanmaak", "nieuw", "toevoegen", "add"],
|
|||
|
|
"nouns": ["melding", "incident", "ticket", "aanvraag", "report", "issue", "storingen", "storing"]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def _read_file_safe(p: Path) -> str:
|
|||
|
|
try:
|
|||
|
|
return _read_text_file(p) or ""
|
|||
|
|
except Exception:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
def laravel_scan_routes(repo_root: Path) -> list[dict]:
|
|||
|
|
out = []
|
|||
|
|
for rp in ["routes/web.php", "routes/api.php"]:
|
|||
|
|
p = repo_root / rp
|
|||
|
|
if not p.exists():
|
|||
|
|
continue
|
|||
|
|
txt = _read_file_safe(p)
|
|||
|
|
for m in re.finditer(r"Route::(get|post|put|patch|delete|match|resource)\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*([^)]+)\)", txt, flags=re.I):
|
|||
|
|
verb, uri, target = m.group(1).lower(), m.group(2), m.group(3)
|
|||
|
|
ctrl = None; method = None; name = None
|
|||
|
|
# controller@method
|
|||
|
|
m2 = re.search(r"['\"]([A-Za-z0-9_\\]+)@([A-Za-z0-9_]+)['\"]", target)
|
|||
|
|
if m2:
|
|||
|
|
ctrl, method = m2.group(1), m2.group(2)
|
|||
|
|
else:
|
|||
|
|
# ['Foo\\BarController::class', 'index'] of [Foo\\BarController::class, 'index']
|
|||
|
|
m2b = re.search(r"\[\s*([A-Za-z0-9_\\]+)::class\s*,\s*['\"]([A-Za-z0-9_]+)['\"]\s*\]", target)
|
|||
|
|
if m2b:
|
|||
|
|
ctrl, method = m2b.group(1), m2b.group(2)
|
|||
|
|
# ->name('...')
|
|||
|
|
tail = txt[m.end(): m.end()+140]
|
|||
|
|
m3 = re.search(r"->\s*name\s*\(\s*['\"]([^'\"]+)['\"]\s*\)", tail)
|
|||
|
|
if m3: name = m3.group(1)
|
|||
|
|
out.append({"file": rp, "verb": verb, "uri": uri, "target": target, "controller": ctrl, "method": method, "name": name})
|
|||
|
|
# Route::resource
|
|||
|
|
for m in re.finditer(r"Route::resource\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*['\"]([^'\"]+)['\"]\s*\)", txt, flags=re.I):
|
|||
|
|
res, ctrl = m.group(1), m.group(2)
|
|||
|
|
out.append({"file": rp, "verb": "resource", "uri": res, "target": ctrl, "controller": ctrl, "method": None, "name": None})
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
def _candidate_paths_for_controller(repo_root: Path, controller_fqcn: str) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Probeer Controller-bestand + views te vinden vanuit FQCN zoals App\\Http\\Controllers\\Foo\\BarController.
|
|||
|
|
"""
|
|||
|
|
rels = []
|
|||
|
|
# controller pad
|
|||
|
|
base = controller_fqcn.replace("\\\\","/").replace("\\","/")
|
|||
|
|
name = base.split("/")[-1]
|
|||
|
|
ctrl_guess = [
|
|||
|
|
f"app/Http/Controllers/{base}.php",
|
|||
|
|
f"app/Http/Controllers/{name}.php"
|
|||
|
|
]
|
|||
|
|
for g in ctrl_guess:
|
|||
|
|
if (repo_root / g).exists():
|
|||
|
|
rels.append(g)
|
|||
|
|
# view dir guesses (resource-achtig)
|
|||
|
|
view_roots = ["resources/views", "resources/views/livewire", "resources/views/components"]
|
|||
|
|
stem = re.sub(r"Controller$", "", name, flags=re.I)
|
|||
|
|
for vr in view_roots:
|
|||
|
|
for hint in [stem, stem.lower()]:
|
|||
|
|
dp = repo_root / f"{vr}/{hint}"
|
|||
|
|
if dp.exists() and dp.is_dir():
|
|||
|
|
for bp in dp.rglob("*.blade.php"):
|
|||
|
|
if bp.stat().st_size < 500000:
|
|||
|
|
rels.append(str(bp.relative_to(repo_root)))
|
|||
|
|
return list(dict.fromkeys(rels))[:8]
|
|||
|
|
|
|||
|
|
def laravel_signal_candidates(repo_root: Path, user_goal: str, all_files: list[str], max_out: int = 6) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Heuristische preselectie voor Laravel 'aanmaken/nieuw' use-cases:
|
|||
|
|
- zoekt in routes naar 'create|store' of semantic hints
|
|||
|
|
- projecteert naar controllers + blade views
|
|||
|
|
"""
|
|||
|
|
# snelle exit als er geen laravel markers zijn
|
|||
|
|
if not (repo_root / "artisan").exists() and not (repo_root / "composer.json").exists():
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
goal = (user_goal or "").lower()
|
|||
|
|
verbs = _LARAVEL_CREATE_HINTS["verbs"]
|
|||
|
|
nouns = _LARAVEL_CREATE_HINTS["nouns"]
|
|||
|
|
|
|||
|
|
def _goal_hits(s: str) -> int:
|
|||
|
|
lo = s.lower()
|
|||
|
|
v = sum(1 for w in verbs if w in lo)
|
|||
|
|
n = sum(1 for w in nouns if w in lo)
|
|||
|
|
return v*2 + n # verbs wegen iets zwaarder
|
|||
|
|
|
|||
|
|
routes = laravel_scan_routes(repo_root)
|
|||
|
|
scored = []
|
|||
|
|
for r in routes:
|
|||
|
|
base_s = f"{r.get('uri','')} {r.get('name','')} {r.get('controller','') or ''} {r.get('method','') or ''}"
|
|||
|
|
score = _goal_hits(base_s)
|
|||
|
|
# bonus als expliciet create/store
|
|||
|
|
if (r.get("method") or "").lower() in ("create","store"):
|
|||
|
|
score += 3
|
|||
|
|
if r.get("verb") == "resource":
|
|||
|
|
# resource → heeft impliciet create/store routes
|
|||
|
|
score += 2
|
|||
|
|
if score > 0:
|
|||
|
|
scored.append((score, r))
|
|||
|
|
|
|||
|
|
if not scored:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
scored.sort(key=lambda x: x[0], reverse=True)
|
|||
|
|
picks: list[str] = []
|
|||
|
|
for _score, r in scored[:8]:
|
|||
|
|
# controller + vermoedelijke views
|
|||
|
|
if r.get("controller"):
|
|||
|
|
for rel in _candidate_paths_for_controller(repo_root, r["controller"]):
|
|||
|
|
if rel in all_files and rel not in picks:
|
|||
|
|
picks.append(rel)
|
|||
|
|
# view guess als padnaam “melding*create.blade.php”
|
|||
|
|
for rel in all_files:
|
|||
|
|
name = os.path.basename(rel).lower()
|
|||
|
|
dirname = os.path.dirname(rel).lower()
|
|||
|
|
if any(n in dirname for n in nouns) and ("create" in name or "form" in name):
|
|||
|
|
if rel not in picks:
|
|||
|
|
picks.append(rel)
|
|||
|
|
if len(picks) >= max_out:
|
|||
|
|
break
|
|||
|
|
return picks[:max_out]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _detect_stack_summary(repo_root: Path) -> dict:
|
|||
|
|
"""Heuristieken: taal/vermoed framework, routes/migraties/DB hints."""
|
|||
|
|
summary = {
|
|||
|
|
"languages": {},
|
|||
|
|
"framework": [],
|
|||
|
|
"entrypoints": [],
|
|||
|
|
"routes": [],
|
|||
|
|
"db": [],
|
|||
|
|
"notable_dirs": [],
|
|||
|
|
}
|
|||
|
|
# talen tellen (globaal)
|
|||
|
|
ext_map = {}
|
|||
|
|
for rel in list_repo_files(repo_root):
|
|||
|
|
ext = os.path.splitext(rel)[1].lower()
|
|||
|
|
ext_map[ext] = ext_map.get(ext, 0) + 1
|
|||
|
|
summary["languages"] = dict(sorted(ext_map.items(), key=lambda x: x[1], reverse=True)[:8])
|
|||
|
|
|
|||
|
|
# PHP/Laravel hints
|
|||
|
|
comp = repo_root / "composer.json"
|
|||
|
|
if comp.exists():
|
|||
|
|
try:
|
|||
|
|
import json as _json
|
|||
|
|
js = _json.loads(comp.read_text(encoding="utf-8", errors="ignore"))
|
|||
|
|
req = (js.get("require") or {}) | (js.get("require-dev") or {})
|
|||
|
|
if any("laravel/framework" in k for k in req.keys()):
|
|||
|
|
summary["framework"].append("Laravel")
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
if (repo_root / "artisan").exists():
|
|||
|
|
summary["entrypoints"].append("artisan (Laravel CLI)")
|
|||
|
|
# Node hints
|
|||
|
|
pkg = repo_root / "package.json"
|
|||
|
|
if pkg.exists():
|
|||
|
|
try:
|
|||
|
|
import json as _json
|
|||
|
|
js = _json.loads(pkg.read_text(encoding="utf-8", errors="ignore"))
|
|||
|
|
deps = list((js.get("dependencies") or {}).keys()) + list((js.get("devDependencies") or {}).keys())
|
|||
|
|
if any(x in deps for x in ["next", "nuxt", "react", "vue", "vite"]):
|
|||
|
|
summary["framework"].append("Node/Frontend")
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# Routes (Laravel)
|
|||
|
|
for rp in ["routes/web.php", "routes/api.php"]:
|
|||
|
|
p = repo_root / rp
|
|||
|
|
if p.exists():
|
|||
|
|
txt = _read_text_file(p) or ""
|
|||
|
|
for m in re.finditer(r"Route::(get|post|put|patch|delete)\s*\(\s*['\"]([^'\"]+)['\"]", txt):
|
|||
|
|
summary["routes"].append(f"{rp}: {m.group(1).upper()} {m.group(2)}")
|
|||
|
|
# DB hints (Laravel/vanilla PHP)
|
|||
|
|
for rp in ["config/database.php", ".env", ".env.example", "app/config/database.php"]:
|
|||
|
|
p = repo_root / rp
|
|||
|
|
if p.exists():
|
|||
|
|
txt = _read_text_file(p) or ""
|
|||
|
|
if "DB_" in txt or "mysql" in txt or "sqlite" in txt or "pgsql" in txt:
|
|||
|
|
snippet = txt[:800].replace("\r"," ")
|
|||
|
|
summary["db"].append(f"{rp}: {snippet}")
|
|||
|
|
# Notable dirs
|
|||
|
|
for d in ["app", "app/admin", "app/public", "public", "resources", "storage", "config", "routes", "src", "docs", "tests"]:
|
|||
|
|
if (repo_root / d).exists():
|
|||
|
|
summary["notable_dirs"].append(d)
|
|||
|
|
return summary
|
|||
|
|
|
|||
|
|
def _format_stack_summary_text(s: dict) -> str:
|
|||
|
|
lines = []
|
|||
|
|
if s.get("framework"):
|
|||
|
|
lines.append("Frameworks (heuristiek): " + ", ".join(sorted(set(s["framework"]))))
|
|||
|
|
if s.get("languages"):
|
|||
|
|
langs = ", ".join([f"{k or '∅'}×{v}" for k,v in s["languages"].items()])
|
|||
|
|
lines.append("Talen (bestandext): " + langs)
|
|||
|
|
if s.get("notable_dirs"):
|
|||
|
|
lines.append("Mappen: " + ", ".join(s["notable_dirs"]))
|
|||
|
|
if s.get("entrypoints"):
|
|||
|
|
lines.append("Entrypoints: " + ", ".join(s["entrypoints"]))
|
|||
|
|
if s.get("routes"):
|
|||
|
|
sample = "; ".join(s["routes"][:8])
|
|||
|
|
lines.append("Routes (sample): " + sample)
|
|||
|
|
if s.get("db"):
|
|||
|
|
# toon alleen paden, geen volledige secrets
|
|||
|
|
lines.append("DB-config aanwezig in: " + ", ".join([d.split(":")[0] for d in s["db"]]))
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
def _collect_repo_context(repo_root: Path, owner_repo: Optional[str], branch: str, question: str, n_ctx: int = 8) -> list[dict]:
|
|||
|
|
"""Kies relevante paden + snippets via hybrid RAG/keywords, voor QA."""
|
|||
|
|
# Deze sync helper is bewust niet geïmplementeerd om misbruik te voorkomen.
|
|||
|
|
# Gebruik altijd de async-variant: _collect_repo_context_async(...)
|
|||
|
|
raise NotImplementedError("_collect_repo_context is niet beschikbaar; gebruik _collect_repo_context_async")
|
|||
|
|
all_files = list_repo_files(repo_root)
|
|||
|
|
# explicit paths uit vraag
|
|||
|
|
picked: List[str] = []
|
|||
|
|
for pth in extract_explicit_paths(question):
|
|||
|
|
if pth in all_files and pth not in picked:
|
|||
|
|
picked.append(pth)
|
|||
|
|
else:
|
|||
|
|
best = best_path_by_basename(all_files, pth)
|
|||
|
|
if best and best not in picked: picked.append(best)
|
|||
|
|
# hybrid rag
|
|||
|
|
loop = asyncio.get_event_loop()
|
|||
|
|
# NB: call hybrag via run_until_complete buiten async? we zitten al in async in hoofdhandler; hier helper sync → laat caller het async deel doen
|
|||
|
|
return [] # placeholder; deze helper niet direct gebruiken buiten async
|
|||
|
|
|
|||
|
|
async def _collect_repo_context_async(repo_root: Path, owner_repo: Optional[str], branch: str, question: str, n_ctx: int = 8) -> list[dict]:
|
|||
|
|
all_files = list_repo_files(repo_root)
|
|||
|
|
picked: List[str] = []
|
|||
|
|
for pth in extract_explicit_paths(question):
|
|||
|
|
if pth in all_files and pth not in picked:
|
|||
|
|
picked.append(pth)
|
|||
|
|
else:
|
|||
|
|
best = best_path_by_basename(all_files, pth)
|
|||
|
|
if best and best not in picked: picked.append(best)
|
|||
|
|
# DB-vragen: seed eerst met bekende DB-artefacten zodat recall direct goed is
|
|||
|
|
def _db_seed_paths() -> list[str]:
|
|||
|
|
prefer: list[str] = []
|
|||
|
|
# 1) directe, bekende locaties
|
|||
|
|
for rel in [
|
|||
|
|
".env", ".env.example", "config/database.php", "config/database.yml",
|
|||
|
|
"database/database.sqlite"
|
|||
|
|
]:
|
|||
|
|
if (repo_root / rel).exists() and rel in all_files:
|
|||
|
|
prefer.append(rel)
|
|||
|
|
# 2) migrations / seeders / modellen
|
|||
|
|
for rel in all_files:
|
|||
|
|
lo = rel.lower()
|
|||
|
|
if lo.startswith("database/migrations/") or lo.startswith("database/seeders/"):
|
|||
|
|
prefer.append(rel)
|
|||
|
|
elif lo.startswith(("app/models/", "app/model/", "app/Models/")) and lo.endswith(".php"):
|
|||
|
|
prefer.append(rel)
|
|||
|
|
elif lo.endswith(".sql"):
|
|||
|
|
prefer.append(rel)
|
|||
|
|
# 3) ruwe heuristiek: bestanden met Schema::, DB::, select/insert/update
|
|||
|
|
hits = []
|
|||
|
|
for rel in all_files:
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(repo_root / rel) or ""
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
tlo = txt.lower()
|
|||
|
|
if any(x in tlo for x in ["schema::create(", "schema::table(", "db::table(", "db::select(", "select ", "insert into ", "create table "]):
|
|||
|
|
hits.append(rel)
|
|||
|
|
# dedupe en cap
|
|||
|
|
seen = set(); out = []
|
|||
|
|
for rel in prefer + hits:
|
|||
|
|
if rel not in seen:
|
|||
|
|
seen.add(rel); out.append(rel)
|
|||
|
|
if len(out) >= n_ctx:
|
|||
|
|
break
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
if _db_intent(question):
|
|||
|
|
for p in _db_seed_paths():
|
|||
|
|
if p in all_files and p not in picked:
|
|||
|
|
picked.append(p)
|
|||
|
|
|
|||
|
|
hybrid = await hybrid_rag_select_paths(repo_root, owner_repo, branch, question, all_files, max_out=n_ctx)
|
|||
|
|
|
|||
|
|
for p in hybrid:
|
|||
|
|
if p not in picked: picked.append(p)
|
|||
|
|
# keyword fallback als nodig
|
|||
|
|
if len(picked) < n_ctx:
|
|||
|
|
for rel, _s in simple_keyword_search(repo_root, all_files, question, limit=n_ctx):
|
|||
|
|
if rel not in picked: picked.append(rel)
|
|||
|
|
# maak snippets
|
|||
|
|
quotes = extract_quotes(question)
|
|||
|
|
hints = extract_word_hints(question)
|
|||
|
|
out = []
|
|||
|
|
for rel in picked[:n_ctx]:
|
|||
|
|
txt = _read_text_file(repo_root / rel) or ""
|
|||
|
|
snippet = extract_focus_snippets(txt, (quotes + hints)[:6], window=320, max_snippets=2)
|
|||
|
|
out.append({"path": rel, "snippet": snippet})
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
def _trim_text_to_tokens(text: str, max_tokens: int, tok_len=approx_token_count) -> str:
|
|||
|
|
if tok_len(text) <= max_tokens:
|
|||
|
|
return text
|
|||
|
|
# ruwe char-slice obv 4 chars/token
|
|||
|
|
max_chars = max(200, max_tokens * 4)
|
|||
|
|
return text[:max_chars]
|
|||
|
|
|
|||
|
|
def _jaccard_tokens(a: str, b: str) -> float:
|
|||
|
|
ta = set(re.findall(r"[A-Za-z0-9_]+", (a or "").lower()))
|
|||
|
|
tb = set(re.findall(r"[A-Za-z0-9_]+", (b or "").lower()))
|
|||
|
|
if not ta or not tb:
|
|||
|
|
return 0.0
|
|||
|
|
return len(ta & tb) / max(1, len(ta | tb))
|
|||
|
|
|
|||
|
|
def _db_intent(text: str) -> bool:
|
|||
|
|
"""Detecteer of de vraag over DB-verbindingen/schema/queries gaat."""
|
|||
|
|
t = (text or "").lower()
|
|||
|
|
keys = [
|
|||
|
|
"database", "sql", "microsoft sql", "ms sql", "mssql", "sql server",
|
|||
|
|
"schema", "tabel", "tabellen", "migratie", "migrations",
|
|||
|
|
"query", "queries", "select", "insert", "update", "delete",
|
|||
|
|
"db_", "connection string", "dsn", "driver", "host", "poort", "poortnummer",
|
|||
|
|
"database.php", ".env"
|
|||
|
|
]
|
|||
|
|
return any(k in t for k in keys)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _prepare_contexts_under_budget(
|
|||
|
|
contexts: List[dict],
|
|||
|
|
question: str,
|
|||
|
|
stack_summary_text: str,
|
|||
|
|
*,
|
|||
|
|
budget_tokens: int = int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")),
|
|||
|
|
tok_len=approx_token_count
|
|||
|
|
) -> List[dict]:
|
|||
|
|
"""
|
|||
|
|
Slimme budgetverdeler:
|
|||
|
|
- dedup & near-dedup
|
|||
|
|
- novelty-gewicht t.o.v. reeds gekozen snippets
|
|||
|
|
- adaptieve toekenningsstrategie met min/max per snippet
|
|||
|
|
"""
|
|||
|
|
if not contexts:
|
|||
|
|
return contexts
|
|||
|
|
|
|||
|
|
# Tunables (mil de default iets conservatiever):
|
|||
|
|
MIN_PER = int(os.getenv("QA_MIN_PER_SNIPPET", "180")) # hard min
|
|||
|
|
MAX_PER = int(os.getenv("QA_MAX_PER_SNIPPET", "900")) # hard max
|
|||
|
|
KEEP_TOP = int(os.getenv("QA_KEEP_TOP_K", "8")) # cap op #snippets
|
|||
|
|
NOVELTY_THRESH = float(os.getenv("QA_NOVELTY_DROP", "0.25")) # onder deze novelty laten we vallen
|
|||
|
|
DEDUP_THRESH = float(os.getenv("QA_DEDUP_JACCARD", "0.85")) # zeer hoge overlap => drop
|
|||
|
|
|
|||
|
|
# 0) cap aantal snippets alvast (caller leverde al gerankt)
|
|||
|
|
contexts = contexts[:KEEP_TOP]
|
|||
|
|
|
|||
|
|
# 1) brute dedup op pad + near-dup op tekst (Jaccard)
|
|||
|
|
unique: List[dict] = []
|
|||
|
|
seen_paths = set()
|
|||
|
|
for c in contexts:
|
|||
|
|
p = c.get("path","")
|
|||
|
|
s = str(c.get("snippet",""))
|
|||
|
|
if p in seen_paths:
|
|||
|
|
continue
|
|||
|
|
# near-dup check tegen al gekozen snippets
|
|||
|
|
is_dup = False
|
|||
|
|
for u in unique:
|
|||
|
|
if _jaccard_tokens(u["snippet"], s) >= DEDUP_THRESH:
|
|||
|
|
is_dup = True
|
|||
|
|
break
|
|||
|
|
if not is_dup:
|
|||
|
|
unique.append({"path": p, "snippet": s})
|
|||
|
|
seen_paths.add(p)
|
|||
|
|
contexts = unique
|
|||
|
|
|
|||
|
|
if not contexts:
|
|||
|
|
return contexts
|
|||
|
|
|
|||
|
|
# Overhead raming zoals voorheen (headers + vraag + stack)
|
|||
|
|
header = (
|
|||
|
|
"Beantwoord de vraag over deze codebase. Wees concreet, kort, en noem bronnen (padnamen).\n"
|
|||
|
|
"Als zekerheid laag is, stel max 2 verduidelijkingsvragen.\n\n"
|
|||
|
|
f"VRAAG:\n{question}\n\n"
|
|||
|
|
f"REPO SAMENVATTING:\n{stack_summary_text or '(geen)'}\n\n"
|
|||
|
|
"RELEVANTE FRAGMENTEN:\n"
|
|||
|
|
)
|
|||
|
|
frag_headers = "\n\n".join([f"{i+1}) PATH: {c['path']}\nFRAGMENT:\n" for i, c in enumerate(contexts)])
|
|||
|
|
overhead_tokens = tok_len(header) + tok_len(frag_headers) + 200
|
|||
|
|
|
|||
|
|
# Beschikbaar voor echte snippet-inhoud
|
|||
|
|
remain = max(300, budget_tokens - overhead_tokens)
|
|||
|
|
n = len(contexts)
|
|||
|
|
|
|||
|
|
# 2) Schat "relevance proxy" = overlap tussen vraag en snippet
|
|||
|
|
def rel(sn: str) -> float:
|
|||
|
|
return _jaccard_tokens(question, sn)
|
|||
|
|
|
|||
|
|
# 3) Greedy novelty: per snippet extra score voor info die nog niet gedekt is
|
|||
|
|
chosen_text = "" # cumulatieve "coverage"
|
|||
|
|
scores = []
|
|||
|
|
for i, c in enumerate(contexts):
|
|||
|
|
s = c["snippet"]
|
|||
|
|
r = rel(s)
|
|||
|
|
# novelty = 1 - overlap met reeds gekozen tekst
|
|||
|
|
nov = 1.0 - _jaccard_tokens(chosen_text, s) if chosen_text else 1.0
|
|||
|
|
# filter extreem lage novelty: helpt ruis te schrappen
|
|||
|
|
if nov < NOVELTY_THRESH and i > 0:
|
|||
|
|
# Markeer als zwak; we geven ‘m een heel lage score (kan later afvallen)
|
|||
|
|
scores.append((i, r * 0.05, nov))
|
|||
|
|
else:
|
|||
|
|
# na 3 snippets weeg novelty zwaarder
|
|||
|
|
if i >= 3:
|
|||
|
|
scores.append((i, r * (0.35 + 0.65 * nov), nov))
|
|||
|
|
else:
|
|||
|
|
scores.append((i, r * (0.5 + 0.5 * nov), nov))
|
|||
|
|
# update coverage grof: voeg tokens toe (beperkt) om drift te vermijden
|
|||
|
|
if tok_len(chosen_text) < 4000:
|
|||
|
|
chosen_text += "\n" + s[:1200]
|
|||
|
|
|
|||
|
|
# 4) Als totaal-minima al boven budget → kap staart
|
|||
|
|
total_min = n * MIN_PER
|
|||
|
|
if total_min > remain:
|
|||
|
|
# Sorteer op score aflopend, en hou zoveel als past met MIN_PER
|
|||
|
|
ranked_idx = sorted(range(n), key=lambda i: scores[i][1], reverse=True)
|
|||
|
|
keep_idx = ranked_idx[: max(1, remain // MIN_PER)]
|
|||
|
|
contexts = [contexts[i] for i in keep_idx]
|
|||
|
|
scores = [scores[i] for i in keep_idx]
|
|||
|
|
n = len(keep_idx)
|
|||
|
|
|
|||
|
|
# 5) Verdeel budget: iedereen MIN_PER, rest proportioneel op score; cap op MAX_PER
|
|||
|
|
base = n * MIN_PER
|
|||
|
|
extra = max(0, remain - base)
|
|||
|
|
# normaliseer score-gewichten
|
|||
|
|
raw = [max(0.0, sc) for (_i, sc, _nov) in scores]
|
|||
|
|
ssum = sum(raw) or 1.0
|
|||
|
|
weights = [x / ssum for x in raw]
|
|||
|
|
|
|||
|
|
alloc = [MIN_PER + int(extra * w) for w in weights]
|
|||
|
|
# enforce MAX_PER; redistribueer overschot grofweg
|
|||
|
|
overshoot = 0
|
|||
|
|
for i in range(n):
|
|||
|
|
if alloc[i] > MAX_PER:
|
|||
|
|
overshoot += alloc[i] - MAX_PER
|
|||
|
|
alloc[i] = MAX_PER
|
|||
|
|
if overshoot > 0:
|
|||
|
|
# verdeel overschot naar anderen die nog onder MAX_PER zitten
|
|||
|
|
holes = [i for i in range(n) if alloc[i] < MAX_PER]
|
|||
|
|
if holes:
|
|||
|
|
plus = overshoot // len(holes)
|
|||
|
|
for i in holes:
|
|||
|
|
alloc[i] = min(MAX_PER, alloc[i] + plus)
|
|||
|
|
|
|||
|
|
# 6) Trim snippet-tekst op toegekend budget
|
|||
|
|
trimmed = []
|
|||
|
|
for i, c in enumerate(contexts):
|
|||
|
|
sn = str(c.get("snippet",""))
|
|||
|
|
sn = _trim_text_to_tokens(sn, alloc[i], tok_len)
|
|||
|
|
trimmed.append({"path": c["path"], "snippet": sn})
|
|||
|
|
return trimmed
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def _llm_qa_answer(question: str, stack_summary_text: str, contexts: List[dict]) -> str:
|
|||
|
|
"""
|
|||
|
|
Laat de LLM een bondig antwoord formuleren met bronverwijzingen.
|
|||
|
|
- Antwoord in NL
|
|||
|
|
- Noem paden als bronnen
|
|||
|
|
- Stel max 2 verduidelijkingsvragen als informatie ontbreekt
|
|||
|
|
"""
|
|||
|
|
# --- NIEUW: trim contexts onder tokenbudget ---
|
|||
|
|
contexts = _prepare_contexts_under_budget(
|
|||
|
|
contexts, question, stack_summary_text,
|
|||
|
|
budget_tokens=int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")),
|
|||
|
|
tok_len=approx_token_count
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
ctx_blocks = []
|
|||
|
|
for i, c in enumerate(contexts, 1):
|
|||
|
|
ctx_blocks.append(f"{i}) PATH: {c['path']}\nFRAGMENT:\n{c['snippet'][:1200]}") # laat 1200 char-cap staan; _prepare_contexts_ kapt al eerder af
|
|||
|
|
USER = (
|
|||
|
|
"Beantwoord de vraag over deze codebase. Wees concreet, kort, en noem bronnen (padnamen).\n"
|
|||
|
|
"Als zekerheid laag is, stel max 2 verduidelijkingsvragen.\n\n"
|
|||
|
|
f"VRAAG:\n{question}\n\n"
|
|||
|
|
"REPO SAMENVATTING:\n" + (stack_summary_text or "(geen)") + "\n\n"
|
|||
|
|
"RELEVANTE FRAGMENTEN:\n" + ("\n\n".join(ctx_blocks) if ctx_blocks else "(geen)") + "\n\n"
|
|||
|
|
"FORMAT:\n"
|
|||
|
|
"- Antwoord (kort en feitelijk)\n"
|
|||
|
|
"- Bronnen: lijst van paden die je gebruikt hebt\n"
|
|||
|
|
"- (optioneel) Vervolgvragen als iets onduidelijk is\n"
|
|||
|
|
)
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":"Je bent een zeer precieze, nuchtere code-assistent. Antwoord in het Nederlands."},
|
|||
|
|
{"role":"user","content": USER}],
|
|||
|
|
stream=False, temperature=0.2, top_p=0.9, max_tokens=900
|
|||
|
|
)
|
|||
|
|
return resp.get("choices",[{}])[0].get("message",{}).get("content","").strip()
|
|||
|
|
|
|||
|
|
# heuristics: iets kleinere chunks voor Laravel/Blade/Routes, anders iets groter
|
|||
|
|
def _chunk_params_for_repo(root: Path) -> tuple[int,int]:
|
|||
|
|
# simpele stack detectie:
|
|||
|
|
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
|||
|
|
if is_laravel:
|
|||
|
|
return int(os.getenv("CHUNK_CHARS_LARAVEL","1800")), int(os.getenv("CHUNK_OVERLAP_LARAVEL","300"))
|
|||
|
|
return int(os.getenv("CHUNK_CHARS_DEFAULT","2600")), int(os.getenv("CHUNK_OVERLAP_DEFAULT","350"))
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------- QA repo agent ----------
|
|||
|
|
async def repo_qa_answer(repo_hint: str, question: str, branch: str = "main", n_ctx: int = 8) -> str:
|
|||
|
|
"""
|
|||
|
|
High-level QA over een specifieke repo:
|
|||
|
|
- resolve + clone/update
|
|||
|
|
- (re)index RAG collectie
|
|||
|
|
- stack summary
|
|||
|
|
- context ophalen
|
|||
|
|
- LLM antwoord met bronnen
|
|||
|
|
"""
|
|||
|
|
meta, _reason = resolve_repo(repo_hint)
|
|||
|
|
if not meta:
|
|||
|
|
# Als hint owner/repo is: meteen bestaan-check
|
|||
|
|
if re.match(r"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$", repo_hint):
|
|||
|
|
owner, name = repo_hint.split("/", 1)
|
|||
|
|
if not gitea_repo_exists(owner, name):
|
|||
|
|
return f"Repo `{repo_hint}` niet gevonden of geen rechten. Controleer naam/URL/token."
|
|||
|
|
return f"Kon repo niet vinden voor hint: {repo_hint}"
|
|||
|
|
|
|||
|
|
repo_url = meta.get("clone_url") or repo_hint
|
|||
|
|
owner_repo = meta.get("full_name")
|
|||
|
|
|
|||
|
|
# clone/checkout
|
|||
|
|
try:
|
|||
|
|
async with _CLONE_SEMA:
|
|||
|
|
repo_path = await _call_get_git_repo(repo_url, branch)
|
|||
|
|
except Exception as e:
|
|||
|
|
# fallback naar master
|
|||
|
|
branch = "master"
|
|||
|
|
try:
|
|||
|
|
async with _CLONE_SEMA:
|
|||
|
|
repo_path = await _call_get_git_repo(repo_url, branch)
|
|||
|
|
except Exception as e:
|
|||
|
|
return (f"Clonen mislukte voor `{owner_repo or repo_hint}`: {e}. "
|
|||
|
|
"Controleer repo-naam/URL of je toegangsrechten.")
|
|||
|
|
root = Path(repo_path)
|
|||
|
|
|
|||
|
|
# (re)index collectie voor deze repo
|
|||
|
|
collection = repo_collection_name(owner_repo, branch)
|
|||
|
|
chunk_chars, overlap = _chunk_params_for_repo(Path(repo_path))
|
|||
|
|
try:
|
|||
|
|
await _rag_index_repo_internal(
|
|||
|
|
repo_url=repo_url, branch=branch, profile="auto",
|
|||
|
|
include="", exclude_dirs="", chunk_chars=chunk_chars, overlap=overlap,
|
|||
|
|
collection_name=collection
|
|||
|
|
)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:rag_index for QA failed (%s), fallback 'code_docs': %s", collection, e)
|
|||
|
|
collection = "code_docs"
|
|||
|
|
await _rag_index_repo_internal(
|
|||
|
|
repo_url=repo_url, branch=branch, profile="auto",
|
|||
|
|
include="", exclude_dirs="", chunk_chars=chunk_chars, overlap=overlap,
|
|||
|
|
collection_name=collection
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# stack summary
|
|||
|
|
stack = _detect_stack_summary(root)
|
|||
|
|
stack_txt = _format_stack_summary_text(stack)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
symbol_index_repo(root, owner_repo, branch)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:symbol index build (QA) failed: %s", e)
|
|||
|
|
|
|||
|
|
# context
|
|||
|
|
contexts = await _collect_repo_context_async(root, owner_repo, branch, question, n_ctx=n_ctx)
|
|||
|
|
|
|||
|
|
# antwoord
|
|||
|
|
answer = await _llm_qa_answer(question, stack_txt, contexts)
|
|||
|
|
return answer
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------- Dry-run voorstel ----------
|
|||
|
|
async def propose_patches_without_apply(repo_path: str, candidates: List[str], user_goal: str) -> Tuple[Dict[str,str], Dict[str,str], Dict[str,str]]:
|
|||
|
|
"""
|
|||
|
|
Returns: proposed, diffs, reasons
|
|||
|
|
- reasons[pad] bevat korte uitleg over de wijziging/keuze
|
|||
|
|
"""
|
|||
|
|
proposed, diffs, reasons = {}, {}, {}
|
|||
|
|
root = Path(repo_path)
|
|||
|
|
token_steps = [1536, 1024, 768, 512]
|
|||
|
|
quotes = extract_quotes(user_goal)
|
|||
|
|
hints = extract_word_hints(user_goal)
|
|||
|
|
old_new = (quotes[0], quotes[1]) if len(quotes) >= 2 else (None, None)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Bepaal taaktype lokaal (lichtgewicht, 1 LLM-call; framework-heuristiek)
|
|||
|
|
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
|||
|
|
try:
|
|||
|
|
_route = await _llm_task_route(user_goal, framework=("laravel" if is_laravel else "generic"))
|
|||
|
|
_task_type = (_route.get("task_type") or "").lower()
|
|||
|
|
except Exception:
|
|||
|
|
_task_type = ""
|
|||
|
|
|
|||
|
|
def _is_view_or_lang(path: str) -> bool:
|
|||
|
|
return path.endswith(".blade.php") or path.startswith("resources/lang/")
|
|||
|
|
|
|||
|
|
|
|||
|
|
for rel in candidates:
|
|||
|
|
p = root / rel
|
|||
|
|
# als het pad nog niet bestaat probeer een create-voorstel
|
|||
|
|
if not p.exists():
|
|||
|
|
content, because = await propose_new_file(root, rel, user_goal)
|
|||
|
|
if content:
|
|||
|
|
proposed[rel] = content
|
|||
|
|
diffs[rel] = make_new_file_diff(rel, content, max_lines=300)
|
|||
|
|
reasons[rel] = because
|
|||
|
|
else:
|
|||
|
|
logger.info("INFO:agent_repo:no create-proposal for missing file %s", rel)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
original = _read_text_file(p)
|
|||
|
|
except Exception:
|
|||
|
|
original = ""
|
|||
|
|
if not original:
|
|||
|
|
logger.info("INFO:agent_repo:skip unreadable/empty %s", rel)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 0) Gerichte, veilige fallback-literal replace (alleen bij oud->nieuw)
|
|||
|
|
old, new, why_pair = deduce_old_new_literals(user_goal, original)
|
|||
|
|
if old and new:
|
|||
|
|
tmp, ok, because = targeted_fallback_replace(original, old, new)
|
|||
|
|
if ok and tmp != original:
|
|||
|
|
# anti-destructie niet nodig: minimale vervanging
|
|||
|
|
proposed[rel] = tmp
|
|||
|
|
diffs[rel] = make_diffs(original, tmp, rel, max_lines=200)
|
|||
|
|
reasons[rel] = f"{because}. ({why_pair})"
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 1) HTML-scope als prompt tags noemt
|
|||
|
|
ctx = extract_context_hints_from_prompt(user_goal)
|
|||
|
|
if old and new and ctx["tag_names"]:
|
|||
|
|
scoped, ok, because = html_scoped_literal_replace(original, old, new, ctx["tag_names"])
|
|||
|
|
if ok and scoped != original and not is_destructive(original, scoped, allow_destructive=False):
|
|||
|
|
proposed[rel] = scoped
|
|||
|
|
diffs[rel] = make_diffs(original, scoped, rel, max_lines=200)
|
|||
|
|
reasons[rel] = (because + (f" ({why_pair})" if why_pair else ""))
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 2) Fallback-literal (??,?:, "", or) - volledig generiek
|
|||
|
|
#if old and new:
|
|||
|
|
# tmp, ok, because = targeted_fallback_replace(original, old, new)
|
|||
|
|
# if ok and tmp != original and not is_destructive(original, tmp, allow_destructive=False):
|
|||
|
|
# proposed[rel] = tmp
|
|||
|
|
# diffs[rel] = make_diffs(original, tmp, rel, max_lines=200)
|
|||
|
|
# reasons[rel] = (because + (f" ({why_pair})" if why_pair else ""))
|
|||
|
|
# continue
|
|||
|
|
# Zit al in stap 0)
|
|||
|
|
|
|||
|
|
# 3) Algemene quoted-literal (taalagnostisch, behoud minimaliteit)
|
|||
|
|
if old and new:
|
|||
|
|
qrep, ok, because = quoted_literal_replace(original, old, new, max_occurrences=2)
|
|||
|
|
if ok and qrep != original and not is_destructive(original, qrep, allow_destructive=False):
|
|||
|
|
proposed[rel] = qrep
|
|||
|
|
diffs[rel] = make_diffs(original, qrep, rel, max_lines=200)
|
|||
|
|
reasons[rel] = (because + (f" ({why_pair})" if why_pair else ""))
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 4) Focus-snippets + LLM edit-plan
|
|||
|
|
needles = []
|
|||
|
|
if quotes: needles += quotes
|
|||
|
|
if hints: needles += hints[:6]
|
|||
|
|
focus = extract_focus_snippets(original, needles, window=240, max_snippets=3)
|
|||
|
|
|
|||
|
|
# Tree-hint standaard aan: maak compacte map-tree en zet in globale var voor de prompt
|
|||
|
|
try:
|
|||
|
|
globals()["_LLM_EDIT_TREE_HINT"] = _make_local_tree_hint(root, rel, max_siblings=14)
|
|||
|
|
except Exception:
|
|||
|
|
globals()["_LLM_EDIT_TREE_HINT"] = ""
|
|||
|
|
plan = await llm_plan_edits_for_file(user_goal, rel, focus)
|
|||
|
|
if plan:
|
|||
|
|
patched, change_count, explains, allow_destructive = apply_edit_plan(original, plan)
|
|||
|
|
if change_count > 0 and patched.strip() != original.strip():
|
|||
|
|
if is_destructive(original, patched, allow_destructive):
|
|||
|
|
logger.warning("WARN:agent_repo:destructive patch blocked for %s", rel)
|
|||
|
|
else:
|
|||
|
|
proposed[rel] = patched
|
|||
|
|
diffs[rel] = make_diffs(original, patched, rel, max_lines=200)
|
|||
|
|
reasons[rel] = "LLM edit-plan: " + "; ".join(explains[:4])
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 5) Volledige rewrite fallback (met guard)
|
|||
|
|
# Bij UI-label taken verbieden we volledige rewrites op NIET-view/lang bestanden.
|
|||
|
|
if _task_type == "ui_label_change" and not _is_view_or_lang(rel):
|
|||
|
|
logger.info("INFO:agent_repo:skip full rewrite for non-view/lang during ui_label_change: %s", rel)
|
|||
|
|
# sla deze stap over; ga door naar volgende kandidaat
|
|||
|
|
continue
|
|||
|
|
last_err = None
|
|||
|
|
for mx in [1024]:
|
|||
|
|
try:
|
|||
|
|
messages = [
|
|||
|
|
{"role":"system","content":"Voer exact de gevraagde wijziging uit. GEEN extra refactors/best practices. Lever de volledige, werkende bestandinformatie als 1 codeblok."},
|
|||
|
|
{"role":"user","content": f"Doel:\n{user_goal}\n\nBestand ({rel}) huidige inhoud:\n```\n{original}\n```"}
|
|||
|
|
]
|
|||
|
|
resp = await _llm_call(messages, stream=False, temperature=0.2, top_p=0.9, max_tokens=mx)
|
|||
|
|
newc = _extract_code_block(resp.get("choices",[{}])[0].get("message",{}).get("content","")) or original
|
|||
|
|
if newc.strip() != original.strip():
|
|||
|
|
if is_destructive(original, newc, allow_destructive=False):
|
|||
|
|
logger.warning("WARN:agent_repo:destructive rewrite blocked for %s (ratio>%.2f)", rel, AGENT_DESTRUCTIVE_RATIO)
|
|||
|
|
break # early-exit: geen extra pogingen
|
|||
|
|
proposed[rel] = newc
|
|||
|
|
diffs[rel] = make_diffs(original, newc, rel, max_lines=200)
|
|||
|
|
reasons[rel] = "Full rewrite (guarded): minimale aanpassing om het doel te halen."
|
|||
|
|
break
|
|||
|
|
except Exception as e:
|
|||
|
|
last_err = e
|
|||
|
|
logger.warning("WARN:agent_repo:LLM rewrite fail %s mx=%d: %s", rel, mx, repr(e))
|
|||
|
|
#continue
|
|||
|
|
if rel not in proposed and last_err:
|
|||
|
|
logger.error("ERROR:agent_repo:give up on %s after retries: %s", rel, repr(last_err))
|
|||
|
|
# --- Syntax guard filtering (laatste stap) ---
|
|||
|
|
drop: List[str] = []
|
|||
|
|
for rel, content in proposed.items():
|
|||
|
|
try:
|
|||
|
|
if rel.endswith(".php"):
|
|||
|
|
tmp = _write_tmp(content, ".php")
|
|||
|
|
ok = _php_lint_ok(tmp)
|
|||
|
|
try: tmp.unlink(missing_ok=True)
|
|||
|
|
except Exception: pass
|
|||
|
|
if not ok:
|
|||
|
|
reasons[rel] = (reasons.get(rel,"") + " [PHP lint failed]").strip()
|
|||
|
|
drop.append(rel)
|
|||
|
|
elif rel.endswith(".blade.php"):
|
|||
|
|
if not _blade_balance_ok(content):
|
|||
|
|
reasons[rel] = (reasons.get(rel,"") + " [Blade balance failed]").strip()
|
|||
|
|
drop.append(rel)
|
|||
|
|
except Exception:
|
|||
|
|
# in twijfel: laat de patch door (fail-open), maar log upstream
|
|||
|
|
pass
|
|||
|
|
for rel in drop:
|
|||
|
|
proposed.pop(rel, None); diffs.pop(rel, None)
|
|||
|
|
return proposed, diffs, reasons
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ---------- Agent state ----------
|
|||
|
|
@dataclass
|
|||
|
|
class AgentState:
|
|||
|
|
stage: str = "TRIAGE"
|
|||
|
|
questions_asked: int = 0
|
|||
|
|
user_goal: str = ""
|
|||
|
|
repo_hint: str = ""
|
|||
|
|
selected_repo: dict | None = None
|
|||
|
|
repo_url: str = ""
|
|||
|
|
branch_base: str = AGENT_DEFAULT_BRANCH
|
|||
|
|
repo_path: str = ""
|
|||
|
|
owner_repo: str | None = None
|
|||
|
|
collection_name: str = ""
|
|||
|
|
candidate_paths: List[str] = field(default_factory=list)
|
|||
|
|
proposed_patches: Dict[str, str] = field(default_factory=dict)
|
|||
|
|
reasons: Dict[str, str] = field(default_factory=dict)
|
|||
|
|
new_branch: str = ""
|
|||
|
|
dry_run: bool = True
|
|||
|
|
repo_candidates: List[dict] = field(default_factory=list)
|
|||
|
|
smart_preview: str = ""
|
|||
|
|
recovery_attempted: bool = False
|
|||
|
|
|
|||
|
|
# --- bootstrap op echte repo-inhoud ------------------------------------------------
|
|||
|
|
async def _detect_repo_url(text: str) -> str | None:
|
|||
|
|
m = re.search(r"(https?://\S+?\.git)\b", text or "")
|
|||
|
|
return m.group(1) if m else None
|
|||
|
|
|
|||
|
|
async def _ensure_indexed(repo_url: str, *, branch: str = "main", profile: str = "auto",
|
|||
|
|
rag_index_repo_internal_fn=None, get_git_repo_fn=None):
|
|||
|
|
# clone/update (best-effort) om failures vroeg te vangen
|
|||
|
|
if get_git_repo_fn:
|
|||
|
|
try:
|
|||
|
|
loop = asyncio.get_running_loop()
|
|||
|
|
await loop.run_in_executor(None, get_git_repo_fn, repo_url, branch)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
if rag_index_repo_internal_fn:
|
|||
|
|
await rag_index_repo_internal_fn(
|
|||
|
|
repo_url=repo_url, branch=branch, profile=profile,
|
|||
|
|
include="", exclude_dirs="",
|
|||
|
|
chunk_chars=int(os.getenv("RAG_CHUNK_CHARS","3000")),
|
|||
|
|
overlap=int(os.getenv("RAG_CHUNK_OVERLAP","400")),
|
|||
|
|
collection_name=os.getenv("RAG_COLLECTION","code_docs"),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
async def _bootstrap_overview(repo_url: str, rag_query_internal_fn, *, collection="code_docs") -> str:
|
|||
|
|
"""Haalt echte passages op en maakt een compacte context."""
|
|||
|
|
# Bij per-repo collections is een extra repo-filter contraproductief.
|
|||
|
|
# Gebruik daarom repo=None zodra we een collection doorgeven.
|
|||
|
|
owner, name = owner_repo_from_url(repo_url)
|
|||
|
|
repo_full = f"{owner}/{name}" if (owner and name) else None
|
|||
|
|
wants = [
|
|||
|
|
{"q": "project overview readme", "path_contains": "README"},
|
|||
|
|
{"q": "install setup configuration", "path_contains": "README"},
|
|||
|
|
{"q": "composer dependencies autoload", "path_contains": "composer.json"},
|
|||
|
|
{"q": "npm dependencies scripts", "path_contains": "package.json"},
|
|||
|
|
{"q": "routes definitions", "path_contains": "routes"},
|
|||
|
|
{"q": "controllers overview", "path_contains": "app/Http/Controllers"},
|
|||
|
|
{"q": "views templates blade", "path_contains": "resources/views"},
|
|||
|
|
{"q": "env example", "path_contains": ".env"},
|
|||
|
|
]
|
|||
|
|
chunks = []
|
|||
|
|
for w in wants:
|
|||
|
|
res = await rag_query_internal_fn(
|
|||
|
|
query=w["q"], n_results=3,
|
|||
|
|
collection_name=collection, # per-repo collectie al gebruikt
|
|||
|
|
repo=None, # voorkom dubbele/te strikte scoping
|
|||
|
|
path_contains=w["path_contains"], profile=None
|
|||
|
|
)
|
|||
|
|
chunks.extend((res or {}).get("results", []))
|
|||
|
|
|
|||
|
|
seen = set(); buf = []
|
|||
|
|
for r in chunks[:18]:
|
|||
|
|
meta = r.get("metadata") or {}
|
|||
|
|
key = (meta.get("path",""), meta.get("chunk_index"))
|
|||
|
|
if key in seen:
|
|||
|
|
continue
|
|||
|
|
seen.add(key)
|
|||
|
|
body = (r.get("document") or "").strip()[:1200]
|
|||
|
|
buf.append(f"### {meta.get('path','')}\n{body}")
|
|||
|
|
return "\n\n".join(buf[:8]).strip()
|
|||
|
|
|
|||
|
|
def _extract_explicit_paths_robust(text: str) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Haalt bestands-paden uit vrije tekst robuust op.
|
|||
|
|
Herkent tokens met minimaal één '/' en één '.' (extensie),
|
|||
|
|
negeert trailing leestekens.
|
|||
|
|
"""
|
|||
|
|
if not text:
|
|||
|
|
return []
|
|||
|
|
pats = re.findall(r"[A-Za-z0-9_./\\-]+\\.[A-Za-z0-9_.-]+", text)
|
|||
|
|
out = []
|
|||
|
|
for p in pats:
|
|||
|
|
# normaliseer Windows backslashes → unix
|
|||
|
|
p = p.replace("\\", "/")
|
|||
|
|
# strip algemene trailing chars
|
|||
|
|
p = p.strip().strip(",.;:)]}>'\"")
|
|||
|
|
if "/" in p and "." in p:
|
|||
|
|
out.append(p)
|
|||
|
|
# de-dup behoud volgorde
|
|||
|
|
seen = set(); uniq = []
|
|||
|
|
for p in out:
|
|||
|
|
if p not in seen:
|
|||
|
|
uniq.append(p); seen.add(p)
|
|||
|
|
return uniq
|
|||
|
|
|
|||
|
|
def _grep_repo_for_literal(root: Path, needle: str, limit: int = 12) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Heel snelle, ruwe literal-zoeker over tekstbestanden in de repo.
|
|||
|
|
Retourneert lijst met relatieve paden waar 'needle' voorkomt (top 'limit').
|
|||
|
|
"""
|
|||
|
|
if not needle or len(needle) < 2:
|
|||
|
|
return []
|
|||
|
|
hits = []
|
|||
|
|
try:
|
|||
|
|
for p in root.rglob("*"):
|
|||
|
|
if p.is_dir():
|
|||
|
|
continue
|
|||
|
|
# respecteer uitgesloten directories en grootte-limiet
|
|||
|
|
if any(part in _PROFILE_EXCLUDE_DIRS for part in p.parts):
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
if p.stat().st_size > 500_000:
|
|||
|
|
continue
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
# alleen tekst-achtige extensies volgens allowed_file()
|
|||
|
|
if not allowed_file(p):
|
|||
|
|
continue
|
|||
|
|
# lees als tekst (met best-effort fallback)
|
|||
|
|
try:
|
|||
|
|
txt = p.read_text(encoding="utf-8", errors="ignore")
|
|||
|
|
except Exception:
|
|||
|
|
try:
|
|||
|
|
txt = p.read_text(encoding="latin-1", errors="ignore")
|
|||
|
|
except Exception:
|
|||
|
|
continue
|
|||
|
|
if needle in txt:
|
|||
|
|
try:
|
|||
|
|
rel = str(p.relative_to(root))
|
|||
|
|
except Exception:
|
|||
|
|
rel = str(p)
|
|||
|
|
hits.append(rel)
|
|||
|
|
if len(hits) >= limit:
|
|||
|
|
break
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
return hits
|
|||
|
|
|
|||
|
|
def _laravel_priors_from_prompt(user_goal: str, root: Path, all_files: list[str], max_k: int = 8) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Geef een lijst met waarschijnlijke Laravel-bestanden op basis van conventies + prompt-keywords.
|
|||
|
|
Neem ALLEEN paden op die daadwerkelijk bestaan in de repo (all_files).
|
|||
|
|
"""
|
|||
|
|
text = (user_goal or "").lower()
|
|||
|
|
exists = set(all_files)
|
|||
|
|
priors: list[str] = []
|
|||
|
|
|
|||
|
|
def add_if_present(paths: list[str]):
|
|||
|
|
for p in paths:
|
|||
|
|
if p in exists and p not in priors:
|
|||
|
|
priors.append(p)
|
|||
|
|
|
|||
|
|
# Altijd nuttige ankerpunten in Laravel repos
|
|||
|
|
add_if_present([
|
|||
|
|
"routes/web.php",
|
|||
|
|
"routes/api.php",
|
|||
|
|
"config/app.php",
|
|||
|
|
"config/database.php",
|
|||
|
|
".env",
|
|||
|
|
".env.example",
|
|||
|
|
"resources/lang/en.json",
|
|||
|
|
"resources/lang/nl.json",
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
# Prompt-gestuurde hints
|
|||
|
|
if any(k in text for k in ("api ", "endpoint", "jwt", "sanctum", "api-route")):
|
|||
|
|
add_if_present(["routes/api.php"])
|
|||
|
|
if any(k in text for k in ("route", "router", "web", "pagina", "page", "url ")):
|
|||
|
|
add_if_present(["routes/web.php"])
|
|||
|
|
if any(k in text for k in ("controller", "actie", "action", "handler", "store(", "update(", "create(", "edit(")):
|
|||
|
|
# neem de meest voorkomende controllers-map mee
|
|||
|
|
# (geen directory listing; we kiezen alleen de indexerende anchor-files)
|
|||
|
|
for p in exists:
|
|||
|
|
if p.startswith("app/Http/Controllers/") and p.endswith(".php"):
|
|||
|
|
priors.append(p)
|
|||
|
|
if len(priors) >= max_k:
|
|||
|
|
break
|
|||
|
|
if any(k in text for k in ("view", "blade", "template", "pagina", "page", "formulier", "form")):
|
|||
|
|
# bekende view-locaties
|
|||
|
|
add_if_present([
|
|||
|
|
"resources/views/layouts/app.blade.php",
|
|||
|
|
"resources/views/welcome.blade.php",
|
|||
|
|
"resources/views/dashboard.blade.php",
|
|||
|
|
])
|
|||
|
|
# heuristisch: als prompt een padfragment noemt (b.v. 'log/create'), pak views daaronder
|
|||
|
|
m = re.search(r"resources/views/([A-Za-z0-9_/\-]+)/", user_goal)
|
|||
|
|
if m:
|
|||
|
|
base = f"resources/views/{m.group(1).strip('/')}/"
|
|||
|
|
for p in exists:
|
|||
|
|
if p.startswith(base) and p.endswith(".blade.php") and p not in priors:
|
|||
|
|
priors.append(p)
|
|||
|
|
if len(priors) >= max_k:
|
|||
|
|
break
|
|||
|
|
if any(k in text for k in ("validatie", "validation", "formrequest", "request class", "rules(")):
|
|||
|
|
# vaak custom FormRequest classes
|
|||
|
|
for p in exists:
|
|||
|
|
if p.startswith("app/Http/Requests/") and p.endswith(".php"):
|
|||
|
|
priors.append(p)
|
|||
|
|
if len(priors) >= max_k:
|
|||
|
|
break
|
|||
|
|
if any(k in text for k in ("database", "db", "sql", "sqlserver", "mssql", "mysql", "pgsql", "connection", "migratie", "migration", "schema")):
|
|||
|
|
add_if_present(["config/database.php", ".env", ".env.example"])
|
|||
|
|
# migrations en models zijn vaak relevant
|
|||
|
|
for p in exists:
|
|||
|
|
if (p.startswith("database/migrations/") and p.endswith(".php")) or \
|
|||
|
|
(p.startswith("app/Models/") and p.endswith(".php")):
|
|||
|
|
priors.append(p)
|
|||
|
|
if len(priors) >= max_k:
|
|||
|
|
break
|
|||
|
|
if any(k in text for k in ("taal", "language", "vertaling", "translation", "lang", "i18n")):
|
|||
|
|
# neem json én php lang packs mee
|
|||
|
|
for p in exists:
|
|||
|
|
if p.startswith("resources/lang/") and (p.endswith(".json") or p.endswith(".php")):
|
|||
|
|
priors.append(p)
|
|||
|
|
if len(priors) >= max_k:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# dedupe + cap
|
|||
|
|
uniq: list[str] = []
|
|||
|
|
seen = set()
|
|||
|
|
for p in priors:
|
|||
|
|
if p not in seen:
|
|||
|
|
uniq.append(p); seen.add(p)
|
|||
|
|
if len(uniq) >= max_k:
|
|||
|
|
break
|
|||
|
|
return uniq
|
|||
|
|
|
|||
|
|
async def _llm_framework_priors(user_goal: str, all_files: list[str], framework: str = "laravel", max_k: int = 10) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
Laat de LLM kansrijke BESTAANDE bestanden/globs voorstellen op basis van framework-conventies.
|
|||
|
|
- Output MOET JSON zijn: {"files":[...]} met relatieve paden of simpele globs.
|
|||
|
|
- We filteren op echt-bestaande paden (match tegen all_files), globs toegestaan.
|
|||
|
|
- Geen netwerk I/O; 1 kleine LLM-call.
|
|||
|
|
"""
|
|||
|
|
text = (user_goal or "").strip()
|
|||
|
|
if not text:
|
|||
|
|
return []
|
|||
|
|
# Bescheiden token budget
|
|||
|
|
sys = ("You are a precise code navigator. Output ONLY compact JSON with likely file paths for the task.\n"
|
|||
|
|
"Rules:\n- Return: {\"files\":[\"relative/path/or/glob\", ...]}\n"
|
|||
|
|
"- Use framework conventions (e.g., Laravel routes/controllers/views, config, .env, migrations, lang).\n"
|
|||
|
|
"- Do NOT invent files that cannot exist; prefer generic globs (e.g., resources/views/**/create*.blade.php).\n"
|
|||
|
|
"- No explanations, no prose.")
|
|||
|
|
usr = (f"Framework: {framework}\n"
|
|||
|
|
f"Task/prompt:\n{text}\n"
|
|||
|
|
"Return at most 15 items.\n"
|
|||
|
|
"Examples for Laravel (if applicable): routes/web.php, app/Http/Controllers/**.php, "
|
|||
|
|
"resources/views/**.blade.php, config/database.php, .env, database/migrations/**.php, resources/lang/**")
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
|||
|
|
stream=False, temperature=0.0, top_p=1.0, max_tokens=300
|
|||
|
|
)
|
|||
|
|
raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","").strip()
|
|||
|
|
except Exception:
|
|||
|
|
return []
|
|||
|
|
# Haal eventuele ```json fences weg
|
|||
|
|
m = re.search(r"\{[\s\S]*\}", raw)
|
|||
|
|
if not m:
|
|||
|
|
return []
|
|||
|
|
try:
|
|||
|
|
obj = json.loads(m.group(0))
|
|||
|
|
except Exception:
|
|||
|
|
return []
|
|||
|
|
items = obj.get("files") or []
|
|||
|
|
if not isinstance(items, list):
|
|||
|
|
return []
|
|||
|
|
# Glob -> concrete bestanden; filter op bestaande paden
|
|||
|
|
exists = set(all_files)
|
|||
|
|
out: list[str] = []
|
|||
|
|
def _match(pat: str) -> list[str]:
|
|||
|
|
# simpele glob: **, *, ?. We matchen tegen all_files.
|
|||
|
|
try:
|
|||
|
|
pat_norm = pat.strip().lstrip("./")
|
|||
|
|
return [f for f in all_files if fnmatch.fnmatch(f, pat_norm)]
|
|||
|
|
except Exception:
|
|||
|
|
return []
|
|||
|
|
for it in items:
|
|||
|
|
if not isinstance(it, str) or not it.strip():
|
|||
|
|
continue
|
|||
|
|
it = it.strip().lstrip("./")
|
|||
|
|
if it in exists:
|
|||
|
|
if it not in out:
|
|||
|
|
out.append(it)
|
|||
|
|
else:
|
|||
|
|
for hit in _match(it):
|
|||
|
|
if hit not in out:
|
|||
|
|
out.append(hit)
|
|||
|
|
if len(out) >= max_k:
|
|||
|
|
break
|
|||
|
|
return out[:max_k]
|
|||
|
|
|
|||
|
|
async def _llm_task_route(user_goal: str, framework: str = "laravel") -> dict:
|
|||
|
|
"""
|
|||
|
|
Laat de LLM expliciet kiezen: {task_type, categories[], hints[]}
|
|||
|
|
Voorbeelden task_type:
|
|||
|
|
- "ui_label_change", "db_credentials", "db_queries", "routes_to_views", "config_env", "generic_code_change"
|
|||
|
|
categories: welke mappen/artefacten zijn relevant (bv. ["views","controllers","routes","migrations","config",".env"])
|
|||
|
|
hints: korte trefwoorden of view/controller namen.
|
|||
|
|
"""
|
|||
|
|
if not (user_goal or "").strip():
|
|||
|
|
return {}
|
|||
|
|
sys = ("You are a precise task router. Return ONLY compact JSON.\n"
|
|||
|
|
"Schema: {\"task_type\":str, \"categories\":[str,...], \"hints\":[str,...]}\n"
|
|||
|
|
"Use framework conventions (e.g., Laravel). No explanations.")
|
|||
|
|
usr = f"Framework: {framework}\nUser goal:\n{user_goal}\nReturn at most 6 categories and 8 hints."
|
|||
|
|
try:
|
|||
|
|
resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
|||
|
|
stream=False, temperature=0.0, top_p=1.0, max_tokens=250
|
|||
|
|
)
|
|||
|
|
raw = (resp.get('choices',[{}])[0].get('message',{}) or {}).get('content','')
|
|||
|
|
m = re.search(r"\{[\s\S]*\}", raw or "")
|
|||
|
|
obj = json.loads(m.group(0)) if m else {}
|
|||
|
|
# sanitize
|
|||
|
|
obj["task_type"] = (obj.get("task_type") or "generic_code_change")[:64]
|
|||
|
|
obj["categories"] = [str(x)[:32] for x in (obj.get("categories") or [])][:8]
|
|||
|
|
obj["hints"] = [str(x)[:64] for x in (obj.get("hints") or [])][:8]
|
|||
|
|
return obj
|
|||
|
|
except Exception:
|
|||
|
|
return {"task_type":"generic_code_change","categories":[],"hints":[]}
|
|||
|
|
|
|||
|
|
# ---------- Hoofd-handler ----------
|
|||
|
|
async def handle_repo_agent(messages: List[dict], request) -> str:
|
|||
|
|
sid = _get_session_id(messages, request)
|
|||
|
|
st = _app.state.AGENT_SESSIONS.get(sid) or AgentState()
|
|||
|
|
_app.state.AGENT_SESSIONS[sid] = st
|
|||
|
|
user_last = next((m["content"] for m in reversed(messages) if m.get("role")=="user"), "").strip()
|
|||
|
|
user_last_lower = user_last.lower()
|
|||
|
|
logger.info("INFO:agent_repo:[%s] stage=%s", sid, st.stage)
|
|||
|
|
from smart_rag import enrich_intent, expand_queries, hybrid_retrieve
|
|||
|
|
# Als user een .git URL meegeeft: zet state en ga via de state-machine verder
|
|||
|
|
user_txt = next((m.get("content","") for m in reversed(messages) if m.get("role")=="user"), "")
|
|||
|
|
repo_url = await _detect_repo_url(user_txt)
|
|||
|
|
|
|||
|
|
if repo_url:
|
|||
|
|
st.repo_hint = repo_url
|
|||
|
|
st.stage = "SELECT_REPO"
|
|||
|
|
logger.info("INFO:agent_repo:[%s] direct SELECT_REPO via .git url: %s", sid, repo_url)
|
|||
|
|
# LET OP: geen vroegtijdige return hier; de SELECT_REPO tak hieronder handelt DISCOVER/INDEX etc. af.
|
|||
|
|
|
|||
|
|
|
|||
|
|
# === SMART-RAG: opt-in pad (alleen als er nog GEEN repo is) ===
|
|||
|
|
smart_enabled = str(os.getenv("REPO_AGENT_SMART","1")).lower() not in ("0","false")
|
|||
|
|
if smart_enabled and not st.repo_hint and st.stage in ("TRIAGE","ASK"):
|
|||
|
|
# 1) intent → plan
|
|||
|
|
spec = await enrich_intent(_llm_call, messages)
|
|||
|
|
task = spec.get("task","").strip()
|
|||
|
|
file_hints = spec.get("file_hints") or []
|
|||
|
|
keywords = spec.get("keywords") or []
|
|||
|
|
constraints= spec.get("constraints") or []
|
|||
|
|
acceptance = spec.get("acceptance") or []
|
|||
|
|
ask = spec.get("ask")
|
|||
|
|
|
|||
|
|
# 2) query expansion (kort) en hybride retrieval
|
|||
|
|
variants = await expand_queries(_llm_call, task, k=int(os.getenv("RAG_EXPAND_K","3")))
|
|||
|
|
merged: list[dict] = []
|
|||
|
|
for i, qv in enumerate(variants):
|
|||
|
|
partial = await hybrid_retrieve(
|
|||
|
|
_rag_query_internal,
|
|||
|
|
qv,
|
|||
|
|
repo= None,
|
|||
|
|
profile= None,
|
|||
|
|
path_contains=(file_hints[0] if file_hints else None),
|
|||
|
|
per_query_k=int(os.getenv("RAG_PER_QUERY_K","30")),
|
|||
|
|
n_results=int(os.getenv("RAG_N_RESULTS","18")),
|
|||
|
|
alpha=float(os.getenv("RAG_EMB_WEIGHT","0.6")),
|
|||
|
|
)
|
|||
|
|
merged.extend(partial)
|
|||
|
|
# dedupe op path+chunk
|
|||
|
|
seen = set(); uniq = []
|
|||
|
|
for r in sorted(merged, key=lambda x: x["score"], reverse=True):
|
|||
|
|
meta = r.get("metadata") or {}
|
|||
|
|
key = (meta.get("path",""), meta.get("chunk_index",""))
|
|||
|
|
if key in seen: continue
|
|||
|
|
seen.add(key); uniq.append(r)
|
|||
|
|
|
|||
|
|
# 3) context + confidence
|
|||
|
|
ctx_text, top_score = assemble_context(uniq, max_chars=int(os.getenv("REPO_AGENT_CONTEXT_CHARS","640000")))
|
|||
|
|
# heel simpele confidence: als top_score erg laag is en vragen toegestaan → stel 1 verhelderingsvraag
|
|||
|
|
if ask and float(os.getenv("REPO_AGENT_ASK_CLARIFY","1")) and top_score < float(os.getenv("REPO_AGENT_ASK_THRESHOLD","0.35")):
|
|||
|
|
return f"Snelle check: {ask}"
|
|||
|
|
|
|||
|
|
# 4) finale prompt samenstellen
|
|||
|
|
sys = (
|
|||
|
|
"Je bent een senior code-assistent. "
|
|||
|
|
"Lees de contextfragmenten (met padheaders). "
|
|||
|
|
"Beantwoord taakgericht, concreet en veilig. "
|
|||
|
|
"Als je verbeteringen doet, geef dan eerst een kort plan en daarna exacte, toepasbare wijzigingen."
|
|||
|
|
)
|
|||
|
|
user = (
|
|||
|
|
f"TAKEN:\n{task}\n\n"
|
|||
|
|
f"CONSTRAINTS: {', '.join(constraints) or '-'}\n"
|
|||
|
|
f"ACCEPTANCE: {', '.join(acceptance) or '-'}\n"
|
|||
|
|
f"KEYWORDS: {', '.join(keywords) or '-'}\n"
|
|||
|
|
f"FILE HINTS: {', '.join(file_hints) or '-'}\n\n"
|
|||
|
|
f"--- CONTEXT (gedeeltelijk) ---\n{ctx_text}\n--- EINDE CONTEXT ---\n\n"
|
|||
|
|
"Geef eerst een kort, puntsgewijs plan (max 6 bullets). "
|
|||
|
|
"Daarna de concrete wijzigingen per bestand met codeblokken. "
|
|||
|
|
"Geen herhaling van hele bestanden als dat niet nodig is."
|
|||
|
|
)
|
|||
|
|
llm_resp = await _llm_call(
|
|||
|
|
[{"role":"system","content":sys},{"role":"user","content":user}],
|
|||
|
|
stream=False, temperature=0.2, top_p=0.9, max_tokens=1536
|
|||
|
|
)
|
|||
|
|
out = (llm_resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
|
|||
|
|
if out.strip():
|
|||
|
|
# niet returnen — maar bijvoorbeeld loggen of meesturen als “quick analysis”
|
|||
|
|
st.smart_preview = out
|
|||
|
|
logger.info("SMART-RAG preview gemaakt (geen vroegtijdige exit)")
|
|||
|
|
# === /SMART-RAG ===
|
|||
|
|
|
|||
|
|
|
|||
|
|
if any(k in user_last_lower for k in ["dry-run","dryrun","preview"]): st.dry_run = True
|
|||
|
|
if "apply" in user_last_lower and ("akkoord" in user_last_lower or "ga door" in user_last_lower): st.dry_run = False
|
|||
|
|
|
|||
|
|
if st.stage == "TRIAGE":
|
|||
|
|
logger.info("Stage TRIAGE")
|
|||
|
|
st.user_goal = user_last
|
|||
|
|
# Optioneel: intent refine + verduidelijkingsvragen
|
|||
|
|
if AGENT_ENABLE_GOAL_REFINE and st.user_goal:
|
|||
|
|
try:
|
|||
|
|
refined, questions, conf = await llm_refine_goal(st.user_goal)
|
|||
|
|
if refined and refined != st.user_goal:
|
|||
|
|
st.user_goal = refined
|
|||
|
|
if questions and conf < AGENT_CLARIFY_THRESHOLD:
|
|||
|
|
st.stage = "ASK"
|
|||
|
|
qtxt = "\n".join([f"- {q}" for q in questions])
|
|||
|
|
return ("Om zeker de juiste bestanden te kiezen, beantwoord kort:\n" + qtxt)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
st.stage = "ASK"
|
|||
|
|
base = ("Ik verken de code en doe een voorstel. Geef de repo (bv. `admin/image-viewing-website` of "
|
|||
|
|
"`http://localhost:3080/admin/image-viewing-website.git`). "
|
|||
|
|
"Of zeg: **'zoek repo'** als ik zelf moet zoeken.")
|
|||
|
|
return _with_preview(base, st)
|
|||
|
|
|
|||
|
|
if st.stage == "ASK":
|
|||
|
|
logger.info("Stage ASK ")
|
|||
|
|
# 1) check of er een repo-hint in de zin zit
|
|||
|
|
hint = None
|
|||
|
|
m = re.search(r"(https?://\S+)", user_last)
|
|||
|
|
if m: hint = m.group(1)
|
|||
|
|
elif "/" in user_last:
|
|||
|
|
for p in user_last.split():
|
|||
|
|
if re.match(r"^[A-Za-z0-9_.\-]+/[A-Za-z0-9_.\-]+$", p): hint = p; break
|
|||
|
|
# 2) Als expliciete vraag om repo te zoeken óf geen hint → auto-discovery
|
|||
|
|
if (not hint) and ("zoek repo" in user_last_lower):
|
|||
|
|
# Probeer auto-discovery
|
|||
|
|
st.repo_candidates = await discover_candidate_repos(st.user_goal)
|
|||
|
|
if not st.repo_candidates:
|
|||
|
|
st.questions_asked += 1
|
|||
|
|
return _with_preview("Ik kon geen repos vinden. Geef de Gitea repo (owner/repo) of volledige .git-URL.", st)
|
|||
|
|
# Normalize scores naar 0..1
|
|||
|
|
maxs = max((c.get("score",0.0) for c in st.repo_candidates), default=0.0) or 1.0
|
|||
|
|
for c in st.repo_candidates:
|
|||
|
|
c["score"] = min(1.0, c["score"]/maxs) if maxs else 0.0
|
|||
|
|
best = st.repo_candidates[0]
|
|||
|
|
# Als hoogste score duidelijk is, auto-select
|
|||
|
|
if best.get("score",0.0) >= AGENT_AUTOSELECT_THRESHOLD and best.get("clone_url"):
|
|||
|
|
st.repo_hint = best["clone_url"]
|
|||
|
|
st.stage = "SELECT_REPO"
|
|||
|
|
return _with_preview(f"Repo automatisch gekozen: **{best['full_name']}** (score {best['score']:.2f}).", st)
|
|||
|
|
# Anders: laat top-3 zien en vraag keuze
|
|||
|
|
st.stage = "CONFIRM_REPO"
|
|||
|
|
lines = []
|
|||
|
|
for i, c in enumerate(st.repo_candidates[:3], 1):
|
|||
|
|
lines.append(f"{i}. {c['full_name']} — score {c.get('score',0.0):.2f}")
|
|||
|
|
base = "Ik vond deze passende repos:\n" + "\n".join(lines) + "\nKies een nummer, of typ de naam/URL."
|
|||
|
|
return _with_preview(base, st)
|
|||
|
|
|
|||
|
|
# 3) Er is wel een hint - ga door
|
|||
|
|
if hint:
|
|||
|
|
st.repo_hint = hint
|
|||
|
|
st.stage = "SELECT_REPO"
|
|||
|
|
else:
|
|||
|
|
st.questions_asked += 1
|
|||
|
|
if st.questions_asked <= AGENT_MAX_QUESTIONS:
|
|||
|
|
return _with_preview("Graag de Gitea repo (owner/repo) of volledige .git-URL.", st)
|
|||
|
|
return _with_preview("Ik heb de repo-naam of URL nodig om verder te gaan.", st)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if st.stage == "CONFIRM_REPO":
|
|||
|
|
logger.info("Stage CONFIRM_REPO")
|
|||
|
|
# parse keuze
|
|||
|
|
pick = None
|
|||
|
|
m = re.match(r"^\s*([1-5])\s*$", user_last)
|
|||
|
|
if m:
|
|||
|
|
idx = int(m.group(1)) - 1
|
|||
|
|
if 0 <= idx < len(st.repo_candidates):
|
|||
|
|
pick = st.repo_candidates[idx]
|
|||
|
|
if not pick:
|
|||
|
|
# probeer naam match
|
|||
|
|
for c in st.repo_candidates:
|
|||
|
|
if c["full_name"].lower() in user_last_lower or (c.get("clone_url","") and c["clone_url"] in user_last):
|
|||
|
|
pick = c; break
|
|||
|
|
if not pick:
|
|||
|
|
return _with_preview("Typ een nummer (1..3) of de naam/URL van de repo.", st)
|
|||
|
|
|
|||
|
|
st.repo_hint = pick.get("clone_url") or (f"{GITEA_URL}/{pick['full_name']}.git")
|
|||
|
|
st.stage = "SELECT_REPO"
|
|||
|
|
return _with_preview(f"Repo gekozen: **{pick['full_name']}**.", st)
|
|||
|
|
|
|||
|
|
if st.stage == "SELECT_REPO":
|
|||
|
|
logger.info("Stage SELECT_REPO")
|
|||
|
|
repo_meta, reason = resolve_repo(st.repo_hint)
|
|||
|
|
if not repo_meta:
|
|||
|
|
return (f"Geen repo gevonden voor “{st.repo_hint}”. Probeer volledige URL: {GITEA_URL}/<owner>/<repo>.git")
|
|||
|
|
st.selected_repo = repo_meta
|
|||
|
|
st.repo_url = repo_meta.get("clone_url") or ""
|
|||
|
|
st.owner_repo = repo_meta.get("full_name")
|
|||
|
|
if not st.repo_url:
|
|||
|
|
return f"Geen clone URL voor “{st.repo_hint}”."
|
|||
|
|
progress = [f"Repo ({reason}): {st.owner_repo or st.repo_url}"]
|
|||
|
|
|
|||
|
|
# DISCOVER
|
|||
|
|
logger.info("DISCOVER")
|
|||
|
|
try:
|
|||
|
|
try:
|
|||
|
|
st.repo_path = await _call_get_git_repo(st.repo_url, st.branch_base)
|
|||
|
|
except Exception as e_main:
|
|||
|
|
logger.warning("WARN:agent_repo:get_git_repo %s failed: %s; fallback master", st.branch_base, e_main)
|
|||
|
|
st.branch_base = "master"
|
|||
|
|
st.repo_path = await _call_get_git_repo(st.repo_url, st.branch_base)
|
|||
|
|
|
|||
|
|
|
|||
|
|
st.collection_name = repo_collection_name(st.owner_repo, st.branch_base)
|
|||
|
|
chunk_chars, overlap = _chunk_params_for_repo(Path(st.repo_path))
|
|||
|
|
|
|||
|
|
# ── Fast-path: check HEAD en sla index over als ongewijzigd ──
|
|||
|
|
try:
|
|||
|
|
import git
|
|||
|
|
head_sha = await run_in_threadpool(lambda: git.Repo(st.repo_path).head.commit.hexsha)
|
|||
|
|
except Exception:
|
|||
|
|
head_sha = ""
|
|||
|
|
#memo_key = f"{st.repo_url}|{st.branch_base}|{st.collection_name}"
|
|||
|
|
# ‘Brede’ key (repo+branch) voorkomt dubbele index runs bij dezelfde HEAD,
|
|||
|
|
# ook als collection_name varieert.
|
|||
|
|
memo_key = f"{st.repo_url}|{st.branch_base}"
|
|||
|
|
|
|||
|
|
if _INDEX_HEAD_MEMO.get(memo_key) == head_sha and head_sha:
|
|||
|
|
progress.append(f"Index overslaan: HEAD ongewijzigd ({head_sha[:7]}).")
|
|||
|
|
else:
|
|||
|
|
try:
|
|||
|
|
res = await _rag_index_repo_internal(
|
|||
|
|
repo_url=st.repo_url, branch=st.branch_base,
|
|||
|
|
profile="auto", include="", exclude_dirs="",
|
|||
|
|
chunk_chars=chunk_chars, overlap=overlap, collection_name=st.collection_name
|
|||
|
|
)
|
|||
|
|
# alleen updaten als index call succesvol was
|
|||
|
|
_INDEX_HEAD_MEMO[memo_key] = head_sha or _INDEX_HEAD_MEMO.get(memo_key, "")
|
|||
|
|
|
|||
|
|
if isinstance(res, dict) and res.get("status") == "skipped":
|
|||
|
|
progress.append(f"Index: skip (cache) — HEAD {head_sha[:7]}.")
|
|||
|
|
else:
|
|||
|
|
progress.append("Index: bijgewerkt.")
|
|||
|
|
except Exception as e_idx:
|
|||
|
|
logger.warning("WARN:agent_repo:rag index failed '%s': %s; fallback 'code_docs'", st.collection_name, e_idx)
|
|||
|
|
st.collection_name = "code_docs"
|
|||
|
|
res = await _rag_index_repo_internal(
|
|||
|
|
repo_url=st.repo_url, branch=st.branch_base,
|
|||
|
|
profile="auto", include="", exclude_dirs="",
|
|||
|
|
chunk_chars=chunk_chars, overlap=overlap, collection_name=st.collection_name
|
|||
|
|
)
|
|||
|
|
_INDEX_HEAD_MEMO[memo_key] = head_sha or _INDEX_HEAD_MEMO.get(memo_key, "")
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
# na succesvolle _rag_index_repo_internal(...) en meili/bm25:
|
|||
|
|
logger.info("Symbol index repo")
|
|||
|
|
try:
|
|||
|
|
symbol_index_repo(Path(st.repo_path), st.owner_repo, st.branch_base)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:symbol index build failed: %s", e)
|
|||
|
|
|
|||
|
|
|
|||
|
|
logger.info("Meili part")
|
|||
|
|
if MEILI_URL:
|
|||
|
|
try:
|
|||
|
|
# Skip Meili herindex als HEAD ongewijzigd
|
|||
|
|
if _MEILI_HEAD_MEMO.get(memo_key) == head_sha and head_sha:
|
|||
|
|
progress.append("Meili: overslaan (HEAD ongewijzigd).")
|
|||
|
|
else:
|
|||
|
|
await run_cpu_blocking(meili_index_repo, Path(st.repo_path), st.owner_repo, st.branch_base)
|
|||
|
|
_MEILI_HEAD_MEMO[memo_key] = head_sha or _MEILI_HEAD_MEMO.get(memo_key, "")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:meili_index_repo failed: %s", e)
|
|||
|
|
else:
|
|||
|
|
try:
|
|||
|
|
if _BM25_HEAD_MEMO.get(memo_key) == head_sha and head_sha:
|
|||
|
|
progress.append("BM25: overslaan (HEAD ongewijzigd).")
|
|||
|
|
else:
|
|||
|
|
await run_cpu_blocking(bm25_build_index, Path(st.repo_path), st.owner_repo, st.branch_base)
|
|||
|
|
_BM25_HEAD_MEMO[memo_key] = head_sha or _BM25_HEAD_MEMO.get(memo_key, "")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:bm25_build_index failed: %s", e)
|
|||
|
|
|
|||
|
|
|
|||
|
|
progress.append("DISCOVER klaar.")
|
|||
|
|
logger.info("DISCOVER klaar.")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.exception("ERROR:agent_repo:DISCOVER failed")
|
|||
|
|
st.stage = "ASK"
|
|||
|
|
return _with_preview("\n".join(progress + [f"DISCOVER mislukte: {e}"]), st)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# RANK via hybrid RAG
|
|||
|
|
logger.info("RANK via hybrid RAG")
|
|||
|
|
root = Path(st.repo_path)
|
|||
|
|
all_files = list_repo_files(root)
|
|||
|
|
# Precompute graph + tree (per HEAD) voor ranking-boost en explain
|
|||
|
|
graph = _get_graph_cached(root, memo_key=(f"{st.repo_url}|{st.branch_base}|{head_sha or ''}"))
|
|||
|
|
tree_summ = _get_tree_cached(root, memo_key=(f"{st.repo_url}|{st.branch_base}|{head_sha or ''}"), all_files=all_files)
|
|||
|
|
|
|||
|
|
|
|||
|
|
picked: List[str] = []
|
|||
|
|
# 1) expliciete paden uit de prompt (bestaande extractor)
|
|||
|
|
explicit = list(extract_explicit_paths(st.user_goal) or [])
|
|||
|
|
# 2) robuuste fallback extractor
|
|||
|
|
robust = _extract_explicit_paths_robust(st.user_goal)
|
|||
|
|
for pth in explicit + [p for p in robust if p not in explicit]:
|
|||
|
|
norm = pth.replace("\\", "/").strip()
|
|||
|
|
if norm in all_files and norm not in picked:
|
|||
|
|
picked.append(norm)
|
|||
|
|
continue
|
|||
|
|
best = best_path_by_basename(all_files, norm)
|
|||
|
|
if best and best not in picked:
|
|||
|
|
picked.append(best)
|
|||
|
|
continue
|
|||
|
|
# Als het niet bestaat: toch opnemen (voor create-flow)
|
|||
|
|
if norm not in picked:
|
|||
|
|
picked.append(norm)
|
|||
|
|
|
|||
|
|
# Laravel priors (alleen bestaande paden), vóór RAG
|
|||
|
|
try:
|
|||
|
|
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
|||
|
|
except Exception:
|
|||
|
|
is_laravel = False
|
|||
|
|
if is_laravel:
|
|||
|
|
priors = _laravel_priors_from_prompt(st.user_goal, root, all_files, max_k=int(os.getenv("LARAVEL_PRIORS_K","8")))
|
|||
|
|
for p in priors:
|
|||
|
|
if p not in picked:
|
|||
|
|
picked.append(p)
|
|||
|
|
|
|||
|
|
# ---- LLM-PRIORS (optioneel via env, standaard aan) ----
|
|||
|
|
use_llm_priors = os.getenv("LLM_PRIORS_ENABLE", "1").lower() not in ("0","false","no")
|
|||
|
|
if use_llm_priors:
|
|||
|
|
try:
|
|||
|
|
# Hint framework adhv repo
|
|||
|
|
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
|||
|
|
except Exception:
|
|||
|
|
is_laravel = False
|
|||
|
|
fw = "laravel" if is_laravel else "generic"
|
|||
|
|
llm_hits = await _llm_framework_priors(st.user_goal, all_files, framework=fw, max_k=int(os.getenv("LLM_PRIORS_K","12")))
|
|||
|
|
for p in llm_hits:
|
|||
|
|
if p not in picked:
|
|||
|
|
picked.append(p)
|
|||
|
|
|
|||
|
|
# ---- Rules fallback (alleen als nog mager) ----
|
|||
|
|
try:
|
|||
|
|
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
|||
|
|
except Exception:
|
|||
|
|
is_laravel = False
|
|||
|
|
if is_laravel and len(picked) < max(4, int(os.getenv("LLM_PRIORS_MIN_BEFORE_RAG","4"))):
|
|||
|
|
priors = _laravel_priors_from_prompt(st.user_goal, root, all_files, max_k=int(os.getenv("LARAVEL_PRIORS_K","8")))
|
|||
|
|
for p in priors:
|
|||
|
|
if p not in picked:
|
|||
|
|
picked.append(p)
|
|||
|
|
|
|||
|
|
# --- LLM Task Router ---
|
|||
|
|
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
|||
|
|
route = await _llm_task_route(st.user_goal, framework=("laravel" if is_laravel else "generic"))
|
|||
|
|
st.reasons["task_route"] = json.dumps(route, ensure_ascii=False)
|
|||
|
|
task_type = (route.get("task_type") or "").lower()
|
|||
|
|
|
|||
|
|
# --- LLM zoekpatronen → deterministische scan ---
|
|||
|
|
if os.getenv("LLM_PATTERN_SCAN","1").lower() not in ("0","false","no"):
|
|||
|
|
specs = await _llm_make_search_specs(st.user_goal, framework=("laravel" if is_laravel else "generic"))
|
|||
|
|
scan_hits = _scan_repo_for_patterns(root, all_files, specs, max_hits=int(os.getenv("LLM_PATTERN_MAX_HITS","24")))
|
|||
|
|
for f in scan_hits:
|
|||
|
|
if f not in picked:
|
|||
|
|
picked.append(f)
|
|||
|
|
|
|||
|
|
# --- VIEW/LANG bias voor UI-label wijzigingen ---
|
|||
|
|
if task_type == "ui_label_change":
|
|||
|
|
# Probeer de 'oude' literal uit de prompt te halen (voor gerichter filteren)
|
|||
|
|
try:
|
|||
|
|
old_lit, _new_lit, _why = deduce_old_new_literals(st.user_goal, "")
|
|||
|
|
except Exception:
|
|||
|
|
old_lit = None
|
|||
|
|
|
|||
|
|
def _contains_old(rel: str) -> bool:
|
|||
|
|
if not old_lit:
|
|||
|
|
return True
|
|||
|
|
try:
|
|||
|
|
txt = _read_text_file(Path(st.repo_path)/rel) or ""
|
|||
|
|
return old_lit in txt
|
|||
|
|
except Exception:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
view_files = [f for f in all_files
|
|||
|
|
if f.startswith("resources/views/") and f.endswith(".blade.php")]
|
|||
|
|
lang_files = [f for f in all_files
|
|||
|
|
if f.startswith("resources/lang/") and (f.endswith(".json") or f.endswith(".php"))]
|
|||
|
|
|
|||
|
|
# Als we de oude literal kennen: eerst de files waar die echt in staat
|
|||
|
|
if old_lit:
|
|||
|
|
view_hits = [f for f in view_files if _contains_old(f)]
|
|||
|
|
lang_hits = [f for f in lang_files if _contains_old(f)]
|
|||
|
|
else:
|
|||
|
|
view_hits = view_files
|
|||
|
|
lang_hits = lang_files
|
|||
|
|
|
|||
|
|
# Zet de meest waarschijnlijke kandidaten vóóraan, behoud verder huidige volgorde
|
|||
|
|
front = []
|
|||
|
|
for lst in (view_hits, lang_hits):
|
|||
|
|
for f in lst:
|
|||
|
|
if f in all_files and f not in front:
|
|||
|
|
front.append(f)
|
|||
|
|
picked = list(dict.fromkeys(front + picked))[:MAX_FILES_DRYRUN]
|
|||
|
|
|
|||
|
|
|
|||
|
|
# --- (optioneel) priors op basis van framework (je eerdere patch A/B) ---
|
|||
|
|
# LLM priors + rule-based priors kun je hier behouden zoals je eerder hebt toegevoegd.
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
# --- NIEUW: Smart-RAG path selectie op repo-collectie ---
|
|||
|
|
|
|||
|
|
# 1) intent (voor file_hints) + query-expansion
|
|||
|
|
logger.info("Smart RAG path select. 1) intent")
|
|||
|
|
spec = await enrich_intent(_llm_call, [{"role":"user","content": st.user_goal}])
|
|||
|
|
file_hints = (spec.get("file_hints") or [])
|
|||
|
|
variants = await expand_queries(_llm_call, spec.get("task") or st.user_goal, k=2)
|
|||
|
|
|
|||
|
|
# 2) retrieval per variant met repo-filter & collectie van deze repo
|
|||
|
|
logger.info("Smart RAG path select. 2) retrieval")
|
|||
|
|
merged = []
|
|||
|
|
for qv in variants:
|
|||
|
|
part = await hybrid_retrieve(
|
|||
|
|
_rag_query_internal,
|
|||
|
|
qv,
|
|||
|
|
repo=st.owner_repo, # <<< repo-scope
|
|||
|
|
profile=None,
|
|||
|
|
path_contains=(file_hints[0] if file_hints else None),
|
|||
|
|
per_query_k=int(os.getenv("RAG_PER_QUERY_K","30")),
|
|||
|
|
n_results=int(os.getenv("RAG_N_RESULTS","18")),
|
|||
|
|
alpha=float(os.getenv("RAG_EMB_WEIGHT","0.6")),
|
|||
|
|
collection_name=st.collection_name # <<< repo-collection
|
|||
|
|
)
|
|||
|
|
merged.extend(part)
|
|||
|
|
|
|||
|
|
# 3) naar unieke paden + sort op score
|
|||
|
|
logger.info("Smart RAG path select. 3) unieke paden sort op score")
|
|||
|
|
seen=set()
|
|||
|
|
for r in sorted(merged, key=lambda x: x.get("score",0.0), reverse=True):
|
|||
|
|
meta = r.get("metadata") or {}
|
|||
|
|
rel = meta.get("path","")
|
|||
|
|
if not rel or rel in seen:
|
|||
|
|
continue
|
|||
|
|
seen.add(rel)
|
|||
|
|
if rel not in picked:
|
|||
|
|
picked.append(rel)
|
|||
|
|
# 4) Laravel neighbors (klein zetje, opt-in via env)
|
|||
|
|
logger.info("Smart RAG path select. 4) Laravel neighbors")
|
|||
|
|
if os.getenv("RAG_NEIGHBORS", "1").lower() not in ("0","false"):
|
|||
|
|
add = []
|
|||
|
|
for rel in picked[:8]:
|
|||
|
|
# routes -> controllers
|
|||
|
|
if rel in ("routes/web.php","routes/api.php"):
|
|||
|
|
txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
|
|||
|
|
try:
|
|||
|
|
from app import _laravel_pairs_from_route_text # of waar je helper staat
|
|||
|
|
except Exception:
|
|||
|
|
_laravel_pairs_from_route_text = None
|
|||
|
|
if _laravel_pairs_from_route_text:
|
|||
|
|
for ctrl_path,_m in _laravel_pairs_from_route_text(txt):
|
|||
|
|
if ctrl_path and ctrl_path not in picked and ctrl_path not in add:
|
|||
|
|
add.append(ctrl_path)
|
|||
|
|
# controllers -> views
|
|||
|
|
if rel.startswith("app/Http/Controllers/") and rel.endswith(".php"):
|
|||
|
|
txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
|
|||
|
|
try:
|
|||
|
|
from app import _laravel_guess_view_paths_from_text
|
|||
|
|
except Exception:
|
|||
|
|
_laravel_guess_view_paths_from_text = None
|
|||
|
|
if _laravel_guess_view_paths_from_text:
|
|||
|
|
for v in _laravel_guess_view_paths_from_text(txt):
|
|||
|
|
if v and v not in picked and v not in add:
|
|||
|
|
add.append(v)
|
|||
|
|
# Extra: neem kleine nabije partials/layouts mee (zelfde dir, ≤40KB)
|
|||
|
|
more = []
|
|||
|
|
for rel in (picked + add)[:8]:
|
|||
|
|
if rel.endswith(".blade.php"):
|
|||
|
|
d = (Path(st.repo_path) / rel).parent
|
|||
|
|
try:
|
|||
|
|
for bp in d.glob("*.blade.php"):
|
|||
|
|
if bp.name == os.path.basename(rel):
|
|||
|
|
continue
|
|||
|
|
if bp.stat().st_size <= 40_000:
|
|||
|
|
cand = str(bp.relative_to(Path(st.repo_path)))
|
|||
|
|
if cand not in picked and cand not in add and cand not in more:
|
|||
|
|
more.append(cand)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
picked = (picked + add + more)[:MAX_FILES_DRYRUN]
|
|||
|
|
# 5) Literal-grep fallback: als de user een oud->nieuw wijziging impliceert, zoek de 'old' literal repo-breed
|
|||
|
|
try:
|
|||
|
|
old, new, _why_pair = deduce_old_new_literals(st.user_goal, "")
|
|||
|
|
except Exception:
|
|||
|
|
old, new = None, None
|
|||
|
|
if old and isinstance(old, str) and old.strip():
|
|||
|
|
grep_hits = _grep_repo_for_literal(Path(st.repo_path), old.strip(), limit=16)
|
|||
|
|
for rel in grep_hits:
|
|||
|
|
if rel in all_files and rel not in picked:
|
|||
|
|
picked.append(rel)
|
|||
|
|
|
|||
|
|
# Keyword fallback alleen als we nog te weinig zeker zijn
|
|||
|
|
top_conf = 0.0
|
|||
|
|
try:
|
|||
|
|
top_conf = max([r.get("score",0.0) for r in merged]) if merged else 0.0
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
if len(picked) < MAX_FILES_DRYRUN and top_conf < float(os.getenv("RAG_FALLBACK_THRESHOLD","0.42")):
|
|||
|
|
|
|||
|
|
for rel, _s in simple_keyword_search(root, all_files, st.user_goal, limit=MAX_FILES_DRYRUN):
|
|||
|
|
if rel not in picked: picked.append(rel)
|
|||
|
|
# --- Gewogen her-ranking (Meili/embeddings/heuristiek/explicit) ---
|
|||
|
|
explicit_all = extract_explicit_paths(st.user_goal) + _extract_explicit_paths_robust(st.user_goal)
|
|||
|
|
explicit_all = [p.replace("\\","/").strip() for p in explicit_all]
|
|||
|
|
# 1) verzamel meili/embeddings scores vanuit 'merged'
|
|||
|
|
meili_scores = {}
|
|||
|
|
for r in merged:
|
|||
|
|
meta = (r or {}).get("metadata") or {}
|
|||
|
|
rel = meta.get("path","")
|
|||
|
|
if rel:
|
|||
|
|
try:
|
|||
|
|
sc = float(r.get("score", 0.0))
|
|||
|
|
except Exception:
|
|||
|
|
sc = 0.0
|
|||
|
|
meili_scores[rel] = max(meili_scores.get(rel, 0.0), sc)
|
|||
|
|
# 2) weeg en motiveer
|
|||
|
|
cand_scores = {}
|
|||
|
|
cand_why = {}
|
|||
|
|
def _boost(rel: str, amt: float, why: str):
|
|||
|
|
cand_scores[rel] = cand_scores.get(rel, 0.0) + float(amt)
|
|||
|
|
if amt > 0:
|
|||
|
|
cand_why[rel] = (cand_why.get(rel, "") + f"{why}; ").strip()
|
|||
|
|
for rel in picked:
|
|||
|
|
# Meili/embeddings top-hit
|
|||
|
|
if rel in meili_scores:
|
|||
|
|
_boost(rel, 0.55 * meili_scores[rel], "meili")
|
|||
|
|
# pad-heuristiek
|
|||
|
|
lo = rel.lower()
|
|||
|
|
if lo.startswith("routes/"): _boost(rel, 0.08, "routes")
|
|||
|
|
if lo.startswith("app/http/controllers/"): _boost(rel, 0.06, "controller")
|
|||
|
|
if lo.startswith("resources/views/"): _boost(rel, 0.06, "view")
|
|||
|
|
if lo.startswith("resources/lang/"): _boost(rel, 0.05, "lang")
|
|||
|
|
# expliciet genoemd door user
|
|||
|
|
if rel in explicit_all: _boost(rel, 0.20, "explicit")
|
|||
|
|
|
|||
|
|
# 2b) Graph-boost: BFS vanaf expliciete seeds (en evt. route-bestanden)
|
|||
|
|
try:
|
|||
|
|
seeds = [p for p in picked if p in explicit_all]
|
|||
|
|
# heuristisch: als gebruiker over "route" praat, neem routes/web.php als seed
|
|||
|
|
if any(k in st.user_goal.lower() for k in [" route", "routes", "/"]):
|
|||
|
|
for rp in ["routes/web.php","routes/api.php"]:
|
|||
|
|
if rp in picked and rp not in seeds:
|
|||
|
|
seeds.append(rp)
|
|||
|
|
if graph and seeds:
|
|||
|
|
bfs = _graph_bfs_boosts(graph, seeds, max_depth=int(os.getenv("AGENT_GRAPH_MAX_DEPTH","3")))
|
|||
|
|
for rel in picked:
|
|||
|
|
if rel in bfs:
|
|||
|
|
d, via = bfs[rel]
|
|||
|
|
# afstand → boost: 0:0.08, 1:0.06, 2:0.03, 3:0.01
|
|||
|
|
boost_map = {0:0.08, 1:0.06, 2:0.03, 3:0.01}
|
|||
|
|
b = boost_map.get(min(d,3), 0.0)
|
|||
|
|
if b > 0:
|
|||
|
|
_boost(rel, b, f"graph:d={d} via {via}")
|
|||
|
|
st.reasons[f"graph::{rel}"] = f"d={d}, via {via}"
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 2c) Tree-summary boost: hits van prompt-keywords in samenvatting
|
|||
|
|
try:
|
|||
|
|
hints = extract_word_hints(st.user_goal) or []
|
|||
|
|
if hints and tree_summ:
|
|||
|
|
lo_hints = [h.lower() for h in hints[:8]]
|
|||
|
|
for rel in picked:
|
|||
|
|
s = (tree_summ.get(rel) or "").lower()
|
|||
|
|
if not s:
|
|||
|
|
continue
|
|||
|
|
hits = sum(1 for h in lo_hints if h in s)
|
|||
|
|
if hits:
|
|||
|
|
_boost(rel, min(0.04, 0.01 * hits), f"tree:{hits}hit")
|
|||
|
|
if hits >= 2:
|
|||
|
|
st.reasons[f"tree::{rel}"] = tree_summ.get(rel, "")[:200]
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 3) sorteer op totale score (desc)
|
|||
|
|
picked.sort(key=lambda p: cand_scores.get(p, 0.0), reverse=True)
|
|||
|
|
# 4) leg motivatie vast voor UI/preview
|
|||
|
|
for rel in picked[:MAX_FILES_DRYRUN]:
|
|||
|
|
if cand_scores.get(rel, 0.0) > 0:
|
|||
|
|
st.reasons[f"rank::{rel}"] = f"{cand_scores[rel]:.2f} via {cand_why.get(rel,'')}"
|
|||
|
|
st.candidate_paths = picked[:MAX_FILES_DRYRUN]
|
|||
|
|
logger.info("CANDIDATES (explicit first, capped=%d): %s", MAX_FILES_DRYRUN, st.candidate_paths)
|
|||
|
|
if not len(st.candidate_paths)>0:
|
|||
|
|
st.stage = "ASK"
|
|||
|
|
return _with_preview("\n".join(progress + ["Geen duidelijke kandidaten. Noem een pagina/onderdeel of (optioneel) bestandsnaam."]), st)
|
|||
|
|
|
|||
|
|
|
|||
|
|
progress.append("Kandidaten:\n" + "\n".join([f"- {rel}" for rel in st.candidate_paths]))
|
|||
|
|
logger.info("Kandidaten gevonden!")
|
|||
|
|
|
|||
|
|
# DRY-RUN
|
|||
|
|
logger.info("dry-run")
|
|||
|
|
try:
|
|||
|
|
proposed, diffs, reasons = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
|
|||
|
|
if not proposed:
|
|||
|
|
# ---- T3: automatische recovery (éénmalig) ----
|
|||
|
|
if not st.recovery_attempted:
|
|||
|
|
st.recovery_attempted = True
|
|||
|
|
try:
|
|||
|
|
new_list, dbg = await _recovery_expand_candidates(
|
|||
|
|
Path(st.repo_path), list_repo_files(Path(st.repo_path)),
|
|||
|
|
st.user_goal, st.candidate_paths, last_reason="no_proposal_after_dryrun"
|
|||
|
|
)
|
|||
|
|
st.candidate_paths = new_list
|
|||
|
|
st.reasons["recovery_note"] = dbg.get("recovery_plan",{}).get("note","")
|
|||
|
|
# opnieuw proberen
|
|||
|
|
proposed2, diffs2, reasons2 = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
|
|||
|
|
if proposed2:
|
|||
|
|
st.proposed_patches = proposed2
|
|||
|
|
st.reasons.update(reasons2 or {})
|
|||
|
|
st.stage = "APPLY"
|
|||
|
|
preview = []
|
|||
|
|
for rel in list(diffs2.keys())[:3]:
|
|||
|
|
why = st.reasons.get(rel, "")
|
|||
|
|
preview.append(f"### {rel}\n```\n{diffs2[rel]}\n```\n**Waarom**: {why}")
|
|||
|
|
more = "" if len(diffs2) <= 3 else f"\n(Plus {len(diffs2)-3} extra diff(s).)"
|
|||
|
|
base = "\n".join(progress + [
|
|||
|
|
"**Dry-run voorstel (na recovery):**",
|
|||
|
|
"\n\n".join(preview) + more,
|
|||
|
|
"\nTyp **'Akkoord apply'** om te schrijven & pushen, of geef feedback."
|
|||
|
|
])
|
|||
|
|
return _with_preview(base, st, header="--- SMART-RAG + recovery notities ---")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:recovery attempt failed: %s", e)
|
|||
|
|
# geen succes → val terug op bestaande melding
|
|||
|
|
st.stage = "PROPOSE_DIFF_DRYRUN"
|
|||
|
|
return "\n".join(progress + ["Dry-run: geen bruikbaar voorstel met deze kandidaten. Geef extra hint (pagina/ term)."])
|
|||
|
|
|
|||
|
|
st.proposed_patches = proposed
|
|||
|
|
st.reasons = reasons
|
|||
|
|
st.stage = "APPLY"
|
|||
|
|
preview = []
|
|||
|
|
for rel in list(diffs.keys())[:3]:
|
|||
|
|
why = reasons.get(rel, "")
|
|||
|
|
preview.append(f"### {rel}\n```\n{diffs[rel]}\n```\n**Waarom**: {why}")
|
|||
|
|
more = "" if len(diffs) <= 3 else f"\n(Plus {len(diffs)-3} extra diff(s).)"
|
|||
|
|
base= "\n".join(progress + [
|
|||
|
|
"**Dry-run voorstel (geen writes):**",
|
|||
|
|
"\n\n".join(preview) + more,
|
|||
|
|
"\nTyp **'Akkoord apply'** om te schrijven & pushen, of geef feedback."
|
|||
|
|
])
|
|||
|
|
return _with_preview(base, st, header="--- SMART-RAG contextnotities ---")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.exception("ERROR:agent_repo:PROPOSE_DIFF_DRYRUN failed")
|
|||
|
|
st.stage = "PROPOSE_DIFF_DRYRUN"
|
|||
|
|
return "\n".join(progress + [f"Dry-run mislukte: {e}"])
|
|||
|
|
|
|||
|
|
if st.stage == "PROPOSE_DIFF_DRYRUN":
|
|||
|
|
logger.info("Stage PROPOSE_DIFF_DRYRUN")
|
|||
|
|
root = Path(st.repo_path)
|
|||
|
|
all_files = list_repo_files(root)
|
|||
|
|
added = []
|
|||
|
|
for pth in extract_explicit_paths(user_last):
|
|||
|
|
if pth in all_files and pth not in st.candidate_paths:
|
|||
|
|
added.append(pth)
|
|||
|
|
else:
|
|||
|
|
best = best_path_by_basename(all_files, pth)
|
|||
|
|
if best and best not in st.candidate_paths: added.append(best)
|
|||
|
|
st.candidate_paths = (added + st.candidate_paths)[:MAX_FILES_DRYRUN]
|
|||
|
|
# extra: grep op 'old' literal uit user_goal om kandidaten te verrijken
|
|||
|
|
try:
|
|||
|
|
old, new, _why_pair = deduce_old_new_literals(st.user_goal, "")
|
|||
|
|
except Exception:
|
|||
|
|
old = None
|
|||
|
|
if old:
|
|||
|
|
for rel in _grep_repo_for_literal(root, old, limit=16):
|
|||
|
|
if rel in all_files and rel not in st.candidate_paths:
|
|||
|
|
st.candidate_paths.append(rel)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
proposed, diffs, reasons = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
|
|||
|
|
if not proposed:
|
|||
|
|
if not st.recovery_attempted:
|
|||
|
|
st.recovery_attempted = True
|
|||
|
|
try:
|
|||
|
|
new_list, dbg = await _recovery_expand_candidates(
|
|||
|
|
Path(st.repo_path), list_repo_files(Path(st.repo_path)),
|
|||
|
|
st.user_goal, st.candidate_paths, last_reason="no_proposal_in_propose_diff"
|
|||
|
|
)
|
|||
|
|
st.candidate_paths = new_list
|
|||
|
|
st.reasons["recovery_note"] = dbg.get("recovery_plan",{}).get("note","")
|
|||
|
|
# direct nog een poging
|
|||
|
|
proposed2, diffs2, reasons2 = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
|
|||
|
|
if proposed2:
|
|||
|
|
st.proposed_patches = proposed2
|
|||
|
|
st.reasons.update(reasons2 or {})
|
|||
|
|
st.stage = "APPLY"
|
|||
|
|
preview = []
|
|||
|
|
for rel in list(diffs2.keys())[:3]:
|
|||
|
|
why = st.reasons.get(rel, "")
|
|||
|
|
preview.append(f"### {rel}\n```\n{diffs2[rel]}\n```\n**Waarom**: {why}")
|
|||
|
|
more = "" if len(diffs2) <= 3 else f"\n(Plus {len(diffs2)-3} extra diff(s).)"
|
|||
|
|
base = ("**Dry-run voorstel (na recovery):**\n" +
|
|||
|
|
"\n\n".join(preview) + more +
|
|||
|
|
"\n\nTyp **'Akkoord apply'** om te schrijven & pushen, of geef feedback.")
|
|||
|
|
return _with_preview(base, st, header="--- SMART-RAG + recovery notities ---")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("WARN:agent_repo:recovery in PROPOSE_DIFF failed: %s", e)
|
|||
|
|
return _with_preview("Nog geen bruikbaar voorstel. Noem exact bestand/pagina of plak relevante code.", st)
|
|||
|
|
|
|||
|
|
st.proposed_patches = proposed
|
|||
|
|
st.reasons = reasons
|
|||
|
|
st.stage = "APPLY"
|
|||
|
|
preview = []
|
|||
|
|
for rel in list(diffs.keys())[:3]:
|
|||
|
|
why = reasons.get(rel, "")
|
|||
|
|
preview.append(f"### {rel}\n```\n{diffs[rel]}\n```\n**Waarom**: {why}")
|
|||
|
|
more = "" if len(diffs) <= 3 else f"\n(Plus {len(diffs)-3} extra diff(s).)"
|
|||
|
|
base = ("**Dry-run voorstel (geen writes):**\n" +
|
|||
|
|
"\n\n".join(preview) + more +
|
|||
|
|
"\n\nTyp **'Akkoord apply'** om te schrijven & pushen, of geef feedback.")
|
|||
|
|
return _with_preview(base, st, header="--- SMART-RAG contextnotities ---")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.exception("ERROR:agent_repo:PROPOSE_DIFF_DRYRUN retry failed")
|
|||
|
|
return _with_preview(f"Dry-run mislukte: {e}", st)
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _apply():
|
|||
|
|
if not (("akkoord" in user_last_lower) and ("apply" in user_last_lower)):
|
|||
|
|
return "Typ **'Akkoord apply'** om de dry-run wijzigingen te schrijven & pushen."
|
|||
|
|
try:
|
|||
|
|
repo_path = _get_git_repo(st.repo_url, st.branch_base)
|
|||
|
|
import git
|
|||
|
|
repo = git.Repo(repo_path)
|
|||
|
|
short = re.sub(r'[^a-z0-9\-]+','-', st.user_goal.lower()).strip("-")
|
|||
|
|
st.new_branch = f"task/{short[:40]}-{time.strftime('%Y%m%d-%H%M%S')}"
|
|||
|
|
repo.git.checkout("-b", st.new_branch)
|
|||
|
|
changed = []
|
|||
|
|
for rel, content in st.proposed_patches.items():
|
|||
|
|
f = Path(repo_path) / rel
|
|||
|
|
f.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
f.write_text(content, encoding="utf-8")
|
|||
|
|
changed.append(str(f))
|
|||
|
|
if not changed:
|
|||
|
|
return "Er waren geen wijzigingen om te commiten."
|
|||
|
|
repo.index.add(changed)
|
|||
|
|
msg = (f"feat: {st.user_goal}\n\nScope:\n" +
|
|||
|
|
"\n".join([f"- {Path(c).relative_to(repo_path)}" for c in changed]) +
|
|||
|
|
"\n\nRationale (samengevat):\n" +
|
|||
|
|
"\n".join([f"- {k}: {v}" for k,v in st.reasons.items()]) +
|
|||
|
|
"\n\nCo-authored-by: repo-agent\n")
|
|||
|
|
repo.index.commit(msg)
|
|||
|
|
repo.remotes.origin.push(refspec=f"{st.new_branch}:{st.new_branch}")
|
|||
|
|
st.stage = "DONE"
|
|||
|
|
return f"✅ Branch aangemaakt en gepusht: `{st.new_branch}`. Maak nu je PR in Gitea."
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.exception("ERROR:agent_repo:APPLY failed")
|
|||
|
|
st.stage = "PROPOSE_DIFF_DRYRUN"
|
|||
|
|
return f"Apply/push mislukte: {e}"
|
|||
|
|
if st.stage == "APPLY":
|
|||
|
|
logger.info("Stage APPLY")
|
|||
|
|
return await run_in_threadpool(_apply)
|
|||
|
|
|
|||
|
|
if st.stage == "DONE":
|
|||
|
|
logger.info("Stage DONE")
|
|||
|
|
st.smart_preview = ""
|
|||
|
|
return f"Klaar. Branch: `{st.new_branch}`."
|
|||
|
|
return "Interne status onduidelijk; begin opnieuw of herformuleer je doel."
|
|||
|
|
|
|||
|
|
|