4888 lines
205 KiB
Python
4888 lines
205 KiB
Python
# agent_repo.py
|
||
# =====================================================================
|
||
# Hybrid RAG + LLM edit-plans met: veilige fallback, anti-destructie guard,
|
||
# en EXPLICIETE UITLEG per diff.
|
||
# =====================================================================
|
||
# agent_repo.py (bovenin)
|
||
|
||
from __future__ import annotations
|
||
from smart_rag import enrich_intent, expand_queries, hybrid_retrieve, assemble_context
|
||
import os, re, time, uuid, difflib, hashlib, logging, json, fnmatch
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
from typing import Dict, List, Tuple, Optional, Any
|
||
from urllib.parse import urlparse, urlunparse
|
||
import requests
|
||
import base64
|
||
from windowing_utils import approx_token_count
|
||
from starlette.concurrency import run_in_threadpool
|
||
import asyncio
|
||
from collections import defaultdict
|
||
from llm_client import _llm_call
|
||
|
||
# --- Async I/O executors (voorkom event-loop blocking) ---
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
|
||
_IO_POOL = ThreadPoolExecutor(max_workers=int(os.getenv("AGENT_IO_WORKERS", "8")))
|
||
_CPU_POOL = ThreadPoolExecutor(max_workers=int(os.getenv("AGENT_CPU_WORKERS", "2")))
|
||
_CLONE_SEMA = asyncio.Semaphore(int(os.getenv("AGENT_MAX_CONCURRENT_CLONES", "2")))
|
||
|
||
BACKEND = (os.getenv("VECTOR_BACKEND") or "CHROMA").upper().strip()
|
||
|
||
#PATH_RE = re.compile(r"(?<!https?:\/\/)([A-Za-z0-9._\-\/]+\/[A-Za-z0-9._\-]+(?:\.[A-Za-z0-9._\-]+))")
|
||
#PATH_RE = re.compile(r"(?<!http:\/\/)(?<!https:\/\/)(/[A-Za-z0-9._\-\/]+\/[A-Za-z0-9._\-]+(?:\.[A-Za-z0-9._\-]+))")
|
||
#PATH_RE = re.compile(r'(?<!https?://)(?:^|(?<=\s)|(?<=["\'(]))'r'((?:\.{0,2}/)?(?:[A-Za-z0-9._-]+/)+[A-Za-z0-9._-]+\.[A-Za-z0-9._-]+)')
|
||
PATH_RE = re.compile(
|
||
r'''
|
||
(?<!http://)(?<!https://) # niet voorafgegaan door http:// of https://
|
||
(?:^|(?<=\s)|(?<=[\'"\[])) # begin van string, whitespace of na ", ', [
|
||
( # ---------- capture group ----------
|
||
(?:\.{1,2}/)? # optioneel ./ of ../
|
||
(?:[\w.-]+/)* # 0 of meer map‑segmenten
|
||
[\w.-]+\.[\w.-]+ # bestandsnaam + extensie
|
||
)
|
||
''',
|
||
re.VERBOSE | re.IGNORECASE,
|
||
)
|
||
|
||
# Debounce: onthoud laatst-geïndexeerde HEAD per (repo_url|branch) in-memory
|
||
_INDEX_HEAD_MEMO: dict[str, str] = {}
|
||
_MEILI_HEAD_MEMO: dict[str, str] = {}
|
||
_BM25_HEAD_MEMO: dict[str, str] = {}
|
||
|
||
DEF_INJECTS = {}
|
||
_search_candidates_fn = None
|
||
_repo_summary_get_fn = None
|
||
_meili_search_fn = None
|
||
|
||
# --- caches voor graph en tree summaries (per HEAD) ---
|
||
_GRAPH_CACHE: dict[str, dict[str, set[str]]] = {}
|
||
_TREE_SUM_CACHE: dict[str, dict[str, str]] = {}
|
||
|
||
# ---------------------------------------------------------
|
||
# Fast-path helpers: expliciete paden + vervangpaar (old->new)
|
||
# ---------------------------------------------------------
|
||
_Q = r"[\"'“”‘’`]"
|
||
_PATH_PATS = [
|
||
r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"”']",
|
||
r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
|
||
r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"”']",
|
||
r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
|
||
]
|
||
_TRANS_WRAPPERS = [
|
||
r"__\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
|
||
r"@lang\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
|
||
r"trans\(\s*{q}(.+?){q}\s*\)".format(q=_Q),
|
||
]
|
||
|
||
def _clean_repo_arg(x):
|
||
"""Zet lege/sentinel repo-waarden om naar None (geen filter)."""
|
||
if x is None:
|
||
return None
|
||
s = str(x).strip().lower()
|
||
return None if s in ("", "-", "none") else x
|
||
|
||
|
||
def _extract_repo_branch_from_text(txt: str) -> Tuple[Optional[str], str]:
|
||
repo_url, branch = None, "main"
|
||
m = re.search(r"\bRepo\s*:\s*(\S+)", txt, flags=re.I)
|
||
if m: repo_url = m.group(1).strip()
|
||
mb = re.search(r"\bbranch\s*:\s*([A-Za-z0-9._/-]+)", txt, flags=re.I)
|
||
if mb: branch = mb.group(1).strip()
|
||
return repo_url, branch
|
||
|
||
def _extract_explicit_paths(txt: str) -> List[str]:
|
||
out = []
|
||
for pat in _PATH_PATS:
|
||
for m in re.finditer(pat, txt):
|
||
p = m.group(1)
|
||
if p and p not in out:
|
||
out.append(p)
|
||
return out
|
||
|
||
def _extract_replace_pair(txt: str) -> Tuple[Optional[str], Optional[str]]:
|
||
# NL/EN varianten + “slimme” quotes
|
||
pats = [
|
||
rf"Vervang\s+de\s+tekst\s*{_Q}(.+?){_Q}[^.\n]*?(?:in|naar|verander(?:en)?\s+in)\s*{_Q}(.+?){_Q}",
|
||
rf"Replace(?:\s+the)?\s+text\s*{_Q}(.+?){_Q}\s*(?:to|with)\s*{_Q}(.+?){_Q}",
|
||
]
|
||
for p in pats:
|
||
m = re.search(p, txt, flags=re.I|re.S)
|
||
if m:
|
||
return m.group(1), m.group(2)
|
||
mm = re.search(r"(Vervang|Replace)[\s\S]*?"+_Q+"(.+?)"+_Q+"[\s\S]*?"+_Q+"(.+?)"+_Q, txt, flags=re.I)
|
||
if mm:
|
||
return mm.group(2), mm.group(3)
|
||
return None, None
|
||
|
||
def _looks_like_unified_diff_request(txt: str) -> bool:
|
||
if re.search(r"\bunified\s+diff\b", txt, flags=re.I): return True
|
||
if re.search(r"\b(diff|patch)\b", txt, flags=re.I) and _extract_explicit_paths(txt):
|
||
return True
|
||
return False
|
||
|
||
# zet dit dicht bij de andere module-consts
|
||
async def _call_get_git_repo(repo_url: str, branch: str):
|
||
"""
|
||
Veilig wrapper: ondersteunt zowel sync als async implementaties van _get_git_repo.
|
||
"""
|
||
if asyncio.iscoroutinefunction(_get_git_repo):
|
||
return await _get_git_repo(repo_url, branch)
|
||
# sync: draai in IO pool
|
||
return await run_io_blocking(_get_git_repo, repo_url, branch)
|
||
|
||
|
||
async def run_io_blocking(func, *args, pool=None, **kwargs):
|
||
"""Draai sync/blokkerende I/O in threadpool zodat de event-loop vrij blijft."""
|
||
loop = asyncio.get_running_loop()
|
||
executor = pool or _IO_POOL
|
||
return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
|
||
|
||
async def run_cpu_blocking(func, *args, pool=None, **kwargs):
|
||
"""Voor CPU-zwaardere taken (bv. index bouwen)."""
|
||
loop = asyncio.get_running_loop()
|
||
executor = pool or _CPU_POOL
|
||
return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
|
||
|
||
# Lazy imports
|
||
_chroma = None
|
||
_qdrant = None
|
||
_qdrant_models = None
|
||
try:
|
||
if BACKEND == "CHROMA":
|
||
import chromadb # type: ignore
|
||
_chroma = chromadb
|
||
except Exception:
|
||
_chroma = None
|
||
try:
|
||
if BACKEND == "QDRANT":
|
||
from qdrant_client import QdrantClient # type: ignore
|
||
from qdrant_client.http.models import Filter, FieldCondition, MatchValue # type: ignore
|
||
_qdrant = QdrantClient
|
||
_qdrant_models = (Filter, FieldCondition, MatchValue)
|
||
except Exception:
|
||
_qdrant = None
|
||
_qdrant_models = None
|
||
|
||
|
||
try:
|
||
from rank_bm25 import BM25Okapi
|
||
except Exception:
|
||
BM25Okapi = None
|
||
|
||
logger = logging.getLogger("agent_repo")
|
||
|
||
# ---------- Omgeving / Config ----------
|
||
GITEA_URL = os.environ.get("GITEA_URL", "http://localhost:3080").rstrip("/")
|
||
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "8bdbe18dd2ec93ecbf9cd0a8f01a6eadf9cfa87d")
|
||
GITEA_API = os.environ.get("GITEA_API", f"{GITEA_URL}/api/v1").rstrip("/")
|
||
AGENT_DEFAULT_BRANCH = os.environ.get("AGENT_DEFAULT_BRANCH", "main")
|
||
AGENT_MAX_QUESTIONS = int(os.environ.get("AGENT_MAX_QUESTIONS", "3"))
|
||
MAX_FILES_DRYRUN = int(os.environ.get("AGENT_MAX_FILES_DRYRUN", "27"))
|
||
RAG_TOPK = int(os.environ.get("AGENT_RAG_TOPK", "24")) # grotere kandidaatpool helpt de reranker
|
||
AGENT_DISCOVER_MAX_REPOS = int(os.environ.get("AGENT_DISCOVER_MAX_REPOS", "200"))
|
||
AGENT_AUTOSELECT_THRESHOLD = float(os.environ.get("AGENT_AUTOSELECT_THRESHOLD", "0.80")) # 0..1
|
||
REPO_CATALOG_MEILI_INDEX = os.environ.get("REPO_CATALOG_MEILI_INDEX", "repo-catalog")
|
||
AGENT_ENABLE_GOAL_REFINE = os.environ.get("AGENT_ENABLE_GOAL_REFINE", "1").lower() in ("1","true","yes")
|
||
AGENT_CLARIFY_THRESHOLD = float(os.environ.get("AGENT_CLARIFY_THRESHOLD", "0.6"))
|
||
|
||
|
||
# Meilisearch (optioneel)
|
||
MEILI_URL = os.environ.get("MEILI_URL", "http://localhost:7700").strip()
|
||
MEILI_KEY = os.environ.get("MEILI_KEY", "0xipOmfgi_zMgdFplSdv7L8mlx0RPMQCNxVTNJc54lQ").strip()
|
||
MEILI_INDEX_PREFIX = os.environ.get("MEILI_INDEX_PREFIX", "code").strip()
|
||
|
||
# optioneel: basic auth injectie voor HTTP clone (private repos)
|
||
GITEA_HTTP_USER = os.environ.get("GITEA_HTTP_USER", "Mistral-llm")
|
||
GITEA_HTTP_TOKEN = os.environ.get("GITEA_HTTP_TOKEN", "8bdbe18dd2ec93ecbf9cd0a8f01a6eadf9cfa87d")
|
||
|
||
# Geen destructive edits. (geen complete inhoud van files verwijderen.)
|
||
AGENT_DESTRUCTIVE_RATIO = float(os.environ.get("AGENT_DESTRUCTIVE_RATIO", "0.50"))
|
||
|
||
# Alleen relevante code/tekst-extensies (geen binaire/caches)
|
||
ALLOWED_EXTS = {
|
||
".php",".blade.php",".vue",".js",".ts",".jsx",".tsx",".css",".scss",
|
||
".html",".htm",".json",".md",".ini",".cfg",".yml",".yaml",".toml",
|
||
".py",".go",".rb",".java",".cs",".txt"
|
||
}
|
||
INTERNAL_EXCLUDE_DIRS = {
|
||
".git",".npm","node_modules","vendor","storage","dist","build",".next",
|
||
"__pycache__",".venv","venv",".mypy_cache",".pytest_cache",
|
||
"target","bin","obj","logs","cache","temp",".cache"
|
||
}
|
||
_LIST_FILES_CACHE: dict[str, tuple[float, List[str]]] = {} # path -> (ts, files)
|
||
# ---------- Injectie vanuit app.py ----------
|
||
_app = None
|
||
_get_git_repo = None
|
||
_rag_index_repo_internal = None
|
||
_rag_query_internal = None
|
||
_llm_call = None
|
||
_extract_code_block = None
|
||
_read_text_file = None
|
||
_client_ip = None
|
||
_PROFILE_EXCLUDE_DIRS: set[str] = set()
|
||
_get_chroma_collection = None
|
||
_embed_query_fn = None
|
||
_embed_documents = None
|
||
|
||
|
||
# === SMART LLM WRAPPER: budget + nette afronding + auto-continue ===
|
||
# Past binnen jouw GPU-cap (typisch 13027 tokens totale context).
|
||
# Non-invasief: behoudt hetzelfde response-shape als _llm_call.
|
||
|
||
# Harde cap van jouw Mistral-LLM docker (zoals je aangaf)
|
||
_MODEL_BUDGET = int(os.getenv("LLM_TOTAL_TOKEN_BUDGET", "13027"))
|
||
# Veiligheidsmarge voor headers/EOS/afwijkingen in token-raming
|
||
_BUDGET_SAFETY = int(os.getenv("LLM_BUDGET_SAFETY_TOKENS", "512"))
|
||
# Max aantal vervolgstappen als het net afgekapt lijkt
|
||
_MAX_AUTO_CONTINUES = int(os.getenv("LLM_MAX_AUTO_CONTINUES", "2"))
|
||
|
||
def _est_tokens(text: str) -> int:
|
||
# Ruwe schatting: ~4 chars/token (conservatief genoeg voor budgettering)
|
||
if not text: return 0
|
||
return max(1, len(text) // 4)
|
||
|
||
def _concat_messages_text(messages: list[dict]) -> str:
|
||
parts = []
|
||
for m in messages or []:
|
||
c = m.get("content")
|
||
if isinstance(c, str): parts.append(c)
|
||
return "\n".join(parts)
|
||
|
||
def _ends_neatly(s: str) -> bool:
|
||
if not s: return False
|
||
t = s.rstrip()
|
||
return t.endswith((".", "!", "?", "…", "”", "’"))
|
||
|
||
def _append_assistant_and_continue_prompt(base_messages: list[dict], prev_text: str) -> list[dict]:
|
||
"""
|
||
Bouw een minimale vervolgprompt zonder opnieuw de hele context te sturen.
|
||
Dit beperkt prompt_tokens en voorkomt dat we opnieuw de cap raken.
|
||
"""
|
||
tail_words = " ".join(prev_text.split()[-60:]) # laatste ±60 woorden als anker
|
||
cont_user = (
|
||
"Ga verder waar je stopte. Herhaal niets. "
|
||
"Vervolg direct de laatste zin met hetzelfde formaat.\n\n"
|
||
"Vorige woorden:\n" + tail_words
|
||
)
|
||
# We sturen *niet* de volledige history opnieuw; alleen een korte instructie
|
||
return [
|
||
{"role": "system", "content": "Vervolg exact en beknopt; geen herhaling van eerder gegenereerde tekst."},
|
||
{"role": "user", "content": cont_user},
|
||
]
|
||
|
||
def _merge_choice_text(resp_a: dict, resp_b: dict) -> dict:
|
||
"""
|
||
Plak de content van choices[0] aan elkaar zodat callsites één 'content' blijven lezen.
|
||
"""
|
||
a = (((resp_a or {}).get("choices") or [{}])[0].get("message") or {}).get("content","")
|
||
b = (((resp_b or {}).get("choices") or [{}])[0].get("message") or {}).get("content","")
|
||
merged = (a or "") + (b or "")
|
||
out = resp_a.copy()
|
||
if "choices" in out and out["choices"]:
|
||
out["choices"] = [{
|
||
"index": 0,
|
||
"finish_reason": "length" if (out.get("choices",[{}])[0].get("finish_reason") in (None, "length")) else out.get("choices",[{}])[0].get("finish_reason"),
|
||
"message": {"role":"assistant","content": merged}
|
||
}]
|
||
return out
|
||
|
||
# Voorbeeld: Chroma client/init – vervang door jouw eigen client
|
||
# from chromadb import Client
|
||
# chroma = Client(...)
|
||
|
||
def _build_where_filter(repo: Optional[str], path_contains: Optional[str], profile: Optional[str]) -> Dict[str, Any]:
|
||
"""
|
||
Bouw een simpele metadata-filter voor de vector-DB. Pas aan naar jouw DB.
|
||
"""
|
||
where: Dict[str, Any] = {}
|
||
if repo:
|
||
where["repo"] = repo
|
||
if profile:
|
||
where["profile"] = profile
|
||
if path_contains:
|
||
# Als je DB geen 'contains' ondersteunt: filter achteraf (post-filter)
|
||
where["path_contains"] = path_contains
|
||
return where
|
||
|
||
def _to_distance_from_similarity(x: Optional[float]) -> float:
|
||
"""
|
||
Converteer een 'similarity' (1=identiek, 0=ver weg) naar distance (lager = beter).
|
||
"""
|
||
if x is None:
|
||
return 1.0
|
||
try:
|
||
xv = float(x)
|
||
except Exception:
|
||
return 1.0
|
||
# Veiligheids-net: clamp
|
||
if xv > 1.0 or xv < 0.0:
|
||
# Sommige backends geven cosine distance al (0=identiek). Als >1, treat as distance passthrough.
|
||
return max(0.0, xv)
|
||
# Standaard: cosine similarity → distance
|
||
return 1.0 - xv
|
||
|
||
def _post_filter_path_contains(items: List[Dict[str,Any]], path_contains: Optional[str]) -> List[Dict[str,Any]]:
|
||
if not path_contains:
|
||
return items
|
||
key = (path_contains or "").lower()
|
||
out = []
|
||
for it in items:
|
||
p = ((it.get("metadata") or {}).get("path") or "").lower()
|
||
if key in p:
|
||
out.append(it)
|
||
return out
|
||
|
||
def _chroma_query(collection_name: str, query: str, n_results: int, where: Dict[str,Any]) -> Dict[str,Any]:
|
||
global _chroma
|
||
if _chroma is None:
|
||
raise RuntimeError("Chroma backend niet beschikbaar (module niet geïnstalleerd).")
|
||
# Gebruik dezelfde collection-factory als de indexer, zodat versie/suffix consistent is
|
||
if _get_chroma_collection is None:
|
||
client = _chroma.Client()
|
||
coll = client.get_or_create_collection(collection_name)
|
||
else:
|
||
coll = _get_chroma_collection(collection_name)
|
||
# Chroma: use 'where' only for exact fields (repo/profile)
|
||
where_exact = {k:v for k,v in where.items() if k in ("repo","profile")}
|
||
qr = coll.query(
|
||
query_texts=[query],
|
||
n_results=max(1, n_results),
|
||
where=where_exact,
|
||
include=["documents","metadatas","distances"]
|
||
)
|
||
docs = qr.get("documents", [[]])[0] or []
|
||
metas = qr.get("metadatas", [[]])[0] or []
|
||
dists = qr.get("distances", [[]])[0] or []
|
||
# Chroma 'distances': lager = beter (ok)
|
||
items: List[Dict[str,Any]] = []
|
||
for doc, meta, dist in zip(docs, metas, dists):
|
||
items.append({
|
||
"document": doc,
|
||
"metadata": {
|
||
"repo": meta.get("repo",""),
|
||
"path": meta.get("path",""),
|
||
"chunk_index": meta.get("chunk_index", 0),
|
||
"symbols": meta.get("symbols", []),
|
||
"profile": meta.get("profile",""),
|
||
},
|
||
"distance": float(dist) if dist is not None else 1.0,
|
||
})
|
||
return {"results": items}
|
||
|
||
def _qdrant_query(collection_name: str, query: str, n_results: int, where: Dict[str,Any]) -> Dict[str,Any]:
|
||
global _qdrant, _qdrant_models
|
||
if _qdrant is None or _qdrant_models is None:
|
||
raise RuntimeError("Qdrant backend niet beschikbaar (module niet geïnstalleerd).")
|
||
Filter, FieldCondition, MatchValue = _qdrant_models
|
||
# Let op: je hebt hier *ook* een embedder nodig (client-side). In dit skeleton verwachten we dat
|
||
# je server-side search by text hebt geconfigureerd. Anders: voeg hier je embedder toe.
|
||
client = _qdrant(host=os.getenv("QDRANT_HOST","localhost"), port=int(os.getenv("QDRANT_PORT","6333")))
|
||
# Eenvoudig: text search (als ingeschakeld). Anders: raise en laat de mock fallback pakken.
|
||
try:
|
||
must: List[Any] = []
|
||
if where.get("repo"):
|
||
must.append(FieldCondition(key="repo", match=MatchValue(value=where["repo"])))
|
||
if where.get("profile"):
|
||
must.append(FieldCondition(key="profile", match=MatchValue(value=where["profile"])))
|
||
flt = Filter(must=must) if must else None
|
||
# NB: Qdrant 'score' is vaak cosine similarity (hoog=goed). Converteer naar distance.
|
||
res = client.search(
|
||
collection_name=collection_name,
|
||
query=query,
|
||
limit=max(1, n_results),
|
||
query_filter=flt,
|
||
with_payload=True,
|
||
)
|
||
except Exception as e:
|
||
raise RuntimeError(f"Qdrant text search niet geconfigureerd: {e}")
|
||
|
||
items: List[Dict[str,Any]] = []
|
||
for p in res:
|
||
meta = (p.payload or {})
|
||
sim = getattr(p, "score", None)
|
||
items.append({
|
||
"document": meta.get("document",""),
|
||
"metadata": {
|
||
"repo": meta.get("repo",""),
|
||
"path": meta.get("path",""),
|
||
"chunk_index": meta.get("chunk_index", 0),
|
||
"symbols": meta.get("symbols", []),
|
||
"profile": meta.get("profile",""),
|
||
},
|
||
"distance": _to_distance_from_similarity(sim),
|
||
})
|
||
return {"results": items}
|
||
|
||
async def rag_query_internal_fn(
|
||
*, query: str, n_results: int, collection_name: str,
|
||
repo: Optional[str], path_contains: Optional[str], profile: Optional[str]
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Adapter die zoekt in je vector-DB en *exact* het verwachte formaat teruggeeft:
|
||
{
|
||
"results": [
|
||
{"document": str, "metadata": {...}, "distance": float}
|
||
]
|
||
}
|
||
"""
|
||
# 1) Haal collectie op (pas aan naar jouw client)
|
||
# coll = chroma.get_or_create_collection(collection_name)
|
||
|
||
# 2) Bouw where/filter (optioneel afhankelijk van jouw DB)
|
||
where = _build_where_filter(repo, path_contains, profile)
|
||
|
||
# ?2?) Router naar backend
|
||
try:
|
||
if BACKEND == "CHROMA":
|
||
res = _chroma_query(collection_name, query, n_results, where)
|
||
elif BACKEND == "QDRANT":
|
||
res = _qdrant_query(collection_name, query, n_results, where)
|
||
else:
|
||
raise RuntimeError(f"Onbekende VECTOR_BACKEND={BACKEND}")
|
||
|
||
except Exception as e:
|
||
# Mock fallback zodat je app bruikbaar blijft
|
||
qr = {
|
||
"documents": [["(mock) no DB connected"]],
|
||
"metadatas": [[{"repo": repo or "", "path": "README.md", "chunk_index": 0, "symbols": []}]],
|
||
"distances": [[0.99]],
|
||
}
|
||
docs = qr.get("documents", [[]])[0] or []
|
||
metas = qr.get("metadatas", [[]])[0] or []
|
||
dists = qr.get("distances", [[]])[0] or []
|
||
|
||
items: List[Dict[str, Any]] = []
|
||
for doc, meta, dist in zip(docs, metas, dists):
|
||
# Post-filter op path_contains als je DB dat niet ondersteunt
|
||
if path_contains:
|
||
p = (meta.get("path") or "").lower()
|
||
if (path_contains or "").lower() not in p:
|
||
continue
|
||
items.append({
|
||
"document": doc,
|
||
"metadata": {
|
||
"repo": meta.get("repo",""),
|
||
"path": meta.get("path",""),
|
||
"chunk_index": meta.get("chunk_index", 0),
|
||
"symbols": meta.get("symbols", []),
|
||
"profile": meta.get("profile",""),
|
||
},
|
||
"distance": float(dist) if dist is not None else 1.0,
|
||
})
|
||
res = {"results": items[:max(1, n_results)]}
|
||
# 3) Post-filter path_contains (indien nodig)
|
||
res["results"] = _post_filter_path_contains(res.get("results", []), path_contains)
|
||
# 4) Trim
|
||
res["results"] = res.get("results", [])[:max(1, n_results)]
|
||
return res
|
||
|
||
async def _smart_llm_call_base(
|
||
llm_call_fn,
|
||
messages: list[dict],
|
||
*,
|
||
stop: list[str] | None = None,
|
||
max_tokens: int | None = None,
|
||
temperature: float = 0.2,
|
||
top_p: float = 0.9,
|
||
stream: bool = False,
|
||
**kwargs
|
||
):
|
||
"""
|
||
1) Dwing max_tokens af binnen totale budget (prompt + output ≤ cap).
|
||
2) Voeg milde stop-sequenties toe voor nette afronding.
|
||
3) Auto-continue als het lijkt afgekapt en we ruimte willen voor een vervolg.
|
||
"""
|
||
# 1) Budget berekenen op basis van huidige prompt omvang
|
||
prompt_text = _concat_messages_text(messages)
|
||
prompt_tokens = _est_tokens(prompt_text)
|
||
room = max(128, _MODEL_BUDGET - prompt_tokens - _BUDGET_SAFETY)
|
||
eff_max_tokens = max(1, min(int(max_tokens or 900), room))
|
||
|
||
# 2) Stop-sequenties (mild, niet beperkend voor code)
|
||
default_stops = ["\n\n", "###"]
|
||
stops = list(dict.fromkeys((stop or []) + default_stops))
|
||
|
||
# eerste call
|
||
try:
|
||
resp = await llm_call_fn(
|
||
messages,
|
||
stream=stream,
|
||
temperature=temperature,
|
||
top_p=top_p,
|
||
max_tokens=eff_max_tokens,
|
||
stop=stops,
|
||
**kwargs
|
||
)
|
||
except TypeError as e:
|
||
# backend accepteert geen 'stop' → probeer opnieuw zonder stop
|
||
resp = await llm_call_fn(
|
||
messages,
|
||
stream=stream,
|
||
temperature=temperature,
|
||
top_p=top_p,
|
||
max_tokens=eff_max_tokens,
|
||
**kwargs
|
||
)
|
||
text = (((resp or {}).get("choices") or [{}])[0].get("message") or {}).get("content","")
|
||
# Heuristiek: bijna vol + niet netjes eindigen → waarschijnlijk afgekapt
|
||
near_cap = (_est_tokens(text) >= int(0.92 * eff_max_tokens))
|
||
needs_more = (near_cap and not _ends_neatly(text))
|
||
|
||
continues = 0
|
||
merged = resp
|
||
while needs_more and continues < _MAX_AUTO_CONTINUES:
|
||
continues += 1
|
||
cont_msgs = _append_assistant_and_continue_prompt(messages, text)
|
||
# Herbereken budget voor vervolg (nieuwe prompt is veel kleiner)
|
||
cont_prompt_tokens = _est_tokens(_concat_messages_text(cont_msgs))
|
||
cont_room = max(128, _MODEL_BUDGET - cont_prompt_tokens - _BUDGET_SAFETY)
|
||
cont_max = max(1, min(int(max_tokens or 900), cont_room))
|
||
try:
|
||
cont_resp = await llm_call_fn(
|
||
cont_msgs,
|
||
stream=False,
|
||
temperature=temperature,
|
||
top_p=top_p,
|
||
max_tokens=cont_max,
|
||
stop=stops,
|
||
**kwargs
|
||
)
|
||
except TypeError:
|
||
cont_resp = await llm_call_fn(
|
||
cont_msgs,
|
||
stream=False,
|
||
temperature=temperature,
|
||
top_p=top_p,
|
||
max_tokens=cont_max,
|
||
**kwargs
|
||
)
|
||
merged = _merge_choice_text(merged, cont_resp)
|
||
text = (((merged or {}).get("choices") or [{}])[0].get("message") or {}).get("content","")
|
||
near_cap = (_est_tokens(text.split()[-800:]) >= int(0.9 * cont_max)) # check op laatst stuk
|
||
needs_more = (near_cap and not _ends_neatly(text))
|
||
|
||
return merged
|
||
|
||
def initialize_agent(*, app, get_git_repo_fn, rag_index_repo_internal_fn, rag_query_internal_fn,
|
||
llm_call_fn, extract_code_block_fn, read_text_file_fn, client_ip_fn,
|
||
profile_exclude_dirs, chroma_get_collection_fn, embed_query_fn, embed_documents_fn,
|
||
search_candidates_fn=None, repo_summary_get_fn=None, meili_search_fn=None):
|
||
global DEF_INJECTS
|
||
DEF_INJECTS.update({
|
||
"app": app,
|
||
"get_git_repo_fn": get_git_repo_fn,
|
||
"rag_index_repo_internal_fn": rag_index_repo_internal_fn,
|
||
"rag_query_internal_fn": rag_query_internal_fn,
|
||
"llm_call_fn": llm_call_fn,
|
||
"extract_code_block_fn": extract_code_block_fn,
|
||
"read_text_file_fn": read_text_file_fn,
|
||
"client_ip_fn": client_ip_fn,
|
||
"profile_exclude_dirs": profile_exclude_dirs,
|
||
"chroma_get_collection_fn": chroma_get_collection_fn,
|
||
"embed_query_fn": embed_query_fn,
|
||
"embed_documents_fn": embed_documents_fn,
|
||
})
|
||
global _search_candidates_fn, _repo_summary_get_fn, _meili_search_fn
|
||
_search_candidates_fn = search_candidates_fn
|
||
_repo_summary_get_fn = repo_summary_get_fn
|
||
_meili_search_fn = meili_search_fn
|
||
global _get_chroma_collection, _embed_query_fn
|
||
global _app, _get_git_repo, _rag_index_repo_internal, _rag_query_internal, _llm_call
|
||
global _extract_code_block, _read_text_file, _client_ip, _PROFILE_EXCLUDE_DIRS
|
||
_app = app
|
||
_get_git_repo = get_git_repo_fn
|
||
_rag_index_repo_internal = rag_index_repo_internal_fn
|
||
_rag_query_internal = rag_query_internal_fn
|
||
# Bewaar de originele en wrap met budget + auto-continue
|
||
_llm_call_original = llm_call_fn
|
||
async def _wrapped_llm_call(messages, **kwargs):
|
||
return await _smart_llm_call_base(_llm_call_original, messages, **kwargs)
|
||
globals()["_llm_call"] = _wrapped_llm_call
|
||
_extract_code_block = extract_code_block_fn
|
||
_read_text_file = read_text_file_fn
|
||
_client_ip = client_ip_fn
|
||
_PROFILE_EXCLUDE_DIRS = set(profile_exclude_dirs) | INTERNAL_EXCLUDE_DIRS
|
||
_get_chroma_collection = chroma_get_collection_fn
|
||
_embed_query_fn = embed_query_fn
|
||
_embed_documents = embed_documents_fn
|
||
if not hasattr(_app.state, "AGENT_SESSIONS"):
|
||
_app.state.AGENT_SESSIONS: Dict[str, AgentState] = {}
|
||
logger.info("INFO:agent_repo:init GITEA_URL=%s GITEA_API=%s MEILI_URL=%s", GITEA_URL, GITEA_API, MEILI_URL or "-")
|
||
|
||
# ---------- Helpers ----------
|
||
def extract_explicit_paths(text: str) -> List[str]:
|
||
"""
|
||
Robuuste extractor:
|
||
- negeert urls (http/https)
|
||
- vereist minstens één '/' en een extensie
|
||
- dedupe, behoud originele volgorde
|
||
"""
|
||
if not text:
|
||
return []
|
||
# normaliseer “slimme” quotes naar gewone quotes (kan later handig zijn)
|
||
t = (text or "").replace("“","\"").replace("”","\"").replace("’","'").replace("\\","/").strip()
|
||
cands = PATH_RE.findall(t)
|
||
seen = set()
|
||
out: List[str] = []
|
||
for p in cands:
|
||
if p not in seen:
|
||
seen.add(p)
|
||
out.append(p)
|
||
logger.info("EXPLICIT PATHS parsed: %s", out) # <— log
|
||
return out
|
||
|
||
async def _llm_recovery_plan(user_goal: str, observed_candidates: list[str], last_reason: str = "") -> dict:
|
||
"""
|
||
Vraag de LLM om gerichte herstel-zoekpatronen en trefwoorden wanneer we 'geen voorstel' kregen.
|
||
Output JSON: { "patterns":[{"glob"| "regex": str},...], "keywords":[str,...], "note": str }
|
||
"""
|
||
sys = ("Return ONLY compact JSON. Schema:\n"
|
||
"{\"patterns\":[{\"glob\":str}|{\"regex\":str},...],\"keywords\":[str,...],\"note\":str}\n"
|
||
"Prefer Laravel-centric paths (resources/views/**.blade.php, routes/*.php, app/Http/Controllers/**.php, "
|
||
"config/*.php, .env, database/migrations/**.php). Max 12 patterns, 8 keywords.")
|
||
usr = (f"User goal:\n{user_goal}\n\n"
|
||
f"Candidates we tried (may be irrelevant):\n{json.dumps(observed_candidates[-12:], ensure_ascii=False)}\n\n"
|
||
f"Failure reason (if any): {last_reason or '(none)'}\n"
|
||
"Propose minimal extra patterns/keywords to find the exact files.")
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
||
stream=False, temperature=0.0, top_p=1.0, max_tokens=280
|
||
)
|
||
raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
|
||
m = re.search(r"\{[\s\S]*\}", raw or "")
|
||
obj = json.loads(m.group(0)) if m else {}
|
||
except Exception:
|
||
obj = {}
|
||
# sanitize
|
||
pats = []
|
||
for it in (obj.get("patterns") or []):
|
||
if isinstance(it, dict):
|
||
if "glob" in it and isinstance(it["glob"], str) and it["glob"].strip():
|
||
pats.append({"glob": it["glob"].strip()[:200]})
|
||
elif "regex" in it and isinstance(it["regex"], str) and it["regex"].strip():
|
||
pats.append({"regex": it["regex"].strip()[:200]})
|
||
if len(pats) >= 16: break
|
||
kws = [str(x).strip()[:64] for x in (obj.get("keywords") or []) if str(x).strip()][:8]
|
||
note = str(obj.get("note",""))[:400]
|
||
return {"patterns": pats, "keywords": kws, "note": note}
|
||
|
||
def _extend_candidates_with_keywords(root: Path, all_files: list[str], keywords: list[str], cap: int = 24) -> list[str]:
|
||
"""
|
||
Deterministische keyword-scan (lichtgewicht). Gebruikt dezelfde text loader.
|
||
"""
|
||
out: list[str] = []; seen: set[str] = set()
|
||
kws = [k for k in keywords if k]
|
||
if not kws: return out
|
||
for rel in all_files:
|
||
if len(out) >= cap: break
|
||
try:
|
||
txt = _read_text_file(Path(root)/rel)
|
||
except Exception:
|
||
txt = ""
|
||
if not txt: continue
|
||
low = txt.lower()
|
||
if any(k.lower() in low for k in kws):
|
||
if rel not in seen:
|
||
seen.add(rel); out.append(rel)
|
||
return out
|
||
|
||
async def _recovery_expand_candidates(root: Path, all_files: list[str], user_goal: str,
|
||
current: list[str], *, last_reason: str = "") -> tuple[list[str], dict]:
|
||
"""
|
||
1) vraag LLM om recovery plan → patterns + keywords
|
||
2) scan deterministisch met _scan_repo_for_patterns
|
||
3) keyword-scan als tweede spoor
|
||
Retourneert (nieuwe_kandidaten_lijst, debug_info)
|
||
"""
|
||
plan = await _llm_recovery_plan(user_goal, current, last_reason=last_reason)
|
||
added: list[str] = []
|
||
# patterns → scan
|
||
if plan.get("patterns"):
|
||
hits = _scan_repo_for_patterns(root, all_files, plan["patterns"], max_hits=int(os.getenv("LLM_RECOVERY_MAX_HITS","24")))
|
||
for h in hits:
|
||
if h not in current and h not in added:
|
||
added.append(h)
|
||
# keywords → scan
|
||
if len(added) < int(os.getenv("LLM_RECOVERY_MAX_HITS","24")) and plan.get("keywords"):
|
||
khits = _extend_candidates_with_keywords(root, all_files, plan["keywords"],
|
||
cap=int(os.getenv("LLM_RECOVERY_MAX_HITS","24")) - len(added))
|
||
for h in khits:
|
||
if h not in current and h not in added:
|
||
added.append(h)
|
||
new_list = (current + added)[:MAX_FILES_DRYRUN]
|
||
debug = {"recovery_plan": plan, "added": added[:12]}
|
||
return new_list, debug
|
||
|
||
def _scan_repo_for_patterns(root: Path, all_files: list[str], patterns: list[dict], max_hits: int = 40) -> list[str]:
|
||
"""
|
||
patterns: [{"glob": "resources/views/**.blade.php"}, {"regex": "Truebeam\\s*foutcode"}, ...]
|
||
Retourneert unieke bestands-paden met 1+ hits. Deterministisch (geen LLM).
|
||
"""
|
||
hits: list[str] = []
|
||
seen: set[str] = set()
|
||
def _match_glob(pat: str) -> list[str]:
|
||
try:
|
||
pat = pat.strip().lstrip("./")
|
||
return [f for f in all_files if fnmatch.fnmatch(f, pat)]
|
||
except Exception:
|
||
return []
|
||
for spec in patterns or []:
|
||
if len(hits) >= max_hits: break
|
||
if "glob" in spec and isinstance(spec["glob"], str):
|
||
for f in _match_glob(spec["glob"]):
|
||
if f not in seen:
|
||
seen.add(f); hits.append(f)
|
||
if len(hits) >= max_hits: break
|
||
elif "regex" in spec and isinstance(spec["regex"], str):
|
||
try:
|
||
rx = re.compile(spec["regex"], re.I|re.M)
|
||
except Exception:
|
||
continue
|
||
for f in all_files:
|
||
if f in seen: continue
|
||
try:
|
||
txt = _read_text_file(Path(root)/f)
|
||
if rx.search(txt or ""):
|
||
seen.add(f); hits.append(f)
|
||
if len(hits) >= max_hits: break
|
||
except Exception:
|
||
continue
|
||
return hits
|
||
|
||
async def _llm_make_search_specs(user_goal: str, framework: str = "laravel") -> list[dict]:
|
||
"""
|
||
LLM bedenkt globs/regexen. Output ONLY JSON: {patterns:[{glob|regex: str},...]}
|
||
We voeren daarna een deterministische scan uit met _scan_repo_for_patterns.
|
||
"""
|
||
if not (user_goal or "").strip():
|
||
return []
|
||
sys = ("Return ONLY JSON matching: {\"patterns\":[{\"glob\":str}|{\"regex\":str}, ...]}\n"
|
||
"For Laravel, prefer globs like resources/views/**.blade.php, routes/*.php, app/Http/Controllers/**.php, "
|
||
"config/*.php, .env, database/migrations/**.php. Keep regexes simple and safe.")
|
||
usr = f"Framework: {framework}\nUser goal:\n{user_goal}\nReturn ≤ 12 items."
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
||
stream=False, temperature=0.0, top_p=1.0, max_tokens=280
|
||
)
|
||
raw = (resp.get('choices',[{}])[0].get('message',{}) or {}).get('content','')
|
||
m = re.search(r"\{[\s\S]*\}", raw or "")
|
||
obj = json.loads(m.group(0)) if m else {}
|
||
arr = obj.get("patterns") or []
|
||
out = []
|
||
for it in arr:
|
||
if isinstance(it, dict):
|
||
if "glob" in it and isinstance(it["glob"], str) and it["glob"].strip():
|
||
out.append({"glob": it["glob"].strip()[:200]})
|
||
elif "regex" in it and isinstance(it["regex"], str) and it["regex"].strip():
|
||
out.append({"regex": it["regex"].strip()[:200]})
|
||
if len(out) >= 16: break
|
||
return out
|
||
except Exception:
|
||
return []
|
||
|
||
def _with_preview(text: str, st: "AgentState", *, limit: int = 1200, header: str = "--- SMART-RAG quick scan (preview) ---") -> str:
|
||
"""Plak een compacte SMART-RAG preview onderaan het antwoord, als die er is."""
|
||
sp = getattr(st, "smart_preview", "") or ""
|
||
sp = sp.strip()
|
||
if not sp:
|
||
return text
|
||
if limit > 0 and len(sp) > limit:
|
||
sp = sp[:limit].rstrip() + "\n…"
|
||
return text + "\n\n" + header + "\n" + sp
|
||
|
||
|
||
def _now() -> int:
|
||
return int(time.time())
|
||
|
||
def _gitea_headers():
|
||
return {"Authorization": f"token {GITEA_TOKEN}"} if GITEA_TOKEN else {}
|
||
|
||
def add_auth_to_url(url: str, user: str | None = None, token: str | None = None) -> str:
|
||
if not url or not (user and token):
|
||
return url
|
||
u = urlparse(url)
|
||
if u.scheme not in ("http", "https") or "@" in u.netloc:
|
||
return url
|
||
netloc = f"{user}:{token}@{u.netloc}"
|
||
return urlunparse((u.scheme, netloc, u.path, u.params, u.query, u.fragment))
|
||
|
||
def ensure_git_suffix(url: str) -> str:
|
||
try:
|
||
u = urlparse(url)
|
||
if not u.path.endswith(".git") and "/api/" not in u.path:
|
||
return urlunparse((u.scheme, u.netloc, u.path.rstrip("/") + ".git", u.params, u.query, u.fragment))
|
||
return url
|
||
except Exception:
|
||
return url
|
||
|
||
def parse_owner_repo(hint: str) -> tuple[str | None, str | None]:
|
||
m = re.match(r"^([A-Za-z0-9_.\-]+)/([A-Za-z0-9_.\-]+)$", (hint or "").strip())
|
||
if not m:
|
||
return None, None
|
||
return m.group(1), m.group(2)
|
||
|
||
def gitea_get_repo(owner: str, repo: str) -> dict | None:
|
||
try:
|
||
r = requests.get(f"{GITEA_API}/repos/{owner}/{repo}", headers=_gitea_headers(), timeout=10)
|
||
if r.status_code == 404:
|
||
return None
|
||
r.raise_for_status()
|
||
return r.json()
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:gitea_get_repo %s/%s failed: %s", owner, repo, e)
|
||
return None
|
||
|
||
def gitea_search_repos(q: str, limit: int = 5) -> List[dict]:
|
||
try:
|
||
r = requests.get(f"{GITEA_API}/repos/search",
|
||
params={"q": q, "limit": limit},
|
||
headers=_gitea_headers(), timeout=10)
|
||
r.raise_for_status()
|
||
data = r.json() or {}
|
||
if isinstance(data, dict) and "data" in data: return data["data"]
|
||
if isinstance(data, list): return data
|
||
if isinstance(data, dict) and "ok" in data and "data" in data: return data["data"]
|
||
return []
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:/repos/search failed: %s", e)
|
||
return []
|
||
|
||
def resolve_repo(hint: str) -> tuple[dict | None, str | None]:
|
||
hint = (hint or "").strip()
|
||
logger.info("INFO:agent_repo:resolve_repo hint=%s", hint)
|
||
if hint.startswith("http://") or hint.startswith("https://"):
|
||
url = add_auth_to_url(ensure_git_suffix(hint), GITEA_HTTP_USER, GITEA_HTTP_TOKEN)
|
||
owner, repo = owner_repo_from_url(url)
|
||
rd = {"full_name": f"{owner}/{repo}" if owner and repo else None, "clone_url": url}
|
||
logger.info("INFO:agent_repo:resolved direct-url %s", rd.get("full_name"))
|
||
return rd, "direct-url"
|
||
owner, repo = parse_owner_repo(hint)
|
||
if owner and repo:
|
||
meta = gitea_get_repo(owner, repo)
|
||
if meta:
|
||
url = meta.get("clone_url") or f"{GITEA_URL}/{owner}/{repo}.git"
|
||
url = add_auth_to_url(ensure_git_suffix(url), GITEA_HTTP_USER, GITEA_HTTP_TOKEN)
|
||
meta["clone_url"] = url
|
||
logger.info("INFO:agent_repo:resolved owner-repo %s", meta.get("full_name"))
|
||
return meta, "owner-repo"
|
||
url = add_auth_to_url(ensure_git_suffix(f"{GITEA_URL}/{owner}/{repo}.git"), GITEA_HTTP_USER, GITEA_HTTP_TOKEN)
|
||
rd = {"full_name": f"{owner}/{repo}", "clone_url": url}
|
||
logger.info("INFO:agent_repo:resolved owner-repo-fallback %s", rd.get("full_name"))
|
||
return rd, "owner-repo-fallback"
|
||
found = gitea_search_repos(hint, limit=5)
|
||
if found:
|
||
found[0]["clone_url"] = add_auth_to_url(ensure_git_suffix(found[0].get("clone_url") or ""), GITEA_HTTP_USER, GITEA_HTTP_TOKEN)
|
||
logger.info("INFO:agent_repo:resolved search %s", found[0].get("full_name"))
|
||
return found[0], "search"
|
||
logger.error("ERROR:agent_repo:repo not found for hint=%s", hint)
|
||
return None, "not-found"
|
||
|
||
def extract_context_hints_from_prompt(user_goal: str) -> dict:
|
||
"""
|
||
Haal dynamisch hints uit de prompt:
|
||
- tag_names: HTML/XML tags die genoemd zijn (<title>, <h1>, <button> ...)
|
||
- attr_names: genoemde HTML attributen (value, placeholder, title, aria-label ...)
|
||
"""
|
||
tag_names = set()
|
||
for m in re.finditer(r"<\s*([A-Za-z][A-Za-z0-9:_-]*)\s*>", user_goal):
|
||
tag_names.add(m.group(1).lower())
|
||
attr_names = set()
|
||
for m in re.finditer(r"\b(value|placeholder|title|aria-[a-z-]+|alt|label)\b", user_goal, flags=re.IGNORECASE):
|
||
attr_names.add(m.group(1).lower())
|
||
return {"tag_names": tag_names, "attr_names": attr_names}
|
||
|
||
def gitea_list_all_repos(limit: int = AGENT_DISCOVER_MAX_REPOS) -> List[dict]:
|
||
"""
|
||
Haal zo veel mogelijk repos op die de token kan zien.
|
||
Probeert /repos/search paginated; valt terug op lege lijst bij problemen.
|
||
"""
|
||
out = []
|
||
page = 1
|
||
per_page = 50
|
||
try:
|
||
while len(out) < limit:
|
||
r = requests.get(
|
||
f"{GITEA_API}/repos/search",
|
||
params={"q":"", "limit": per_page, "page": page},
|
||
headers=_gitea_headers(), timeout=10
|
||
)
|
||
r.raise_for_status()
|
||
data = r.json()
|
||
items = data.get("data") if isinstance(data, dict) else (data if isinstance(data, list) else [])
|
||
if not items:
|
||
break
|
||
out.extend(items)
|
||
if len(items) < per_page:
|
||
break
|
||
page += 1
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:gitea_list_all_repos failed: %s", e)
|
||
# Normaliseer velden
|
||
norm = []
|
||
for it in out[:limit]:
|
||
full = it.get("full_name") or (f"{it.get('owner',{}).get('login','')}/{it.get('name','')}".strip("/"))
|
||
clone = it.get("clone_url") or (f"{GITEA_URL}/{full}.git" if full else None)
|
||
default_branch = it.get("default_branch") or "main"
|
||
norm.append({
|
||
"full_name": full,
|
||
"name": it.get("name"),
|
||
"owner": (it.get("owner") or {}).get("login"),
|
||
"description": it.get("description") or "",
|
||
"language": it.get("language") or "",
|
||
"topics": it.get("topics") or [],
|
||
"default_branch": default_branch,
|
||
"clone_url": add_auth_to_url(ensure_git_suffix(clone), GITEA_HTTP_USER, GITEA_HTTP_TOKEN) if clone else None,
|
||
})
|
||
return [n for n in norm if n.get("full_name")]
|
||
|
||
def gitea_fetch_readme(owner: str, repo: str, ref: str = "main") -> str:
|
||
"""Probeer README via API; dek meerdere varianten af; decode base64 als nodig."""
|
||
candidates = [
|
||
f"{GITEA_API}/repos/{owner}/{repo}/readme",
|
||
f"{GITEA_API}/repos/{owner}/{repo}/contents/README.md",
|
||
f"{GITEA_API}/repos/{owner}/{repo}/contents/README",
|
||
f"{GITEA_API}/repos/{owner}/{repo}/contents/readme.md",
|
||
]
|
||
for url in candidates:
|
||
try:
|
||
r = requests.get(url, params={"ref": ref}, headers=_gitea_headers(), timeout=10)
|
||
if r.status_code == 404:
|
||
continue
|
||
r.raise_for_status()
|
||
js = r.json()
|
||
# content in base64?
|
||
if isinstance(js, dict) and "content" in js:
|
||
try:
|
||
return base64.b64decode(js["content"]).decode("utf-8", errors="ignore")
|
||
except Exception:
|
||
pass
|
||
# sommige Gitea versies hebben 'download_url'
|
||
dl = js.get("download_url") if isinstance(js, dict) else None
|
||
if dl:
|
||
rr = requests.get(dl, timeout=10, headers=_gitea_headers())
|
||
rr.raise_for_status()
|
||
return rr.text
|
||
except Exception:
|
||
continue
|
||
return ""
|
||
|
||
def gitea_repo_exists(owner: str, name: str) -> bool:
|
||
"""Controleer via de Gitea API of owner/name bestaat (en je token rechten heeft)."""
|
||
try:
|
||
r = requests.get(f"{GITEA_API}/repos/{owner}/{name}",
|
||
headers=_gitea_headers(), timeout=5)
|
||
return r.status_code == 200
|
||
except Exception:
|
||
return False
|
||
|
||
def owner_repo_from_url(url: str) -> tuple[str|None, str|None]:
|
||
"""
|
||
Probeer owner/repo uit een http(s) .git URL te halen.
|
||
Voorbeeld: http://host:3080/owner/repo.git -> ('owner', 'repo')
|
||
"""
|
||
try:
|
||
from urllib.parse import urlparse
|
||
p = urlparse(url)
|
||
parts = [x for x in (p.path or "").split("/") if x]
|
||
if len(parts) >= 2:
|
||
repo = parts[-1]
|
||
if repo.endswith(".git"):
|
||
repo = repo[:-4]
|
||
owner = parts[-2]
|
||
return owner, repo
|
||
except Exception:
|
||
pass
|
||
return None, None
|
||
|
||
|
||
# === Repo-catalogus indexeren in Meili (optioneel) en Chroma ===
|
||
def meili_get_index(name: str):
|
||
cli = get_meili()
|
||
if not cli: return None
|
||
try:
|
||
return cli.index(name)
|
||
except Exception:
|
||
try:
|
||
return cli.create_index(uid=name, options={"primaryKey":"id"})
|
||
except Exception:
|
||
return None
|
||
|
||
def meili_catalog_upsert(docs: List[dict]):
|
||
idx = meili_get_index(REPO_CATALOG_MEILI_INDEX)
|
||
if not idx or not docs: return
|
||
try:
|
||
idx.add_documents(docs)
|
||
try:
|
||
idx.update_searchable_attributes(["full_name","name","description","readme","topics","language"])
|
||
idx.update_filterable_attributes(["full_name","owner","language","topics"])
|
||
except Exception:
|
||
pass
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:meili_catalog_upsert: %s", e)
|
||
|
||
def meili_catalog_search(q: str, limit: int = 10) -> List[dict]:
|
||
idx = meili_get_index(REPO_CATALOG_MEILI_INDEX)
|
||
if not idx: return []
|
||
try:
|
||
res = idx.search(q, {"limit": limit})
|
||
# Gebruik ALTIJD de injectie:
|
||
#res = meili_search_fn(
|
||
# q,
|
||
# limit=limit,
|
||
# filter={"repo_full": st.owner_repo, "branch": st.branch_base}
|
||
#)
|
||
return res.get("hits", [])
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:meili_catalog_search: %s", e)
|
||
return []
|
||
|
||
def chroma_catalog_upsert(docs: List[dict]):
|
||
"""Indexeer/upsérten van repo-catalogus in Chroma met de GEÏNJECTEERDE embedding_function. (bij HTTP-mode embeddings client-side meezenden.)"""
|
||
try:
|
||
if not docs or _get_chroma_collection is None:
|
||
return
|
||
col = _get_chroma_collection("repo_catalog") # naam wordt in app.py gesuffixed met __<slug>__v<ver>
|
||
ids = [d["id"] for d in docs]
|
||
texts = [d["doc"] for d in docs]
|
||
metas = [d["meta"] for d in docs]
|
||
# schoon oud weg, best-effort
|
||
try:
|
||
col.delete(ids=ids)
|
||
except Exception:
|
||
pass
|
||
if _embed_documents:
|
||
embs = _embed_documents(texts)
|
||
col.add(ids=ids, documents=texts, embeddings=embs, metadatas=metas)
|
||
else:
|
||
col.add(ids=ids, documents=texts, metadatas=metas)
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:chroma_catalog_upsert: %s", e)
|
||
|
||
def chroma_catalog_search(q: str, n: int = 8) -> List[dict]:
|
||
try:
|
||
if _get_chroma_collection is None or _embed_query_fn is None:
|
||
return []
|
||
col = _get_chroma_collection("repo_catalog")
|
||
q_emb = _embed_query_fn(q)
|
||
res = col.query(query_embeddings=[q_emb], n_results=n, include=["documents","metadatas","distances"])
|
||
docs = (res.get("documents") or [[]])[0]
|
||
metas = (res.get("metadatas") or [[]])[0]
|
||
dists = (res.get("distances") or [[]])[0]
|
||
out = []
|
||
for doc, meta, dist in zip(docs, metas, dists):
|
||
if isinstance(meta, dict):
|
||
sim = 1.0 / (1.0 + float(dist or 0.0)) # simpele afstand→similarity
|
||
out.append({"full_name": meta.get("full_name"), "score": float(sim), "preview": doc})
|
||
return out
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:chroma_catalog_search: %s", e)
|
||
return []
|
||
|
||
|
||
# === Documenten maken voor catalogus ===
|
||
def build_repo_catalog_doc(meta: dict, readme: str) -> dict:
|
||
full_name = meta.get("full_name","")
|
||
name = meta.get("name","")
|
||
desc = meta.get("description","")
|
||
lang = meta.get("language","")
|
||
topics = " ".join(meta.get("topics") or [])
|
||
preview = (readme or "")[:2000]
|
||
doc = (
|
||
f"{full_name}\n"
|
||
f"{name}\n"
|
||
f"{desc}\n"
|
||
f"language: {lang}\n"
|
||
f"topics: {topics}\n"
|
||
f"README:\n{preview}"
|
||
)
|
||
return {
|
||
"id": f"repo:{full_name}",
|
||
"doc": doc,
|
||
"meta": {
|
||
"full_name": full_name,
|
||
"name": name,
|
||
"description": desc,
|
||
"language": lang,
|
||
"topics": topics,
|
||
}
|
||
}
|
||
|
||
# === Heuristische (lexicale) score als fallback ===
|
||
def lexical_repo_score(q: str, meta: dict, readme: str) -> float:
|
||
qtokens = re.findall(r"[A-Za-z0-9_]{2,}", q.lower())
|
||
text = " ".join([
|
||
meta.get("full_name",""),
|
||
meta.get("name",""),
|
||
meta.get("description",""),
|
||
" ".join(meta.get("topics") or []),
|
||
(readme or "")[:4000],
|
||
]).lower()
|
||
if not qtokens or not text:
|
||
return 0.0
|
||
score = 0
|
||
for t in set(qtokens):
|
||
score += text.count(t)
|
||
# kleine bonus als 'mainten', 'admin', 'viewer' etc tegelijk voorkomen in naam
|
||
name = (meta.get("name") or "").lower()
|
||
for t in set(qtokens):
|
||
if t in name:
|
||
score += 2
|
||
return float(score)
|
||
|
||
# === LLM-rerank voor repo's (hergebruik van je bestaande reranker) ===
|
||
async def llm_rerank_repos(user_goal: str, candidates: List[dict], topk: int = 5) -> List[dict]:
|
||
if not candidates:
|
||
return []
|
||
pack = []
|
||
for i, c in enumerate(candidates[:12], 1):
|
||
pv = c.get("preview","")[:700]
|
||
pack.append(f"{i}. REPO: {c['full_name']}\nDESC: {c.get('description','')}\nPREVIEW:\n{pv}")
|
||
prompt = (
|
||
"Rangschik onderstaande repositories op geschiktheid voor het doel. "
|
||
"Geef een geldige JSON-array met objecten: {\"full_name\":\"...\",\"score\":0-100}.\n\n"
|
||
"DOEL:\n" + user_goal + "\n\nCANDIDATES:\n" + "\n\n".join(pack)
|
||
)
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":"Alleen geldige JSON."},
|
||
{"role":"user","content":prompt}],
|
||
stream=False, temperature=0.0, top_p=0.9, max_tokens=600
|
||
)
|
||
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
||
arr = safe_json_loads(raw)
|
||
if not isinstance(arr, list):
|
||
return candidates[:topk]
|
||
smap = {}
|
||
for d in (arr or []):
|
||
if not isinstance(d, dict):
|
||
continue
|
||
fn = d.get("full_name"); sc = d.get("score")
|
||
try:
|
||
if isinstance(fn, str):
|
||
smap[fn] = float(sc)
|
||
except Exception:
|
||
continue
|
||
|
||
#smap = {d.get("full_name"): float(d.get("score",0)) for d in arr if isinstance(d, dict) and "full_name" in d}
|
||
resc = []
|
||
for c in candidates:
|
||
resc.append({**c, "score": smap.get(c["full_name"], 0.0)/100.0})
|
||
resc.sort(key=lambda x: x.get("score",0.0), reverse=True)
|
||
return resc[:topk]
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:llm_rerank_repos failed: %s", e)
|
||
return candidates[:topk]
|
||
|
||
# --- Intent/goal refine ---
|
||
async def llm_refine_goal(raw_goal: str) -> tuple[str, List[str], float]:
|
||
"""
|
||
Laat LLM een compacte, concrete 'refined_goal' maken + max 2 verduidelijkingsvragen.
|
||
Retourneert (refined_goal, clarifying_questions, confidence(0..1)).
|
||
"""
|
||
SYSTEM = "Geef uitsluitend geldige JSON; geen uitleg."
|
||
USER = (
|
||
"Vat de bedoeling van deze opdracht ultra-kort en concreet samen als 'refined_goal'. "
|
||
"Als er kritieke onduidelijkheden zijn: geef max 2 korte 'clarifying_questions'. "
|
||
"Geef ook 'confidence' (0..1). JSON:\n"
|
||
"{ \"refined_goal\": \"...\", \"clarifying_questions\": [\"...\"], \"confidence\": 0.0 }\n\n"
|
||
f"RAW_GOAL:\n{raw_goal}"
|
||
)
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":SYSTEM},{"role":"user","content":USER}],
|
||
stream=False, temperature=0.0, top_p=0.9, max_tokens=300
|
||
)
|
||
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
||
js = safe_json_loads(raw) or {}
|
||
rg = (js.get("refined_goal") or "").strip() or raw_goal
|
||
qs = [q.strip() for q in (js.get("clarifying_questions") or []) if isinstance(q, str) and q.strip()][:2]
|
||
cf = float(js.get("confidence", 0.0) or 0.0)
|
||
cf = max(0.0, min(1.0, cf))
|
||
return rg, qs, cf
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:llm_refine_goal failed: %s", e)
|
||
return raw_goal, [], 0.0
|
||
|
||
|
||
# === Discovery pipeline ===
|
||
async def discover_candidate_repos(user_goal: str) -> List[dict]:
|
||
"""Zoek een passende repo puur op basis van de vraag (zonder hint)."""
|
||
#repos = gitea_list_all_repos(limit=AGENT_DISCOVER_MAX_REPOS)
|
||
repos = await run_io_blocking(gitea_list_all_repos, limit=AGENT_DISCOVER_MAX_REPOS)
|
||
if not repos:
|
||
return []
|
||
|
||
# Concurrerende fetch (beperk paralleliteit licht voor stabiliteit)
|
||
sem = asyncio.Semaphore(int(os.getenv("AGENT_DISCOVER_README_CONCURRENCY", "8")))
|
||
|
||
async def _fetch_readme(m):
|
||
async with sem:
|
||
return await run_io_blocking(
|
||
gitea_fetch_readme,
|
||
m.get("owner",""), m.get("name",""), m.get("default_branch","main")
|
||
)
|
||
|
||
readmes = await asyncio.gather(*[_fetch_readme(m) for m in repos], return_exceptions=True)
|
||
|
||
|
||
# Verzamel README's (kort) en bouw catalogus docs
|
||
docs_meili = []
|
||
docs_chroma = []
|
||
cands = []
|
||
for i, m in enumerate(repos):
|
||
#readme = gitea_fetch_readme(m.get("owner",""), m.get("name",""), m.get("default_branch","main"))
|
||
readme = "" if isinstance(readmes[i], Exception) else (readmes[i] or "")
|
||
doc = build_repo_catalog_doc(m, readme)
|
||
docs_chroma.append(doc)
|
||
docs_meili.append({
|
||
"id": m["full_name"],
|
||
"full_name": m["full_name"],
|
||
"name": m.get("name",""),
|
||
"owner": m.get("owner",""),
|
||
"description": m.get("description",""),
|
||
"language": m.get("language",""),
|
||
"topics": " ".join(m.get("topics") or []),
|
||
"readme": (readme or "")[:5000],
|
||
})
|
||
cands.append({
|
||
"full_name": m["full_name"],
|
||
"description": m.get("description",""),
|
||
"clone_url": m.get("clone_url"),
|
||
"preview": (readme or "")[:1200],
|
||
"base_score": 0.0, # vullen we zo
|
||
})
|
||
|
||
# Indexeer catalogus (best effort)
|
||
if MEILI_URL:
|
||
meili_catalog_upsert(docs_meili)
|
||
chroma_catalog_upsert(docs_chroma)
|
||
|
||
# Multi-query expand
|
||
queries = await llm_expand_queries(user_goal, extract_quotes(user_goal), extract_word_hints(user_goal), k=5)
|
||
|
||
# Heuristische score + Meili/Chroma boosts
|
||
score_map: Dict[str, float] = {c["full_name"]: 0.0 for c in cands}
|
||
for q in queries:
|
||
# lexicale score
|
||
for i, m in enumerate(repos):
|
||
score_map[m["full_name"]] += 0.2 * lexical_repo_score(q, m, (docs_meili[i].get("readme") if i < len(docs_meili) else ""))
|
||
|
||
# Meili boost
|
||
if MEILI_URL:
|
||
hits = meili_catalog_search(q, limit=10)
|
||
for h in hits:
|
||
fn = h.get("full_name")
|
||
if fn in score_map:
|
||
score_map[fn] += 2.0
|
||
|
||
# Chroma boost
|
||
chroma_hits = chroma_catalog_search(q, n=6)
|
||
for h in chroma_hits:
|
||
fn = h.get("full_name")
|
||
if fn in score_map:
|
||
score_map[fn] += 1.2
|
||
|
||
# Combineer in kandidaten
|
||
for c in cands:
|
||
c["score"] = score_map.get(c["full_name"], 0.0)
|
||
|
||
# Snelle preselectie
|
||
cands.sort(key=lambda x: x["score"], reverse=True)
|
||
pre = cands[:8]
|
||
|
||
# LLM rerank met uitleg-score
|
||
top = await llm_rerank_repos(user_goal, pre, topk=5)
|
||
return top
|
||
|
||
|
||
# ---------- Chroma collection naam ----------
|
||
def sanitize_collection_name(s: str) -> str:
|
||
s = re.sub(r"[^A-Za-z0-9._-]+", "-", s).strip("-")[:128]
|
||
return s or "code_docs"
|
||
|
||
def repo_collection_name(owner_repo: str | None, branch: str) -> str:
|
||
return sanitize_collection_name(f"code_docs-{owner_repo or 'repo'}-{branch}")
|
||
|
||
def _get_session_id(messages: List[dict], request) -> str:
|
||
for m in messages:
|
||
if m.get("role") == "system" and str(m.get("content","")).startswith("session:"):
|
||
return str(m["content"]).split("session:",1)[1].strip()
|
||
key = (messages[0].get("content","") + "|" + _client_ip(request)).encode("utf-8", errors="ignore")
|
||
return hashlib.sha256(key).hexdigest()[:16]
|
||
|
||
# ---------- Files & filters ----------
|
||
def allowed_file(p: Path) -> bool:
|
||
lo = p.name.lower()
|
||
return any(lo.endswith(ext) for ext in ALLOWED_EXTS)
|
||
|
||
def list_repo_files(repo_root: Path) -> List[str]:
|
||
# lichte TTL-cache om herhaalde rglob/IO te beperken (sneller bij multi-queries)
|
||
ttl = float(os.getenv("AGENT_LIST_CACHE_TTL", "20"))
|
||
key = str(repo_root.resolve())
|
||
now = time.time()
|
||
if key in _LIST_FILES_CACHE:
|
||
ts, cached = _LIST_FILES_CACHE[key]
|
||
if now - ts <= ttl:
|
||
return list(cached)
|
||
|
||
files: List[str] = []
|
||
for p in repo_root.rglob("*"):
|
||
if p.is_dir(): continue
|
||
if any(part in _PROFILE_EXCLUDE_DIRS for part in p.parts): continue
|
||
try:
|
||
if p.stat().st_size > 2_000_000: continue
|
||
except Exception:
|
||
continue
|
||
if not allowed_file(p): continue
|
||
files.append(str(p.relative_to(repo_root)))
|
||
_LIST_FILES_CACHE[key] = (now, files)
|
||
return files
|
||
|
||
# ---------- Query parsing ----------
|
||
def extract_quotes(text: str) -> List[str]:
|
||
if not text: return []
|
||
t = (text or "").replace("“","\"").replace("”","\"").replace("’","'").strip()
|
||
return re.findall(r"['\"]([^'\"]{2,})['\"]", t)
|
||
|
||
|
||
def extract_word_hints(text: str) -> List[str]:
|
||
if not text: return []
|
||
words = set(re.findall(r"[A-Za-z_][A-Za-z0-9_]{1,}", text))
|
||
blacklist = {"de","het","een","and","the","voor","naar","op","in","of","to","is","are","van","met","die","dat"}
|
||
return [w for w in words if w.lower() not in blacklist]
|
||
|
||
# ---------- SAFE JSON loader ----------
|
||
def safe_json_loads(s: str):
|
||
if not s: return None
|
||
t = s.strip()
|
||
if t.startswith("```"):
|
||
t = re.sub(r"^```(?:json)?", "", t.strip(), count=1).strip()
|
||
if t.endswith("```"): t = t[:-3].strip()
|
||
try:
|
||
return json.loads(t)
|
||
except Exception:
|
||
return None
|
||
|
||
# ---------- Meilisearch (optioneel) ----------
|
||
_meili_client = None
|
||
def get_meili():
|
||
global _meili_client
|
||
if _meili_client is not None:
|
||
return _meili_client
|
||
if not MEILI_URL:
|
||
return None
|
||
try:
|
||
from meilisearch import Client
|
||
_meili_client = Client(MEILI_URL, MEILI_KEY or None)
|
||
return _meili_client
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:Meilisearch not available: %s", e)
|
||
return None
|
||
|
||
def meili_index_name(owner_repo: Optional[str], branch: str) -> str:
|
||
base = sanitize_collection_name((owner_repo or "repo") + "-" + branch)
|
||
return sanitize_collection_name(f"{MEILI_INDEX_PREFIX}-{base}")
|
||
|
||
# --- Slimmere, taalbewuste chunker ---
|
||
|
||
_LANG_BY_EXT = {
|
||
".php": "php", ".blade.php": "blade", ".js": "js", ".ts": "ts",
|
||
".jsx": "js", ".tsx": "ts", ".py": "py", ".go": "go",
|
||
".rb": "rb", ".java": "java", ".cs": "cs",
|
||
".css": "css", ".scss": "css",
|
||
".html": "html", ".htm": "html", ".md": "md",
|
||
".yml": "yaml", ".yaml": "yaml", ".toml": "toml", ".ini": "ini",
|
||
".json": "json",
|
||
}
|
||
|
||
def _detect_lang_from_path(path: str) -> str:
|
||
lo = path.lower()
|
||
for ext, lang in _LANG_BY_EXT.items():
|
||
if lo.endswith(ext):
|
||
return lang
|
||
return "txt"
|
||
|
||
def _find_breakpoints(text: str, lang: str) -> list[int]:
|
||
"""
|
||
Retourneer lijst met 'mooie' breekposities (char indices) om chunks te knippen.
|
||
We houden het conservatief; false-positives zijn OK (we kiezen toch dichtstbij).
|
||
"""
|
||
bps = set()
|
||
# Altijd: lege-regelblokken en paragrafen
|
||
for m in re.finditer(r"\n\s*\n\s*", text):
|
||
bps.add(m.end())
|
||
|
||
if lang in ("php", "js", "ts", "java", "cs", "go", "rb", "py"):
|
||
# Functie/klasse boundaries
|
||
pats = [
|
||
r"\n\s*(class|interface|trait)\s+[A-Za-z_][A-Za-z0-9_]*\b",
|
||
r"\n\s*(public|private|protected|static|\s)*\s*function\b",
|
||
r"\n\s*def\s+[A-Za-z_][A-Za-z0-9_]*\s*\(", # py
|
||
r"\n\s*func\s+[A-Za-z_][A-Za-z0-9_]*\s*\(", # go
|
||
r"\n\s*[A-Za-z0-9_<>\[\]]+\s+[A-Za-z_][A-Za-z0-9_]*\s*\(", # java/cs method-ish
|
||
r"\n\}", # sluitende brace op kolom 0 → goed eind
|
||
]
|
||
for p in pats:
|
||
for m in re.finditer(p, text):
|
||
bps.add(m.start())
|
||
|
||
if lang == "blade":
|
||
for p in [r"\n\s*@section\b", r"\n\s*@endsection\b", r"\n\s*@if\b", r"\n\s*@endif\b", r"\n\s*<\w"]:
|
||
for m in re.finditer(p, text, flags=re.I):
|
||
bps.add(m.start())
|
||
|
||
if lang in ("html", "css"):
|
||
for p in [r"\n\s*<\w", r"\n\s*</\w", r"\n\s*}\s*\n"]:
|
||
for m in re.finditer(p, text):
|
||
bps.add(m.start())
|
||
|
||
if lang in ("md",):
|
||
for p in [r"\n#+\s", r"\n\-{3,}\n", r"\n\*\s", r"\n\d+\.\s"]:
|
||
for m in re.finditer(p, text):
|
||
bps.add(m.start())
|
||
|
||
if lang in ("yaml", "toml", "ini"):
|
||
# secties/keys aan kolom 0
|
||
for m in re.finditer(r"\n[A-Za-z0-9_\-]+\s*[:=]", text):
|
||
bps.add(m.start())
|
||
|
||
# JSON: split op object/array boundaries (conservatief: op { of [ aan kolom 0-ish)
|
||
if lang == "json":
|
||
for m in re.finditer(r"\n\s*[\{\[]\s*\n", text):
|
||
bps.add(m.start())
|
||
|
||
# Altijd: regelgrenzen
|
||
for m in re.finditer(r"\n", text):
|
||
bps.add(m.start()+1)
|
||
|
||
# sorteer & filter binnen range
|
||
out = sorted([bp for bp in bps if 0 < bp < len(text)])
|
||
return out
|
||
|
||
def smart_chunk_text(text: str, path_hint: str, target_chars: int = 1800,
|
||
hard_max: int = 2600, min_chunk: int = 800) -> List[str]:
|
||
"""
|
||
Chunk op ~target_chars, maar breek op dichtstbijzijnde semantische breakpoint.
|
||
- Als geen goed breakpoint: breek op dichtstbijzijnde newline.
|
||
- Adaptieve overlap: 200 bij nette break, 350 bij 'ruwe' break.
|
||
"""
|
||
if not text:
|
||
return []
|
||
lang = _detect_lang_from_path(path_hint or "")
|
||
bps = _find_breakpoints(text, lang)
|
||
if not bps:
|
||
# fallback: vaste stappen met overlap
|
||
chunks = []
|
||
i, n = 0, len(text)
|
||
step = max(min_chunk, target_chars - 300)
|
||
while i < n:
|
||
j = min(n, i + target_chars)
|
||
chunks.append(text[i:j])
|
||
i = min(n, i + step)
|
||
return chunks
|
||
|
||
chunks = []
|
||
i, n = 0, len(text)
|
||
while i < n:
|
||
# streef naar i+target_chars, maar zoek 'mooie' breakpoints tussen [i+min_chunk, i+hard_max]
|
||
ideal = i + target_chars
|
||
lo = i + min_chunk
|
||
hi = min(n, i + hard_max)
|
||
# kandidaten = bps in range
|
||
candidates = [bp for bp in bps if lo <= bp <= hi]
|
||
if not candidates:
|
||
# geen mooie; breek grof op ideal of n
|
||
j = min(n, ideal)
|
||
chunk = text[i:j]
|
||
chunks.append(chunk)
|
||
# grotere overlap (ruw)
|
||
i = j - 350 if j - 350 > i else j
|
||
continue
|
||
# kies dichtstbij het ideaal
|
||
j = min(candidates, key=lambda bp: abs(bp - ideal))
|
||
chunk = text[i:j]
|
||
chunks.append(chunk)
|
||
# nette break → kleine overlap
|
||
i = j - 200 if j - 200 > i else j
|
||
|
||
# schoon lege/te-kleine staarten
|
||
out = [c for c in chunks if c and c.strip()]
|
||
return out
|
||
|
||
|
||
def meili_index_repo(repo_root: Path, owner_repo: Optional[str], branch: str):
|
||
cli = get_meili()
|
||
if not cli: return
|
||
idx_name = meili_index_name(owner_repo, branch)
|
||
try:
|
||
idx = cli.index(idx_name)
|
||
except Exception:
|
||
idx = cli.create_index(uid=idx_name, options={"primaryKey":"id"})
|
||
docs = []
|
||
bm25_docs = [] # ← verzamel hier voor BM25
|
||
count = 0
|
||
for rel in list_repo_files(repo_root):
|
||
p = repo_root / rel
|
||
try:
|
||
txt = _read_text_file(p) or ""
|
||
except Exception:
|
||
continue
|
||
for ci, chunk in enumerate(smart_chunk_text(txt, rel, target_chars=int(os.getenv("CHUNK_TARGET_CHARS","1800")),hard_max=int(os.getenv("CHUNK_HARD_MAX","2600")),min_chunk=int(os.getenv("CHUNK_MIN_CHARS","800")))):
|
||
doc_id = f"{owner_repo}:{branch}:{rel}:{ci}"
|
||
item = {"id": doc_id, "path": rel, "repo": owner_repo, "branch": branch, "content": chunk}
|
||
docs.append(item)
|
||
bm25_docs.append(item) # ← ook hier
|
||
count += 1
|
||
if len(docs) >= 1000:
|
||
idx.add_documents(docs); docs.clear()
|
||
if docs:
|
||
idx.add_documents(docs)
|
||
try:
|
||
idx.update_searchable_attributes(["content","path","repo","branch"])
|
||
idx.update_filterable_attributes(["repo","branch","path"])
|
||
except Exception:
|
||
pass
|
||
logger.info("INFO:agent_repo:meili indexed ~%d chunks into %s", count, idx_name)
|
||
|
||
# Lokale BM25 cache opbouwen uit bm25_docs (niet uit docs dat intussen leeg kan zijn)
|
||
try:
|
||
if BM25Okapi and bm25_docs:
|
||
toks = [re.findall(r"[A-Za-z0-9_]+", d["content"].lower()) for d in bm25_docs]
|
||
bm = BM25Okapi(toks) if toks else None
|
||
if bm:
|
||
_BM25_CACHE[idx_name] = {"bm25": bm, "docs": bm25_docs}
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:bm25 build failed: %s", e)
|
||
|
||
|
||
def meili_search(owner_repo: Optional[str], branch: str, q: str, limit: int = 10) -> List[dict]:
|
||
cli = get_meili()
|
||
if not cli: return []
|
||
try:
|
||
idx = cli.index(meili_index_name(owner_repo, branch))
|
||
res = idx.search(q, {"limit": limit})
|
||
# Gebruik ALTIJD de injectie:
|
||
#res = meili_search_fn(
|
||
# q,
|
||
# limit=limit,
|
||
# filter={"repo_full": st.owner_repo, "branch": st.branch_base}
|
||
#)
|
||
return res.get("hits", [])
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:meili_search failed: %s", e)
|
||
return []
|
||
|
||
# ---------- BM25 fallback ----------
|
||
_BM25_CACHE: Dict[str, dict] = {}
|
||
|
||
# module-scope
|
||
_BM25_BY_REPO: dict[str, tuple[BM25Okapi, list[dict]]] = {}
|
||
def _tok(s: str) -> list[str]:
|
||
return re.findall(r"[A-Za-z0-9_]+", s.lower())
|
||
|
||
# --- Lightweight symbol index (in-memory, per repo collection) ---
|
||
_SYMBOL_INDEX: dict[str, dict[str, dict[str, int]]] = {}
|
||
# structuur: { collection_name: { symbol_lower: { path: count } } }
|
||
|
||
|
||
def bm25_index_name(owner_repo: Optional[str], branch: str) -> str:
|
||
return meili_index_name(owner_repo, branch) # dezelfde naam, andere cache
|
||
|
||
def bm25_build_index(repo_root: Path, owner_repo: Optional[str], branch: str):
|
||
# hergebruik meili_index_repo’s docs-opbouw om dubbele IO te vermijden? Hier snel en lokaal:
|
||
if not BM25Okapi:
|
||
return
|
||
idx_name = bm25_index_name(owner_repo, branch)
|
||
docs = []
|
||
for rel in list_repo_files(repo_root):
|
||
p = repo_root / rel
|
||
try:
|
||
txt = _read_text_file(p) or ""
|
||
except Exception:
|
||
continue
|
||
for ci, chunk in enumerate(smart_chunk_text(txt, rel,
|
||
target_chars=int(os.getenv("CHUNK_TARGET_CHARS","1800")),
|
||
hard_max=int(os.getenv("CHUNK_HARD_MAX","2600")),
|
||
min_chunk=int(os.getenv("CHUNK_MIN_CHARS","800")))):
|
||
docs.append({"id": f"{owner_repo}:{branch}:{rel}:{ci}", "path": rel, "repo": owner_repo, "branch": branch, "content": chunk})
|
||
toks = [re.findall(r"[A-Za-z0-9_]+", d["content"].lower()) for d in docs]
|
||
if toks:
|
||
_BM25_CACHE[idx_name] = {"bm25": BM25Okapi(toks), "docs": docs}
|
||
|
||
def bm25_search(owner_repo: Optional[str], branch: str, q: str, limit: int = 10) -> List[dict]:
|
||
idx = _BM25_CACHE.get(bm25_index_name(owner_repo, branch))
|
||
if not idx:
|
||
return []
|
||
bm = idx.get("bm25"); docs = idx.get("docs") or []
|
||
if not bm:
|
||
return []
|
||
toks = re.findall(r"[A-Za-z0-9_]+", (q or "").lower())
|
||
if not toks:
|
||
return []
|
||
scores = bm.get_scores(toks)
|
||
order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:limit]
|
||
return [docs[i] for i in order]
|
||
|
||
def _extract_symbols_generic(path: str, text: str) -> list[str]:
|
||
"""
|
||
Ultra-simpele symbol scraper (taal-agnostisch):
|
||
- class/interface/trait namen
|
||
- function foo(...), Foo::bar, "Controller@method"
|
||
- Laravel: ->name('route.name')
|
||
- React-ish: function Foo(...) { return ( ... ) }, export default function Foo(...)
|
||
- Blade-ish: @section('...'), @component('...'), <x-foo-bar>
|
||
- Basename van file als pseudo-symbool
|
||
"""
|
||
if not text:
|
||
return []
|
||
syms = set()
|
||
|
||
for m in re.finditer(r"\b(class|interface|trait)\s+([A-Za-z_][A-Za-z0-9_\\]*)", text):
|
||
syms.add(m.group(2))
|
||
|
||
for m in re.finditer(r"\bfunction\s+([A-Za-z_][A-Za-z0-9_]*)\s*\(", text):
|
||
syms.add(m.group(1))
|
||
|
||
for m in re.finditer(r"([A-Za-z_][A-Za-z0-9_\\]*)::([A-Za-z_][A-Za-z0-9_]*)", text):
|
||
syms.add(m.group(1) + "::" + m.group(2))
|
||
|
||
for m in re.finditer(r"['\"]([A-Za-z0-9_\\]+)@([A-Za-z0-9_]+)['\"]", text):
|
||
syms.add(m.group(1) + "@" + m.group(2))
|
||
|
||
for m in re.finditer(r"->\s*name\s*\(\s*['\"]([^'\"]+)['\"]\s*\)", text):
|
||
syms.add(m.group(1))
|
||
|
||
for m in re.finditer(r"\bfunction\s+([A-Z][A-Za-z0-9_]*)\s*\(", text):
|
||
syms.add(m.group(1))
|
||
|
||
for m in re.finditer(r"export\s+default\s+function\s+([A-Za-z_][A-Za-z0-9_]*)\s*\(", text):
|
||
syms.add(m.group(1))
|
||
|
||
for m in re.finditer(r"@\s*(section|component|slot)\s*\(\s*['\"]([^'\"]+)['\"]\s*\)", text):
|
||
syms.add(m.group(2))
|
||
for m in re.finditer(r"<\s*x-([a-z0-9\-:]+)", text, flags=re.IGNORECASE):
|
||
syms.add("x-" + m.group(1).lower())
|
||
|
||
base = os.path.basename(path)
|
||
if base:
|
||
syms.add(base)
|
||
|
||
return list(syms)
|
||
|
||
def _symbol_index_name(owner_repo: Optional[str], branch: str) -> str:
|
||
return repo_collection_name(owner_repo, branch)
|
||
|
||
def symbol_index_repo(repo_root: Path, owner_repo: Optional[str], branch: str):
|
||
"""Best-effort: bouw/refresh symbol index voor dit repo/branch."""
|
||
try:
|
||
coll = _symbol_index_name(owner_repo, branch)
|
||
store: dict[str, dict[str, int]] = {}
|
||
for rel in list_repo_files(repo_root):
|
||
p = repo_root / rel
|
||
try:
|
||
if p.stat().st_size > 500_000:
|
||
continue
|
||
txt = _read_text_file(p) or ""
|
||
except Exception:
|
||
continue
|
||
for s in _extract_symbols_generic(rel, txt):
|
||
k = s.strip().lower()
|
||
if not k:
|
||
continue
|
||
bucket = store.setdefault(k, {})
|
||
bucket[rel] = bucket.get(rel, 0) + 1
|
||
_SYMBOL_INDEX[coll] = store
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:symbol_index_repo: %s", e)
|
||
|
||
def symbol_search(owner_repo: Optional[str], branch: str, q: str, limit: int = 10) -> list[tuple[str, int]]:
|
||
"""Eenvoudige symbol-zoeker -> [(path, score)]."""
|
||
coll = _symbol_index_name(owner_repo, branch)
|
||
idx = _SYMBOL_INDEX.get(coll) or {}
|
||
if not idx or not q:
|
||
return []
|
||
quoted = re.findall(r"['\"]([^'\"]{2,})['\"]", q)
|
||
words = re.findall(r"[A-Za-z0-9_:\\.\-]{2,}", q)
|
||
seen = set(); tokens = []
|
||
for t in quoted + words:
|
||
tl = t.lower()
|
||
if tl not in seen:
|
||
seen.add(tl); tokens.append(tl)
|
||
|
||
scores: dict[str, int] = {}
|
||
# exact
|
||
for t in tokens[:12]:
|
||
if t in idx:
|
||
for path, c in idx[t].items():
|
||
scores[path] = scores.get(path, 0) + 3 * c
|
||
# zachte substring
|
||
for sym, paths in idx.items():
|
||
if t in sym:
|
||
for path, c in paths.items():
|
||
scores[path] = scores.get(path, 0) + 1
|
||
|
||
return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:limit]
|
||
|
||
|
||
# ---------- Signal-first scan ----------
|
||
def glob_match(rel: str, patterns: List[str]) -> bool:
|
||
for pat in patterns or []:
|
||
if fnmatch.fnmatch(rel, pat):
|
||
return True
|
||
return False
|
||
|
||
def scan_with_signals(repo_root: Path, files: List[str], sig: dict, phrase_boosts: List[str], hint_boosts: List[str], limit: int = 20) -> List[Tuple[str,int,dict]]:
|
||
file_globs = sig.get("file_globs") or []
|
||
must = [s.lower() for s in (sig.get("must_substrings") or [])]
|
||
maybe = [s.lower() for s in (sig.get("maybe_substrings") or [])]
|
||
regexes = sig.get("regexes") or []
|
||
path_hints = [s.lower() for s in (sig.get("path_hints") or [])]
|
||
exclude_dirs = set(sig.get("exclude_dirs") or [])
|
||
|
||
maybe = list(set(maybe + [p.lower() for p in phrase_boosts]))[:20]
|
||
path_hints = list(set(path_hints + [h.lower() for h in hint_boosts]))[:20]
|
||
|
||
scored: List[Tuple[str,int,dict]] = []
|
||
for rel in files:
|
||
if any(part in exclude_dirs for part in Path(rel).parts): continue
|
||
if file_globs and not glob_match(rel, file_globs): continue
|
||
score = 0
|
||
meta = {"must_hits":0,"maybe_hits":0,"regex_hits":0,"path_hits":0,"phrase_hits":0}
|
||
rel_lo = rel.lower()
|
||
for h in path_hints:
|
||
if h and h in rel_lo: meta["path_hits"] += 1; score += 1
|
||
try:
|
||
txt = _read_text_file(repo_root / rel) or ""
|
||
except Exception:
|
||
continue
|
||
txt_lo = txt.lower()
|
||
if any(m and (m not in txt_lo) for m in must):
|
||
continue
|
||
meta["must_hits"] = len([m for m in must if m and m in txt_lo]); score += 3*meta["must_hits"]
|
||
meta["maybe_hits"] = len([m for m in maybe if m and m in txt_lo]); score += meta["maybe_hits"]
|
||
for rp in regexes:
|
||
try:
|
||
if re.search(rp, txt, flags=re.IGNORECASE|re.DOTALL):
|
||
meta["regex_hits"] += 1; score += 2
|
||
except re.error:
|
||
pass
|
||
phrase_hits = 0
|
||
for ph in phrase_boosts:
|
||
if ph and ph.lower() in txt_lo:
|
||
phrase_hits += 1
|
||
if phrase_hits:
|
||
meta["phrase_hits"] = phrase_hits
|
||
score += 2*phrase_hits
|
||
if score > 0:
|
||
scored.append((rel, score, meta))
|
||
scored.sort(key=lambda x: x[1], reverse=True)
|
||
return scored[:limit]
|
||
|
||
# ---------- Simple keyword fallback ----------
|
||
def simple_keyword_search(repo_root: Path, files: List[str], query: str, limit: int = 8) -> List[Tuple[str,int]]:
|
||
toks = set(re.findall(r"[A-Za-z0-9_]{2,}", (query or "").lower()))
|
||
scores: List[Tuple[str,int]] = []
|
||
for rel in files:
|
||
score = 0
|
||
lo = rel.lower()
|
||
for t in toks:
|
||
if t in lo: score += 1
|
||
if score == 0:
|
||
try:
|
||
txt = _read_text_file(Path(repo_root) / rel) or ""
|
||
txt_lo = txt.lower()
|
||
score += sum(txt_lo.count(t) for t in toks)
|
||
except Exception:
|
||
pass
|
||
if score > 0: scores.append((rel, score))
|
||
scores.sort(key=lambda x: x[1], reverse=True)
|
||
return scores[:limit]
|
||
|
||
# ---------- Expliciete paden ----------
|
||
|
||
|
||
def best_path_by_basename(all_files: List[str], hint: str) -> str | None:
|
||
base = os.path.basename(hint)
|
||
if not base: return None
|
||
hint_tokens = set(re.findall(r"[A-Za-z0-9_]+", hint.lower()))
|
||
scored = []
|
||
for rel in all_files:
|
||
if os.path.basename(rel).lower() == base.lower():
|
||
score = 1
|
||
lo = rel.lower()
|
||
for t in hint_tokens:
|
||
if t in lo: score += 1
|
||
scored.append((rel, score))
|
||
if not scored: return None
|
||
scored.sort(key=lambda x: x[1], reverse=True)
|
||
return scored[0][0]
|
||
|
||
# ---------- Hybrid RAG ----------
|
||
def _append_ctx_preview(answer: str, chunks: list[dict], limit: int = 12) -> str:
|
||
paths = []
|
||
for h in chunks:
|
||
meta = h.get("metadata") or {}
|
||
p = meta.get("path");
|
||
if p and p not in paths: paths.append(p)
|
||
if not paths: return answer
|
||
head = paths[:limit]
|
||
return answer + "\n\n--- context (paths) ---\n" + "\n".join(f"- {p}" for p in head)
|
||
|
||
async def smart_rag_answer(messages: list[dict], *, n_ctx: int = 8,
|
||
owner_repo: Optional[str] = None,
|
||
branch: Optional[str] = None,
|
||
collection_name: Optional[str] = None,
|
||
add_preview: bool = True) -> str:
|
||
# 1) intent
|
||
spec = await enrich_intent(_llm_call, messages)
|
||
task = (spec.get("task") or "").strip()
|
||
if not task:
|
||
return "Geen vraag gedetecteerd."
|
||
|
||
# 2) queries
|
||
variants = await expand_queries(_llm_call, task, k=3)
|
||
|
||
# 3) hybrid retrieve (let op: gebruik dezelfde collectie als index; 'code_docs' wordt in app.py al versied via _collection_versioned)
|
||
# resolve collection: expliciet > (owner_repo,branch) > default
|
||
coll = collection_name or (repo_collection_name(owner_repo, branch or AGENT_DEFAULT_BRANCH) if owner_repo else "code_docs")
|
||
all_hits = []
|
||
for q in variants:
|
||
hits = await hybrid_retrieve(
|
||
_rag_query_internal,
|
||
q,
|
||
n_results=n_ctx,
|
||
per_query_k=max(30, n_ctx * 6),
|
||
alpha=0.6,
|
||
# expliciet doorgeven om zeker de juiste (versie-gesufficede) collectie te raken:
|
||
collection_name=coll,
|
||
)
|
||
all_hits.extend(hits)
|
||
|
||
# dedup op path + chunk_index
|
||
seen = set()
|
||
uniq = []
|
||
for h in sorted(all_hits, key=lambda x: x.get("score", 0), reverse=True):
|
||
meta = h.get("metadata") or {}
|
||
key = (meta.get("path"), meta.get("chunk_index"))
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
uniq.append(h)
|
||
if len(uniq) >= n_ctx:
|
||
break
|
||
|
||
# 4) context
|
||
ctx, top = assemble_context(uniq, max_chars=int(os.getenv("REPO_AGENT_CONTEXT_CHARS","640000")))
|
||
if not ctx:
|
||
return "Geen context gevonden."
|
||
|
||
# 5) laat LLM antwoorden
|
||
sys = "Beantwoord concreet en kort. Citeer relevante paths. Als iets onzeker is: zeg dat."
|
||
usr = f"Vraag: {task}\n\n--- CONTEXT ---\n{ctx}"
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
||
stream=False, temperature=0.2, top_p=0.9, max_tokens=700
|
||
)
|
||
ans = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
|
||
return _append_ctx_preview(ans, uniq) if (add_preview and os.getenv("REPO_AGENT_PREVIEW","1") not in ("0","false")) else ans
|
||
|
||
|
||
|
||
|
||
|
||
|
||
async def llm_expand_queries(user_goal: str, quotes: List[str], hints: List[str], k: int = 5, extra_seeds: Optional[List[str]] = None) -> List[str]: # already defined above
|
||
# (duplicate name kept intentionally — Python allows redef; using the latest one)
|
||
|
||
seed = []
|
||
if quotes: seed += quotes
|
||
if hints: seed += hints[:6]
|
||
if extra_seeds: seed += extra_seeds[:6]
|
||
seed = list(dict.fromkeys(seed))[:8]
|
||
prompt = (
|
||
f"Maak {k} alternatieve zoekqueries (kort, divers). Mix NL/EN, synoniemen, veldnamen."
|
||
" Alleen geldige JSON-array met strings.\n"
|
||
f"Doel:\n{user_goal}\n\nHints:\n" + ", ".join(seed)
|
||
)
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":"Alleen geldige JSON, geen uitleg."},
|
||
{"role":"user","content":prompt}],
|
||
stream=False, temperature=0.3, top_p=0.9, max_tokens=400
|
||
)
|
||
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
||
arr = safe_json_loads(raw)
|
||
base = [user_goal]
|
||
if isinstance(arr, list):
|
||
base += [s for s in arr if isinstance(s, str) and s.strip()]
|
||
out = []
|
||
for q in base:
|
||
qn = re.sub(r"\s+", " ", q.strip())
|
||
if qn and qn not in out: out.append(qn)
|
||
return out[:1+k]
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:llm_expand_queries failed: %s", e)
|
||
return [user_goal]
|
||
|
||
def get_file_preview(repo_root: Path, rel: str, terms: List[str], window: int = 180) -> str:
|
||
try:
|
||
txt = _read_text_file(repo_root / rel) or ""
|
||
except Exception:
|
||
return ""
|
||
if not txt: return ""
|
||
if not terms: return txt[:window*2]
|
||
lo = txt.lower()
|
||
for t in terms:
|
||
i = lo.find(t.lower())
|
||
if i >= 0:
|
||
a = max(0, i - window); b = min(len(txt), i + len(t) + window)
|
||
return txt[a:b]
|
||
return txt[:window*2]
|
||
|
||
async def llm_rerank_candidates(user_goal: str, candidates: List[dict], topk: int = 8) -> List[dict]:
|
||
if not candidates: return []
|
||
pack = []
|
||
for i, c in enumerate(candidates[:20], 1):
|
||
pv = c.get("preview","")[:600]
|
||
pth = c["path"]
|
||
base = os.path.basename(pth)
|
||
dr = os.path.dirname(pth)
|
||
pack.append(f"{i}. PATH: {pth}\nDIR: {dr}\nBASENAME: {base}\nPREVIEW:\n{pv}")
|
||
|
||
prompt = (
|
||
"Rangschik de onderstaande codefragmenten op relevantie om het doel te behalen. "
|
||
"Geef een JSON-array met objecten: {\"path\":\"...\",\"score\":0-100}."
|
||
"\n\nDOEL:\n" + user_goal + "\n\nFRAGMENTEN:\n" + "\n\n".join(pack)
|
||
)
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":"Alleen geldige JSON zonder uitleg."},
|
||
{"role":"user","content":prompt}],
|
||
stream=False, temperature=0.0, top_p=0.9, max_tokens=600
|
||
)
|
||
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
||
arr = safe_json_loads(raw)
|
||
if not isinstance(arr, list):
|
||
return candidates[:topk]
|
||
score_map = {d.get("path"): float(d.get("score",0)) for d in arr if isinstance(d, dict) and "path" in d}
|
||
rescored = []
|
||
for c in candidates:
|
||
rescored.append({**c, "score": score_map.get(c["path"], 0.0)})
|
||
rescored.sort(key=lambda x: x.get("score",0.0), reverse=True)
|
||
return rescored[:topk]
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:llm_rerank_candidates failed: %s", e)
|
||
return candidates[:topk]
|
||
|
||
def _rrf_fuse_paths(*ordered_lists: List[str], k: int = int(os.getenv("RRF_K","60"))) -> List[str]:
|
||
"""
|
||
Neem meerdere geordende padlijsten (beste eerst) en geef een RRF-fusie.
|
||
"""
|
||
acc = defaultdict(float)
|
||
for lst in ordered_lists:
|
||
for i, p in enumerate(lst):
|
||
acc[p] += 1.0 / (k + i + 1)
|
||
# path prior
|
||
def _prior(p: str) -> float:
|
||
return (
|
||
(0.35 if p.lower().startswith("routes/") else 0.0) +
|
||
(0.30 if p.lower().startswith("app/http/controllers/") else 0.0) +
|
||
(0.25 if p.lower().startswith("resources/views/") or p.lower().endswith(".blade.php") else 0.0) +
|
||
(0.12 if p.lower().startswith(("src/","app/","lib/","pages/","components/")) else 0.0) +
|
||
(0.05 if p.lower().endswith((".php",".ts",".tsx",".js",".jsx",".py",".go",".rb",".java",".cs",".vue",".html",".md")) else 0.0) -
|
||
(0.10 if ("/tests/" in p.lower() or p.lower().startswith(("tests/","test/"))) else 0.0) -
|
||
(0.10 if p.lower().endswith((".lock",".map",".min.js",".min.css")) else 0.0)
|
||
)
|
||
for p in list(acc.keys()):
|
||
acc[p] += float(os.getenv("RRF_PATH_PRIOR_WEIGHT","0.25")) * _prior(p)
|
||
return [p for p,_ in sorted(acc.items(), key=lambda t: t[1], reverse=True)]
|
||
|
||
async def hybrid_rag_select_paths(repo_root: Path,
|
||
owner_repo: Optional[str],
|
||
branch: str,
|
||
user_goal: str,
|
||
all_files: List[str],
|
||
max_out: int = 8) -> List[str]:
|
||
quotes = extract_quotes(user_goal)
|
||
hints = extract_word_hints(user_goal)
|
||
# signals
|
||
sig_messages = [
|
||
{"role":"system","content":"Produceer alleen geldige JSON zonder uitleg."},
|
||
{"role":"user","content":(
|
||
"Bedenk een compacte zoekstrategie als JSON om relevante bestanden te vinden (globs/must/maybe/regex/path_hints/excludes). Wijziging:\n"
|
||
+ user_goal
|
||
)}
|
||
]
|
||
try:
|
||
resp = await _llm_call(sig_messages, stream=False, temperature=0.1, top_p=0.9, max_tokens=384)
|
||
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","").strip()
|
||
sig = safe_json_loads(raw) or {}
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:signals LLM failed: %s", e)
|
||
sig = {}
|
||
# Tweepassig: eerst lenient (recall), dan strict (precision)
|
||
sig_lenient = dict(sig or {})
|
||
sig_lenient["must_substrings"] = []
|
||
sig_lenient["regexes"] = []
|
||
scan_hits_lenient = scan_with_signals(
|
||
repo_root, all_files, sig_lenient,
|
||
phrase_boosts=quotes, hint_boosts=hints, limit=24
|
||
)
|
||
scan_hits_strict = scan_with_signals(
|
||
repo_root, all_files, sig,
|
||
phrase_boosts=quotes, hint_boosts=hints, limit=20
|
||
)
|
||
# combineer met voorkeur voor strict
|
||
seen_paths_local = set()
|
||
prepicked = []
|
||
for rel, _sc, _m in scan_hits_strict + scan_hits_lenient:
|
||
if rel not in seen_paths_local:
|
||
seen_paths_local.add(rel); prepicked.append(rel)
|
||
|
||
# --- NIEUW: expliciete pad-hints uit de user prompt voorrang geven ---
|
||
try:
|
||
explicit = extract_explicit_paths(user_goal)
|
||
except Exception:
|
||
explicit = []
|
||
explicit_resolved: List[str] = []
|
||
for ep in explicit:
|
||
if ep in all_files:
|
||
explicit_resolved.append(ep)
|
||
else:
|
||
bp = best_path_by_basename(all_files, ep)
|
||
if bp: explicit_resolved.append(bp)
|
||
# plaats expliciete paden vooraan met dedupe
|
||
for ep in reversed(explicit_resolved):
|
||
if ep not in seen_paths_local:
|
||
prepicked.insert(0, ep); seen_paths_local.add(ep)
|
||
|
||
# lichte stack-seeds
|
||
seeds = []
|
||
if (repo_root / "artisan").exists() or (repo_root / "composer.json").exists():
|
||
seeds += ["Route::get", "Controller", "blade", "resources/views", "routes/web.php", "app/Http/Controllers"]
|
||
if (repo_root / "package.json").exists():
|
||
seeds += ["component", "pages", "src/components", "useState", "useEffect"]
|
||
queries = await llm_expand_queries(user_goal, quotes, hints, k=5, extra_seeds=seeds)
|
||
|
||
|
||
chroma_paths: List[str] = []
|
||
for q in queries:
|
||
try:
|
||
rag_res = await _rag_query_internal(
|
||
query=q, n_results=RAG_TOPK,
|
||
# zoek in de versie-consistente collectie:
|
||
collection_name=repo_collection_name(owner_repo, branch),
|
||
repo=None, path_contains=None, profile=None
|
||
)
|
||
for item in rag_res.get("results", []):
|
||
meta = item.get("metadata") or {}
|
||
pth = meta.get("path")
|
||
if pth and pth in all_files:
|
||
chroma_paths.append(pth)
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:Chroma query failed: %s", e)
|
||
|
||
meili_paths: List[str] = []
|
||
if MEILI_URL:
|
||
for q in queries:
|
||
hits = meili_search(owner_repo, branch, q, limit=RAG_TOPK)
|
||
for h in hits:
|
||
p = h.get("path")
|
||
if p and p in all_files:
|
||
meili_paths.append(p)
|
||
else:
|
||
# BM25 fallback wanneer Meili uit staat
|
||
# zorg dat er een (eenmalige) index is
|
||
try:
|
||
if bm25_index_name(owner_repo, branch) not in _BM25_CACHE:
|
||
bm25_build_index(repo_root, owner_repo, branch)
|
||
except Exception:
|
||
pass
|
||
for q in queries:
|
||
hits = bm25_search(owner_repo, branch, q, limit=RAG_TOPK)
|
||
for h in hits:
|
||
p = h.get("path")
|
||
if p and p in all_files:
|
||
meili_paths.append(p)
|
||
|
||
|
||
try:
|
||
laravel_picks = laravel_signal_candidates(repo_root, user_goal, all_files, max_out=6)
|
||
except Exception:
|
||
laravel_picks = []
|
||
|
||
|
||
# --- NIEUW: Symbol-driven candidates ---
|
||
sym_hits = symbol_search(owner_repo, branch, user_goal, limit=12)
|
||
sym_paths = [p for p, _sc in sym_hits if p in all_files]
|
||
|
||
# RRF-fusie van bronnen + Laravel-picks
|
||
#fused = _rrf_fuse_paths(prepicked, chroma_paths, meili_paths, laravel_picks)
|
||
|
||
# --- Optionele RRF-fusie van kanalen (standaard UIT) ---
|
||
use_rrf = str(os.getenv("RRF_ENABLE", "1")).lower() in ("1","true","yes")
|
||
if use_rrf:
|
||
k = int(os.getenv("RRF_K", "30"))
|
||
# eenvoudige gewichten per kanaal (pas aan via env)
|
||
w_signals = float(os.getenv("RRF_W_SIGNALS", "1.0"))
|
||
w_chroma = float(os.getenv("RRF_W_CHROMA", "1.0"))
|
||
w_meili = float(os.getenv("RRF_W_MEILI", "0.8"))
|
||
w_sym = float(os.getenv("RRF_W_SYMBOLS", "1.3"))
|
||
w_lara = float(os.getenv("RRF_W_LARAVEL", "1.2"))
|
||
|
||
sources = [
|
||
("signals", prepicked, w_signals),
|
||
("chroma", chroma_paths, w_chroma),
|
||
("meili", meili_paths, w_meili),
|
||
("symbols", sym_paths, w_sym),
|
||
("laravel", laravel_picks,w_lara),
|
||
]
|
||
|
||
rrf_scores: dict[str, float] = {}
|
||
seen_any = set()
|
||
for _name, paths, w in sources:
|
||
for rank, p in enumerate(paths, start=1):
|
||
if p not in all_files:
|
||
continue
|
||
seen_any.add(p)
|
||
rrf_scores[p] = rrf_scores.get(p, 0.0) + (w * (1.0 / (k + rank)))
|
||
|
||
# kies top op basis van RRF; val terug op union als leeg
|
||
fused_paths = [p for p, _ in sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)]
|
||
base_pool = fused_paths[: max_out*3] if fused_paths else []
|
||
|
||
# bouw pool (met dedupe) + vul aan met de oude volgorde indien nodig
|
||
pool, seen = [], set()
|
||
def add(p):
|
||
if p not in seen and p in all_files:
|
||
seen.add(p); pool.append(p)
|
||
|
||
for p in base_pool: add(p)
|
||
if len(pool) < max_out:
|
||
for lst in (prepicked, chroma_paths, meili_paths, sym_paths, laravel_picks):
|
||
for p in lst:
|
||
add(p)
|
||
else:
|
||
# oude (jouw huidige) manier zonder RRF
|
||
pool, seen = [], set()
|
||
def add(p):
|
||
if p not in seen and p in all_files:
|
||
seen.add(p); pool.append(p)
|
||
for lst in (prepicked, chroma_paths, meili_paths, sym_paths, laravel_picks):
|
||
for p in lst:
|
||
add(p)
|
||
|
||
# LLM-rerank blijft identiek:
|
||
cands = [{"path": p, "preview": get_file_preview(repo_root, p, quotes+hints)} for p in pool[:20]]
|
||
ranked = await llm_rerank_candidates(user_goal, cands, topk=max_out)
|
||
|
||
# symbol-boost (licht) ná LLM-rerank (ongewijzigd)
|
||
sym_map = {p: sc for p, sc in sym_hits}
|
||
boost = float(os.getenv("SYMBOL_LIGHT_BOOST", "0.15"))
|
||
rescored = []
|
||
for c in ranked:
|
||
base = float(c.get("score", 0.0))
|
||
s = sym_map.get(c["path"], 0)
|
||
adj = base + (boost if s > 0 else 0.0)
|
||
rescored.append({**c, "score": adj})
|
||
rescored.sort(key=lambda x: x["score"], reverse=True)
|
||
return [c["path"] for c in rescored[:max_out]]
|
||
|
||
# ---------- Focus-snippets ----------
|
||
def extract_focus_snippets(text: str, needles: List[str], window: int = 240, max_snippets: int = 3) -> str:
|
||
if not text or not needles: return (text[:window*2] if text else "")
|
||
lo = text.lower()
|
||
hits = []
|
||
for n in needles:
|
||
nlo = (n or "").lower()
|
||
if not nlo: continue
|
||
start = 0
|
||
for _ in range(4):
|
||
idx = lo.find(nlo, start)
|
||
if idx < 0: break
|
||
a = max(0, idx - window)
|
||
b = min(len(text), idx + len(nlo) + window)
|
||
hits.append(text[a:b]); start = idx + len(nlo)
|
||
uniq = []
|
||
for h in hits:
|
||
# de-dupe met wederzijdse containment (voorkom overlap/ingebed)
|
||
if all((h not in u) and (u not in h) for u in uniq):
|
||
uniq.append(h)
|
||
if len(uniq) >= max_snippets: break
|
||
return "\n----- CONTEXT SPLIT -----\n".join(uniq) if uniq else text[:window*2]
|
||
|
||
# ---------- LLM edit-plan ----------
|
||
async def llm_plan_edits_for_file(user_goal: str, rel: str, focus_snippet: str) -> dict | None:
|
||
SYSTEM = "Produceer uitsluitend geldige JSON; geen verdere uitleg. Minimaliseer edits; raak zo min mogelijk regels."
|
||
# (optioneel) korte tree-hint in de prompt – zet AGENT_TREE_PROMPT=1 om te activeren
|
||
# Tree-hint standaard aan: korte mapoverzicht + samenvattingen van nabije files
|
||
tree_block = globals().get("_LLM_EDIT_TREE_HINT", "")
|
||
tree_hint = os.getenv("AGENT_TREE_PROMPT","1").lower() not in ("0","false")
|
||
try:
|
||
if tree_hint:
|
||
# NB: eenvoudige, lokale context: alleen siblings + map info om tokens te sparen
|
||
# (Vereist repo_root hier normaal gesproken; als niet beschikbaar, laat leeg)
|
||
if not tree_block:
|
||
tree_block = "\n(Tree-overzicht niet beschikbaar in deze context)\n"
|
||
except Exception:
|
||
pass
|
||
USER = (
|
||
"Doel:\n" + user_goal + "\n\n" +
|
||
f"Bestand: {rel}\n" +
|
||
"Relevante contextfragmenten:\n----- BEGIN SNIPPETS -----\n" +
|
||
focus_snippet + "\n----- EIND SNIPPETS -----\n\n" +
|
||
("Korte tree-hint:\n" + tree_block + "\n") +
|
||
"JSON schema:\n" +
|
||
"{ \"allow_destructive\": false, \"edits\": [\n" +
|
||
" {\"type\":\"regex_replace\",\"pattern\":\"...\",\"replacement\":\"...\",\"flags\":\"ims\",\"count\":1,\"explain\":\"...\"},\n" +
|
||
" {\"type\":\"string_replace\",\"find\":\"...\",\"replace\":\"...\",\"count\":1,\"explain\":\"...\"},\n" +
|
||
" {\"type\":\"insert_after\",\"anchor_regex\":\"...\",\"text\":\"...\",\"occur\":\"first|last\",\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
||
" {\"type\":\"insert_before\",\"anchor_regex\":\"...\",\"text\":\"...\",\"occur\":\"first|last\",\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
||
" {\"type\":\"replace_between_anchors\",\"start_regex\":\"...\",\"end_regex\":\"...\",\"replacement\":\"...\",\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
||
" {\"type\":\"delete_between_anchors\",\"start_regex\":\"...\",\"end_regex\":\"...\",\"keep_anchors\":false,\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
||
" {\"type\":\"conditional_insert\",\"absent_regex\":\"...\",\"anchor_regex\":\"...\",\"text\":\"...\",\"occur\":\"first|last\",\"flags\":\"ims\",\"explain\":\"...\"},\n" +
|
||
" {\"type\":\"insert_at_top\",\"text\":\"...\",\"explain\":\"...\"},\n" +
|
||
" {\"type\":\"insert_at_bottom\",\"text\":\"...\",\"explain\":\"...\"}\n" +
|
||
"]}\n" +
|
||
"Maximaal 4 edits. Geef bij elke edit een korte 'explain'."
|
||
)
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":SYSTEM},{"role":"user","content":USER}],
|
||
stream=False, temperature=0.1, top_p=0.9, max_tokens=800
|
||
)
|
||
raw = resp.get("choices",[{}])[0].get("message",{}).get("content","").strip()
|
||
plan = safe_json_loads(raw)
|
||
if isinstance(plan, dict) and isinstance(plan.get("edits"), list):
|
||
return plan
|
||
return None
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:llm_plan_edits_for_file failed for %s: %s", rel, e)
|
||
return None
|
||
|
||
# ---------- Apply helpers ----------
|
||
def _regex_flags(flag_str: str) -> int:
|
||
flags = 0
|
||
if not flag_str: return flags
|
||
for ch in flag_str.lower():
|
||
if ch == 'i': flags |= re.IGNORECASE
|
||
if ch == 'm': flags |= re.MULTILINE
|
||
if ch == 's': flags |= re.DOTALL
|
||
return flags
|
||
|
||
def apply_edit_plan(original: str, plan: dict) -> tuple[str, int, List[str], bool]:
|
||
"""
|
||
Returns: (modified, changes_count, explains[], allow_destructive)
|
||
"""
|
||
if not original or not plan or not isinstance(plan.get("edits"), list):
|
||
return original, 0, [], False
|
||
txt = original
|
||
changes = 0
|
||
explains: List[str] = []
|
||
for ed in plan["edits"]:
|
||
try:
|
||
et = (ed.get("type") or "").lower()
|
||
ex = ed.get("explain") or et
|
||
if et == "string_replace":
|
||
find = ed.get("find") or ""; rep = ed.get("replace") or ""
|
||
cnt = int(ed.get("count") or 0) or 1
|
||
if find:
|
||
new = txt.replace(find, rep, cnt)
|
||
if new != txt: changes += 1; txt = new; explains.append(f"string_replace: {ex}")
|
||
elif et == "regex_replace":
|
||
pat = ed.get("pattern") or ""; rep = ed.get("replacement") or ""
|
||
flags = _regex_flags(ed.get("flags") or ""); cnt = int(ed.get("count") or 0) or 1
|
||
if pat:
|
||
new, n = re.subn(pat, rep, txt, count=cnt, flags=flags)
|
||
if n > 0: changes += 1; txt = new; explains.append(f"regex_replace: {ex}")
|
||
elif et in ("insert_after","insert_before"):
|
||
anchor = ed.get("anchor_regex") or ""; ins = ed.get("text") or ""
|
||
occur = (ed.get("occur") or "first").lower(); flags = _regex_flags(ed.get("flags") or "")
|
||
if not anchor or not ins: continue
|
||
matches = list(re.finditer(anchor, txt, flags))
|
||
if not matches: continue
|
||
m = matches[0] if occur != "last" else matches[-1]
|
||
pos = m.end() if et == "insert_after" else m.start()
|
||
# idempotentie: voeg niet opnieuw in als de tekst al vlakbij staat
|
||
win_a, win_b = max(0, pos-200), min(len(txt), pos+200)
|
||
if ins in txt[win_a:win_b]:
|
||
continue
|
||
txt = txt[:pos] + ins + txt[pos:]; changes += 1; explains.append(f"{et}: {ex}")
|
||
elif et in ("replace_between_anchors","delete_between_anchors"):
|
||
srx = ed.get("start_regex") or ""; erx = ed.get("end_regex") or ""
|
||
flags = _regex_flags(ed.get("flags") or ""); keep_anchors = bool(ed.get("keep_anchors")) if et == "delete_between_anchors" else True
|
||
repl = ed.get("replacement") or ""
|
||
if not srx or not erx: continue
|
||
s_matches = list(re.finditer(srx, txt, flags))
|
||
e_matches = list(re.finditer(erx, txt, flags))
|
||
if not s_matches or not e_matches: continue
|
||
s0 = s_matches[0]
|
||
# Kies de eerste end-anker ná het start-anker
|
||
e0 = next((em for em in e_matches if em.start() >= s0.end()), None)
|
||
if not e0: continue
|
||
a = s0.end(); b = e0.start()
|
||
if et == "replace_between_anchors":
|
||
txt = txt[:a] + repl + txt[b:]; changes += 1; explains.append(f"replace_between_anchors: {ex}")
|
||
else:
|
||
if keep_anchors: txt = txt[:a] + txt[b:]
|
||
else: txt = txt[:s0.start()] + txt[e0.end():]
|
||
changes += 1; explains.append(f"delete_between_anchors: {ex}")
|
||
elif et == "conditional_insert":
|
||
absent = ed.get("absent_regex") or ""; anchor = ed.get("anchor_regex") or ""
|
||
occur = (ed.get("occur") or "first").lower(); ins = ed.get("text") or ""
|
||
flags = _regex_flags(ed.get("flags") or "")
|
||
if not anchor or not ins: continue
|
||
if absent and re.search(absent, txt, flags): continue
|
||
matches = list(re.finditer(anchor, txt, flags))
|
||
if not matches: continue
|
||
m = matches[0] if occur != "last" else matches[-1]
|
||
pos = m.end()
|
||
# idempotentie: lokale window-check
|
||
win_a, win_b = max(0, pos-200), min(len(txt), pos+200)
|
||
if ins in txt[win_a:win_b]:
|
||
continue
|
||
txt = txt[:pos] + ins + txt[pos:]; changes += 1; explains.append(f"conditional_insert: {ex}")
|
||
elif et == "insert_at_top":
|
||
ins = ed.get("text") or ""
|
||
if ins: txt = ins + txt; changes += 1; explains.append(f"insert_at_top: {ex}")
|
||
elif et == "insert_at_bottom":
|
||
ins = ed.get("text") or ""
|
||
if ins: txt = txt + ins; changes += 1; explains.append(f"insert_at_bottom: {ex}")
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:apply_edit_plan step failed: %s", e)
|
||
continue
|
||
allow_destructive = bool(plan.get("allow_destructive"))
|
||
return txt, changes, explains, allow_destructive
|
||
|
||
# ==== BEGIN PATCH A: destructiviteit op diff-basis + drempel via env ====
|
||
# Veilige default voor AGENT_DESTRUCTIVE_RATIO (voorkom NameError als niet gedefinieerd)
|
||
try:
|
||
AGENT_DESTRUCTIVE_RATIO
|
||
except NameError:
|
||
AGENT_DESTRUCTIVE_RATIO = float(os.getenv("AGENT_DESTRUCTIVE_RATIO", "0.45"))
|
||
|
||
def _deletion_ratio(original: str, modified: str) -> float:
|
||
"""Schat welk deel van de originele regels als deletions wegvalt."""
|
||
ol = original.splitlines()
|
||
ml = modified.splitlines()
|
||
if not ol:
|
||
return 0.0
|
||
# ndiff: regels met prefix '- ' tellen we als deletions
|
||
dels = 0
|
||
for line in difflib.ndiff(ol, ml):
|
||
if line.startswith("- "):
|
||
dels += 1
|
||
return dels / max(1, len(ol))
|
||
|
||
def is_destructive(original: str, modified: str, allow_destructive: bool) -> bool:
|
||
"""Blokkeer alleen als er aantoonbaar veel deletions zijn."""
|
||
if allow_destructive:
|
||
return False
|
||
# heel kleine files: laat door, we willen niet te streng zijn
|
||
if len(original.splitlines()) < 6:
|
||
return False
|
||
ratio = _deletion_ratio(original, modified)
|
||
return ratio > AGENT_DESTRUCTIVE_RATIO
|
||
|
||
# ==== END PATCH A ====
|
||
|
||
def list_sibling_files(repo_root: Path, rel: str, limit: int = 12) -> List[str]:
|
||
d = (repo_root / rel).parent
|
||
if not d.exists():
|
||
# directory kan nog niet bestaan; kies dichtstbijzijnde bestaande ouder
|
||
d = repo_root / os.path.dirname(rel)
|
||
while not d.exists() and d != repo_root:
|
||
d = d.parent
|
||
outs = []
|
||
if d.exists():
|
||
for p in d.iterdir():
|
||
if p.is_file() and allowed_file(p) and p.stat().st_size < 500_000:
|
||
outs.append(str(p.name))
|
||
# stabiele output i.p.v. FS-volgorde
|
||
outs.sort(key=str.lower)
|
||
return outs[:limit]
|
||
|
||
|
||
def read_snippet(p: Path, max_chars: int = 2000) -> str:
|
||
try:
|
||
t = _read_text_file(p) or ""
|
||
return t[:max_chars]
|
||
except Exception:
|
||
return ""
|
||
|
||
async def propose_new_file(repo_root: Path, rel: str, user_goal: str) -> tuple[Optional[str], str]:
|
||
"""
|
||
Vraag de LLM om een *volledig nieuwe file* te genereren op pad `rel`
|
||
met minimale aannames. Geeft (content, reason).
|
||
"""
|
||
ext = os.path.splitext(rel)[1].lower()
|
||
siblings = list_sibling_files(repo_root, rel)
|
||
sibling_snippets = []
|
||
for name in siblings[:3]:
|
||
snippet = read_snippet(repo_root / os.path.join(os.path.dirname(rel), name), max_chars=1600)
|
||
if snippet:
|
||
sibling_snippets.append({"name": name, "snippet": snippet[:1600]})
|
||
|
||
SYSTEM = "Je bent een zorgvuldige codegenerator. Lever exact één compleet bestand. Geen extra refactors."
|
||
USER = (
|
||
f"Doel (nieuwe file aanmaken):\n{user_goal}\n\n"
|
||
f"Bestandspad: {rel}\n"
|
||
f"Directory siblings: {', '.join(siblings) if siblings else '(geen)'}\n\n"
|
||
"Enkele nabije referenties (indien aanwezig):\n" +
|
||
"\n".join([f"--- {s['name']} ---\n{s['snippet']}" for s in sibling_snippets]) +
|
||
"\n\nEisen:\n"
|
||
"- Maak een minimal-werkende versie van dit bestand die past bij de context hierboven.\n"
|
||
"- Raak geen andere paden aan; geen includes naar niet-bestaande bestanden.\n"
|
||
"- Gebruik hetzelfde framework/stack als de referenties suggereren (indien duidelijk).\n"
|
||
"- Output: alleen de VOLLEDIGE bestandinformatie in één codeblok, niets anders."
|
||
)
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":SYSTEM},{"role":"user","content":USER}],
|
||
stream=False, temperature=0.2, top_p=0.9, max_tokens=2048
|
||
)
|
||
content = _extract_code_block(
|
||
resp.get("choices",[{}])[0].get("message",{}).get("content","")
|
||
) or ""
|
||
content = content.strip()
|
||
if not content:
|
||
return None, "LLM gaf geen inhoud terug."
|
||
# simpele sanity-limit
|
||
if len(content) > 200_000:
|
||
content = content[:200_000]
|
||
return content, "Nieuw bestand voorgesteld op basis van directory-context en doel."
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:propose_new_file failed for %s: %s", rel, e)
|
||
return None, f"Kon geen nieuwe file genereren: {e}"
|
||
|
||
|
||
|
||
|
||
# ---------- Diff helper ----------
|
||
def make_diffs(original: str, modified: str, filename: str, max_lines: int = 200) -> str:
|
||
diff = list(difflib.unified_diff(
|
||
original.splitlines(keepends=True),
|
||
modified.splitlines(keepends=True),
|
||
fromfile=f"a/{filename}",
|
||
tofile=f"b/{filename}",
|
||
lineterm=""
|
||
))
|
||
if len(diff) > max_lines:
|
||
return "".join(diff[:max_lines]) + "\n... (diff ingekort)"
|
||
return "".join(diff)
|
||
|
||
def make_new_file_diff(filename: str, content: str, max_lines: int = 400) -> str:
|
||
new_lines = content.splitlines(keepends=True)
|
||
diff = list(difflib.unified_diff(
|
||
[], new_lines,
|
||
fromfile="/dev/null",
|
||
tofile=f"b/{filename}",
|
||
lineterm=""
|
||
))
|
||
if len(diff) > max_lines:
|
||
return "".join(diff[:max_lines]) + "\n... (diff ingekort)"
|
||
return "".join(diff)
|
||
|
||
# ---------- Lightweight Laravel Graph helpers ----------
|
||
def _view_name_to_path(repo_root: Path, view_name: str) -> Optional[str]:
|
||
"""
|
||
'users.index' -> resources/views/users/index.blade.php (als bestaand)
|
||
'users/index' -> idem. Return relatieve path of None als niet gevonden.
|
||
"""
|
||
if not view_name:
|
||
return None
|
||
cand = view_name.replace(".", "/").strip("/ ")
|
||
for ext in [".blade.php", ".php"]:
|
||
rel = f"resources/views/{cand}{ext}"
|
||
if (repo_root / rel).exists():
|
||
return rel
|
||
return None
|
||
|
||
def _controller_extract_views(text: str, repo_root: Path) -> list[str]:
|
||
"""
|
||
Zoek 'return view("x.y")' en map naar blade-bestanden.
|
||
Ondersteunt ook: View::make('x.y'), Inertia::render('X/Y') -> best effort naar blade.
|
||
"""
|
||
outs: list[str] = []
|
||
# view('foo.bar')
|
||
for m in re.finditer(r"(?:return\s+)?view\s*\(\s*['\"]([^'\"]+)['\"]", text, flags=re.I):
|
||
rel = _view_name_to_path(repo_root, m.group(1))
|
||
if rel:
|
||
outs.append(rel)
|
||
# View::make('foo.bar')
|
||
for m in re.finditer(r"View::make\s*\(\s*['\"]([^'\"]+)['\"]", text, flags=re.I):
|
||
rel = _view_name_to_path(repo_root, m.group(1))
|
||
if rel:
|
||
outs.append(rel)
|
||
# Inertia::render('Foo/Bar') -> probeer view pad heuristisch
|
||
for m in re.finditer(r"Inertia::render\s*\(\s*['\"]([^'\"]+)['\"]", text, flags=re.I):
|
||
rel = _view_name_to_path(repo_root, m.group(1))
|
||
if rel:
|
||
outs.append(rel)
|
||
# dedupe
|
||
seen=set(); uniq=[]
|
||
for r in outs:
|
||
if r not in seen:
|
||
uniq.append(r); seen.add(r)
|
||
return uniq
|
||
|
||
def _blade_extract_lang_keys(text: str) -> list[str]:
|
||
"""
|
||
Haal vertaalkeys uit Blade/PHP: __('x.y'), @lang('x.y'), trans('x.y')
|
||
"""
|
||
keys = []
|
||
for rx in [
|
||
r"__\(\s*['\"]([^'\"]+)['\"]\s*\)",
|
||
r"@lang\(\s*['\"]([^'\"]+)['\"]\s*\)",
|
||
r"trans\(\s*['\"]([^'\"]+)['\"]\s*\)"
|
||
]:
|
||
for m in re.finditer(rx, text):
|
||
keys.append(m.group(1))
|
||
# dedupe
|
||
seen=set(); out=[]
|
||
for k in keys:
|
||
if k not in seen:
|
||
out.append(k); seen.add(k)
|
||
return out
|
||
|
||
def _grep_lang_files_for_key(repo_root: Path, key: str, limit: int = 6) -> list[str]:
|
||
"""
|
||
Zoek in resources/lang/**/*.(json|php) naar KEY. Best-effort, klein limiet.
|
||
"""
|
||
base = repo_root / "resources/lang"
|
||
if not base.exists():
|
||
return []
|
||
hits=[]
|
||
try:
|
||
for p in base.rglob("*"):
|
||
if p.is_dir():
|
||
continue
|
||
if not (str(p).endswith(".json") or str(p).endswith(".php")):
|
||
continue
|
||
if p.stat().st_size > 300_000:
|
||
continue
|
||
txt = p.read_text(encoding="utf-8", errors="ignore")
|
||
if key in txt:
|
||
hits.append(str(p.relative_to(repo_root)))
|
||
if len(hits) >= limit:
|
||
break
|
||
except Exception:
|
||
pass
|
||
return hits
|
||
|
||
def _build_laravel_graph(repo_root: Path) -> dict[str, set[str]]:
|
||
"""
|
||
Maak een lichte ongerichte graaf:
|
||
- routes/web.php|api.php ↔ controller-bestanden
|
||
- controller ↔ views (via return view(...))
|
||
- view ↔ lang-bestanden (voor keys die in de view voorkomen)
|
||
Node-labels = relatieve padnamen; edges zijn ongericht (buren).
|
||
"""
|
||
g: dict[str, set[str]] = {}
|
||
def _add(a: str, b: str):
|
||
g.setdefault(a, set()).add(b)
|
||
g.setdefault(b, set()).add(a)
|
||
|
||
# 1) routes → controllers (reeds beschikbare scanner hergebruiken)
|
||
routes = laravel_scan_routes(repo_root)
|
||
for r in routes:
|
||
rp = r.get("file") or ""
|
||
ctrl = r.get("controller") or ""
|
||
if not ctrl:
|
||
continue
|
||
for cpath in _candidate_paths_for_controller(repo_root, ctrl):
|
||
_add(rp, cpath)
|
||
# 2) controllers → views (parse controller file)
|
||
try:
|
||
txt = _read_text_file(repo_root / cpath) or ""
|
||
except Exception:
|
||
txt = ""
|
||
for vrel in _controller_extract_views(txt, repo_root):
|
||
_add(cpath, vrel)
|
||
# 3) views → lang-files (op basis van keys)
|
||
try:
|
||
vtxt = _read_text_file(repo_root / vrel) or ""
|
||
except Exception:
|
||
vtxt = ""
|
||
for key in _blade_extract_lang_keys(vtxt):
|
||
for lrel in _grep_lang_files_for_key(repo_root, key, limit=4):
|
||
_add(vrel, lrel)
|
||
return g
|
||
|
||
def _graph_bfs_boosts(graph: dict[str, set[str]], seeds: list[str], max_depth: int = 3) -> dict[str, tuple[int, str]]:
|
||
"""
|
||
BFS vanaf seed-nodes. Return: {node: (distance, via)} met via=eerste buur of route.
|
||
"""
|
||
from collections import deque
|
||
dist: dict[str, int] = {}
|
||
via: dict[str, str] = {}
|
||
q = deque()
|
||
for s in seeds:
|
||
if s in graph:
|
||
dist[s] = 0
|
||
via[s] = s
|
||
q.append(s)
|
||
while q:
|
||
cur = q.popleft()
|
||
if dist[cur] >= max_depth:
|
||
continue
|
||
for nb in graph.get(cur, ()):
|
||
if nb not in dist:
|
||
dist[nb] = dist[cur] + 1
|
||
via[nb] = cur if via.get(cur) == cur else via.get(cur, cur)
|
||
q.append(nb)
|
||
return {n: (d, via.get(n, "")) for n, d in dist.items()}
|
||
|
||
def _get_graph_cached(repo_root: Path, memo_key: str) -> dict[str, set[str]]:
|
||
if os.getenv("AGENT_GRAPH_ENABLE", "1").lower() in ("0", "false"):
|
||
return {}
|
||
g = _GRAPH_CACHE.get(memo_key)
|
||
if g is not None:
|
||
return g
|
||
try:
|
||
g = _build_laravel_graph(repo_root)
|
||
except Exception:
|
||
g = {}
|
||
_GRAPH_CACHE[memo_key] = g
|
||
return g
|
||
|
||
# ---------- Tree summaries (korte per-file beschrijving) ----------
|
||
def _summarize_file_for_tree(path: Path) -> str:
|
||
"""
|
||
Heuristische mini-samenvatting (<=160 chars):
|
||
- eerste docblock / commentregel / heading
|
||
- anders eerste niet-lege regel
|
||
"""
|
||
try:
|
||
txt = path.read_text(encoding="utf-8", errors="ignore")
|
||
except Exception:
|
||
return ""
|
||
head = txt[:1200]
|
||
# PHP docblock
|
||
m = re.search(r"/\*\*([\s\S]{0,400}?)\*/", head)
|
||
if m:
|
||
s = re.sub(r"[*\s]+", " ", m.group(1)).strip()
|
||
return (s[:160])
|
||
# single-line comments / headings
|
||
for rx in [r"^\s*//\s*(.+)$", r"^\s*#\s*(.+)$", r"^\s*<!--\s*(.+?)\s*-->", r"^\s*<h1[^>]*>([^<]+)</h1>", r"^\s*<title[^>]*>([^<]+)</title>"]:
|
||
mm = re.search(rx, head, flags=re.M|re.I)
|
||
if mm:
|
||
return mm.group(1).strip()[:160]
|
||
# first non-empty line
|
||
for line in head.splitlines():
|
||
ln = line.strip()
|
||
if ln:
|
||
return ln[:160]
|
||
return ""
|
||
|
||
def _build_tree_summaries(repo_root: Path, all_files: list[str], max_files: int = 2000) -> dict[str, str]:
|
||
out: dict[str, str] = {}
|
||
count = 0
|
||
for rel in all_files:
|
||
if count >= max_files:
|
||
break
|
||
p = repo_root / rel
|
||
try:
|
||
if p.stat().st_size > 200_000:
|
||
continue
|
||
except Exception:
|
||
continue
|
||
s = _summarize_file_for_tree(p)
|
||
if s:
|
||
out[rel] = s
|
||
count += 1
|
||
return out
|
||
|
||
def _get_tree_cached(repo_root: Path, memo_key: str, all_files: list[str]) -> dict[str, str]:
|
||
if os.getenv("AGENT_TREE_ENABLE", "1").lower() in ("0","false"):
|
||
return {}
|
||
t = _TREE_SUM_CACHE.get(memo_key)
|
||
if t is not None:
|
||
return t
|
||
try:
|
||
t = _build_tree_summaries(repo_root, all_files)
|
||
except Exception:
|
||
t = {}
|
||
_TREE_SUM_CACHE[memo_key] = t
|
||
return t
|
||
|
||
# ---------- Mini tree-hint voor LLM edit-plannen ----------
|
||
def _make_local_tree_hint(repo_root: Path, rel: str, max_siblings: int = 14) -> str:
|
||
"""
|
||
Bouw een compact overzicht van de map van 'rel' met 10–14 nabije files en korte samenvattingen.
|
||
Houd het kort en voorspelbaar voor de LLM.
|
||
"""
|
||
try:
|
||
base_dir = (repo_root / rel).parent
|
||
except Exception:
|
||
return ""
|
||
lines = []
|
||
try:
|
||
folder = str(base_dir.relative_to(repo_root))
|
||
except Exception:
|
||
folder = base_dir.name
|
||
lines.append(f"Map: {folder or '.'}")
|
||
|
||
items = []
|
||
try:
|
||
for p in sorted(base_dir.iterdir(), key=lambda x: x.name.lower()):
|
||
if not p.is_file():
|
||
continue
|
||
try:
|
||
if not allowed_file(p) or p.stat().st_size > 200_000:
|
||
continue
|
||
except Exception:
|
||
continue
|
||
summ = _summarize_file_for_tree(p)
|
||
name = p.name
|
||
if summ:
|
||
items.append(f"- {name}: {summ[:120]}")
|
||
else:
|
||
items.append(f"- {name}")
|
||
if len(items) >= max_siblings:
|
||
break
|
||
except Exception:
|
||
pass
|
||
lines.extend(items)
|
||
return "\n".join(lines)
|
||
|
||
# ---------- Basic syntax guards ----------
|
||
def _write_tmp(content: str, suffix: str) -> Path:
|
||
import tempfile
|
||
fd, path = tempfile.mkstemp(suffix=suffix)
|
||
os.close(fd)
|
||
p = Path(path)
|
||
p.write_text(content, encoding="utf-8")
|
||
return p
|
||
|
||
def _php_lint_ok(tmp_path: Path) -> bool:
|
||
# disable via AGENT_SYNTAX_GUARD=0
|
||
if os.getenv("AGENT_SYNTAX_GUARD","1").lower() in ("0","false"):
|
||
return True
|
||
try:
|
||
import subprocess
|
||
res = subprocess.run(["php","-l",str(tmp_path)], capture_output=True, text=True, timeout=8)
|
||
return res.returncode == 0
|
||
except Exception:
|
||
return True
|
||
|
||
def _blade_balance_ok(text: str) -> bool:
|
||
# Zeer conservatieve balans-check voor veelvoorkomende Blade directives
|
||
tl = (text or "").lower()
|
||
pairs = [("section","endsection"),("if","endif"),("foreach","endforeach"),("isset","endisset"),("php","endphp")]
|
||
for a,b in pairs:
|
||
if tl.count("@"+a) != tl.count("@"+b):
|
||
return False
|
||
return True
|
||
|
||
|
||
# ---------- Gerichte, veilige literal fallback ----------
|
||
# === PATCH: generieke HTML-scope vervanging ===
|
||
|
||
def html_scoped_literal_replace(html: str, old: str, new: str, tag_names: set[str]) -> tuple[str, bool, str]:
|
||
"""
|
||
Probeer 'old' -> 'new' te vervangen, maar ALLEEN binnen de genoemde tags.
|
||
Werkt zonder externe libs; gebruikt conservatieve regex (DOTALL).
|
||
Retour: (modified, changed, rationale)
|
||
"""
|
||
if not html or not old or not tag_names:
|
||
return html, False, ""
|
||
changed = False
|
||
rationale = []
|
||
result = html
|
||
|
||
for tag in sorted(tag_names):
|
||
# <tag ...> ... </tag> (greedy genoeg per blok, maar beperkt via DOTALL)
|
||
tag_re = re.compile(rf"(<\s*{re.escape(tag)}\b[^>]*>)(.*?)(</\s*{re.escape(tag)}\s*>)",
|
||
flags=re.IGNORECASE | re.DOTALL)
|
||
def _one(m):
|
||
nonlocal changed
|
||
open_tag, inner, close_tag = m.group(1), m.group(2), m.group(3)
|
||
if old in inner:
|
||
# maximaal 1 vervanging per tag-blok (conform docstring)
|
||
new_inner = inner.replace(old, new, 1)
|
||
if new_inner != inner:
|
||
changed = True
|
||
rationale.append(f"'{old}' vervangen binnen <{tag}> (1x)")
|
||
return open_tag + new_inner + close_tag
|
||
return m.group(0)
|
||
result_new = tag_re.sub(_one, result)
|
||
result = result_new
|
||
|
||
return result, changed, "; ".join(rationale) if changed else ""
|
||
|
||
# === PATCH: veilige, algemene string-literal vervanging ===
|
||
|
||
def quoted_literal_replace(original: str, old: str, new: str, max_occurrences: int = 2) -> tuple[str, bool, str]:
|
||
"""
|
||
Vervang 'old' of "old" als string-literal, maximaal 'max_occurrences' keer.
|
||
Dit is taalagnostisch en wijzigt geen identifiers, enkel stringwaarden.
|
||
Return: (modified, changed, rationale)
|
||
"""
|
||
if not original or not old:
|
||
return original, False, ""
|
||
pat = re.compile(rf"(?P<q>['\"])({re.escape(old)})(?P=q)")
|
||
cnt = 0
|
||
def _repl(m):
|
||
nonlocal cnt
|
||
if cnt >= max_occurrences:
|
||
return m.group(0)
|
||
cnt += 1
|
||
q = m.group("q")
|
||
return q + new + q
|
||
new_text = pat.sub(_repl, original)
|
||
if new_text != original and cnt > 0:
|
||
return new_text, True, f"'{old}' → '{new}' als string-literal ({cnt}x, limiet {max_occurrences})"
|
||
return original, False, ""
|
||
|
||
|
||
# ==== BEGIN PATCH B: per-bestand oud/nieuw bepalen + generieke fallback ====
|
||
def _literal_matches_with_context(src: str, needle: str, window: int = 160):
|
||
"""Vind alle posities waar 'needle' als literal voorkomt en geef de operator-context terug."""
|
||
escaped = re.escape(needle)
|
||
pat = re.compile(r"(?P<q>['\"])(" + escaped + r")(?P=q)")
|
||
for m in pat.finditer(src):
|
||
a, b = m.span()
|
||
before = src[max(0, a - window):a]
|
||
op = None
|
||
if re.search(r"\?\?\s*$", before):
|
||
op = "??"
|
||
elif re.search(r"\?\s*[^:\n]{0,120}:\s*$", before):
|
||
op = "?:"
|
||
elif re.search(r"\|\|\s*$", before):
|
||
op = "||"
|
||
elif re.search(r"\bor\b\s*$", before, flags=re.IGNORECASE):
|
||
op = "or"
|
||
yield (a, b, op)
|
||
|
||
def deduce_old_new_literals(user_goal: str, original: str) -> tuple[Optional[str], Optional[str], str]:
|
||
"""
|
||
Kies 'old' als de quoted string uit de prompt die ook in de file staat
|
||
én het vaakst in fallback-context (??, ?:, ||, or) voorkomt.
|
||
Kies 'new' als een andere quoted string uit de prompt (liefst die níet in de file voorkomt).
|
||
Retourneer (old, new, rationale).
|
||
"""
|
||
quotes = extract_quotes(user_goal)
|
||
if not quotes:
|
||
return None, None, "Geen quoted strings in prompt gevonden."
|
||
# Score candidates for OLD
|
||
scores = []
|
||
for q in quotes:
|
||
hits = list(_literal_matches_with_context(original, q))
|
||
if hits:
|
||
# gewicht: aantal hits + bonus als er operator context is
|
||
ctx_hits = sum(1 for _,_,op in hits if op)
|
||
score = 2 * ctx_hits + len(hits)
|
||
scores.append((q, score, ctx_hits))
|
||
if not scores:
|
||
# Geen van de quotes komt in de file voor; dan geen gerichte fallback
|
||
return None, None, "Geen van de quotes uit prompt kwam in de file voor."
|
||
scores.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
||
old = scores[0][0]
|
||
|
||
# Kies NEW uit overige quotes: bij voorkeur eentje die niet in de file voorkomt
|
||
rest = [q for q in quotes if q != old]
|
||
if not rest:
|
||
return old, None, f"OLD='{old}' gekozen; geen 'new' gevonden."
|
||
prefer = [q for q in rest if q not in original]
|
||
new = (prefer[0] if prefer else rest[0])
|
||
|
||
why = f"OLD='{old}' (meeste fallback-contexthits), NEW='{new}'."
|
||
return old, new, why
|
||
|
||
def targeted_fallback_replace(original: str, old: str, new: str) -> tuple[str, bool, str]:
|
||
"""
|
||
Vervang uitsluitend de literal OLD als die duidelijk fallback is nabij ??, ?:, || of 'or'.
|
||
Retourneer (modified, changed_bool, rationale).
|
||
"""
|
||
if not original or not old:
|
||
return original, False, ""
|
||
window = 160
|
||
escaped_old = re.escape(old)
|
||
pat = re.compile(r"(?P<q>['\"])(" + escaped_old + r")(?P=q)")
|
||
text = original
|
||
for m in pat.finditer(text):
|
||
q = m.group("q")
|
||
a, b = m.span()
|
||
before = text[max(0, a - window):a]
|
||
op = None
|
||
if re.search(r"\?\?\s*$", before):
|
||
op = "??"
|
||
elif re.search(r"\?\s*[^:\n]{0,120}:\s*$", before):
|
||
op = "?:"
|
||
elif re.search(r"\|\|\s*$", before):
|
||
op = "||"
|
||
elif re.search(r"\bor\b\s*$", before, flags=re.IGNORECASE):
|
||
op = "or"
|
||
if not op:
|
||
continue
|
||
new_text = text[:a] + q + new + q + text[b:]
|
||
reason = f"Gerichte vervanging van fallback-literal nabij operator '{op}'"
|
||
return new_text, True, reason
|
||
return original, False, ""
|
||
|
||
# ==== END PATCH B ====
|
||
|
||
# === Repo-QA: vraag-antwoord over 1 specifieke repository ===
|
||
_LARAVEL_CREATE_HINTS = {
|
||
"verbs": ["create", "store", "new", "aanmaken", "aanmaak", "nieuw", "toevoegen", "add"],
|
||
"nouns": ["melding", "incident", "ticket", "aanvraag", "report", "issue", "storingen", "storing"]
|
||
}
|
||
|
||
def _read_file_safe(p: Path) -> str:
|
||
try:
|
||
return _read_text_file(p) or ""
|
||
except Exception:
|
||
return ""
|
||
|
||
def laravel_scan_routes(repo_root: Path) -> list[dict]:
|
||
out = []
|
||
for rp in ["routes/web.php", "routes/api.php"]:
|
||
p = repo_root / rp
|
||
if not p.exists():
|
||
continue
|
||
txt = _read_file_safe(p)
|
||
for m in re.finditer(r"Route::(get|post|put|patch|delete|match)\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*([^)]+)\)", txt, flags=re.I):
|
||
verb, uri, target = m.group(1).lower(), m.group(2), m.group(3)
|
||
ctrl = None; method = None; name = None
|
||
# controller@method
|
||
m2 = re.search(r"['\"]([A-Za-z0-9_\\]+)@([A-Za-z0-9_]+)['\"]", target)
|
||
if m2:
|
||
ctrl, method = m2.group(1), m2.group(2)
|
||
else:
|
||
# ['Foo\\BarController::class', 'index'] of [Foo\\BarController::class, 'index']
|
||
m2b = re.search(r"\[\s*([A-Za-z0-9_\\]+)::class\s*,\s*['\"]([A-Za-z0-9_]+)['\"]\s*\]", target)
|
||
if m2b:
|
||
ctrl, method = m2b.group(1), m2b.group(2)
|
||
# ->name('...')
|
||
tail = txt[m.end(): m.end()+140]
|
||
m3 = re.search(r"->\s*name\s*\(\s*['\"]([^'\"]+)['\"]\s*\)", tail)
|
||
if m3: name = m3.group(1)
|
||
out.append({"file": rp, "verb": verb, "uri": uri, "target": target, "controller": ctrl, "method": method, "name": name})
|
||
# Route::resource
|
||
for m in re.finditer(r"Route::resource\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*['\"]([^'\"]+)['\"]\s*\)", txt, flags=re.I):
|
||
res, ctrl = m.group(1), m.group(2)
|
||
out.append({"file": rp, "verb": "resource", "uri": res, "target": ctrl, "controller": ctrl, "method": None, "name": None})
|
||
return out
|
||
|
||
def _candidate_paths_for_controller(repo_root: Path, controller_fqcn: str) -> list[str]:
|
||
"""
|
||
Probeer Controller-bestand + views te vinden vanuit FQCN zoals App\\Http\\Controllers\\Foo\\BarController.
|
||
"""
|
||
rels = []
|
||
# controller pad
|
||
base = controller_fqcn.replace("\\\\","/").replace("\\","/")
|
||
name = base.split("/")[-1]
|
||
ctrl_guess = [
|
||
f"app/Http/Controllers/{base}.php",
|
||
f"app/Http/Controllers/{name}.php"
|
||
]
|
||
for g in ctrl_guess:
|
||
if (repo_root / g).exists():
|
||
rels.append(g)
|
||
# view dir guesses (resource-achtig)
|
||
view_roots = ["resources/views", "resources/views/livewire", "resources/views/components"]
|
||
stem = re.sub(r"Controller$", "", name, flags=re.I)
|
||
for vr in view_roots:
|
||
for hint in [stem, stem.lower()]:
|
||
dp = repo_root / f"{vr}/{hint}"
|
||
if dp.exists() and dp.is_dir():
|
||
for bp in dp.rglob("*.blade.php"):
|
||
if bp.stat().st_size < 500000:
|
||
rels.append(str(bp.relative_to(repo_root)))
|
||
return list(dict.fromkeys(rels))[:8]
|
||
|
||
def laravel_signal_candidates(repo_root: Path, user_goal: str, all_files: list[str], max_out: int = 6) -> list[str]:
|
||
"""
|
||
Heuristische preselectie voor Laravel 'aanmaken/nieuw' use-cases:
|
||
- zoekt in routes naar 'create|store' of semantic hints
|
||
- projecteert naar controllers + blade views
|
||
"""
|
||
# snelle exit als er geen laravel markers zijn
|
||
if not (repo_root / "artisan").exists() and not (repo_root / "composer.json").exists():
|
||
return []
|
||
|
||
goal = (user_goal or "").lower()
|
||
verbs = _LARAVEL_CREATE_HINTS["verbs"]
|
||
nouns = _LARAVEL_CREATE_HINTS["nouns"]
|
||
|
||
def _goal_hits(s: str) -> int:
|
||
lo = s.lower()
|
||
v = sum(1 for w in verbs if w in lo)
|
||
n = sum(1 for w in nouns if w in lo)
|
||
return v*2 + n # verbs wegen iets zwaarder
|
||
|
||
routes = laravel_scan_routes(repo_root)
|
||
scored = []
|
||
for r in routes:
|
||
base_s = f"{r.get('uri','')} {r.get('name','')} {r.get('controller','') or ''} {r.get('method','') or ''}"
|
||
score = _goal_hits(base_s)
|
||
# bonus als expliciet create/store
|
||
if (r.get("method") or "").lower() in ("create","store"):
|
||
score += 3
|
||
if r.get("verb") == "resource":
|
||
# resource → heeft impliciet create/store routes
|
||
score += 2
|
||
if score > 0:
|
||
scored.append((score, r))
|
||
|
||
if not scored:
|
||
return []
|
||
|
||
scored.sort(key=lambda x: x[0], reverse=True)
|
||
picks: list[str] = []
|
||
for _score, r in scored[:8]:
|
||
# controller + vermoedelijke views
|
||
if r.get("controller"):
|
||
for rel in _candidate_paths_for_controller(repo_root, r["controller"]):
|
||
if rel in all_files and rel not in picks:
|
||
picks.append(rel)
|
||
# view guess als padnaam “melding*create.blade.php”
|
||
for rel in all_files:
|
||
name = os.path.basename(rel).lower()
|
||
dirname = os.path.dirname(rel).lower()
|
||
if any(n in dirname for n in nouns) and ("create" in name or "form" in name):
|
||
if rel not in picks:
|
||
picks.append(rel)
|
||
if len(picks) >= max_out:
|
||
break
|
||
return picks[:max_out]
|
||
|
||
|
||
def _detect_stack_summary(repo_root: Path) -> dict:
|
||
"""Heuristieken: taal/vermoed framework, routes/migraties/DB hints."""
|
||
summary = {
|
||
"languages": {},
|
||
"framework": [],
|
||
"entrypoints": [],
|
||
"routes": [],
|
||
"db": [],
|
||
"notable_dirs": [],
|
||
}
|
||
# talen tellen (globaal)
|
||
ext_map = {}
|
||
for rel in list_repo_files(repo_root):
|
||
ext = os.path.splitext(rel)[1].lower()
|
||
ext_map[ext] = ext_map.get(ext, 0) + 1
|
||
summary["languages"] = dict(sorted(ext_map.items(), key=lambda x: x[1], reverse=True)[:8])
|
||
|
||
# PHP/Laravel hints
|
||
comp = repo_root / "composer.json"
|
||
if comp.exists():
|
||
try:
|
||
import json as _json
|
||
js = _json.loads(comp.read_text(encoding="utf-8", errors="ignore"))
|
||
req = (js.get("require") or {}) | (js.get("require-dev") or {})
|
||
if any("laravel/framework" in k for k in req.keys()):
|
||
summary["framework"].append("Laravel")
|
||
except Exception:
|
||
pass
|
||
if (repo_root / "artisan").exists():
|
||
summary["entrypoints"].append("artisan (Laravel CLI)")
|
||
# Node hints
|
||
pkg = repo_root / "package.json"
|
||
if pkg.exists():
|
||
try:
|
||
import json as _json
|
||
js = _json.loads(pkg.read_text(encoding="utf-8", errors="ignore"))
|
||
deps = list((js.get("dependencies") or {}).keys()) + list((js.get("devDependencies") or {}).keys())
|
||
if any(x in deps for x in ["next", "nuxt", "react", "vue", "vite"]):
|
||
summary["framework"].append("Node/Frontend")
|
||
except Exception:
|
||
pass
|
||
|
||
# Routes (Laravel)
|
||
for rp in ["routes/web.php", "routes/api.php"]:
|
||
p = repo_root / rp
|
||
if p.exists():
|
||
txt = _read_text_file(p) or ""
|
||
for m in re.finditer(r"Route::(get|post|put|patch|delete)\s*\(\s*['\"]([^'\"]+)['\"]", txt):
|
||
summary["routes"].append(f"{rp}: {m.group(1).upper()} {m.group(2)}")
|
||
# DB hints (Laravel/vanilla PHP)
|
||
for rp in ["config/database.php", ".env", ".env.example", "app/config/database.php"]:
|
||
p = repo_root / rp
|
||
if p.exists():
|
||
txt = _read_text_file(p) or ""
|
||
if "DB_" in txt or "mysql" in txt or "sqlite" in txt or "pgsql" in txt:
|
||
snippet = txt[:800].replace("\r"," ")
|
||
summary["db"].append(f"{rp}: {snippet}")
|
||
# Notable dirs
|
||
for d in ["app", "app/admin", "app/public", "public", "resources", "storage", "config", "routes", "src", "docs", "tests"]:
|
||
if (repo_root / d).exists():
|
||
summary["notable_dirs"].append(d)
|
||
return summary
|
||
|
||
def _format_stack_summary_text(s: dict) -> str:
|
||
lines = []
|
||
if s.get("framework"):
|
||
lines.append("Frameworks (heuristiek): " + ", ".join(sorted(set(s["framework"]))))
|
||
if s.get("languages"):
|
||
langs = ", ".join([f"{k or '∅'}×{v}" for k,v in s["languages"].items()])
|
||
lines.append("Talen (bestandext): " + langs)
|
||
if s.get("notable_dirs"):
|
||
lines.append("Mappen: " + ", ".join(s["notable_dirs"]))
|
||
if s.get("entrypoints"):
|
||
lines.append("Entrypoints: " + ", ".join(s["entrypoints"]))
|
||
if s.get("routes"):
|
||
sample = "; ".join(s["routes"][:8])
|
||
lines.append("Routes (sample): " + sample)
|
||
if s.get("db"):
|
||
# toon alleen paden, geen volledige secrets
|
||
lines.append("DB-config aanwezig in: " + ", ".join([d.split(":")[0] for d in s["db"]]))
|
||
return "\n".join(lines)
|
||
|
||
def _collect_repo_context(repo_root: Path, owner_repo: Optional[str], branch: str, question: str, n_ctx: int = 8) -> list[dict]:
|
||
"""Kies relevante paden + snippets via hybrid RAG/keywords, voor QA."""
|
||
# Deze sync helper is bewust niet geïmplementeerd om misbruik te voorkomen.
|
||
# Gebruik altijd de async-variant: _collect_repo_context_async(...)
|
||
raise NotImplementedError("_collect_repo_context is niet beschikbaar; gebruik _collect_repo_context_async")
|
||
all_files = list_repo_files(repo_root)
|
||
# explicit paths uit vraag
|
||
picked: List[str] = []
|
||
for pth in extract_explicit_paths(question):
|
||
if pth in all_files and pth not in picked:
|
||
picked.append(pth)
|
||
else:
|
||
best = best_path_by_basename(all_files, pth)
|
||
if best and best not in picked: picked.append(best)
|
||
# hybrid rag
|
||
loop = asyncio.get_event_loop()
|
||
# NB: call hybrag via run_until_complete buiten async? we zitten al in async in hoofdhandler; hier helper sync → laat caller het async deel doen
|
||
return [] # placeholder; deze helper niet direct gebruiken buiten async
|
||
|
||
async def _collect_repo_context_async(repo_root: Path, owner_repo: Optional[str], branch: str, question: str, n_ctx: int = 8) -> list[dict]:
|
||
all_files = list_repo_files(repo_root)
|
||
picked: List[str] = []
|
||
for pth in extract_explicit_paths(question):
|
||
if pth in all_files and pth not in picked:
|
||
picked.append(pth)
|
||
else:
|
||
best = best_path_by_basename(all_files, pth)
|
||
if best and best not in picked: picked.append(best)
|
||
# DB-vragen: seed eerst met bekende DB-artefacten zodat recall direct goed is
|
||
def _db_seed_paths() -> list[str]:
|
||
prefer: list[str] = []
|
||
# 1) directe, bekende locaties
|
||
for rel in [
|
||
".env", ".env.example", "config/database.php", "config/database.yml",
|
||
"database/database.sqlite"
|
||
]:
|
||
if (repo_root / rel).exists() and rel in all_files:
|
||
prefer.append(rel)
|
||
# 2) migrations / seeders / modellen
|
||
for rel in all_files:
|
||
lo = rel.lower()
|
||
if lo.startswith("database/migrations/") or lo.startswith("database/seeders/"):
|
||
prefer.append(rel)
|
||
elif lo.startswith(("app/models/", "app/model/", "app/Models/")) and lo.endswith(".php"):
|
||
prefer.append(rel)
|
||
elif lo.endswith(".sql"):
|
||
prefer.append(rel)
|
||
# 3) ruwe heuristiek: bestanden met Schema::, DB::, select/insert/update
|
||
hits = []
|
||
for rel in all_files:
|
||
try:
|
||
txt = _read_text_file(repo_root / rel) or ""
|
||
except Exception:
|
||
continue
|
||
tlo = txt.lower()
|
||
if any(x in tlo for x in ["schema::create(", "schema::table(", "db::table(", "db::select(", "select ", "insert into ", "create table "]):
|
||
hits.append(rel)
|
||
# dedupe en cap
|
||
seen = set(); out = []
|
||
for rel in prefer + hits:
|
||
if rel not in seen:
|
||
seen.add(rel); out.append(rel)
|
||
if len(out) >= n_ctx:
|
||
break
|
||
return out
|
||
|
||
if _db_intent(question):
|
||
for p in _db_seed_paths():
|
||
if p in all_files and p not in picked:
|
||
picked.append(p)
|
||
|
||
hybrid = await hybrid_rag_select_paths(repo_root, owner_repo, branch, question, all_files, max_out=n_ctx)
|
||
|
||
for p in hybrid:
|
||
if p not in picked: picked.append(p)
|
||
# keyword fallback als nodig
|
||
if len(picked) < n_ctx:
|
||
for rel, _s in simple_keyword_search(repo_root, all_files, question, limit=n_ctx):
|
||
if rel not in picked: picked.append(rel)
|
||
# maak snippets
|
||
quotes = extract_quotes(question)
|
||
hints = extract_word_hints(question)
|
||
out = []
|
||
for rel in picked[:n_ctx]:
|
||
txt = _read_text_file(repo_root / rel) or ""
|
||
snippet = extract_focus_snippets(txt, (quotes + hints)[:6], window=320, max_snippets=2)
|
||
out.append({"path": rel, "snippet": snippet})
|
||
return out
|
||
|
||
def _trim_text_to_tokens(text: str, max_tokens: int, tok_len=approx_token_count) -> str:
|
||
if tok_len(text) <= max_tokens:
|
||
return text
|
||
# ruwe char-slice obv 4 chars/token
|
||
max_chars = max(200, max_tokens * 4)
|
||
return text[:max_chars]
|
||
|
||
def _jaccard_tokens(a: str, b: str) -> float:
|
||
ta = set(re.findall(r"[A-Za-z0-9_]+", (a or "").lower()))
|
||
tb = set(re.findall(r"[A-Za-z0-9_]+", (b or "").lower()))
|
||
if not ta or not tb:
|
||
return 0.0
|
||
return len(ta & tb) / max(1, len(ta | tb))
|
||
|
||
def _db_intent(text: str) -> bool:
|
||
"""Detecteer of de vraag over DB-verbindingen/schema/queries gaat."""
|
||
t = (text or "").lower()
|
||
keys = [
|
||
"database", "sql", "microsoft sql", "ms sql", "mssql", "sql server",
|
||
"schema", "tabel", "tabellen", "migratie", "migrations",
|
||
"query", "queries", "select", "insert", "update", "delete",
|
||
"db_", "connection string", "dsn", "driver", "host", "poort", "poortnummer",
|
||
"database.php", ".env"
|
||
]
|
||
return any(k in t for k in keys)
|
||
|
||
|
||
def _prepare_contexts_under_budget(
|
||
contexts: List[dict],
|
||
question: str,
|
||
stack_summary_text: str,
|
||
*,
|
||
budget_tokens: int = int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")),
|
||
tok_len=approx_token_count
|
||
) -> List[dict]:
|
||
"""
|
||
Slimme budgetverdeler:
|
||
- dedup & near-dedup
|
||
- novelty-gewicht t.o.v. reeds gekozen snippets
|
||
- adaptieve toekenningsstrategie met min/max per snippet
|
||
"""
|
||
if not contexts:
|
||
return contexts
|
||
|
||
# Tunables (mil de default iets conservatiever):
|
||
MIN_PER = int(os.getenv("QA_MIN_PER_SNIPPET", "180")) # hard min
|
||
MAX_PER = int(os.getenv("QA_MAX_PER_SNIPPET", "900")) # hard max
|
||
KEEP_TOP = int(os.getenv("QA_KEEP_TOP_K", "8")) # cap op #snippets
|
||
NOVELTY_THRESH = float(os.getenv("QA_NOVELTY_DROP", "0.25")) # onder deze novelty laten we vallen
|
||
DEDUP_THRESH = float(os.getenv("QA_DEDUP_JACCARD", "0.85")) # zeer hoge overlap => drop
|
||
|
||
# 0) cap aantal snippets alvast (caller leverde al gerankt)
|
||
contexts = contexts[:KEEP_TOP]
|
||
|
||
# 1) brute dedup op pad + near-dup op tekst (Jaccard)
|
||
unique: List[dict] = []
|
||
seen_paths = set()
|
||
for c in contexts:
|
||
p = c.get("path","")
|
||
s = str(c.get("snippet",""))
|
||
if p in seen_paths:
|
||
continue
|
||
# near-dup check tegen al gekozen snippets
|
||
is_dup = False
|
||
for u in unique:
|
||
if _jaccard_tokens(u["snippet"], s) >= DEDUP_THRESH:
|
||
is_dup = True
|
||
break
|
||
if not is_dup:
|
||
unique.append({"path": p, "snippet": s})
|
||
seen_paths.add(p)
|
||
contexts = unique
|
||
|
||
if not contexts:
|
||
return contexts
|
||
|
||
# Overhead raming zoals voorheen (headers + vraag + stack)
|
||
header = (
|
||
"Beantwoord de vraag over deze codebase. Wees concreet, kort, en noem bronnen (padnamen).\n"
|
||
"Als zekerheid laag is, stel max 2 verduidelijkingsvragen.\n\n"
|
||
f"VRAAG:\n{question}\n\n"
|
||
f"REPO SAMENVATTING:\n{stack_summary_text or '(geen)'}\n\n"
|
||
"RELEVANTE FRAGMENTEN:\n"
|
||
)
|
||
frag_headers = "\n\n".join([f"{i+1}) PATH: {c['path']}\nFRAGMENT:\n" for i, c in enumerate(contexts)])
|
||
overhead_tokens = tok_len(header) + tok_len(frag_headers) + 200
|
||
|
||
# Beschikbaar voor echte snippet-inhoud
|
||
remain = max(300, budget_tokens - overhead_tokens)
|
||
n = len(contexts)
|
||
|
||
# 2) Schat "relevance proxy" = overlap tussen vraag en snippet
|
||
def rel(sn: str) -> float:
|
||
return _jaccard_tokens(question, sn)
|
||
|
||
# 3) Greedy novelty: per snippet extra score voor info die nog niet gedekt is
|
||
chosen_text = "" # cumulatieve "coverage"
|
||
scores = []
|
||
for i, c in enumerate(contexts):
|
||
s = c["snippet"]
|
||
r = rel(s)
|
||
# novelty = 1 - overlap met reeds gekozen tekst
|
||
nov = 1.0 - _jaccard_tokens(chosen_text, s) if chosen_text else 1.0
|
||
# filter extreem lage novelty: helpt ruis te schrappen
|
||
if nov < NOVELTY_THRESH and i > 0:
|
||
# Markeer als zwak; we geven ‘m een heel lage score (kan later afvallen)
|
||
scores.append((i, r * 0.05, nov))
|
||
else:
|
||
# na 3 snippets weeg novelty zwaarder
|
||
if i >= 3:
|
||
scores.append((i, r * (0.35 + 0.65 * nov), nov))
|
||
else:
|
||
scores.append((i, r * (0.5 + 0.5 * nov), nov))
|
||
# update coverage grof: voeg tokens toe (beperkt) om drift te vermijden
|
||
if tok_len(chosen_text) < 4000:
|
||
chosen_text += "\n" + s[:1200]
|
||
|
||
# 4) Als totaal-minima al boven budget → kap staart
|
||
total_min = n * MIN_PER
|
||
if total_min > remain:
|
||
# Sorteer op score aflopend, en hou zoveel als past met MIN_PER
|
||
ranked_idx = sorted(range(n), key=lambda i: scores[i][1], reverse=True)
|
||
keep_idx = ranked_idx[: max(1, remain // MIN_PER)]
|
||
contexts = [contexts[i] for i in keep_idx]
|
||
scores = [scores[i] for i in keep_idx]
|
||
n = len(keep_idx)
|
||
|
||
# 5) Verdeel budget: iedereen MIN_PER, rest proportioneel op score; cap op MAX_PER
|
||
base = n * MIN_PER
|
||
extra = max(0, remain - base)
|
||
# normaliseer score-gewichten
|
||
raw = [max(0.0, sc) for (_i, sc, _nov) in scores]
|
||
ssum = sum(raw) or 1.0
|
||
weights = [x / ssum for x in raw]
|
||
|
||
alloc = [MIN_PER + int(extra * w) for w in weights]
|
||
# enforce MAX_PER; redistribueer overschot grofweg
|
||
overshoot = 0
|
||
for i in range(n):
|
||
if alloc[i] > MAX_PER:
|
||
overshoot += alloc[i] - MAX_PER
|
||
alloc[i] = MAX_PER
|
||
if overshoot > 0:
|
||
# verdeel overschot naar anderen die nog onder MAX_PER zitten
|
||
holes = [i for i in range(n) if alloc[i] < MAX_PER]
|
||
if holes:
|
||
plus = overshoot // len(holes)
|
||
for i in holes:
|
||
alloc[i] = min(MAX_PER, alloc[i] + plus)
|
||
|
||
# 6) Trim snippet-tekst op toegekend budget
|
||
trimmed = []
|
||
for i, c in enumerate(contexts):
|
||
sn = str(c.get("snippet",""))
|
||
sn = _trim_text_to_tokens(sn, alloc[i], tok_len)
|
||
trimmed.append({"path": c["path"], "snippet": sn})
|
||
return trimmed
|
||
|
||
|
||
async def _llm_qa_answer(question: str, stack_summary_text: str, contexts: List[dict]) -> str:
|
||
"""
|
||
Laat de LLM een bondig antwoord formuleren met bronverwijzingen.
|
||
- Antwoord in NL
|
||
- Noem paden als bronnen
|
||
- Stel max 2 verduidelijkingsvragen als informatie ontbreekt
|
||
"""
|
||
# --- NIEUW: trim contexts onder tokenbudget ---
|
||
contexts = _prepare_contexts_under_budget(
|
||
contexts, question, stack_summary_text,
|
||
budget_tokens=int(os.getenv("AGENT_QA_CTX_BUDGET_TOKENS", "6000")),
|
||
tok_len=approx_token_count
|
||
)
|
||
|
||
ctx_blocks = []
|
||
for i, c in enumerate(contexts, 1):
|
||
ctx_blocks.append(f"{i}) PATH: {c['path']}\nFRAGMENT:\n{c['snippet'][:1200]}") # laat 1200 char-cap staan; _prepare_contexts_ kapt al eerder af
|
||
USER = (
|
||
"Beantwoord de vraag over deze codebase. Wees concreet, kort, en noem bronnen (padnamen).\n"
|
||
"Als zekerheid laag is, stel max 2 verduidelijkingsvragen.\n\n"
|
||
f"VRAAG:\n{question}\n\n"
|
||
"REPO SAMENVATTING:\n" + (stack_summary_text or "(geen)") + "\n\n"
|
||
"RELEVANTE FRAGMENTEN:\n" + ("\n\n".join(ctx_blocks) if ctx_blocks else "(geen)") + "\n\n"
|
||
"FORMAT:\n"
|
||
"- Antwoord (kort en feitelijk)\n"
|
||
"- Bronnen: lijst van paden die je gebruikt hebt\n"
|
||
"- (optioneel) Vervolgvragen als iets onduidelijk is\n"
|
||
)
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":"Je bent een zeer precieze, nuchtere code-assistent. Antwoord in het Nederlands."},
|
||
{"role":"user","content": USER}],
|
||
stream=False, temperature=0.2, top_p=0.9, max_tokens=900
|
||
)
|
||
return resp.get("choices",[{}])[0].get("message",{}).get("content","").strip()
|
||
|
||
# heuristics: iets kleinere chunks voor Laravel/Blade/Routes, anders iets groter
|
||
def _chunk_params_for_repo(root: Path) -> tuple[int,int]:
|
||
# simpele stack detectie:
|
||
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
||
if is_laravel:
|
||
return int(os.getenv("CHUNK_CHARS_LARAVEL","1800")), int(os.getenv("CHUNK_OVERLAP_LARAVEL","300"))
|
||
return int(os.getenv("CHUNK_CHARS_DEFAULT","2600")), int(os.getenv("CHUNK_OVERLAP_DEFAULT","350"))
|
||
|
||
|
||
# ---------- QA repo agent ----------
|
||
async def repo_qa_answer(repo_hint: str, question: str, branch: str = "main", n_ctx: int = 8) -> str:
|
||
"""
|
||
High-level QA over een specifieke repo:
|
||
- resolve + clone/update
|
||
- (re)index RAG collectie
|
||
- stack summary
|
||
- context ophalen
|
||
- LLM antwoord met bronnen
|
||
"""
|
||
meta, _reason = resolve_repo(repo_hint)
|
||
if not meta:
|
||
# Als hint owner/repo is: meteen bestaan-check
|
||
if re.match(r"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$", repo_hint):
|
||
owner, name = repo_hint.split("/", 1)
|
||
if not gitea_repo_exists(owner, name):
|
||
return f"Repo `{repo_hint}` niet gevonden of geen rechten. Controleer naam/URL/token."
|
||
return f"Kon repo niet vinden voor hint: {repo_hint}"
|
||
|
||
repo_url = meta.get("clone_url") or repo_hint
|
||
owner_repo = meta.get("full_name")
|
||
|
||
# clone/checkout
|
||
try:
|
||
async with _CLONE_SEMA:
|
||
repo_path = await _call_get_git_repo(repo_url, branch)
|
||
except Exception as e:
|
||
# fallback naar master
|
||
branch = "master"
|
||
try:
|
||
async with _CLONE_SEMA:
|
||
repo_path = await _call_get_git_repo(repo_url, branch)
|
||
except Exception as e:
|
||
return (f"Clonen mislukte voor `{owner_repo or repo_hint}`: {e}. "
|
||
"Controleer repo-naam/URL of je toegangsrechten.")
|
||
root = Path(repo_path)
|
||
|
||
# (re)index collectie voor deze repo
|
||
collection = repo_collection_name(owner_repo, branch)
|
||
chunk_chars, overlap = _chunk_params_for_repo(Path(repo_path))
|
||
try:
|
||
await _rag_index_repo_internal(
|
||
repo_url=repo_url, branch=branch, profile="auto",
|
||
include="", exclude_dirs="", chunk_chars=chunk_chars, overlap=overlap,
|
||
collection_name=collection
|
||
)
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:rag_index for QA failed (%s), fallback 'code_docs': %s", collection, e)
|
||
collection = "code_docs"
|
||
await _rag_index_repo_internal(
|
||
repo_url=repo_url, branch=branch, profile="auto",
|
||
include="", exclude_dirs="", chunk_chars=chunk_chars, overlap=overlap,
|
||
collection_name=collection
|
||
)
|
||
|
||
# stack summary
|
||
stack = _detect_stack_summary(root)
|
||
stack_txt = _format_stack_summary_text(stack)
|
||
|
||
try:
|
||
symbol_index_repo(root, owner_repo, branch)
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:symbol index build (QA) failed: %s", e)
|
||
|
||
# context
|
||
contexts = await _collect_repo_context_async(root, owner_repo, branch, question, n_ctx=n_ctx)
|
||
|
||
# antwoord
|
||
answer = await _llm_qa_answer(question, stack_txt, contexts)
|
||
return answer
|
||
|
||
|
||
|
||
# ---------- Dry-run voorstel ----------
|
||
async def propose_patches_without_apply(repo_path: str, candidates: List[str], user_goal: str) -> Tuple[Dict[str,str], Dict[str,str], Dict[str,str]]:
|
||
"""
|
||
Returns: proposed, diffs, reasons
|
||
- reasons[pad] bevat korte uitleg over de wijziging/keuze
|
||
"""
|
||
proposed, diffs, reasons = {}, {}, {}
|
||
root = Path(repo_path)
|
||
token_steps = [1536, 1024, 768, 512]
|
||
quotes = extract_quotes(user_goal)
|
||
hints = extract_word_hints(user_goal)
|
||
old_new = (quotes[0], quotes[1]) if len(quotes) >= 2 else (None, None)
|
||
|
||
|
||
# Bepaal taaktype lokaal (lichtgewicht, 1 LLM-call; framework-heuristiek)
|
||
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
||
try:
|
||
_route = await _llm_task_route(user_goal, framework=("laravel" if is_laravel else "generic"))
|
||
_task_type = (_route.get("task_type") or "").lower()
|
||
except Exception:
|
||
_task_type = ""
|
||
|
||
def _is_view_or_lang(path: str) -> bool:
|
||
return path.endswith(".blade.php") or path.startswith("resources/lang/")
|
||
|
||
|
||
for rel in candidates:
|
||
p = root / rel
|
||
# als het pad nog niet bestaat probeer een create-voorstel
|
||
if not p.exists():
|
||
content, because = await propose_new_file(root, rel, user_goal)
|
||
if content:
|
||
proposed[rel] = content
|
||
diffs[rel] = make_new_file_diff(rel, content, max_lines=300)
|
||
reasons[rel] = because
|
||
else:
|
||
logger.info("INFO:agent_repo:no create-proposal for missing file %s", rel)
|
||
continue
|
||
|
||
try:
|
||
original = _read_text_file(p)
|
||
except Exception:
|
||
original = ""
|
||
if not original:
|
||
logger.info("INFO:agent_repo:skip unreadable/empty %s", rel)
|
||
continue
|
||
|
||
# 0) Gerichte, veilige fallback-literal replace (alleen bij oud->nieuw)
|
||
old, new, why_pair = deduce_old_new_literals(user_goal, original)
|
||
if old and new:
|
||
tmp, ok, because = targeted_fallback_replace(original, old, new)
|
||
if ok and tmp != original:
|
||
# anti-destructie niet nodig: minimale vervanging
|
||
proposed[rel] = tmp
|
||
diffs[rel] = make_diffs(original, tmp, rel, max_lines=200)
|
||
reasons[rel] = f"{because}. ({why_pair})"
|
||
continue
|
||
|
||
# 1) HTML-scope als prompt tags noemt
|
||
ctx = extract_context_hints_from_prompt(user_goal)
|
||
if old and new and ctx["tag_names"]:
|
||
scoped, ok, because = html_scoped_literal_replace(original, old, new, ctx["tag_names"])
|
||
if ok and scoped != original and not is_destructive(original, scoped, allow_destructive=False):
|
||
proposed[rel] = scoped
|
||
diffs[rel] = make_diffs(original, scoped, rel, max_lines=200)
|
||
reasons[rel] = (because + (f" ({why_pair})" if why_pair else ""))
|
||
continue
|
||
|
||
# 2) Fallback-literal (??,?:, "", or) - volledig generiek
|
||
#if old and new:
|
||
# tmp, ok, because = targeted_fallback_replace(original, old, new)
|
||
# if ok and tmp != original and not is_destructive(original, tmp, allow_destructive=False):
|
||
# proposed[rel] = tmp
|
||
# diffs[rel] = make_diffs(original, tmp, rel, max_lines=200)
|
||
# reasons[rel] = (because + (f" ({why_pair})" if why_pair else ""))
|
||
# continue
|
||
# Zit al in stap 0)
|
||
|
||
# 3) Algemene quoted-literal (taalagnostisch, behoud minimaliteit)
|
||
if old and new:
|
||
qrep, ok, because = quoted_literal_replace(original, old, new, max_occurrences=2)
|
||
if ok and qrep != original and not is_destructive(original, qrep, allow_destructive=False):
|
||
proposed[rel] = qrep
|
||
diffs[rel] = make_diffs(original, qrep, rel, max_lines=200)
|
||
reasons[rel] = (because + (f" ({why_pair})" if why_pair else ""))
|
||
continue
|
||
|
||
|
||
# 4) Focus-snippets + LLM edit-plan
|
||
needles = []
|
||
if quotes: needles += quotes
|
||
if hints: needles += hints[:6]
|
||
focus = extract_focus_snippets(original, needles, window=240, max_snippets=3)
|
||
|
||
# Tree-hint standaard aan: maak compacte map-tree en zet in globale var voor de prompt
|
||
try:
|
||
globals()["_LLM_EDIT_TREE_HINT"] = _make_local_tree_hint(root, rel, max_siblings=14)
|
||
except Exception:
|
||
globals()["_LLM_EDIT_TREE_HINT"] = ""
|
||
plan = await llm_plan_edits_for_file(user_goal, rel, focus)
|
||
if plan:
|
||
patched, change_count, explains, allow_destructive = apply_edit_plan(original, plan)
|
||
if change_count > 0 and patched.strip() != original.strip():
|
||
if is_destructive(original, patched, allow_destructive):
|
||
logger.warning("WARN:agent_repo:destructive patch blocked for %s", rel)
|
||
else:
|
||
proposed[rel] = patched
|
||
diffs[rel] = make_diffs(original, patched, rel, max_lines=200)
|
||
reasons[rel] = "LLM edit-plan: " + "; ".join(explains[:4])
|
||
continue
|
||
|
||
# 5) Volledige rewrite fallback (met guard)
|
||
# Bij UI-label taken verbieden we volledige rewrites op NIET-view/lang bestanden.
|
||
if _task_type == "ui_label_change" and not _is_view_or_lang(rel):
|
||
logger.info("INFO:agent_repo:skip full rewrite for non-view/lang during ui_label_change: %s", rel)
|
||
# sla deze stap over; ga door naar volgende kandidaat
|
||
continue
|
||
last_err = None
|
||
for mx in [1024]:
|
||
try:
|
||
messages = [
|
||
{"role":"system","content":"Voer exact de gevraagde wijziging uit. GEEN extra refactors/best practices. Lever de volledige, werkende bestandinformatie als 1 codeblok."},
|
||
{"role":"user","content": f"Doel:\n{user_goal}\n\nBestand ({rel}) huidige inhoud:\n```\n{original}\n```"}
|
||
]
|
||
resp = await _llm_call(messages, stream=False, temperature=0.2, top_p=0.9, max_tokens=mx)
|
||
newc = _extract_code_block(resp.get("choices",[{}])[0].get("message",{}).get("content","")) or original
|
||
if newc.strip() != original.strip():
|
||
if is_destructive(original, newc, allow_destructive=False):
|
||
logger.warning("WARN:agent_repo:destructive rewrite blocked for %s (ratio>%.2f)", rel, AGENT_DESTRUCTIVE_RATIO)
|
||
break # early-exit: geen extra pogingen
|
||
proposed[rel] = newc
|
||
diffs[rel] = make_diffs(original, newc, rel, max_lines=200)
|
||
reasons[rel] = "Full rewrite (guarded): minimale aanpassing om het doel te halen."
|
||
break
|
||
except Exception as e:
|
||
last_err = e
|
||
logger.warning("WARN:agent_repo:LLM rewrite fail %s mx=%d: %s", rel, mx, repr(e))
|
||
#continue
|
||
if rel not in proposed and last_err:
|
||
logger.error("ERROR:agent_repo:give up on %s after retries: %s", rel, repr(last_err))
|
||
# --- Syntax guard filtering (laatste stap) ---
|
||
drop: List[str] = []
|
||
for rel, content in proposed.items():
|
||
try:
|
||
if rel.endswith(".php"):
|
||
tmp = _write_tmp(content, ".php")
|
||
ok = _php_lint_ok(tmp)
|
||
try: tmp.unlink(missing_ok=True)
|
||
except Exception: pass
|
||
if not ok:
|
||
reasons[rel] = (reasons.get(rel,"") + " [PHP lint failed]").strip()
|
||
drop.append(rel)
|
||
elif rel.endswith(".blade.php"):
|
||
if not _blade_balance_ok(content):
|
||
reasons[rel] = (reasons.get(rel,"") + " [Blade balance failed]").strip()
|
||
drop.append(rel)
|
||
except Exception:
|
||
# in twijfel: laat de patch door (fail-open), maar log upstream
|
||
pass
|
||
for rel in drop:
|
||
proposed.pop(rel, None); diffs.pop(rel, None)
|
||
return proposed, diffs, reasons
|
||
|
||
|
||
# ---------- Agent state ----------
|
||
@dataclass
|
||
class AgentState:
|
||
stage: str = "TRIAGE"
|
||
questions_asked: int = 0
|
||
user_goal: str = ""
|
||
repo_hint: str = ""
|
||
selected_repo: dict | None = None
|
||
repo_url: str = ""
|
||
branch_base: str = AGENT_DEFAULT_BRANCH
|
||
repo_path: str = ""
|
||
owner_repo: str | None = None
|
||
collection_name: str = ""
|
||
candidate_paths: List[str] = field(default_factory=list)
|
||
proposed_patches: Dict[str, str] = field(default_factory=dict)
|
||
reasons: Dict[str, str] = field(default_factory=dict)
|
||
new_branch: str = ""
|
||
dry_run: bool = True
|
||
repo_candidates: List[dict] = field(default_factory=list)
|
||
smart_preview: str = ""
|
||
recovery_attempted: bool = False
|
||
|
||
# --- bootstrap op echte repo-inhoud ------------------------------------------------
|
||
async def _detect_repo_url(text: str) -> str | None:
|
||
m = re.search(r"(https?://\S+?\.git)\b", text or "")
|
||
return m.group(1) if m else None
|
||
|
||
async def _ensure_indexed(repo_url: str, *, branch: str = "main", profile: str = "auto",
|
||
rag_index_repo_internal_fn=None, get_git_repo_fn=None):
|
||
# clone/update (best-effort) om failures vroeg te vangen
|
||
if get_git_repo_fn:
|
||
try:
|
||
loop = asyncio.get_running_loop()
|
||
await loop.run_in_executor(None, get_git_repo_fn, repo_url, branch)
|
||
except Exception:
|
||
pass
|
||
if rag_index_repo_internal_fn:
|
||
await rag_index_repo_internal_fn(
|
||
repo_url=repo_url, branch=branch, profile=profile,
|
||
include="", exclude_dirs="",
|
||
chunk_chars=int(os.getenv("RAG_CHUNK_CHARS","3000")),
|
||
overlap=int(os.getenv("RAG_CHUNK_OVERLAP","400")),
|
||
collection_name=os.getenv("RAG_COLLECTION","code_docs"),
|
||
)
|
||
|
||
async def _bootstrap_overview(repo_url: str, rag_query_internal_fn, *, collection="code_docs") -> str:
|
||
"""Haalt echte passages op en maakt een compacte context."""
|
||
# Bij per-repo collections is een extra repo-filter contraproductief.
|
||
# Gebruik daarom repo=None zodra we een collection doorgeven.
|
||
owner, name = owner_repo_from_url(repo_url)
|
||
repo_full = f"{owner}/{name}" if (owner and name) else None
|
||
wants = [
|
||
{"q": "project overview readme", "path_contains": "README"},
|
||
{"q": "install setup configuration", "path_contains": "README"},
|
||
{"q": "composer dependencies autoload", "path_contains": "composer.json"},
|
||
{"q": "npm dependencies scripts", "path_contains": "package.json"},
|
||
{"q": "routes definitions", "path_contains": "routes"},
|
||
{"q": "controllers overview", "path_contains": "app/Http/Controllers"},
|
||
{"q": "views templates blade", "path_contains": "resources/views"},
|
||
{"q": "env example", "path_contains": ".env"},
|
||
]
|
||
chunks = []
|
||
for w in wants:
|
||
res = await rag_query_internal_fn(
|
||
query=w["q"], n_results=3,
|
||
collection_name=collection, # per-repo collectie al gebruikt
|
||
repo=None, # voorkom dubbele/te strikte scoping
|
||
path_contains=w["path_contains"], profile=None
|
||
)
|
||
chunks.extend((res or {}).get("results", []))
|
||
|
||
seen = set(); buf = []
|
||
for r in chunks[:18]:
|
||
meta = r.get("metadata") or {}
|
||
key = (meta.get("path",""), meta.get("chunk_index"))
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
body = (r.get("document") or "").strip()[:1200]
|
||
buf.append(f"### {meta.get('path','')}\n{body}")
|
||
return "\n\n".join(buf[:8]).strip()
|
||
|
||
def _extract_explicit_paths_robust(text: str) -> list[str]:
|
||
"""
|
||
Haalt bestands-paden uit vrije tekst robuust op.
|
||
Herkent tokens met minimaal één '/' en één '.' (extensie),
|
||
negeert trailing leestekens.
|
||
"""
|
||
if not text:
|
||
return []
|
||
pats = re.findall(r"[A-Za-z0-9_./\\-]+\\.[A-Za-z0-9_.-]+", text)
|
||
out = []
|
||
for p in pats:
|
||
# normaliseer Windows backslashes → unix
|
||
p = p.replace("\\", "/")
|
||
# strip algemene trailing chars
|
||
p = p.strip().strip(",.;:)]}>'\"")
|
||
if "/" in p and "." in p:
|
||
out.append(p)
|
||
# de-dup behoud volgorde
|
||
seen = set(); uniq = []
|
||
for p in out:
|
||
if p not in seen:
|
||
uniq.append(p); seen.add(p)
|
||
return uniq
|
||
|
||
def _sanitize_path_hints(hints: list[str], all_files: list[str]) -> list[str]:
|
||
"""
|
||
Filter pseudo-paden zoals 'tool.list' weg. Sta alleen echte projectpaden of
|
||
bekende extensies toe en vereis een '/' om pure tokens te weren.
|
||
"""
|
||
if not hints:
|
||
return []
|
||
ALLOWED_SUFFIXES = (
|
||
".blade.php",".php",".js",".ts",".json",".yml",".yaml",".py",".md",".env",
|
||
".sql",".css",".vue",".jsx",".tsx"
|
||
)
|
||
BAD_BASENAMES = {"tool","tools","list","search","update","create","store","index"}
|
||
out, seen = [], set()
|
||
for h in hints:
|
||
if not h:
|
||
continue
|
||
h = h.strip().lstrip("./").replace("\\","/")
|
||
if "/" not in h:
|
||
continue
|
||
base = os.path.basename(h)
|
||
stem = base.split(".",1)[0].lower()
|
||
if h not in all_files and not any(h.endswith(suf) for suf in ALLOWED_SUFFIXES):
|
||
continue
|
||
if stem in BAD_BASENAMES and h not in all_files:
|
||
continue
|
||
if h not in seen:
|
||
seen.add(h); out.append(h)
|
||
return out
|
||
|
||
def _grep_repo_for_literal(root: Path, needle: str, limit: int = 12) -> list[str]:
|
||
"""
|
||
Heel snelle, ruwe literal-zoeker over tekstbestanden in de repo.
|
||
Retourneert lijst met relatieve paden waar 'needle' voorkomt (top 'limit').
|
||
"""
|
||
if not needle or len(needle) < 2:
|
||
return []
|
||
hits = []
|
||
try:
|
||
for p in root.rglob("*"):
|
||
if p.is_dir():
|
||
continue
|
||
# respecteer uitgesloten directories en grootte-limiet
|
||
if any(part in _PROFILE_EXCLUDE_DIRS for part in p.parts):
|
||
continue
|
||
try:
|
||
if p.stat().st_size > 500_000:
|
||
continue
|
||
except Exception:
|
||
continue
|
||
# alleen tekst-achtige extensies volgens allowed_file()
|
||
if not allowed_file(p):
|
||
continue
|
||
# lees als tekst (met best-effort fallback)
|
||
try:
|
||
txt = p.read_text(encoding="utf-8", errors="ignore")
|
||
except Exception:
|
||
try:
|
||
txt = p.read_text(encoding="latin-1", errors="ignore")
|
||
except Exception:
|
||
continue
|
||
if needle in txt:
|
||
try:
|
||
rel = str(p.relative_to(root))
|
||
except Exception:
|
||
rel = str(p)
|
||
hits.append(rel)
|
||
if len(hits) >= limit:
|
||
break
|
||
except Exception:
|
||
pass
|
||
return hits
|
||
|
||
def _laravel_priors_from_prompt(user_goal: str, root: Path, all_files: list[str], max_k: int = 8) -> list[str]:
|
||
"""
|
||
Geef een lijst met waarschijnlijke Laravel-bestanden op basis van conventies + prompt-keywords.
|
||
Neem ALLEEN paden op die daadwerkelijk bestaan in de repo (all_files).
|
||
"""
|
||
text = (user_goal or "").lower()
|
||
exists = set(all_files)
|
||
priors: list[str] = []
|
||
|
||
def add_if_present(paths: list[str]):
|
||
for p in paths:
|
||
if p in exists and p not in priors:
|
||
priors.append(p)
|
||
|
||
# Altijd nuttige ankerpunten in Laravel repos
|
||
add_if_present([
|
||
"routes/web.php",
|
||
"routes/api.php",
|
||
"config/app.php",
|
||
"config/database.php",
|
||
".env",
|
||
".env.example",
|
||
"resources/lang/en.json",
|
||
"resources/lang/nl.json",
|
||
])
|
||
|
||
# Prompt-gestuurde hints
|
||
if any(k in text for k in ("api ", "endpoint", "jwt", "sanctum", "api-route")):
|
||
add_if_present(["routes/api.php"])
|
||
if any(k in text for k in ("route", "router", "web", "pagina", "page", "url ")):
|
||
add_if_present(["routes/web.php"])
|
||
if any(k in text for k in ("controller", "actie", "action", "handler", "store(", "update(", "create(", "edit(")):
|
||
# neem de meest voorkomende controllers-map mee
|
||
# (geen directory listing; we kiezen alleen de indexerende anchor-files)
|
||
for p in exists:
|
||
if p.startswith("app/Http/Controllers/") and p.endswith(".php"):
|
||
priors.append(p)
|
||
if len(priors) >= max_k:
|
||
break
|
||
if any(k in text for k in ("view", "blade", "template", "pagina", "page", "formulier", "form")):
|
||
# bekende view-locaties
|
||
add_if_present([
|
||
"resources/views/layouts/app.blade.php",
|
||
"resources/views/welcome.blade.php",
|
||
"resources/views/dashboard.blade.php",
|
||
])
|
||
# heuristisch: als prompt een padfragment noemt (b.v. 'log/create'), pak views daaronder
|
||
m = re.search(r"resources/views/([A-Za-z0-9_/\-]+)/", user_goal)
|
||
if m:
|
||
base = f"resources/views/{m.group(1).strip('/')}/"
|
||
for p in exists:
|
||
if p.startswith(base) and p.endswith(".blade.php") and p not in priors:
|
||
priors.append(p)
|
||
if len(priors) >= max_k:
|
||
break
|
||
if any(k in text for k in ("validatie", "validation", "formrequest", "request class", "rules(")):
|
||
# vaak custom FormRequest classes
|
||
for p in exists:
|
||
if p.startswith("app/Http/Requests/") and p.endswith(".php"):
|
||
priors.append(p)
|
||
if len(priors) >= max_k:
|
||
break
|
||
if any(k in text for k in ("database", "db", "sql", "sqlserver", "mssql", "mysql", "pgsql", "connection", "migratie", "migration", "schema")):
|
||
add_if_present(["config/database.php", ".env", ".env.example"])
|
||
# migrations en models zijn vaak relevant
|
||
for p in exists:
|
||
if (p.startswith("database/migrations/") and p.endswith(".php")) or \
|
||
(p.startswith("app/Models/") and p.endswith(".php")):
|
||
priors.append(p)
|
||
if len(priors) >= max_k:
|
||
break
|
||
if any(k in text for k in ("taal", "language", "vertaling", "translation", "lang", "i18n")):
|
||
# neem json én php lang packs mee
|
||
for p in exists:
|
||
if p.startswith("resources/lang/") and (p.endswith(".json") or p.endswith(".php")):
|
||
priors.append(p)
|
||
if len(priors) >= max_k:
|
||
break
|
||
|
||
# dedupe + cap
|
||
uniq: list[str] = []
|
||
seen = set()
|
||
for p in priors:
|
||
if p not in seen:
|
||
uniq.append(p); seen.add(p)
|
||
if len(uniq) >= max_k:
|
||
break
|
||
return uniq
|
||
|
||
async def _llm_framework_priors(user_goal: str, all_files: list[str], framework: str = "laravel", max_k: int = 10) -> list[str]:
|
||
"""
|
||
Laat de LLM kansrijke BESTAANDE bestanden/globs voorstellen op basis van framework-conventies.
|
||
- Output MOET JSON zijn: {"files":[...]} met relatieve paden of simpele globs.
|
||
- We filteren op echt-bestaande paden (match tegen all_files), globs toegestaan.
|
||
- Geen netwerk I/O; 1 kleine LLM-call.
|
||
"""
|
||
text = (user_goal or "").strip()
|
||
if not text:
|
||
return []
|
||
# Bescheiden token budget
|
||
sys = ("You are a precise code navigator. Output ONLY compact JSON with likely file paths for the task.\n"
|
||
"Rules:\n- Return: {\"files\":[\"relative/path/or/glob\", ...]}\n"
|
||
"- Use framework conventions (e.g., Laravel routes/controllers/views, config, .env, migrations, lang).\n"
|
||
"- Do NOT invent files that cannot exist; prefer generic globs (e.g., resources/views/**/create*.blade.php).\n"
|
||
"- No explanations, no prose.")
|
||
usr = (f"Framework: {framework}\n"
|
||
f"Task/prompt:\n{text}\n"
|
||
"Return at most 15 items.\n"
|
||
"Examples for Laravel (if applicable): routes/web.php, app/Http/Controllers/**.php, "
|
||
"resources/views/**.blade.php, config/database.php, .env, database/migrations/**.php, resources/lang/**")
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
||
stream=False, temperature=0.0, top_p=1.0, max_tokens=300
|
||
)
|
||
raw = (resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","").strip()
|
||
except Exception:
|
||
return []
|
||
# Haal eventuele ```json fences weg
|
||
m = re.search(r"\{[\s\S]*\}", raw)
|
||
if not m:
|
||
return []
|
||
try:
|
||
obj = json.loads(m.group(0))
|
||
except Exception:
|
||
return []
|
||
items = obj.get("files") or []
|
||
if not isinstance(items, list):
|
||
return []
|
||
# Glob -> concrete bestanden; filter op bestaande paden
|
||
exists = set(all_files)
|
||
out: list[str] = []
|
||
def _match(pat: str) -> list[str]:
|
||
# simpele glob: **, *, ?. We matchen tegen all_files.
|
||
try:
|
||
pat_norm = pat.strip().lstrip("./")
|
||
return [f for f in all_files if fnmatch.fnmatch(f, pat_norm)]
|
||
except Exception:
|
||
return []
|
||
for it in items:
|
||
if not isinstance(it, str) or not it.strip():
|
||
continue
|
||
it = it.strip().lstrip("./")
|
||
if it in exists:
|
||
if it not in out:
|
||
out.append(it)
|
||
else:
|
||
for hit in _match(it):
|
||
if hit not in out:
|
||
out.append(hit)
|
||
if len(out) >= max_k:
|
||
break
|
||
return out[:max_k]
|
||
|
||
async def _llm_task_route(user_goal: str, framework: str = "laravel") -> dict:
|
||
"""
|
||
Laat de LLM expliciet kiezen: {task_type, categories[], hints[]}
|
||
Voorbeelden task_type:
|
||
- "ui_label_change", "db_credentials", "db_queries", "routes_to_views", "config_env", "generic_code_change"
|
||
categories: welke mappen/artefacten zijn relevant (bv. ["views","controllers","routes","migrations","config",".env"])
|
||
hints: korte trefwoorden of view/controller namen.
|
||
"""
|
||
if not (user_goal or "").strip():
|
||
return {}
|
||
sys = ("You are a precise task router. Return ONLY compact JSON.\n"
|
||
"Schema: {\"task_type\":str, \"categories\":[str,...], \"hints\":[str,...]}\n"
|
||
"Use framework conventions (e.g., Laravel). No explanations.")
|
||
usr = f"Framework: {framework}\nUser goal:\n{user_goal}\nReturn at most 6 categories and 8 hints."
|
||
try:
|
||
resp = await _llm_call(
|
||
[{"role":"system","content":sys},{"role":"user","content":usr}],
|
||
stream=False, temperature=0.0, top_p=1.0, max_tokens=250
|
||
)
|
||
raw = (resp.get('choices',[{}])[0].get('message',{}) or {}).get('content','')
|
||
m = re.search(r"\{[\s\S]*\}", raw or "")
|
||
obj = json.loads(m.group(0)) if m else {}
|
||
# sanitize
|
||
obj["task_type"] = (obj.get("task_type") or "generic_code_change")[:64]
|
||
obj["categories"] = [str(x)[:32] for x in (obj.get("categories") or [])][:8]
|
||
obj["hints"] = [str(x)[:64] for x in (obj.get("hints") or [])][:8]
|
||
return obj
|
||
except Exception:
|
||
return {"task_type":"generic_code_change","categories":[],"hints":[]}
|
||
|
||
# ---------- Hoofd-handler ----------
|
||
async def handle_repo_agent(messages: List[dict], request) -> str:
|
||
"""
|
||
Uitbreiding: fast-path voor unified diffs op expliciete bestanden met tekstvervanging.
|
||
Als niet van toepassing, valt automatisch terug op de bestaande flow.
|
||
"""
|
||
# 1) Combineer user/system content om opdracht te parsen
|
||
try:
|
||
full_txt = "\n".join([m.get("content","") for m in messages if m.get("role") in ("system","user")])
|
||
except Exception:
|
||
full_txt = ""
|
||
|
||
# 2) Herken fast-path
|
||
try_fast = _looks_like_unified_diff_request(full_txt)
|
||
paths_fp = _extract_explicit_paths(full_txt) if try_fast else []
|
||
old_txt, new_txt = _extract_replace_pair(full_txt) if try_fast else (None, None)
|
||
|
||
# NB: we gebruiken de injecties die via initialize_agent zijn gezet:
|
||
# - get_git_repo_fn (async)
|
||
# - read_text_file_fn (sync)
|
||
# Deze symbolen worden onderin initialize_agent aan globals() gehangen.
|
||
get_git_repo_fn = globals().get("get_git_repo_fn")
|
||
read_text_file_fn = globals().get("read_text_file_fn")
|
||
|
||
if try_fast and paths_fp and old_txt and new_txt and callable(get_git_repo_fn) and callable(read_text_file_fn):
|
||
# 3) repo + branch bepalen
|
||
repo_url, branch = _extract_repo_branch_from_text(full_txt)
|
||
if not repo_url:
|
||
# fallback: probeer repo uit eerdere agent-state (optioneel), anders stop fast-path
|
||
repo_url = globals().get("_last_repo_url")
|
||
branch = globals().get("_last_branch", "main")
|
||
if repo_url:
|
||
try:
|
||
repo_root = await get_git_repo_fn(repo_url, branch or "main")
|
||
root = Path(repo_root)
|
||
lang_path = root / "resources" / "lang" / "nl.json"
|
||
lang_before = lang_path.read_text(encoding="utf-8", errors="ignore") if lang_path.exists() else "{}"
|
||
lang_data = {}
|
||
try:
|
||
lang_data = json.loads(lang_before or "{}")
|
||
except Exception:
|
||
lang_data = {}
|
||
|
||
diffs_out = []
|
||
lang_changed = False
|
||
|
||
def _make_udiff(a: str, b: str, rel: str) -> str:
|
||
return "".join(difflib.unified_diff(
|
||
a.splitlines(keepends=True),
|
||
b.splitlines(keepends=True),
|
||
fromfile=f"a/{rel}", tofile=f"b/{rel}", n=3
|
||
))
|
||
|
||
# 4) per bestand: ofwel inline replace, ofwel vertaling bijwerken
|
||
for rel in paths_fp:
|
||
p = root / rel
|
||
if not p.exists():
|
||
continue
|
||
before = read_text_file_fn(p)
|
||
if not before:
|
||
continue
|
||
# Als de 'oude' tekst voorkomt BINNEN een vertaalwrapper, dan géén blade-edit
|
||
found_in_wrapper = False
|
||
for pat in _TRANS_WRAPPERS:
|
||
for m in re.finditer(pat, before):
|
||
inner = m.group(1)
|
||
if inner == old_txt:
|
||
found_in_wrapper = True
|
||
break
|
||
if found_in_wrapper:
|
||
break
|
||
if found_in_wrapper:
|
||
# update nl.json: {"oude": "nieuwe"}
|
||
if lang_data.get(old_txt) != new_txt:
|
||
lang_data[old_txt] = new_txt
|
||
lang_changed = True
|
||
continue
|
||
|
||
# anders: directe, exacte vervanging (conservatief)
|
||
after = before.replace(old_txt, new_txt)
|
||
if after != before:
|
||
diff = _make_udiff(before, after, rel)
|
||
if diff.strip():
|
||
diffs_out.append(("blade", rel, diff))
|
||
|
||
# 5) indien vertaling gewijzigd: diff voor nl.json toevoegen
|
||
if lang_changed:
|
||
new_lang = json.dumps(lang_data, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
|
||
diff_lang = _make_udiff(lang_before if isinstance(lang_before, str) else "", new_lang, "resources/lang/nl.json")
|
||
if diff_lang.strip():
|
||
diffs_out.append(("lang", "resources/lang/nl.json", diff_lang))
|
||
if diffs_out:
|
||
parts = ["### Unified diffs"]
|
||
for kind, rel, d in diffs_out:
|
||
parts.append(f"**{rel}**")
|
||
parts.append("```diff\n" + d + "```")
|
||
return "\n\n".join(parts)
|
||
else:
|
||
return "Dry-run: geen wijzigbare treffers gevonden in opgegeven bestanden (of reeds actueel)."
|
||
except Exception as e:
|
||
# mislukt → val terug op bestaande discover/agent flow
|
||
pass
|
||
|
||
# === GEEN fast-path → ga door met de bestaande flow hieronder ===
|
||
|
||
sid = _get_session_id(messages, request)
|
||
st = _app.state.AGENT_SESSIONS.get(sid) or AgentState()
|
||
_app.state.AGENT_SESSIONS[sid] = st
|
||
user_last = next((m["content"] for m in reversed(messages) if m.get("role")=="user"), "").strip()
|
||
user_last_lower = user_last.lower()
|
||
logger.info("INFO:agent_repo:[%s] stage=%s", sid, st.stage)
|
||
from smart_rag import (
|
||
enrich_intent,
|
||
expand_queries,
|
||
hybrid_retrieve,
|
||
_laravel_pairs_from_route_text,
|
||
_laravel_guess_view_paths_from_text,
|
||
)
|
||
# Als user een .git URL meegeeft: zet state en ga via de state-machine verder
|
||
user_txt = next((m.get("content","") for m in reversed(messages) if m.get("role")=="user"), "")
|
||
repo_url = await _detect_repo_url(user_txt)
|
||
|
||
if repo_url:
|
||
st.repo_hint = repo_url
|
||
st.stage = "SELECT_REPO"
|
||
logger.info("INFO:agent_repo:[%s] direct SELECT_REPO via .git url: %s", sid, repo_url)
|
||
# LET OP: geen vroegtijdige return hier; de SELECT_REPO tak hieronder handelt DISCOVER/INDEX etc. af.
|
||
|
||
|
||
# === SMART-RAG: opt-in pad (alleen als er nog GEEN repo is) ===
|
||
smart_enabled = str(os.getenv("REPO_AGENT_SMART","1")).lower() not in ("0","false")
|
||
if smart_enabled and not st.repo_hint and st.stage in ("TRIAGE","ASK"):
|
||
# 1) intent → plan
|
||
spec = await enrich_intent(_llm_call, messages)
|
||
task = spec.get("task","").strip()
|
||
file_hints = spec.get("file_hints") or []
|
||
keywords = spec.get("keywords") or []
|
||
constraints= spec.get("constraints") or []
|
||
acceptance = spec.get("acceptance") or []
|
||
ask = spec.get("ask")
|
||
|
||
# 2) query expansion (kort) en hybride retrieval
|
||
variants = await expand_queries(_llm_call, task, k=int(os.getenv("RAG_EXPAND_K","3")))
|
||
merged: list[dict] = []
|
||
for i, qv in enumerate(variants):
|
||
partial = await hybrid_retrieve(
|
||
_rag_query_internal,
|
||
qv,
|
||
repo= None,
|
||
profile= None,
|
||
path_contains=(file_hints[0] if file_hints else None),
|
||
per_query_k=int(os.getenv("RAG_PER_QUERY_K","30")),
|
||
n_results=int(os.getenv("RAG_N_RESULTS","18")),
|
||
alpha=float(os.getenv("RAG_EMB_WEIGHT","0.6")),
|
||
)
|
||
merged.extend(partial)
|
||
# dedupe op path+chunk
|
||
seen = set(); uniq = []
|
||
for r in sorted(merged, key=lambda x: x["score"], reverse=True):
|
||
meta = r.get("metadata") or {}
|
||
key = (meta.get("path",""), meta.get("chunk_index",""))
|
||
if key in seen: continue
|
||
seen.add(key); uniq.append(r)
|
||
|
||
# 3) context + confidence
|
||
ctx_text, top_score = assemble_context(uniq, max_chars=int(os.getenv("REPO_AGENT_CONTEXT_CHARS","640000")))
|
||
# heel simpele confidence: als top_score erg laag is en vragen toegestaan → stel 1 verhelderingsvraag
|
||
if ask and float(os.getenv("REPO_AGENT_ASK_CLARIFY","1")) and top_score < float(os.getenv("REPO_AGENT_ASK_THRESHOLD","0.35")):
|
||
return f"Snelle check: {ask}"
|
||
|
||
# 4) finale prompt samenstellen
|
||
sys = (
|
||
"Je bent een senior code-assistent. "
|
||
"Lees de contextfragmenten (met padheaders). "
|
||
"Beantwoord taakgericht, concreet en veilig. "
|
||
"Als je verbeteringen doet, geef dan eerst een kort plan en daarna exacte, toepasbare wijzigingen."
|
||
)
|
||
user = (
|
||
f"TAKEN:\n{task}\n\n"
|
||
f"CONSTRAINTS: {', '.join(constraints) or '-'}\n"
|
||
f"ACCEPTANCE: {', '.join(acceptance) or '-'}\n"
|
||
f"KEYWORDS: {', '.join(keywords) or '-'}\n"
|
||
f"FILE HINTS: {', '.join(file_hints) or '-'}\n\n"
|
||
f"--- CONTEXT (gedeeltelijk) ---\n{ctx_text}\n--- EINDE CONTEXT ---\n\n"
|
||
"Geef eerst een kort, puntsgewijs plan (max 6 bullets). "
|
||
"Daarna de concrete wijzigingen per bestand met codeblokken. "
|
||
"Geen herhaling van hele bestanden als dat niet nodig is."
|
||
)
|
||
llm_resp = await _llm_call(
|
||
[{"role":"system","content":sys},{"role":"user","content":user}],
|
||
stream=False, temperature=0.2, top_p=0.9, max_tokens=1536
|
||
)
|
||
out = (llm_resp.get("choices",[{}])[0].get("message",{}) or {}).get("content","")
|
||
if out.strip():
|
||
# niet returnen — maar bijvoorbeeld loggen of meesturen als “quick analysis”
|
||
st.smart_preview = out
|
||
logger.info("SMART-RAG preview gemaakt (geen vroegtijdige exit)")
|
||
# === /SMART-RAG ===
|
||
|
||
|
||
if any(k in user_last_lower for k in ["dry-run","dryrun","preview"]): st.dry_run = True
|
||
if "apply" in user_last_lower and ("akkoord" in user_last_lower or "ga door" in user_last_lower): st.dry_run = False
|
||
|
||
if st.stage == "TRIAGE":
|
||
logger.info("Stage TRIAGE")
|
||
st.user_goal = user_last
|
||
# Optioneel: intent refine + verduidelijkingsvragen
|
||
if AGENT_ENABLE_GOAL_REFINE and st.user_goal:
|
||
try:
|
||
refined, questions, conf = await llm_refine_goal(st.user_goal)
|
||
if refined and refined != st.user_goal:
|
||
st.user_goal = refined
|
||
if questions and conf < AGENT_CLARIFY_THRESHOLD:
|
||
st.stage = "ASK"
|
||
qtxt = "\n".join([f"- {q}" for q in questions])
|
||
return ("Om zeker de juiste bestanden te kiezen, beantwoord kort:\n" + qtxt)
|
||
except Exception:
|
||
pass
|
||
st.stage = "ASK"
|
||
base = ("Ik verken de code en doe een voorstel. Geef de repo (bv. `admin/image-viewing-website` of "
|
||
"`http://localhost:3080/admin/image-viewing-website.git`). "
|
||
"Of zeg: **'zoek repo'** als ik zelf moet zoeken.")
|
||
return _with_preview(base, st)
|
||
|
||
if st.stage == "ASK":
|
||
logger.info("Stage ASK ")
|
||
# 1) check of er een repo-hint in de zin zit
|
||
hint = None
|
||
m = re.search(r"(https?://\S+)", user_last)
|
||
if m: hint = m.group(1)
|
||
elif "/" in user_last:
|
||
for p in user_last.split():
|
||
if re.match(r"^[A-Za-z0-9_.\-]+/[A-Za-z0-9_.\-]+$", p): hint = p; break
|
||
# 2) Als expliciete vraag om repo te zoeken óf geen hint → auto-discovery
|
||
if (not hint) and ("zoek repo" in user_last_lower):
|
||
# Probeer auto-discovery
|
||
st.repo_candidates = await discover_candidate_repos(st.user_goal)
|
||
if not st.repo_candidates:
|
||
st.questions_asked += 1
|
||
return _with_preview("Ik kon geen repos vinden. Geef de Gitea repo (owner/repo) of volledige .git-URL.", st)
|
||
# Normalize scores naar 0..1
|
||
maxs = max((c.get("score",0.0) for c in st.repo_candidates), default=0.0) or 1.0
|
||
for c in st.repo_candidates:
|
||
c["score"] = min(1.0, c["score"]/maxs) if maxs else 0.0
|
||
best = st.repo_candidates[0]
|
||
# Als hoogste score duidelijk is, auto-select
|
||
if best.get("score",0.0) >= AGENT_AUTOSELECT_THRESHOLD and best.get("clone_url"):
|
||
st.repo_hint = best["clone_url"]
|
||
st.stage = "SELECT_REPO"
|
||
return _with_preview(f"Repo automatisch gekozen: **{best['full_name']}** (score {best['score']:.2f}).", st)
|
||
# Anders: laat top-3 zien en vraag keuze
|
||
st.stage = "CONFIRM_REPO"
|
||
lines = []
|
||
for i, c in enumerate(st.repo_candidates[:3], 1):
|
||
lines.append(f"{i}. {c['full_name']} — score {c.get('score',0.0):.2f}")
|
||
base = "Ik vond deze passende repos:\n" + "\n".join(lines) + "\nKies een nummer, of typ de naam/URL."
|
||
return _with_preview(base, st)
|
||
|
||
# 3) Er is wel een hint - ga door
|
||
if hint:
|
||
st.repo_hint = hint
|
||
st.stage = "SELECT_REPO"
|
||
else:
|
||
st.questions_asked += 1
|
||
if st.questions_asked <= AGENT_MAX_QUESTIONS:
|
||
return _with_preview("Graag de Gitea repo (owner/repo) of volledige .git-URL.", st)
|
||
return _with_preview("Ik heb de repo-naam of URL nodig om verder te gaan.", st)
|
||
|
||
|
||
if st.stage == "CONFIRM_REPO":
|
||
logger.info("Stage CONFIRM_REPO")
|
||
# parse keuze
|
||
pick = None
|
||
m = re.match(r"^\s*([1-5])\s*$", user_last)
|
||
if m:
|
||
idx = int(m.group(1)) - 1
|
||
if 0 <= idx < len(st.repo_candidates):
|
||
pick = st.repo_candidates[idx]
|
||
if not pick:
|
||
# probeer naam match
|
||
for c in st.repo_candidates:
|
||
if c["full_name"].lower() in user_last_lower or (c.get("clone_url","") and c["clone_url"] in user_last):
|
||
pick = c; break
|
||
if not pick:
|
||
return _with_preview("Typ een nummer (1..3) of de naam/URL van de repo.", st)
|
||
|
||
st.repo_hint = pick.get("clone_url") or (f"{GITEA_URL}/{pick['full_name']}.git")
|
||
st.stage = "SELECT_REPO"
|
||
return _with_preview(f"Repo gekozen: **{pick['full_name']}**.", st)
|
||
|
||
if st.stage == "SELECT_REPO":
|
||
logger.info("Stage SELECT_REPO")
|
||
repo_meta, reason = resolve_repo(st.repo_hint)
|
||
if not repo_meta:
|
||
return (f"Geen repo gevonden voor “{st.repo_hint}”. Probeer volledige URL: {GITEA_URL}/<owner>/<repo>.git")
|
||
st.selected_repo = repo_meta
|
||
st.repo_url = repo_meta.get("clone_url") or ""
|
||
st.owner_repo = repo_meta.get("full_name")
|
||
if not st.repo_url:
|
||
return f"Geen clone URL voor “{st.repo_hint}”."
|
||
progress = [f"Repo ({reason}): {st.owner_repo or st.repo_url}"]
|
||
|
||
# DISCOVER
|
||
logger.info("DISCOVER")
|
||
try:
|
||
try:
|
||
st.repo_path = await _call_get_git_repo(st.repo_url, st.branch_base)
|
||
except Exception as e_main:
|
||
logger.warning("WARN:agent_repo:get_git_repo %s failed: %s; fallback master", st.branch_base, e_main)
|
||
st.branch_base = "master"
|
||
st.repo_path = await _call_get_git_repo(st.repo_url, st.branch_base)
|
||
|
||
|
||
st.collection_name = repo_collection_name(st.owner_repo, st.branch_base)
|
||
chunk_chars, overlap = _chunk_params_for_repo(Path(st.repo_path))
|
||
|
||
# ── Fast-path: check HEAD en sla index over als ongewijzigd ──
|
||
try:
|
||
import git
|
||
head_sha = await run_in_threadpool(lambda: git.Repo(st.repo_path).head.commit.hexsha)
|
||
except Exception:
|
||
head_sha = ""
|
||
#memo_key = f"{st.repo_url}|{st.branch_base}|{st.collection_name}"
|
||
# ‘Brede’ key (repo+branch) voorkomt dubbele index runs bij dezelfde HEAD,
|
||
# ook als collection_name varieert.
|
||
memo_key = f"{st.repo_url}|{st.branch_base}"
|
||
|
||
if _INDEX_HEAD_MEMO.get(memo_key) == head_sha and head_sha:
|
||
progress.append(f"Index overslaan: HEAD ongewijzigd ({head_sha[:7]}).")
|
||
else:
|
||
try:
|
||
res = await _rag_index_repo_internal(
|
||
repo_url=st.repo_url, branch=st.branch_base,
|
||
profile="auto", include="", exclude_dirs="",
|
||
chunk_chars=chunk_chars, overlap=overlap, collection_name=st.collection_name
|
||
)
|
||
# alleen updaten als index call succesvol was
|
||
_INDEX_HEAD_MEMO[memo_key] = head_sha or _INDEX_HEAD_MEMO.get(memo_key, "")
|
||
|
||
if isinstance(res, dict) and res.get("status") == "skipped":
|
||
progress.append(f"Index: skip (cache) — HEAD {head_sha[:7]}.")
|
||
else:
|
||
progress.append("Index: bijgewerkt.")
|
||
except Exception as e_idx:
|
||
logger.warning("WARN:agent_repo:rag index failed '%s': %s; fallback 'code_docs'", st.collection_name, e_idx)
|
||
st.collection_name = "code_docs"
|
||
res = await _rag_index_repo_internal(
|
||
repo_url=st.repo_url, branch=st.branch_base,
|
||
profile="auto", include="", exclude_dirs="",
|
||
chunk_chars=chunk_chars, overlap=overlap, collection_name=st.collection_name
|
||
)
|
||
_INDEX_HEAD_MEMO[memo_key] = head_sha or _INDEX_HEAD_MEMO.get(memo_key, "")
|
||
|
||
|
||
|
||
# na succesvolle _rag_index_repo_internal(...) en meili/bm25:
|
||
logger.info("Symbol index repo")
|
||
try:
|
||
symbol_index_repo(Path(st.repo_path), st.owner_repo, st.branch_base)
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:symbol index build failed: %s", e)
|
||
|
||
|
||
logger.info("Meili part")
|
||
if MEILI_URL:
|
||
try:
|
||
# Skip Meili herindex als HEAD ongewijzigd
|
||
if _MEILI_HEAD_MEMO.get(memo_key) == head_sha and head_sha:
|
||
progress.append("Meili: overslaan (HEAD ongewijzigd).")
|
||
else:
|
||
await run_cpu_blocking(meili_index_repo, Path(st.repo_path), st.owner_repo, st.branch_base)
|
||
_MEILI_HEAD_MEMO[memo_key] = head_sha or _MEILI_HEAD_MEMO.get(memo_key, "")
|
||
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:meili_index_repo failed: %s", e)
|
||
else:
|
||
try:
|
||
if _BM25_HEAD_MEMO.get(memo_key) == head_sha and head_sha:
|
||
progress.append("BM25: overslaan (HEAD ongewijzigd).")
|
||
else:
|
||
await run_cpu_blocking(bm25_build_index, Path(st.repo_path), st.owner_repo, st.branch_base)
|
||
_BM25_HEAD_MEMO[memo_key] = head_sha or _BM25_HEAD_MEMO.get(memo_key, "")
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:bm25_build_index failed: %s", e)
|
||
|
||
|
||
progress.append("DISCOVER klaar.")
|
||
logger.info("DISCOVER klaar.")
|
||
except Exception as e:
|
||
logger.exception("ERROR:agent_repo:DISCOVER failed")
|
||
st.stage = "ASK"
|
||
return _with_preview("\n".join(progress + [f"DISCOVER mislukte: {e}"]), st)
|
||
|
||
|
||
# RANK via hybrid RAG
|
||
logger.info("RANK via hybrid RAG")
|
||
root = Path(st.repo_path)
|
||
all_files = list_repo_files(root)
|
||
# Precompute graph + tree (per HEAD) voor ranking-boost en explain
|
||
graph = _get_graph_cached(root, memo_key=(f"{st.repo_url}|{st.branch_base}|{head_sha or ''}"))
|
||
tree_summ = _get_tree_cached(root, memo_key=(f"{st.repo_url}|{st.branch_base}|{head_sha or ''}"), all_files=all_files)
|
||
|
||
|
||
picked: List[str] = []
|
||
# 1) expliciete paden uit de prompt (bestaande extractor)
|
||
explicit = _sanitize_path_hints(list(extract_explicit_paths(st.user_goal) or []), all_files)
|
||
# 2) robuuste fallback extractor
|
||
robust = _sanitize_path_hints(_extract_explicit_paths_robust(st.user_goal), all_files)
|
||
for pth in explicit + [p for p in robust if p not in explicit]:
|
||
norm = pth.replace("\\", "/").strip()
|
||
if norm in all_files and norm not in picked:
|
||
picked.append(norm)
|
||
continue
|
||
best = best_path_by_basename(all_files, norm)
|
||
if best and best not in picked:
|
||
picked.append(best)
|
||
continue
|
||
# Als het niet bestaat: toch opnemen (voor create-flow)
|
||
if norm not in picked:
|
||
picked.append(norm)
|
||
|
||
# Laravel priors (alleen bestaande paden), vóór RAG
|
||
try:
|
||
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
||
except Exception:
|
||
is_laravel = False
|
||
if is_laravel:
|
||
priors = _laravel_priors_from_prompt(st.user_goal, root, all_files, max_k=int(os.getenv("LARAVEL_PRIORS_K","8")))
|
||
for p in priors:
|
||
if p not in picked:
|
||
picked.append(p)
|
||
|
||
# ---- LLM-PRIORS (optioneel via env, standaard aan) ----
|
||
use_llm_priors = os.getenv("LLM_PRIORS_ENABLE", "1").lower() not in ("0","false","no")
|
||
if use_llm_priors:
|
||
try:
|
||
# Hint framework adhv repo
|
||
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
||
except Exception:
|
||
is_laravel = False
|
||
fw = "laravel" if is_laravel else "generic"
|
||
llm_hits = await _llm_framework_priors(st.user_goal, all_files, framework=fw, max_k=int(os.getenv("LLM_PRIORS_K","12")))
|
||
for p in llm_hits:
|
||
if p not in picked:
|
||
picked.append(p)
|
||
|
||
# ---- Rules fallback (alleen als nog mager) ----
|
||
try:
|
||
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
||
except Exception:
|
||
is_laravel = False
|
||
if is_laravel and len(picked) < max(4, int(os.getenv("LLM_PRIORS_MIN_BEFORE_RAG","4"))):
|
||
priors = _laravel_priors_from_prompt(st.user_goal, root, all_files, max_k=int(os.getenv("LARAVEL_PRIORS_K","8")))
|
||
for p in priors:
|
||
if p not in picked:
|
||
picked.append(p)
|
||
|
||
# --- LLM Task Router ---
|
||
is_laravel = (root / "artisan").exists() or (root / "composer.json").exists()
|
||
route = await _llm_task_route(st.user_goal, framework=("laravel" if is_laravel else "generic"))
|
||
st.reasons["task_route"] = json.dumps(route, ensure_ascii=False)
|
||
task_type = (route.get("task_type") or "").lower()
|
||
|
||
# --- LLM zoekpatronen → deterministische scan ---
|
||
if os.getenv("LLM_PATTERN_SCAN","1").lower() not in ("0","false","no"):
|
||
specs = await _llm_make_search_specs(st.user_goal, framework=("laravel" if is_laravel else "generic"))
|
||
scan_hits = _scan_repo_for_patterns(root, all_files, specs, max_hits=int(os.getenv("LLM_PATTERN_MAX_HITS","24")))
|
||
for f in scan_hits:
|
||
if f not in picked:
|
||
picked.append(f)
|
||
|
||
# --- VIEW/LANG bias voor UI-label wijzigingen ---
|
||
# Pak de eerste quote uit de prompt als "oude" literal
|
||
qs = extract_quotes(st.user_goal) or []
|
||
old_lit = qs[0] if qs else None
|
||
|
||
def _contains_old(rel: str) -> bool:
|
||
if not old_lit:
|
||
return True # fallback: geen filtering
|
||
try:
|
||
txt = _read_text_file(Path(st.repo_path) / rel) or ""
|
||
return old_lit in txt
|
||
except Exception:
|
||
return False
|
||
|
||
view_files = [f for f in all_files
|
||
if f.startswith("resources/views/") and f.endswith(".blade.php")]
|
||
lang_files = [f for f in all_files
|
||
if f.startswith("resources/lang/") and (f.endswith(".json") or f.endswith(".php"))]
|
||
|
||
# Als we de oude literal kennen: eerst de files waar die echt in staat
|
||
if old_lit:
|
||
view_hits = [f for f in view_files if _contains_old(f)]
|
||
lang_hits = [f for f in lang_files if _contains_old(f)]
|
||
else:
|
||
view_hits = view_files
|
||
lang_hits = lang_files
|
||
|
||
# Zet de meest waarschijnlijke kandidaten vóóraan, behoud verder huidige volgorde
|
||
front = []
|
||
for lst in (view_hits, lang_hits):
|
||
for f in lst:
|
||
if f in all_files and f not in front:
|
||
front.append(f)
|
||
picked = list(dict.fromkeys(front + picked))[:MAX_FILES_DRYRUN]
|
||
|
||
|
||
# --- (optioneel) priors op basis van framework (je eerdere patch A/B) ---
|
||
# LLM priors + rule-based priors kun je hier behouden zoals je eerder hebt toegevoegd.
|
||
|
||
|
||
|
||
# --- NIEUW: Smart-RAG path selectie op repo-collectie ---
|
||
|
||
# 1) intent (voor file_hints) + query-expansion
|
||
logger.info("Smart RAG path select. 1) intent")
|
||
spec = await enrich_intent(_llm_call, [{"role":"user","content": st.user_goal}])
|
||
file_hints = (spec.get("file_hints") or [])
|
||
variants = await expand_queries(_llm_call, spec.get("task") or st.user_goal, k=2)
|
||
|
||
# 2) retrieval per variant met repo-filter & collectie van deze repo
|
||
logger.info("Smart RAG path select. 2) retrieval")
|
||
merged = []
|
||
for qv in variants:
|
||
use_collection = bool(st.collection_name)
|
||
part = await hybrid_retrieve(
|
||
_rag_query_internal,
|
||
qv,
|
||
repo=_clean_repo_arg(st.owner_repo) if not use_collection else None,
|
||
profile=None,
|
||
path_contains=(file_hints[0] if file_hints else None),
|
||
per_query_k=int(os.getenv("RAG_PER_QUERY_K","30")),
|
||
n_results=int(os.getenv("RAG_N_RESULTS","18")),
|
||
alpha=float(os.getenv("RAG_EMB_WEIGHT","0.6")),
|
||
collection_name=(st.collection_name if use_collection else None)
|
||
)
|
||
merged.extend(part)
|
||
|
||
# 3) naar unieke paden + sort op score
|
||
logger.info("Smart RAG path select. 3) unieke paden sort op score")
|
||
seen=set()
|
||
for r in sorted(merged, key=lambda x: x.get("score",0.0), reverse=True):
|
||
meta = r.get("metadata") or {}
|
||
rel = meta.get("path","")
|
||
if not rel or rel in seen:
|
||
continue
|
||
seen.add(rel)
|
||
if rel not in picked:
|
||
picked.append(rel)
|
||
# 4) Laravel neighbors (klein zetje, opt-in via env)
|
||
logger.info("Smart RAG path select. 4) Laravel neighbors")
|
||
if os.getenv("RAG_NEIGHBORS", "1").lower() not in ("0","false"):
|
||
add = []
|
||
for rel in picked[:8]:
|
||
# routes -> controllers
|
||
if rel in ("routes/web.php","routes/api.php"):
|
||
txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
|
||
for ctrl_path, _m in _laravel_pairs_from_route_text(txt):
|
||
if ctrl_path and ctrl_path not in picked and ctrl_path not in add:
|
||
add.append(ctrl_path)
|
||
# controllers -> views
|
||
if rel.startswith("app/Http/Controllers/") and rel.endswith(".php"):
|
||
txt = (Path(st.repo_path)/rel).read_text(encoding="utf-8", errors="ignore")
|
||
for v in _laravel_guess_view_paths_from_text(txt):
|
||
if v and v not in picked and v not in add:
|
||
add.append(v)
|
||
# Extra: neem kleine nabije partials/layouts mee (zelfde dir, ≤40KB)
|
||
more = []
|
||
for rel in (picked + add)[:8]:
|
||
if rel.endswith(".blade.php"):
|
||
d = (Path(st.repo_path) / rel).parent
|
||
try:
|
||
for bp in d.glob("*.blade.php"):
|
||
if bp.name == os.path.basename(rel):
|
||
continue
|
||
if bp.stat().st_size <= 40_000:
|
||
cand = str(bp.relative_to(Path(st.repo_path)))
|
||
if cand not in picked and cand not in add and cand not in more:
|
||
more.append(cand)
|
||
except Exception:
|
||
pass
|
||
picked = (picked + add + more)[:MAX_FILES_DRYRUN]
|
||
# 5) Literal-grep fallback: als de user een oud->nieuw wijziging impliceert, zoek de 'old' literal repo-breed
|
||
qs = extract_quotes(st.user_goal) or []
|
||
old = qs[0].strip() if qs and qs[0].strip() else None
|
||
if old:
|
||
grep_hits = _grep_repo_for_literal(Path(st.repo_path), old, limit=16)
|
||
for rel in grep_hits:
|
||
if rel in all_files and rel not in picked:
|
||
picked.append(rel)
|
||
|
||
# Keyword fallback alleen als we nog te weinig zeker zijn
|
||
top_conf = 0.0
|
||
try:
|
||
top_conf = max([r.get("score",0.0) for r in merged]) if merged else 0.0
|
||
except Exception:
|
||
pass
|
||
if len(picked) < MAX_FILES_DRYRUN and top_conf < float(os.getenv("RAG_FALLBACK_THRESHOLD","0.42")):
|
||
|
||
for rel, _s in simple_keyword_search(root, all_files, st.user_goal, limit=MAX_FILES_DRYRUN):
|
||
if rel not in picked: picked.append(rel)
|
||
# --- Gewogen her-ranking (Meili/embeddings/heuristiek/explicit) ---
|
||
explicit_all = extract_explicit_paths(st.user_goal) + _extract_explicit_paths_robust(st.user_goal)
|
||
explicit_all = [p.replace("\\","/").strip() for p in explicit_all]
|
||
# 1) verzamel meili/embeddings scores vanuit 'merged'
|
||
meili_scores = {}
|
||
for r in merged:
|
||
meta = (r or {}).get("metadata") or {}
|
||
rel = meta.get("path","")
|
||
if rel:
|
||
try:
|
||
sc = float(r.get("score", 0.0))
|
||
except Exception:
|
||
sc = 0.0
|
||
meili_scores[rel] = max(meili_scores.get(rel, 0.0), sc)
|
||
# 2) weeg en motiveer
|
||
cand_scores = {}
|
||
cand_why = {}
|
||
def _boost(rel: str, amt: float, why: str):
|
||
cand_scores[rel] = cand_scores.get(rel, 0.0) + float(amt)
|
||
if amt > 0:
|
||
cand_why[rel] = (cand_why.get(rel, "") + f"{why}; ").strip()
|
||
for rel in picked:
|
||
# Meili/embeddings top-hit
|
||
if rel in meili_scores:
|
||
_boost(rel, 0.55 * meili_scores[rel], "meili")
|
||
# pad-heuristiek
|
||
lo = rel.lower()
|
||
if lo.startswith("routes/"): _boost(rel, 0.08, "routes")
|
||
if lo.startswith("app/http/controllers/"): _boost(rel, 0.06, "controller")
|
||
if lo.startswith("resources/views/"): _boost(rel, 0.06, "view")
|
||
if lo.startswith("resources/lang/"): _boost(rel, 0.05, "lang")
|
||
# expliciet genoemd door user
|
||
if rel in explicit_all: _boost(rel, 0.20, "explicit")
|
||
|
||
# 2b) Graph-boost: BFS vanaf expliciete seeds (en evt. route-bestanden)
|
||
try:
|
||
seeds = [p for p in picked if p in explicit_all]
|
||
# heuristisch: als gebruiker over "route" praat, neem routes/web.php als seed
|
||
if any(k in st.user_goal.lower() for k in [" route", "routes", "/"]):
|
||
for rp in ["routes/web.php","routes/api.php"]:
|
||
if rp in picked and rp not in seeds:
|
||
seeds.append(rp)
|
||
if graph and seeds:
|
||
bfs = _graph_bfs_boosts(graph, seeds, max_depth=int(os.getenv("AGENT_GRAPH_MAX_DEPTH","3")))
|
||
for rel in picked:
|
||
if rel in bfs:
|
||
d, via = bfs[rel]
|
||
# afstand → boost: 0:0.08, 1:0.06, 2:0.03, 3:0.01
|
||
boost_map = {0:0.08, 1:0.06, 2:0.03, 3:0.01}
|
||
b = boost_map.get(min(d,3), 0.0)
|
||
if b > 0:
|
||
_boost(rel, b, f"graph:d={d} via {via}")
|
||
st.reasons[f"graph::{rel}"] = f"d={d}, via {via}"
|
||
except Exception:
|
||
pass
|
||
|
||
# 2c) Tree-summary boost: hits van prompt-keywords in samenvatting
|
||
try:
|
||
hints = extract_word_hints(st.user_goal) or []
|
||
if hints and tree_summ:
|
||
lo_hints = [h.lower() for h in hints[:8]]
|
||
for rel in picked:
|
||
s = (tree_summ.get(rel) or "").lower()
|
||
if not s:
|
||
continue
|
||
hits = sum(1 for h in lo_hints if h in s)
|
||
if hits:
|
||
_boost(rel, min(0.04, 0.01 * hits), f"tree:{hits}hit")
|
||
if hits >= 2:
|
||
st.reasons[f"tree::{rel}"] = tree_summ.get(rel, "")[:200]
|
||
except Exception:
|
||
pass
|
||
|
||
# 3) sorteer op totale score (desc)
|
||
picked.sort(key=lambda p: cand_scores.get(p, 0.0), reverse=True)
|
||
# 4) leg motivatie vast voor UI/preview
|
||
for rel in picked[:MAX_FILES_DRYRUN]:
|
||
if cand_scores.get(rel, 0.0) > 0:
|
||
st.reasons[f"rank::{rel}"] = f"{cand_scores[rel]:.2f} via {cand_why.get(rel,'')}"
|
||
st.candidate_paths = picked[:MAX_FILES_DRYRUN]
|
||
logger.info("CANDIDATES (explicit first, capped=%d): %s", MAX_FILES_DRYRUN, st.candidate_paths)
|
||
if not len(st.candidate_paths)>0:
|
||
st.stage = "ASK"
|
||
return _with_preview("\n".join(progress + ["Geen duidelijke kandidaten. Noem een pagina/onderdeel of (optioneel) bestandsnaam."]), st)
|
||
|
||
|
||
progress.append("Kandidaten:\n" + "\n".join([f"- {rel}" for rel in st.candidate_paths]))
|
||
logger.info("Kandidaten gevonden!")
|
||
|
||
# DRY-RUN
|
||
logger.info("dry-run")
|
||
try:
|
||
proposed, diffs, reasons = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
|
||
if not proposed:
|
||
# ---- T3: automatische recovery (éénmalig) ----
|
||
if not st.recovery_attempted:
|
||
st.recovery_attempted = True
|
||
try:
|
||
new_list, dbg = await _recovery_expand_candidates(
|
||
Path(st.repo_path), list_repo_files(Path(st.repo_path)),
|
||
st.user_goal, st.candidate_paths, last_reason="no_proposal_after_dryrun"
|
||
)
|
||
st.candidate_paths = new_list
|
||
st.reasons["recovery_note"] = dbg.get("recovery_plan",{}).get("note","")
|
||
# opnieuw proberen
|
||
proposed2, diffs2, reasons2 = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
|
||
if proposed2:
|
||
st.proposed_patches = proposed2
|
||
st.reasons.update(reasons2 or {})
|
||
st.stage = "APPLY"
|
||
preview = []
|
||
for rel in list(diffs2.keys())[:3]:
|
||
why = st.reasons.get(rel, "")
|
||
preview.append(f"### {rel}\n```\n{diffs2[rel]}\n```\n**Waarom**: {why}")
|
||
more = "" if len(diffs2) <= 3 else f"\n(Plus {len(diffs2)-3} extra diff(s).)"
|
||
base = "\n".join(progress + [
|
||
"**Dry-run voorstel (na recovery):**",
|
||
"\n\n".join(preview) + more,
|
||
"\nTyp **'Akkoord apply'** om te schrijven & pushen, of geef feedback."
|
||
])
|
||
return _with_preview(base, st, header="--- SMART-RAG + recovery notities ---")
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:recovery attempt failed: %s", e)
|
||
# geen succes → val terug op bestaande melding
|
||
st.stage = "PROPOSE_DIFF_DRYRUN"
|
||
return "\n".join(progress + ["Dry-run: geen bruikbaar voorstel met deze kandidaten. Geef extra hint (pagina/ term)."])
|
||
|
||
st.proposed_patches = proposed
|
||
st.reasons = reasons
|
||
st.stage = "APPLY"
|
||
preview = []
|
||
for rel in list(diffs.keys())[:3]:
|
||
why = reasons.get(rel, "")
|
||
preview.append(f"### {rel}\n```\n{diffs[rel]}\n```\n**Waarom**: {why}")
|
||
more = "" if len(diffs) <= 3 else f"\n(Plus {len(diffs)-3} extra diff(s).)"
|
||
base= "\n".join(progress + [
|
||
"**Dry-run voorstel (geen writes):**",
|
||
"\n\n".join(preview) + more,
|
||
"\nTyp **'Akkoord apply'** om te schrijven & pushen, of geef feedback."
|
||
])
|
||
return _with_preview(base, st, header="--- SMART-RAG contextnotities ---")
|
||
except Exception as e:
|
||
logger.exception("ERROR:agent_repo:PROPOSE_DIFF_DRYRUN failed")
|
||
st.stage = "PROPOSE_DIFF_DRYRUN"
|
||
return "\n".join(progress + [f"Dry-run mislukte: {e}"])
|
||
|
||
if st.stage == "PROPOSE_DIFF_DRYRUN":
|
||
logger.info("Stage PROPOSE_DIFF_DRYRUN")
|
||
root = Path(st.repo_path)
|
||
all_files = list_repo_files(root)
|
||
added = []
|
||
for pth in extract_explicit_paths(user_last):
|
||
if pth in all_files and pth not in st.candidate_paths:
|
||
added.append(pth)
|
||
else:
|
||
best = best_path_by_basename(all_files, pth)
|
||
if best and best not in st.candidate_paths: added.append(best)
|
||
st.candidate_paths = (added + st.candidate_paths)[:MAX_FILES_DRYRUN]
|
||
# extra: grep op 'old' literal uit user_goal om kandidaten te verrijken
|
||
qs = extract_quotes(st.user_goal) or []
|
||
old = qs[0].strip() if qs and qs[0].strip() else None
|
||
if old:
|
||
for rel in _grep_repo_for_literal(root, old, limit=16):
|
||
if rel in all_files and rel not in st.candidate_paths:
|
||
st.candidate_paths.append(rel)
|
||
|
||
|
||
try:
|
||
proposed, diffs, reasons = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
|
||
if not proposed:
|
||
if not st.recovery_attempted:
|
||
st.recovery_attempted = True
|
||
try:
|
||
new_list, dbg = await _recovery_expand_candidates(
|
||
Path(st.repo_path), list_repo_files(Path(st.repo_path)),
|
||
st.user_goal, st.candidate_paths, last_reason="no_proposal_in_propose_diff"
|
||
)
|
||
st.candidate_paths = new_list
|
||
st.reasons["recovery_note"] = dbg.get("recovery_plan",{}).get("note","")
|
||
# direct nog een poging
|
||
proposed2, diffs2, reasons2 = await propose_patches_without_apply(st.repo_path, st.candidate_paths, st.user_goal)
|
||
if proposed2:
|
||
st.proposed_patches = proposed2
|
||
st.reasons.update(reasons2 or {})
|
||
st.stage = "APPLY"
|
||
preview = []
|
||
for rel in list(diffs2.keys())[:3]:
|
||
why = st.reasons.get(rel, "")
|
||
preview.append(f"### {rel}\n```\n{diffs2[rel]}\n```\n**Waarom**: {why}")
|
||
more = "" if len(diffs2) <= 3 else f"\n(Plus {len(diffs2)-3} extra diff(s).)"
|
||
base = ("**Dry-run voorstel (na recovery):**\n" +
|
||
"\n\n".join(preview) + more +
|
||
"\n\nTyp **'Akkoord apply'** om te schrijven & pushen, of geef feedback.")
|
||
return _with_preview(base, st, header="--- SMART-RAG + recovery notities ---")
|
||
except Exception as e:
|
||
logger.warning("WARN:agent_repo:recovery in PROPOSE_DIFF failed: %s", e)
|
||
return _with_preview("Nog geen bruikbaar voorstel. Noem exact bestand/pagina of plak relevante code.", st)
|
||
|
||
st.proposed_patches = proposed
|
||
st.reasons = reasons
|
||
st.stage = "APPLY"
|
||
preview = []
|
||
for rel in list(diffs.keys())[:3]:
|
||
why = reasons.get(rel, "")
|
||
preview.append(f"### {rel}\n```\n{diffs[rel]}\n```\n**Waarom**: {why}")
|
||
more = "" if len(diffs) <= 3 else f"\n(Plus {len(diffs)-3} extra diff(s).)"
|
||
base = ("**Dry-run voorstel (geen writes):**\n" +
|
||
"\n\n".join(preview) + more +
|
||
"\n\nTyp **'Akkoord apply'** om te schrijven & pushen, of geef feedback.")
|
||
return _with_preview(base, st, header="--- SMART-RAG contextnotities ---")
|
||
except Exception as e:
|
||
logger.exception("ERROR:agent_repo:PROPOSE_DIFF_DRYRUN retry failed")
|
||
return _with_preview(f"Dry-run mislukte: {e}", st)
|
||
|
||
|
||
|
||
def _apply():
|
||
if not (("akkoord" in user_last_lower) and ("apply" in user_last_lower)):
|
||
return "Typ **'Akkoord apply'** om de dry-run wijzigingen te schrijven & pushen."
|
||
try:
|
||
repo_path = _get_git_repo(st.repo_url, st.branch_base)
|
||
import git
|
||
repo = git.Repo(repo_path)
|
||
short = re.sub(r'[^a-z0-9\-]+','-', st.user_goal.lower()).strip("-")
|
||
st.new_branch = f"task/{short[:40]}-{time.strftime('%Y%m%d-%H%M%S')}"
|
||
repo.git.checkout("-b", st.new_branch)
|
||
changed = []
|
||
for rel, content in st.proposed_patches.items():
|
||
f = Path(repo_path) / rel
|
||
f.parent.mkdir(parents=True, exist_ok=True)
|
||
f.write_text(content, encoding="utf-8")
|
||
changed.append(str(f))
|
||
if not changed:
|
||
return "Er waren geen wijzigingen om te commiten."
|
||
repo.index.add(changed)
|
||
msg = (f"feat: {st.user_goal}\n\nScope:\n" +
|
||
"\n".join([f"- {Path(c).relative_to(repo_path)}" for c in changed]) +
|
||
"\n\nRationale (samengevat):\n" +
|
||
"\n".join([f"- {k}: {v}" for k,v in st.reasons.items()]) +
|
||
"\n\nCo-authored-by: repo-agent\n")
|
||
repo.index.commit(msg)
|
||
repo.remotes.origin.push(refspec=f"{st.new_branch}:{st.new_branch}")
|
||
st.stage = "DONE"
|
||
return f"✅ Branch aangemaakt en gepusht: `{st.new_branch}`. Maak nu je PR in Gitea."
|
||
except Exception as e:
|
||
logger.exception("ERROR:agent_repo:APPLY failed")
|
||
st.stage = "PROPOSE_DIFF_DRYRUN"
|
||
return f"Apply/push mislukte: {e}"
|
||
if st.stage == "APPLY":
|
||
logger.info("Stage APPLY")
|
||
return await run_in_threadpool(_apply)
|
||
|
||
if st.stage == "DONE":
|
||
logger.info("Stage DONE")
|
||
st.smart_preview = ""
|
||
return f"Klaar. Branch: `{st.new_branch}`."
|
||
return "Interne status onduidelijk; begin opnieuw of herformuleer je doel."
|
||
|
||
|