RAG verbetering ivm filenaam zoeken

This commit is contained in:
admin 2026-01-12 16:48:15 +01:00
parent 4e2ffac24c
commit 4c29600553

155
app.py
View File

@ -1932,6 +1932,17 @@ def _parse_validation_results(text: str) -> list[str]:
issues.append(line) issues.append(line)
return issues return issues
def _norm_collection_name(x: str | None, default="code_docs") -> str:
name = (x or "").strip()
return name or default
def _collection_effective(name: str) -> str:
name = (name or "code_docs").strip() or "code_docs"
# als je al versioned namen hebt, laat ze staan
if re.search(r"__v\d+$", name):
return name
return _collection_versioned(name)
async def _execute_tool(name: str, args: dict) -> dict: async def _execute_tool(name: str, args: dict) -> dict:
logger.info("toolcall: "+str(name)+" ("+str(args)+")") logger.info("toolcall: "+str(name)+" ("+str(args)+")")
if name == "repo_grep": if name == "repo_grep":
@ -1946,6 +1957,28 @@ async def _execute_tool(name: str, args: dict) -> dict:
hits = [] hits = []
qlow = query.lower() qlow = query.lower()
for p in root.rglob("*"): for p in root.rglob("*"):
#logger.info(p)
if qlow in str(p).lower() and str(p).split('.')[-1] in ['txt','md','htm','html','cpp','js','json','env','py','php','c','h']:
if qlow==str(p).lower().split("/")[-1]:
max_chars=55000
else:
max_chars=400
file_txt=_read_text_file(p)
if len(file_txt) > max_chars:
add_str=" so first " + str(max_chars) + " chars given in excerpt"
else:
add_str=" so file content given in excerpt"
hits.append({
"path": str(p.relative_to(root)),
"line": "-text found in filename"+str(add_str)+"- ",
"excerpt": str(file_txt)[:max_chars]
})
elif qlow in str(p).lower():
hits.append({
"path": str(p.relative_to(root)),
"line": "-text found in filename- ",
"excerpt": "-text found in filename"
})
if p.is_dir(): if p.is_dir():
continue continue
if set(p.parts) & PROFILE_EXCLUDE_DIRS: if set(p.parts) & PROFILE_EXCLUDE_DIRS:
@ -1989,7 +2022,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
out = await rag_query_api( out = await rag_query_api(
query=args.get("query",""), query=args.get("query",""),
n_results=int(args.get("n_results",5)), n_results=int(args.get("n_results",5)),
collection_name=args.get("collection_name","code_docs"), collection_name=_norm_collection_name(args.get("collection_name","code_docs"), "code_docs"),
repo=args.get("repo"), repo=args.get("repo"),
path_contains=args.get("path_contains"), path_contains=args.get("path_contains"),
profile=args.get("profile") profile=args.get("profile")
@ -2551,6 +2584,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
if mb: args["branch"] = mb.group(1) if mb: args["branch"] = mb.group(1)
elif fname == "rag_query": elif fname == "rag_query":
args["query"] = utext.strip() args["query"] = utext.strip()
m = re.search(r"\bcollection(?:_name)?\s*[:=]\s*([A-Za-z0-9_.-]+)", utext, re.I)
if m:
args["collection_name"] = m.group(1)
elif fname == "summarize_text": elif fname == "summarize_text":
m = re.search(r':\s*(.+)$', utext, re.S) m = re.search(r':\s*(.+)$', utext, re.S)
args["text"] = (m.group(1).strip() if m else utext.strip())[:16000] args["text"] = (m.group(1).strip() if m else utext.strip())[:16000]
@ -3327,6 +3363,11 @@ async def _search_first_candidates(repo_url: str, branch: str, query: str, expli
def _match_any(name: str, patterns: list[str]) -> bool: def _match_any(name: str, patterns: list[str]) -> bool:
return any(fnmatch.fnmatch(name, pat) for pat in patterns) return any(fnmatch.fnmatch(name, pat) for pat in patterns)
def _looks_like_filename(q: str) -> bool:
q = (q or "").lower()
return any(ext in q for ext in (".php", ".py", ".js", ".ts", ".blade.php", ".vue", ".json", ".yaml", ".yml", ".txt", ".cpp", ".html", ".htm", ".xlsx", ".docx"))
def _rag_index_repo_sync( def _rag_index_repo_sync(
*, *,
repo_url: str, repo_url: str,
@ -3351,7 +3392,6 @@ def _rag_index_repo_sync(
cache = _load_repo_index_cache() cache = _load_repo_index_cache()
repo_key = f"{os.path.basename(repo_url)}|{branch}|{_collection_versioned(collection_name)}|{_EMBEDDER.slug}" repo_key = f"{os.path.basename(repo_url)}|{branch}|{_collection_versioned(collection_name)}|{_EMBEDDER.slug}"
cached = cache.get(repo_key) cached = cache.get(repo_key)
if not force and cached and cached.get("head_sha") == head_sha: if not force and cached and cached.get("head_sha") == head_sha:
return { return {
"status": "skipped", "status": "skipped",
@ -3374,7 +3414,9 @@ def _rag_index_repo_sync(
if exclude_dirs.strip(): if exclude_dirs.strip():
exclude_set |= {d.strip() for d in exclude_dirs.split(",") if d.strip()} exclude_set |= {d.strip() for d in exclude_dirs.split(",") if d.strip()}
collection = _get_collection(collection_name) #collection = _get_collection(collection_name)
collection_name_eff = _collection_effective(collection_name)
collection = _get_collection(collection_name_eff)
# --- Slim chunking toggles (env of via profile='smart') --- # --- Slim chunking toggles (env of via profile='smart') ---
use_smart_chunk = ( use_smart_chunk = (
@ -3431,6 +3473,11 @@ def _rag_index_repo_sync(
return out return out
# precompute owner/repo once # precompute owner/repo once
repo_full_pre = _repo_owner_repo_from_url(repo_url) repo_full_pre = _repo_owner_repo_from_url(repo_url)
if force:
try:
collection.delete(where={"repo_full": repo_full_pre, "branch": branch})
except Exception:
pass
deleted_paths: set[str] = set() deleted_paths: set[str] = set()
for p in root.rglob("*"): for p in root.rglob("*"):
if p.is_dir(): if p.is_dir():
@ -3490,6 +3537,8 @@ def _rag_index_repo_sync(
"path": rel, "path": rel,
"chunk_index": idx, "chunk_index": idx,
"profile": profile, "profile": profile,
"basename": os.path.basename(rel).lower(),
"path_lc": rel.lower(),
} }
batch_documents.append(ch) batch_documents.append(ch)
docs_for_bm25.append({"text": ch, "path": rel}) docs_for_bm25.append({"text": ch, "path": rel})
@ -3663,7 +3712,19 @@ async def rag_query_api(
path_contains: Optional[str] = None, path_contains: Optional[str] = None,
profile: Optional[str] = None profile: Optional[str] = None
) -> dict: ) -> dict:
col = _get_collection(collection_name) branch="main"
repo_full = None
repo_base = None
if repo:
if "://" in repo: # repo is URL
repo_full = _repo_owner_repo_from_url(repo) # -> "admin/cluster"
else:
repo_full = repo # al "owner/repo"
repo_base = repo_full.rsplit("/", 1)[-1]
repo=repo_full
#col = _get_collection(collection_name)
collection_name_eff = _collection_effective(collection_name)
col = _get_collection(collection_name_eff)
q_emb = _EMBEDDER.embed_query(query) q_emb = _EMBEDDER.embed_query(query)
where = {} where = {}
if repo: if repo:
@ -3699,16 +3760,76 @@ async def rag_query_api(
# Herken ook gequote varianten en slimme quotes. # Herken ook gequote varianten en slimme quotes.
path_hints: set[str] = set() path_hints: set[str] = set()
PH_PATTERNS = [ PH_PATTERNS = [
r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"']", # 1) Relatief pad met directories + filename + (multi-dot) extensies
r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)", # Voorbeelden: src/foo/bar.py, app/Http/.../UserController.php, foo.tar.gz, index.blade.php
r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"']", r'["“”\']?((?:[A-Za-z0-9_.-]+[\\/])+[A-Za-z0-9_.-]+(?:\.[A-Za-z0-9]{1,12})+)\b["“”\']?',
r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
r"\b([A-Za-z0-9_\/-]+\.blade\.php)\b", # 2) Alleen filename + (multi-dot) extensies
r"\b([A-Za-z0-9_\/-]+\.php)\b", # Voorbeelden: main.py, CMakeLists.txt (ook gedekt door specials), index.blade.php, foo.min.js
r'["“”\']?([A-Za-z0-9_.-]+(?:\.[A-Za-z0-9]{1,12})+)\b["“”\']?',
# 3) Speciale bekende bestandsnamen zonder extensie
r'\b(Dockerfile(?:\.[A-Za-z0-9_.-]+)?)\b', # Dockerfile, Dockerfile.dev
r'\b(Makefile)\b',
r'\b(CMakeLists\.txt)\b',
r'\b(README(?:\.[A-Za-z0-9]{1,12})?)\b', # README, README.md
r'\b(LICENSE(?:\.[A-Za-z0-9]{1,12})?)\b', # LICENSE, LICENSE.txt
] ]
for pat in PH_PATTERNS: for pat in PH_PATTERNS:
for m in re.finditer(pat, query): for m in re.finditer(pat, query, flags=re.IGNORECASE):
path_hints.add(m.group(1).strip()) path_hints.add(m.group(1).strip())
clean_hints = set()
for h in path_hints:
h = h.strip().strip('"\''"“”")
if len(h) > 200:
continue
# skip dingen die op URL lijken
if "://" in h:
continue
clean_hints.add(h)
path_hints = clean_hints
prefetch = []
if path_hints and repo and _looks_like_filename(query):
repo_base = repo.rsplit("/", 1)[-1]
# jij indexeert meta["repo_full"] als owner/repo; bij rag_query kan repo "owner/repo" zijn.
repo_full = repo
for hint in list(path_hints)[:10]:
h = hint.strip()
if not h:
continue
# 1) exact pad (als er / in zit of het lijkt op resources/... etc.)
if "/" in h:
where = {"$and": [
{"repo_full": {"$eq": repo_full}},
{"branch": {"$eq": branch}},
{"path": {"$eq": h}}
]
}
got = col.get(where=where, include=["documents","metadatas"])
else:
# 2) alleen bestandsnaam -> match op basename
where = {"$and": [
{"repo_full": {"$eq": repo_full}},
{"branch": {"$eq": branch}},
{"basename": {"$eq": h.lower()}}
]
}
got = col.get(where=where, include=["documents","metadatas"])
docs2 = got.get("documents") or []
metas2 = got.get("metadatas") or []
for d, m in zip(docs2, metas2):
prefetch.append({
"document": d or "",
"metadata": m or {},
"emb_sim": 1.0,
"distance": 0.0,
"score": 1.0,
})
res = col.query( res = col.query(
query_embeddings=[q_emb], query_embeddings=[q_emb],
@ -3922,7 +4043,17 @@ async def rag_query_api(
ranked = ranked_full[:n_results] ranked = ranked_full[:n_results]
else: else:
ranked = ranked_full[:n_results] ranked = ranked_full[:n_results]
if prefetch:
seen = set()
merged = []
for r in prefetch + ranked:
meta = r.get("metadata") or {}
key = f"{meta.get('path','')}::{meta.get('chunk_index','')}"
if key in seen:
continue
seen.add(key)
merged.append(r)
ranked = merged[:n_results]
return { return {
"count": len(ranked), "count": len(ranked),
"results": [{ "results": [{