RAG verbetering ivm filenaam zoeken
This commit is contained in:
parent
4e2ffac24c
commit
4c29600553
157
app.py
157
app.py
@ -1932,6 +1932,17 @@ def _parse_validation_results(text: str) -> list[str]:
|
|||||||
issues.append(line)
|
issues.append(line)
|
||||||
return issues
|
return issues
|
||||||
|
|
||||||
|
def _norm_collection_name(x: str | None, default="code_docs") -> str:
|
||||||
|
name = (x or "").strip()
|
||||||
|
return name or default
|
||||||
|
|
||||||
|
def _collection_effective(name: str) -> str:
|
||||||
|
name = (name or "code_docs").strip() or "code_docs"
|
||||||
|
# als je al versioned namen hebt, laat ze staan
|
||||||
|
if re.search(r"__v\d+$", name):
|
||||||
|
return name
|
||||||
|
return _collection_versioned(name)
|
||||||
|
|
||||||
async def _execute_tool(name: str, args: dict) -> dict:
|
async def _execute_tool(name: str, args: dict) -> dict:
|
||||||
logger.info("toolcall: "+str(name)+" ("+str(args)+")")
|
logger.info("toolcall: "+str(name)+" ("+str(args)+")")
|
||||||
if name == "repo_grep":
|
if name == "repo_grep":
|
||||||
@ -1946,6 +1957,28 @@ async def _execute_tool(name: str, args: dict) -> dict:
|
|||||||
hits = []
|
hits = []
|
||||||
qlow = query.lower()
|
qlow = query.lower()
|
||||||
for p in root.rglob("*"):
|
for p in root.rglob("*"):
|
||||||
|
#logger.info(p)
|
||||||
|
if qlow in str(p).lower() and str(p).split('.')[-1] in ['txt','md','htm','html','cpp','js','json','env','py','php','c','h']:
|
||||||
|
if qlow==str(p).lower().split("/")[-1]:
|
||||||
|
max_chars=55000
|
||||||
|
else:
|
||||||
|
max_chars=400
|
||||||
|
file_txt=_read_text_file(p)
|
||||||
|
if len(file_txt) > max_chars:
|
||||||
|
add_str=" so first " + str(max_chars) + " chars given in excerpt"
|
||||||
|
else:
|
||||||
|
add_str=" so file content given in excerpt"
|
||||||
|
hits.append({
|
||||||
|
"path": str(p.relative_to(root)),
|
||||||
|
"line": "-text found in filename"+str(add_str)+"- ",
|
||||||
|
"excerpt": str(file_txt)[:max_chars]
|
||||||
|
})
|
||||||
|
elif qlow in str(p).lower():
|
||||||
|
hits.append({
|
||||||
|
"path": str(p.relative_to(root)),
|
||||||
|
"line": "-text found in filename- ",
|
||||||
|
"excerpt": "-text found in filename"
|
||||||
|
})
|
||||||
if p.is_dir():
|
if p.is_dir():
|
||||||
continue
|
continue
|
||||||
if set(p.parts) & PROFILE_EXCLUDE_DIRS:
|
if set(p.parts) & PROFILE_EXCLUDE_DIRS:
|
||||||
@ -1989,7 +2022,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
|
|||||||
out = await rag_query_api(
|
out = await rag_query_api(
|
||||||
query=args.get("query",""),
|
query=args.get("query",""),
|
||||||
n_results=int(args.get("n_results",5)),
|
n_results=int(args.get("n_results",5)),
|
||||||
collection_name=args.get("collection_name","code_docs"),
|
collection_name=_norm_collection_name(args.get("collection_name","code_docs"), "code_docs"),
|
||||||
repo=args.get("repo"),
|
repo=args.get("repo"),
|
||||||
path_contains=args.get("path_contains"),
|
path_contains=args.get("path_contains"),
|
||||||
profile=args.get("profile")
|
profile=args.get("profile")
|
||||||
@ -2551,6 +2584,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
|
|||||||
if mb: args["branch"] = mb.group(1)
|
if mb: args["branch"] = mb.group(1)
|
||||||
elif fname == "rag_query":
|
elif fname == "rag_query":
|
||||||
args["query"] = utext.strip()
|
args["query"] = utext.strip()
|
||||||
|
m = re.search(r"\bcollection(?:_name)?\s*[:=]\s*([A-Za-z0-9_.-]+)", utext, re.I)
|
||||||
|
if m:
|
||||||
|
args["collection_name"] = m.group(1)
|
||||||
elif fname == "summarize_text":
|
elif fname == "summarize_text":
|
||||||
m = re.search(r':\s*(.+)$', utext, re.S)
|
m = re.search(r':\s*(.+)$', utext, re.S)
|
||||||
args["text"] = (m.group(1).strip() if m else utext.strip())[:16000]
|
args["text"] = (m.group(1).strip() if m else utext.strip())[:16000]
|
||||||
@ -3326,6 +3362,11 @@ async def _search_first_candidates(repo_url: str, branch: str, query: str, expli
|
|||||||
|
|
||||||
def _match_any(name: str, patterns: list[str]) -> bool:
|
def _match_any(name: str, patterns: list[str]) -> bool:
|
||||||
return any(fnmatch.fnmatch(name, pat) for pat in patterns)
|
return any(fnmatch.fnmatch(name, pat) for pat in patterns)
|
||||||
|
|
||||||
|
def _looks_like_filename(q: str) -> bool:
|
||||||
|
q = (q or "").lower()
|
||||||
|
return any(ext in q for ext in (".php", ".py", ".js", ".ts", ".blade.php", ".vue", ".json", ".yaml", ".yml", ".txt", ".cpp", ".html", ".htm", ".xlsx", ".docx"))
|
||||||
|
|
||||||
|
|
||||||
def _rag_index_repo_sync(
|
def _rag_index_repo_sync(
|
||||||
*,
|
*,
|
||||||
@ -3351,7 +3392,6 @@ def _rag_index_repo_sync(
|
|||||||
cache = _load_repo_index_cache()
|
cache = _load_repo_index_cache()
|
||||||
repo_key = f"{os.path.basename(repo_url)}|{branch}|{_collection_versioned(collection_name)}|{_EMBEDDER.slug}"
|
repo_key = f"{os.path.basename(repo_url)}|{branch}|{_collection_versioned(collection_name)}|{_EMBEDDER.slug}"
|
||||||
cached = cache.get(repo_key)
|
cached = cache.get(repo_key)
|
||||||
|
|
||||||
if not force and cached and cached.get("head_sha") == head_sha:
|
if not force and cached and cached.get("head_sha") == head_sha:
|
||||||
return {
|
return {
|
||||||
"status": "skipped",
|
"status": "skipped",
|
||||||
@ -3374,8 +3414,10 @@ def _rag_index_repo_sync(
|
|||||||
if exclude_dirs.strip():
|
if exclude_dirs.strip():
|
||||||
exclude_set |= {d.strip() for d in exclude_dirs.split(",") if d.strip()}
|
exclude_set |= {d.strip() for d in exclude_dirs.split(",") if d.strip()}
|
||||||
|
|
||||||
collection = _get_collection(collection_name)
|
#collection = _get_collection(collection_name)
|
||||||
|
collection_name_eff = _collection_effective(collection_name)
|
||||||
|
collection = _get_collection(collection_name_eff)
|
||||||
|
|
||||||
# --- Slim chunking toggles (env of via profile='smart') ---
|
# --- Slim chunking toggles (env of via profile='smart') ---
|
||||||
use_smart_chunk = (
|
use_smart_chunk = (
|
||||||
(os.getenv("CHROMA_SMART_CHUNK","1").lower() not in ("0","false","no"))
|
(os.getenv("CHROMA_SMART_CHUNK","1").lower() not in ("0","false","no"))
|
||||||
@ -3431,6 +3473,11 @@ def _rag_index_repo_sync(
|
|||||||
return out
|
return out
|
||||||
# precompute owner/repo once
|
# precompute owner/repo once
|
||||||
repo_full_pre = _repo_owner_repo_from_url(repo_url)
|
repo_full_pre = _repo_owner_repo_from_url(repo_url)
|
||||||
|
if force:
|
||||||
|
try:
|
||||||
|
collection.delete(where={"repo_full": repo_full_pre, "branch": branch})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
deleted_paths: set[str] = set()
|
deleted_paths: set[str] = set()
|
||||||
for p in root.rglob("*"):
|
for p in root.rglob("*"):
|
||||||
if p.is_dir():
|
if p.is_dir():
|
||||||
@ -3490,6 +3537,8 @@ def _rag_index_repo_sync(
|
|||||||
"path": rel,
|
"path": rel,
|
||||||
"chunk_index": idx,
|
"chunk_index": idx,
|
||||||
"profile": profile,
|
"profile": profile,
|
||||||
|
"basename": os.path.basename(rel).lower(),
|
||||||
|
"path_lc": rel.lower(),
|
||||||
}
|
}
|
||||||
batch_documents.append(ch)
|
batch_documents.append(ch)
|
||||||
docs_for_bm25.append({"text": ch, "path": rel})
|
docs_for_bm25.append({"text": ch, "path": rel})
|
||||||
@ -3663,7 +3712,19 @@ async def rag_query_api(
|
|||||||
path_contains: Optional[str] = None,
|
path_contains: Optional[str] = None,
|
||||||
profile: Optional[str] = None
|
profile: Optional[str] = None
|
||||||
) -> dict:
|
) -> dict:
|
||||||
col = _get_collection(collection_name)
|
branch="main"
|
||||||
|
repo_full = None
|
||||||
|
repo_base = None
|
||||||
|
if repo:
|
||||||
|
if "://" in repo: # repo is URL
|
||||||
|
repo_full = _repo_owner_repo_from_url(repo) # -> "admin/cluster"
|
||||||
|
else:
|
||||||
|
repo_full = repo # al "owner/repo"
|
||||||
|
repo_base = repo_full.rsplit("/", 1)[-1]
|
||||||
|
repo=repo_full
|
||||||
|
#col = _get_collection(collection_name)
|
||||||
|
collection_name_eff = _collection_effective(collection_name)
|
||||||
|
col = _get_collection(collection_name_eff)
|
||||||
q_emb = _EMBEDDER.embed_query(query)
|
q_emb = _EMBEDDER.embed_query(query)
|
||||||
where = {}
|
where = {}
|
||||||
if repo:
|
if repo:
|
||||||
@ -3699,16 +3760,76 @@ async def rag_query_api(
|
|||||||
# Herken ook gequote varianten en slimme quotes.
|
# Herken ook gequote varianten en slimme quotes.
|
||||||
path_hints: set[str] = set()
|
path_hints: set[str] = set()
|
||||||
PH_PATTERNS = [
|
PH_PATTERNS = [
|
||||||
r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"”']",
|
# 1) Relatief pad met directories + filename + (multi-dot) extensies
|
||||||
r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
|
# Voorbeelden: src/foo/bar.py, app/Http/.../UserController.php, foo.tar.gz, index.blade.php
|
||||||
r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"”']",
|
r'["“”\']?((?:[A-Za-z0-9_.-]+[\\/])+[A-Za-z0-9_.-]+(?:\.[A-Za-z0-9]{1,12})+)\b["“”\']?',
|
||||||
r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
|
|
||||||
r"\b([A-Za-z0-9_\/-]+\.blade\.php)\b",
|
# 2) Alleen filename + (multi-dot) extensies
|
||||||
r"\b([A-Za-z0-9_\/-]+\.php)\b",
|
# Voorbeelden: main.py, CMakeLists.txt (ook gedekt door specials), index.blade.php, foo.min.js
|
||||||
|
r'["“”\']?([A-Za-z0-9_.-]+(?:\.[A-Za-z0-9]{1,12})+)\b["“”\']?',
|
||||||
|
|
||||||
|
# 3) Speciale bekende bestandsnamen zonder extensie
|
||||||
|
r'\b(Dockerfile(?:\.[A-Za-z0-9_.-]+)?)\b', # Dockerfile, Dockerfile.dev
|
||||||
|
r'\b(Makefile)\b',
|
||||||
|
r'\b(CMakeLists\.txt)\b',
|
||||||
|
r'\b(README(?:\.[A-Za-z0-9]{1,12})?)\b', # README, README.md
|
||||||
|
r'\b(LICENSE(?:\.[A-Za-z0-9]{1,12})?)\b', # LICENSE, LICENSE.txt
|
||||||
]
|
]
|
||||||
|
|
||||||
for pat in PH_PATTERNS:
|
for pat in PH_PATTERNS:
|
||||||
for m in re.finditer(pat, query):
|
for m in re.finditer(pat, query, flags=re.IGNORECASE):
|
||||||
path_hints.add(m.group(1).strip())
|
path_hints.add(m.group(1).strip())
|
||||||
|
clean_hints = set()
|
||||||
|
for h in path_hints:
|
||||||
|
h = h.strip().strip('"\''"“”")
|
||||||
|
if len(h) > 200:
|
||||||
|
continue
|
||||||
|
# skip dingen die op URL lijken
|
||||||
|
if "://" in h:
|
||||||
|
continue
|
||||||
|
clean_hints.add(h)
|
||||||
|
path_hints = clean_hints
|
||||||
|
|
||||||
|
prefetch = []
|
||||||
|
if path_hints and repo and _looks_like_filename(query):
|
||||||
|
repo_base = repo.rsplit("/", 1)[-1]
|
||||||
|
# jij indexeert meta["repo_full"] als owner/repo; bij rag_query kan repo "owner/repo" zijn.
|
||||||
|
repo_full = repo
|
||||||
|
|
||||||
|
for hint in list(path_hints)[:10]:
|
||||||
|
h = hint.strip()
|
||||||
|
if not h:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 1) exact pad (als er / in zit of het lijkt op resources/... etc.)
|
||||||
|
if "/" in h:
|
||||||
|
where = {"$and": [
|
||||||
|
{"repo_full": {"$eq": repo_full}},
|
||||||
|
{"branch": {"$eq": branch}},
|
||||||
|
{"path": {"$eq": h}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
got = col.get(where=where, include=["documents","metadatas"])
|
||||||
|
else:
|
||||||
|
# 2) alleen bestandsnaam -> match op basename
|
||||||
|
where = {"$and": [
|
||||||
|
{"repo_full": {"$eq": repo_full}},
|
||||||
|
{"branch": {"$eq": branch}},
|
||||||
|
{"basename": {"$eq": h.lower()}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
got = col.get(where=where, include=["documents","metadatas"])
|
||||||
|
|
||||||
|
docs2 = got.get("documents") or []
|
||||||
|
metas2 = got.get("metadatas") or []
|
||||||
|
for d, m in zip(docs2, metas2):
|
||||||
|
prefetch.append({
|
||||||
|
"document": d or "",
|
||||||
|
"metadata": m or {},
|
||||||
|
"emb_sim": 1.0,
|
||||||
|
"distance": 0.0,
|
||||||
|
"score": 1.0,
|
||||||
|
})
|
||||||
|
|
||||||
res = col.query(
|
res = col.query(
|
||||||
query_embeddings=[q_emb],
|
query_embeddings=[q_emb],
|
||||||
@ -3922,7 +4043,17 @@ async def rag_query_api(
|
|||||||
ranked = ranked_full[:n_results]
|
ranked = ranked_full[:n_results]
|
||||||
else:
|
else:
|
||||||
ranked = ranked_full[:n_results]
|
ranked = ranked_full[:n_results]
|
||||||
|
if prefetch:
|
||||||
|
seen = set()
|
||||||
|
merged = []
|
||||||
|
for r in prefetch + ranked:
|
||||||
|
meta = r.get("metadata") or {}
|
||||||
|
key = f"{meta.get('path','')}::{meta.get('chunk_index','')}"
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
merged.append(r)
|
||||||
|
ranked = merged[:n_results]
|
||||||
return {
|
return {
|
||||||
"count": len(ranked),
|
"count": len(ranked),
|
||||||
"results": [{
|
"results": [{
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user