RAG verbetering ivm filenaam zoeken
This commit is contained in:
parent
4e2ffac24c
commit
4c29600553
157
app.py
157
app.py
@ -1932,6 +1932,17 @@ def _parse_validation_results(text: str) -> list[str]:
|
||||
issues.append(line)
|
||||
return issues
|
||||
|
||||
def _norm_collection_name(x: str | None, default="code_docs") -> str:
|
||||
name = (x or "").strip()
|
||||
return name or default
|
||||
|
||||
def _collection_effective(name: str) -> str:
|
||||
name = (name or "code_docs").strip() or "code_docs"
|
||||
# als je al versioned namen hebt, laat ze staan
|
||||
if re.search(r"__v\d+$", name):
|
||||
return name
|
||||
return _collection_versioned(name)
|
||||
|
||||
async def _execute_tool(name: str, args: dict) -> dict:
|
||||
logger.info("toolcall: "+str(name)+" ("+str(args)+")")
|
||||
if name == "repo_grep":
|
||||
@ -1946,6 +1957,28 @@ async def _execute_tool(name: str, args: dict) -> dict:
|
||||
hits = []
|
||||
qlow = query.lower()
|
||||
for p in root.rglob("*"):
|
||||
#logger.info(p)
|
||||
if qlow in str(p).lower() and str(p).split('.')[-1] in ['txt','md','htm','html','cpp','js','json','env','py','php','c','h']:
|
||||
if qlow==str(p).lower().split("/")[-1]:
|
||||
max_chars=55000
|
||||
else:
|
||||
max_chars=400
|
||||
file_txt=_read_text_file(p)
|
||||
if len(file_txt) > max_chars:
|
||||
add_str=" so first " + str(max_chars) + " chars given in excerpt"
|
||||
else:
|
||||
add_str=" so file content given in excerpt"
|
||||
hits.append({
|
||||
"path": str(p.relative_to(root)),
|
||||
"line": "-text found in filename"+str(add_str)+"- ",
|
||||
"excerpt": str(file_txt)[:max_chars]
|
||||
})
|
||||
elif qlow in str(p).lower():
|
||||
hits.append({
|
||||
"path": str(p.relative_to(root)),
|
||||
"line": "-text found in filename- ",
|
||||
"excerpt": "-text found in filename"
|
||||
})
|
||||
if p.is_dir():
|
||||
continue
|
||||
if set(p.parts) & PROFILE_EXCLUDE_DIRS:
|
||||
@ -1989,7 +2022,7 @@ async def _execute_tool(name: str, args: dict) -> dict:
|
||||
out = await rag_query_api(
|
||||
query=args.get("query",""),
|
||||
n_results=int(args.get("n_results",5)),
|
||||
collection_name=args.get("collection_name","code_docs"),
|
||||
collection_name=_norm_collection_name(args.get("collection_name","code_docs"), "code_docs"),
|
||||
repo=args.get("repo"),
|
||||
path_contains=args.get("path_contains"),
|
||||
profile=args.get("profile")
|
||||
@ -2551,6 +2584,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non
|
||||
if mb: args["branch"] = mb.group(1)
|
||||
elif fname == "rag_query":
|
||||
args["query"] = utext.strip()
|
||||
m = re.search(r"\bcollection(?:_name)?\s*[:=]\s*([A-Za-z0-9_.-]+)", utext, re.I)
|
||||
if m:
|
||||
args["collection_name"] = m.group(1)
|
||||
elif fname == "summarize_text":
|
||||
m = re.search(r':\s*(.+)$', utext, re.S)
|
||||
args["text"] = (m.group(1).strip() if m else utext.strip())[:16000]
|
||||
@ -3326,6 +3362,11 @@ async def _search_first_candidates(repo_url: str, branch: str, query: str, expli
|
||||
|
||||
def _match_any(name: str, patterns: list[str]) -> bool:
|
||||
return any(fnmatch.fnmatch(name, pat) for pat in patterns)
|
||||
|
||||
def _looks_like_filename(q: str) -> bool:
|
||||
q = (q or "").lower()
|
||||
return any(ext in q for ext in (".php", ".py", ".js", ".ts", ".blade.php", ".vue", ".json", ".yaml", ".yml", ".txt", ".cpp", ".html", ".htm", ".xlsx", ".docx"))
|
||||
|
||||
|
||||
def _rag_index_repo_sync(
|
||||
*,
|
||||
@ -3351,7 +3392,6 @@ def _rag_index_repo_sync(
|
||||
cache = _load_repo_index_cache()
|
||||
repo_key = f"{os.path.basename(repo_url)}|{branch}|{_collection_versioned(collection_name)}|{_EMBEDDER.slug}"
|
||||
cached = cache.get(repo_key)
|
||||
|
||||
if not force and cached and cached.get("head_sha") == head_sha:
|
||||
return {
|
||||
"status": "skipped",
|
||||
@ -3374,8 +3414,10 @@ def _rag_index_repo_sync(
|
||||
if exclude_dirs.strip():
|
||||
exclude_set |= {d.strip() for d in exclude_dirs.split(",") if d.strip()}
|
||||
|
||||
collection = _get_collection(collection_name)
|
||||
|
||||
#collection = _get_collection(collection_name)
|
||||
collection_name_eff = _collection_effective(collection_name)
|
||||
collection = _get_collection(collection_name_eff)
|
||||
|
||||
# --- Slim chunking toggles (env of via profile='smart') ---
|
||||
use_smart_chunk = (
|
||||
(os.getenv("CHROMA_SMART_CHUNK","1").lower() not in ("0","false","no"))
|
||||
@ -3431,6 +3473,11 @@ def _rag_index_repo_sync(
|
||||
return out
|
||||
# precompute owner/repo once
|
||||
repo_full_pre = _repo_owner_repo_from_url(repo_url)
|
||||
if force:
|
||||
try:
|
||||
collection.delete(where={"repo_full": repo_full_pre, "branch": branch})
|
||||
except Exception:
|
||||
pass
|
||||
deleted_paths: set[str] = set()
|
||||
for p in root.rglob("*"):
|
||||
if p.is_dir():
|
||||
@ -3490,6 +3537,8 @@ def _rag_index_repo_sync(
|
||||
"path": rel,
|
||||
"chunk_index": idx,
|
||||
"profile": profile,
|
||||
"basename": os.path.basename(rel).lower(),
|
||||
"path_lc": rel.lower(),
|
||||
}
|
||||
batch_documents.append(ch)
|
||||
docs_for_bm25.append({"text": ch, "path": rel})
|
||||
@ -3663,7 +3712,19 @@ async def rag_query_api(
|
||||
path_contains: Optional[str] = None,
|
||||
profile: Optional[str] = None
|
||||
) -> dict:
|
||||
col = _get_collection(collection_name)
|
||||
branch="main"
|
||||
repo_full = None
|
||||
repo_base = None
|
||||
if repo:
|
||||
if "://" in repo: # repo is URL
|
||||
repo_full = _repo_owner_repo_from_url(repo) # -> "admin/cluster"
|
||||
else:
|
||||
repo_full = repo # al "owner/repo"
|
||||
repo_base = repo_full.rsplit("/", 1)[-1]
|
||||
repo=repo_full
|
||||
#col = _get_collection(collection_name)
|
||||
collection_name_eff = _collection_effective(collection_name)
|
||||
col = _get_collection(collection_name_eff)
|
||||
q_emb = _EMBEDDER.embed_query(query)
|
||||
where = {}
|
||||
if repo:
|
||||
@ -3699,16 +3760,76 @@ async def rag_query_api(
|
||||
# Herken ook gequote varianten en slimme quotes.
|
||||
path_hints: set[str] = set()
|
||||
PH_PATTERNS = [
|
||||
r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"”']",
|
||||
r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)",
|
||||
r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"”']",
|
||||
r"(app\/[A-Za-z0-9_\/\.-]+\.php)",
|
||||
r"\b([A-Za-z0-9_\/-]+\.blade\.php)\b",
|
||||
r"\b([A-Za-z0-9_\/-]+\.php)\b",
|
||||
# 1) Relatief pad met directories + filename + (multi-dot) extensies
|
||||
# Voorbeelden: src/foo/bar.py, app/Http/.../UserController.php, foo.tar.gz, index.blade.php
|
||||
r'["“”\']?((?:[A-Za-z0-9_.-]+[\\/])+[A-Za-z0-9_.-]+(?:\.[A-Za-z0-9]{1,12})+)\b["“”\']?',
|
||||
|
||||
# 2) Alleen filename + (multi-dot) extensies
|
||||
# Voorbeelden: main.py, CMakeLists.txt (ook gedekt door specials), index.blade.php, foo.min.js
|
||||
r'["“”\']?([A-Za-z0-9_.-]+(?:\.[A-Za-z0-9]{1,12})+)\b["“”\']?',
|
||||
|
||||
# 3) Speciale bekende bestandsnamen zonder extensie
|
||||
r'\b(Dockerfile(?:\.[A-Za-z0-9_.-]+)?)\b', # Dockerfile, Dockerfile.dev
|
||||
r'\b(Makefile)\b',
|
||||
r'\b(CMakeLists\.txt)\b',
|
||||
r'\b(README(?:\.[A-Za-z0-9]{1,12})?)\b', # README, README.md
|
||||
r'\b(LICENSE(?:\.[A-Za-z0-9]{1,12})?)\b', # LICENSE, LICENSE.txt
|
||||
]
|
||||
|
||||
for pat in PH_PATTERNS:
|
||||
for m in re.finditer(pat, query):
|
||||
for m in re.finditer(pat, query, flags=re.IGNORECASE):
|
||||
path_hints.add(m.group(1).strip())
|
||||
clean_hints = set()
|
||||
for h in path_hints:
|
||||
h = h.strip().strip('"\''"“”")
|
||||
if len(h) > 200:
|
||||
continue
|
||||
# skip dingen die op URL lijken
|
||||
if "://" in h:
|
||||
continue
|
||||
clean_hints.add(h)
|
||||
path_hints = clean_hints
|
||||
|
||||
prefetch = []
|
||||
if path_hints and repo and _looks_like_filename(query):
|
||||
repo_base = repo.rsplit("/", 1)[-1]
|
||||
# jij indexeert meta["repo_full"] als owner/repo; bij rag_query kan repo "owner/repo" zijn.
|
||||
repo_full = repo
|
||||
|
||||
for hint in list(path_hints)[:10]:
|
||||
h = hint.strip()
|
||||
if not h:
|
||||
continue
|
||||
|
||||
# 1) exact pad (als er / in zit of het lijkt op resources/... etc.)
|
||||
if "/" in h:
|
||||
where = {"$and": [
|
||||
{"repo_full": {"$eq": repo_full}},
|
||||
{"branch": {"$eq": branch}},
|
||||
{"path": {"$eq": h}}
|
||||
]
|
||||
}
|
||||
got = col.get(where=where, include=["documents","metadatas"])
|
||||
else:
|
||||
# 2) alleen bestandsnaam -> match op basename
|
||||
where = {"$and": [
|
||||
{"repo_full": {"$eq": repo_full}},
|
||||
{"branch": {"$eq": branch}},
|
||||
{"basename": {"$eq": h.lower()}}
|
||||
]
|
||||
}
|
||||
got = col.get(where=where, include=["documents","metadatas"])
|
||||
|
||||
docs2 = got.get("documents") or []
|
||||
metas2 = got.get("metadatas") or []
|
||||
for d, m in zip(docs2, metas2):
|
||||
prefetch.append({
|
||||
"document": d or "",
|
||||
"metadata": m or {},
|
||||
"emb_sim": 1.0,
|
||||
"distance": 0.0,
|
||||
"score": 1.0,
|
||||
})
|
||||
|
||||
res = col.query(
|
||||
query_embeddings=[q_emb],
|
||||
@ -3922,7 +4043,17 @@ async def rag_query_api(
|
||||
ranked = ranked_full[:n_results]
|
||||
else:
|
||||
ranked = ranked_full[:n_results]
|
||||
|
||||
if prefetch:
|
||||
seen = set()
|
||||
merged = []
|
||||
for r in prefetch + ranked:
|
||||
meta = r.get("metadata") or {}
|
||||
key = f"{meta.get('path','')}::{meta.get('chunk_index','')}"
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
merged.append(r)
|
||||
ranked = merged[:n_results]
|
||||
return {
|
||||
"count": len(ranked),
|
||||
"results": [{
|
||||
|
||||
Loading…
Reference in New Issue
Block a user