diff --git a/app.py b/app.py index d9d647c..ea276e7 100644 --- a/app.py +++ b/app.py @@ -1932,6 +1932,17 @@ def _parse_validation_results(text: str) -> list[str]: issues.append(line) return issues +def _norm_collection_name(x: str | None, default="code_docs") -> str: + name = (x or "").strip() + return name or default + +def _collection_effective(name: str) -> str: + name = (name or "code_docs").strip() or "code_docs" + # als je al versioned namen hebt, laat ze staan + if re.search(r"__v\d+$", name): + return name + return _collection_versioned(name) + async def _execute_tool(name: str, args: dict) -> dict: logger.info("toolcall: "+str(name)+" ("+str(args)+")") if name == "repo_grep": @@ -1946,6 +1957,28 @@ async def _execute_tool(name: str, args: dict) -> dict: hits = [] qlow = query.lower() for p in root.rglob("*"): + #logger.info(p) + if qlow in str(p).lower() and str(p).split('.')[-1] in ['txt','md','htm','html','cpp','js','json','env','py','php','c','h']: + if qlow==str(p).lower().split("/")[-1]: + max_chars=55000 + else: + max_chars=400 + file_txt=_read_text_file(p) + if len(file_txt) > max_chars: + add_str=" so first " + str(max_chars) + " chars given in excerpt" + else: + add_str=" so file content given in excerpt" + hits.append({ + "path": str(p.relative_to(root)), + "line": "-text found in filename"+str(add_str)+"- ", + "excerpt": str(file_txt)[:max_chars] + }) + elif qlow in str(p).lower(): + hits.append({ + "path": str(p.relative_to(root)), + "line": "-text found in filename- ", + "excerpt": "-text found in filename" + }) if p.is_dir(): continue if set(p.parts) & PROFILE_EXCLUDE_DIRS: @@ -1989,7 +2022,7 @@ async def _execute_tool(name: str, args: dict) -> dict: out = await rag_query_api( query=args.get("query",""), n_results=int(args.get("n_results",5)), - collection_name=args.get("collection_name","code_docs"), + collection_name=_norm_collection_name(args.get("collection_name","code_docs"), "code_docs"), repo=args.get("repo"), path_contains=args.get("path_contains"), profile=args.get("profile") @@ -2551,6 +2584,9 @@ async def openai_chat_completions(body: dict = Body(...), request: Request = Non if mb: args["branch"] = mb.group(1) elif fname == "rag_query": args["query"] = utext.strip() + m = re.search(r"\bcollection(?:_name)?\s*[:=]\s*([A-Za-z0-9_.-]+)", utext, re.I) + if m: + args["collection_name"] = m.group(1) elif fname == "summarize_text": m = re.search(r':\s*(.+)$', utext, re.S) args["text"] = (m.group(1).strip() if m else utext.strip())[:16000] @@ -3326,6 +3362,11 @@ async def _search_first_candidates(repo_url: str, branch: str, query: str, expli def _match_any(name: str, patterns: list[str]) -> bool: return any(fnmatch.fnmatch(name, pat) for pat in patterns) + +def _looks_like_filename(q: str) -> bool: + q = (q or "").lower() + return any(ext in q for ext in (".php", ".py", ".js", ".ts", ".blade.php", ".vue", ".json", ".yaml", ".yml", ".txt", ".cpp", ".html", ".htm", ".xlsx", ".docx")) + def _rag_index_repo_sync( *, @@ -3351,7 +3392,6 @@ def _rag_index_repo_sync( cache = _load_repo_index_cache() repo_key = f"{os.path.basename(repo_url)}|{branch}|{_collection_versioned(collection_name)}|{_EMBEDDER.slug}" cached = cache.get(repo_key) - if not force and cached and cached.get("head_sha") == head_sha: return { "status": "skipped", @@ -3374,8 +3414,10 @@ def _rag_index_repo_sync( if exclude_dirs.strip(): exclude_set |= {d.strip() for d in exclude_dirs.split(",") if d.strip()} - collection = _get_collection(collection_name) - + #collection = _get_collection(collection_name) + collection_name_eff = _collection_effective(collection_name) + collection = _get_collection(collection_name_eff) + # --- Slim chunking toggles (env of via profile='smart') --- use_smart_chunk = ( (os.getenv("CHROMA_SMART_CHUNK","1").lower() not in ("0","false","no")) @@ -3431,6 +3473,11 @@ def _rag_index_repo_sync( return out # precompute owner/repo once repo_full_pre = _repo_owner_repo_from_url(repo_url) + if force: + try: + collection.delete(where={"repo_full": repo_full_pre, "branch": branch}) + except Exception: + pass deleted_paths: set[str] = set() for p in root.rglob("*"): if p.is_dir(): @@ -3490,6 +3537,8 @@ def _rag_index_repo_sync( "path": rel, "chunk_index": idx, "profile": profile, + "basename": os.path.basename(rel).lower(), + "path_lc": rel.lower(), } batch_documents.append(ch) docs_for_bm25.append({"text": ch, "path": rel}) @@ -3663,7 +3712,19 @@ async def rag_query_api( path_contains: Optional[str] = None, profile: Optional[str] = None ) -> dict: - col = _get_collection(collection_name) + branch="main" + repo_full = None + repo_base = None + if repo: + if "://" in repo: # repo is URL + repo_full = _repo_owner_repo_from_url(repo) # -> "admin/cluster" + else: + repo_full = repo # al "owner/repo" + repo_base = repo_full.rsplit("/", 1)[-1] + repo=repo_full + #col = _get_collection(collection_name) + collection_name_eff = _collection_effective(collection_name) + col = _get_collection(collection_name_eff) q_emb = _EMBEDDER.embed_query(query) where = {} if repo: @@ -3699,16 +3760,76 @@ async def rag_query_api( # Herken ook gequote varianten en slimme quotes. path_hints: set[str] = set() PH_PATTERNS = [ - r"[\"“”'](resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)[\"”']", - r"(resources\/[A-Za-z0-9_\/\.-]+\.blade\.php)", - r"[\"“”'](app\/[A-Za-z0-9_\/\.-]+\.php)[\"”']", - r"(app\/[A-Za-z0-9_\/\.-]+\.php)", - r"\b([A-Za-z0-9_\/-]+\.blade\.php)\b", - r"\b([A-Za-z0-9_\/-]+\.php)\b", + # 1) Relatief pad met directories + filename + (multi-dot) extensies + # Voorbeelden: src/foo/bar.py, app/Http/.../UserController.php, foo.tar.gz, index.blade.php + r'["“”\']?((?:[A-Za-z0-9_.-]+[\\/])+[A-Za-z0-9_.-]+(?:\.[A-Za-z0-9]{1,12})+)\b["“”\']?', + + # 2) Alleen filename + (multi-dot) extensies + # Voorbeelden: main.py, CMakeLists.txt (ook gedekt door specials), index.blade.php, foo.min.js + r'["“”\']?([A-Za-z0-9_.-]+(?:\.[A-Za-z0-9]{1,12})+)\b["“”\']?', + + # 3) Speciale bekende bestandsnamen zonder extensie + r'\b(Dockerfile(?:\.[A-Za-z0-9_.-]+)?)\b', # Dockerfile, Dockerfile.dev + r'\b(Makefile)\b', + r'\b(CMakeLists\.txt)\b', + r'\b(README(?:\.[A-Za-z0-9]{1,12})?)\b', # README, README.md + r'\b(LICENSE(?:\.[A-Za-z0-9]{1,12})?)\b', # LICENSE, LICENSE.txt ] + for pat in PH_PATTERNS: - for m in re.finditer(pat, query): + for m in re.finditer(pat, query, flags=re.IGNORECASE): path_hints.add(m.group(1).strip()) + clean_hints = set() + for h in path_hints: + h = h.strip().strip('"\''"“”") + if len(h) > 200: + continue + # skip dingen die op URL lijken + if "://" in h: + continue + clean_hints.add(h) + path_hints = clean_hints + + prefetch = [] + if path_hints and repo and _looks_like_filename(query): + repo_base = repo.rsplit("/", 1)[-1] + # jij indexeert meta["repo_full"] als owner/repo; bij rag_query kan repo "owner/repo" zijn. + repo_full = repo + + for hint in list(path_hints)[:10]: + h = hint.strip() + if not h: + continue + + # 1) exact pad (als er / in zit of het lijkt op resources/... etc.) + if "/" in h: + where = {"$and": [ + {"repo_full": {"$eq": repo_full}}, + {"branch": {"$eq": branch}}, + {"path": {"$eq": h}} + ] + } + got = col.get(where=where, include=["documents","metadatas"]) + else: + # 2) alleen bestandsnaam -> match op basename + where = {"$and": [ + {"repo_full": {"$eq": repo_full}}, + {"branch": {"$eq": branch}}, + {"basename": {"$eq": h.lower()}} + ] + } + got = col.get(where=where, include=["documents","metadatas"]) + + docs2 = got.get("documents") or [] + metas2 = got.get("metadatas") or [] + for d, m in zip(docs2, metas2): + prefetch.append({ + "document": d or "", + "metadata": m or {}, + "emb_sim": 1.0, + "distance": 0.0, + "score": 1.0, + }) res = col.query( query_embeddings=[q_emb], @@ -3922,7 +4043,17 @@ async def rag_query_api( ranked = ranked_full[:n_results] else: ranked = ranked_full[:n_results] - + if prefetch: + seen = set() + merged = [] + for r in prefetch + ranked: + meta = r.get("metadata") or {} + key = f"{meta.get('path','')}::{meta.get('chunk_index','')}" + if key in seen: + continue + seen.add(key) + merged.append(r) + ranked = merged[:n_results] return { "count": len(ranked), "results": [{