From 3d620d13b937132f8bf9ed1cadbcf9d1d139d508 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 23 Feb 2026 16:39:46 +0100 Subject: [PATCH] added PDF stuff --- README_PDF.md | 35 +++++ app.py | 414 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 449 insertions(+) create mode 100644 README_PDF.md diff --git a/README_PDF.md b/README_PDF.md new file mode 100644 index 0000000..a3b20f0 --- /dev/null +++ b/README_PDF.md @@ -0,0 +1,35 @@ +# PDF endpoints in toolserver + +This build adds PDF ingestion + page text + page rendering (+ optional embedded image extraction) and indexing into Chroma / Meilisearch. + +## Enable / config (env) + +- ENABLE_PDF=1 +- PDF_STORE_DIR=/data/pdf_store +- PDF_MAX_MB=80 +- MEILI_PDF_INDEX=pdf_docs (optional; used by /pdf/{doc_id}/index when add_meili=true) + +PyMuPDF is required: +- pip install pymupdf +- import name: `fitz` + +## Endpoints + +- POST /pdf/ingest (multipart/form-data, field: file) + -> {doc_id, n_pages, sha256} + +- GET /pdf/{doc_id}/text?page=1&mode=blocks|text|dict + -> blocks includes bbox + text + +- GET /pdf/{doc_id}/render?page=1&dpi=200 + -> image/png (cached on disk) + +- GET /pdf/{doc_id}/images?page=1 + -> list of embedded images (xref ids) + +- GET /pdf/{doc_id}/image/{xref} + -> download embedded image + +- POST /pdf/{doc_id}/index + body: PdfIndexRequest + -> chunks to Chroma + optional Meili, supports extra_text_by_page for vision captions. diff --git a/app.py b/app.py index e0facf3..428f3fc 100644 --- a/app.py +++ b/app.py @@ -24,6 +24,12 @@ import chromadb import git import base64 +# --- Optional PDF backend (PyMuPDF) --- +try: + import fitz # PyMuPDF +except Exception: + fitz = None + from fastapi import FastAPI, APIRouter, UploadFile, File, Form, Request, HTTPException, Body from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse from fastapi.middleware.cors import CORSMiddleware @@ -5464,6 +5470,414 @@ async def _get_git_repo_async(repo_url: str, branch: str = "main") -> str: # gitpython doet subprocess/IO → altijd in threadpool return await run_in_threadpool(get_git_repo, repo_url, branch) + +# ============================ +# PDF endpoints (toolserver) +# ============================ +PDF_ENABLED = (os.getenv("ENABLE_PDF", "1").lower() not in ("0", "false", "no")) +PDF_STORE_DIR = Path(os.getenv("PDF_STORE_DIR", "/data/pdf_store")).resolve() +PDF_MAX_MB = int(os.getenv("PDF_MAX_MB", "80")) +MEILI_PDF_INDEX = os.getenv("MEILI_PDF_INDEX", "pdf_docs") + +def _pdf_require(): + if not PDF_ENABLED: + raise HTTPException(status_code=404, detail="PDF endpoints disabled (ENABLE_PDF=0)") + if fitz is None: + raise HTTPException(status_code=500, detail="PyMuPDF (fitz) not installed in this container") + +def _pdf_safe_doc_id(doc_id: str) -> str: + doc_id = (doc_id or "").strip() + if not re.fullmatch(r"[a-f0-9]{32}", doc_id): + raise HTTPException(status_code=400, detail="Invalid doc_id") + return doc_id + +def _pdf_paths(doc_id: str) -> tuple[Path, Path]: + doc_id = _pdf_safe_doc_id(doc_id) + pdf_path = PDF_STORE_DIR / f"{doc_id}.pdf" + meta_path = PDF_STORE_DIR / f"{doc_id}.json" + return pdf_path, meta_path + +def _pdf_load_meta(doc_id: str) -> dict: + pdf_path, meta_path = _pdf_paths(doc_id) + if not meta_path.exists(): + raise HTTPException(status_code=404, detail="doc_id not found") + try: + return json.loads(meta_path.read_text(encoding="utf-8")) + except Exception: + raise HTTPException(status_code=500, detail="corrupt metadata") + +def _pdf_open(doc_id: str): + _pdf_require() + pdf_path, _ = _pdf_paths(doc_id) + if not pdf_path.exists(): + raise HTTPException(status_code=404, detail="doc_id not found") + try: + return fitz.open(str(pdf_path)) + except Exception as e: + raise HTTPException(status_code=500, detail=f"failed to open pdf: {str(e)}") + +def _ensure_pdf_store_dir(): + try: + PDF_STORE_DIR.mkdir(parents=True, exist_ok=True) + except Exception: + # if we cannot create, we fail at runtime on ingest anyway + pass + +_ensure_pdf_store_dir() + +@app.post("/pdf/ingest") +async def pdf_ingest(file: UploadFile = File(...)): + """Upload een PDF en return doc_id + page count.""" + _pdf_require() + if not file.filename: + raise HTTPException(status_code=400, detail="missing filename") + ct = (file.content_type or "").lower() + if ct and ("pdf" not in ct): + # niet hard-fail: sommige clients sturen generiek content-type + logger.warning("pdf_ingest: suspicious content-type: %s", ct) + + # size guard (best-effort, we stream to disk) + doc_id = uuid.uuid4().hex + pdf_path, meta_path = _pdf_paths(doc_id) + + sha = hashlib.sha256() + total = 0 + try: + with open(pdf_path, "wb") as f: + while True: + chunk = await file.read(1024 * 1024) + if not chunk: + break + total += len(chunk) + if total > PDF_MAX_MB * 1024 * 1024: + raise HTTPException(status_code=413, detail=f"PDF too large (> {PDF_MAX_MB} MB)") + sha.update(chunk) + f.write(chunk) + except HTTPException: + with contextlib.suppress(Exception): + if pdf_path.exists(): + pdf_path.unlink() + raise + except Exception as e: + with contextlib.suppress(Exception): + if pdf_path.exists(): + pdf_path.unlink() + raise HTTPException(status_code=500, detail=f"failed to write pdf: {str(e)}") + + # Open to count pages + try: + doc = fitz.open(str(pdf_path)) + n_pages = int(doc.page_count) + doc.close() + except Exception as e: + with contextlib.suppress(Exception): + pdf_path.unlink() + raise HTTPException(status_code=400, detail=f"invalid PDF: {str(e)}") + + meta = { + "doc_id": doc_id, + "filename": file.filename, + "sha256": sha.hexdigest(), + "bytes": total, + "n_pages": n_pages, + "created_utc": time.time(), + } + try: + meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8") + except Exception as e: + with contextlib.suppress(Exception): + pdf_path.unlink() + raise HTTPException(status_code=500, detail=f"failed to write metadata: {str(e)}") + + return {"doc_id": doc_id, "n_pages": n_pages, "sha256": meta["sha256"]} + +@app.get("/pdf/{doc_id}/text") +async def pdf_text(doc_id: str, page: int = 1, mode: str = "blocks"): + """Tekst per pagina. mode=blocks (default) of mode=text of mode=dict.""" + _pdf_require() + meta = _pdf_load_meta(doc_id) + n_pages = int(meta.get("n_pages") or 0) + if page < 1 or page > n_pages: + raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})") + + doc = _pdf_open(doc_id) + try: + p = doc.load_page(page - 1) + mode = (mode or "blocks").lower().strip() + if mode == "text": + txt = p.get_text("text") or "" + return {"doc_id": doc_id, "page": page, "text": txt} + if mode == "dict": + d = p.get_text("dict") + return {"doc_id": doc_id, "page": page, "dict": d} + # blocks + blocks_raw = p.get_text("blocks") or [] + blocks = [] + texts = [] + for b in blocks_raw: + try: + x0, y0, x1, y1, t, block_no, block_type = b[:7] + t = (t or "").strip() + if t: + texts.append(t) + blocks.append({ + "bbox": [float(x0), float(y0), float(x1), float(y1)], + "text": t, + "block_no": int(block_no), + "block_type": int(block_type), + }) + except Exception: + continue + return {"doc_id": doc_id, "page": page, "text": "\n\n".join(texts), "blocks": blocks} + finally: + with contextlib.suppress(Exception): + doc.close() + +@app.get("/pdf/{doc_id}/render") +async def pdf_render(doc_id: str, page: int = 1, dpi: int = 200): + """Render pagina naar PNG.""" + _pdf_require() + meta = _pdf_load_meta(doc_id) + n_pages = int(meta.get("n_pages") or 0) + if page < 1 or page > n_pages: + raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})") + dpi = int(max(72, min(600, dpi))) + cache_path = PDF_STORE_DIR / f"{doc_id}.p{page}.dpi{dpi}.png" + if cache_path.exists(): + data = cache_path.read_bytes() + return StreamingResponse(BytesIO(data), media_type="image/png") + + doc = _pdf_open(doc_id) + try: + p = doc.load_page(page - 1) + m = fitz.Matrix(dpi / 72.0, dpi / 72.0) + pix = p.get_pixmap(matrix=m, alpha=False) + data = pix.tobytes("png") + with contextlib.suppress(Exception): + cache_path.write_bytes(data) + return StreamingResponse(BytesIO(data), media_type="image/png") + finally: + with contextlib.suppress(Exception): + doc.close() + +@app.get("/pdf/{doc_id}/images") +async def pdf_images(doc_id: str, page: int = 1): + """List embedded images op een pagina (xref ids).""" + _pdf_require() + meta = _pdf_load_meta(doc_id) + n_pages = int(meta.get("n_pages") or 0) + if page < 1 or page > n_pages: + raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})") + + doc = _pdf_open(doc_id) + try: + p = doc.load_page(page - 1) + imgs = [] + for it in (p.get_images(full=True) or []): + # tuple shape: (xref, smask, width, height, bpc, colorspace, alt, name, filter) + try: + xref = int(it[0]) + imgs.append({ + "xref": xref, + "smask": int(it[1]) if it[1] is not None else 0, + "width": int(it[2]), + "height": int(it[3]), + "bpc": int(it[4]), + "colorspace": str(it[5]), + "name": str(it[7]) if len(it) > 7 else "", + "filter": str(it[8]) if len(it) > 8 else "", + }) + except Exception: + continue + return {"doc_id": doc_id, "page": page, "images": imgs} + finally: + with contextlib.suppress(Exception): + doc.close() + +@app.get("/pdf/{doc_id}/image/{xref}") +async def pdf_image(doc_id: str, xref: int): + """Download embedded image by xref.""" + _pdf_require() + _pdf_load_meta(doc_id) # validate existence + xref = int(xref) + if xref <= 0: + raise HTTPException(status_code=400, detail="invalid xref") + doc = _pdf_open(doc_id) + try: + info = doc.extract_image(xref) + if not info: + raise HTTPException(status_code=404, detail="image not found") + img_bytes = info.get("image") or b"" + ext = (info.get("ext") or "bin").lower() + mt = "image/png" if ext == "png" else ("image/jpeg" if ext in ("jpg","jpeg") else "application/octet-stream") + return StreamingResponse(BytesIO(img_bytes), media_type=mt) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + finally: + with contextlib.suppress(Exception): + doc.close() + +def _pdf_simple_chunks(text: str, target_chars: int = 1800, overlap: int = 200) -> list[str]: + text = (text or "").strip() + if not text: + return [] + # normalize whitespace + text = re.sub(r"\r\n?", "\n", text) + # Split on paragraph boundaries first + paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()] + chunks = [] + buf = "" + for p in paras: + if not buf: + buf = p + continue + if len(buf) + 2 + len(p) <= target_chars: + buf += "\n\n" + p + else: + chunks.append(buf) + # overlap tail + tail = buf[-overlap:] if overlap > 0 and len(buf) > overlap else "" + buf = (tail + "\n\n" + p).strip() + if buf: + chunks.append(buf) + # hard split very large chunks + out=[] + for c in chunks: + if len(c) <= target_chars * 1.7: + out.append(c) + else: + step = max(400, target_chars - overlap) + for i in range(0, len(c), step): + out.append(c[i:i+target_chars]) + return [c for c in out if c.strip()] + +async def _ensure_meili_index_generic(uid: str, primary_key: str = "id") -> bool: + if not MEILI_URL: + return False + uid = (uid or "").strip() + if not uid: + return False + try: + # create index (ignore if exists) + await _meili_req("POST", "/indexes", json_body={"uid": uid, "primaryKey": primary_key}, timeout=10.0) + except Exception: + pass + return True + +async def _meili_upsert_documents(index_uid: str, docs: list[dict]) -> dict: + if not MEILI_URL: + return {"status": "skipped", "reason": "MEILI_URL not set"} + if not docs: + return {"status": "skipped", "reason": "no docs"} + await _ensure_meili_index_generic(index_uid, primary_key="id") + r = await _meili_req("POST", f"/indexes/{index_uid}/documents", json_body=docs, timeout=30.0) + try: + return {"status": "ok", "http": r.status_code, "body": r.json()} + except Exception: + return {"status": "ok", "http": r.status_code, "body": (r.text or "")[:4000]} + +class PdfIndexRequest(BaseModel): + collection_name: str = "pdf_docs" + meili_index: str = "" + pages: Union[str, List[int]] = "all" # "all" or list of page numbers (1-based) + chunk_target_chars: int = 1800 + chunk_overlap: int = 200 + extra_text_by_page: Dict[str, str] = {} # e.g. {"1": "vision caption ..."} + add_meili: bool = True + add_chroma: bool = True + +@app.post("/pdf/{doc_id}/index") +async def pdf_index(doc_id: str, req: PdfIndexRequest = Body(default=PdfIndexRequest())): + """Chunk + upsert naar Chroma (en optioneel Meili). Je kunt extra (vision) tekst per pagina meesturen.""" + _pdf_require() + meta = _pdf_load_meta(doc_id) + n_pages = int(meta.get("n_pages") or 0) + pages = req.pages + if isinstance(pages, str): + if pages.lower().strip() == "all": + pages_list = list(range(1, n_pages + 1)) + else: + raise HTTPException(status_code=400, detail="pages must be 'all' or list[int]") + else: + pages_list = [int(p) for p in pages] + for p in pages_list: + if p < 1 or p > n_pages: + raise HTTPException(status_code=400, detail=f"page out of range in pages: {p}") + + collection_name_eff = _collection_effective(req.collection_name or "pdf_docs") + collection = _get_collection(collection_name_eff) if req.add_chroma else None + + batch_docs: list[str] = [] + batch_meta: list[dict] = [] + batch_ids: list[str] = [] + BATCH_SIZE = 64 + total_chunks = 0 + + meili_docs: list[dict] = [] + + doc = _pdf_open(doc_id) + try: + for pno in pages_list: + page_obj = doc.load_page(pno - 1) + base_txt = (page_obj.get_text("text") or "").strip() + extra = (req.extra_text_by_page or {}).get(str(pno), "").strip() + merged = base_txt + if extra: + merged = (merged + "\n\n[VISION]\n" + extra).strip() + chunks = _pdf_simple_chunks(merged, target_chars=int(req.chunk_target_chars), overlap=int(req.chunk_overlap)) + for ci, chunk in enumerate(chunks): + h = hashlib.sha1(chunk.encode("utf-8", errors="ignore")).hexdigest()[:16] + _id = f"pdf|{doc_id}|p{pno}|c{ci}|{h}" + md = { + "source": "pdf", + "doc_id": doc_id, + "page": int(pno), + "filename": meta.get("filename",""), + "sha256": meta.get("sha256",""), + } + if req.add_chroma and collection is not None: + batch_docs.append(chunk) + batch_meta.append(md) + batch_ids.append(_id) + if len(batch_docs) >= BATCH_SIZE: + _collection_add(collection, batch_docs, batch_meta, batch_ids) + total_chunks += len(batch_docs) + batch_docs.clear(); batch_meta.clear(); batch_ids.clear() + if req.add_meili and (req.meili_index or MEILI_PDF_INDEX): + meili_docs.append({ + "id": _id, + "doc_id": doc_id, + "page": int(pno), + "filename": meta.get("filename",""), + "text": chunk, + }) + # flush chroma + if req.add_chroma and collection is not None and batch_docs: + _collection_add(collection, batch_docs, batch_meta, batch_ids) + total_chunks += len(batch_docs) + batch_docs.clear(); batch_meta.clear(); batch_ids.clear() + finally: + with contextlib.suppress(Exception): + doc.close() + + meili_out = {"status": "skipped"} + if req.add_meili and (req.meili_index or MEILI_PDF_INDEX) and meili_docs: + meili_out = await _meili_upsert_documents((req.meili_index or MEILI_PDF_INDEX), meili_docs) + + return { + "status": "ok", + "doc_id": doc_id, + "pages_indexed": len(pages_list), + "chunks_added_chroma": int(total_chunks), + "meili": meili_out, + "collection_effective": collection_name_eff, + } + + + + # Registreer injecties initialize_agent( app=app,