added PDF stuff

2026-02-23 16:39:46 +01:00 · 2026-02-23 16:39:46 +01:00 · 3d620d13b9
commit 3d620d13b9
parent cbfddf128e
2 changed files with 449 additions and 0 deletions
--- a/README_PDF.md
+++ b/README_PDF.md
@ -0,0 +1,35 @@
+# PDF endpoints in toolserver
+
+This build adds PDF ingestion + page text + page rendering (+ optional embedded image extraction) and indexing into Chroma / Meilisearch.
+
+## Enable / config (env)
+
+- ENABLE_PDF=1
+- PDF_STORE_DIR=/data/pdf_store
+- PDF_MAX_MB=80
+- MEILI_PDF_INDEX=pdf_docs   (optional; used by /pdf/{doc_id}/index when add_meili=true)
+
+PyMuPDF is required:
+- pip install pymupdf
+- import name: `fitz`
+
+## Endpoints
+
+- POST /pdf/ingest   (multipart/form-data, field: file)
+  -> {doc_id, n_pages, sha256}
+
+- GET /pdf/{doc_id}/text?page=1&mode=blocks|text|dict
+  -> blocks includes bbox + text
+
+- GET /pdf/{doc_id}/render?page=1&dpi=200
+  -> image/png (cached on disk)
+
+- GET /pdf/{doc_id}/images?page=1
+  -> list of embedded images (xref ids)
+
+- GET /pdf/{doc_id}/image/{xref}
+  -> download embedded image
+
+- POST /pdf/{doc_id}/index
+  body: PdfIndexRequest
+  -> chunks to Chroma + optional Meili, supports extra_text_by_page for vision captions.
--- a/app.py
+++ b/app.py
@ -24,6 +24,12 @@ import chromadb
 import git
 import base64

+# --- Optional PDF backend (PyMuPDF) ---
+try:
+    import fitz  # PyMuPDF
+except Exception:
+    fitz = None
+
 from fastapi import FastAPI, APIRouter, UploadFile, File, Form, Request, HTTPException, Body
 from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
 from fastapi.middleware.cors import CORSMiddleware
@ -5464,6 +5470,414 @@ async def _get_git_repo_async(repo_url: str, branch: str = "main") -> str:
    # gitpython doet subprocess/IO → altijd in threadpool
    return await run_in_threadpool(get_git_repo, repo_url, branch)

+
+# ============================
+# PDF endpoints (toolserver)
+# ============================
+PDF_ENABLED = (os.getenv("ENABLE_PDF", "1").lower() not in ("0", "false", "no"))
+PDF_STORE_DIR = Path(os.getenv("PDF_STORE_DIR", "/data/pdf_store")).resolve()
+PDF_MAX_MB = int(os.getenv("PDF_MAX_MB", "80"))
+MEILI_PDF_INDEX = os.getenv("MEILI_PDF_INDEX", "pdf_docs")
+
+def _pdf_require():
+    if not PDF_ENABLED:
+        raise HTTPException(status_code=404, detail="PDF endpoints disabled (ENABLE_PDF=0)")
+    if fitz is None:
+        raise HTTPException(status_code=500, detail="PyMuPDF (fitz) not installed in this container")
+
+def _pdf_safe_doc_id(doc_id: str) -> str:
+    doc_id = (doc_id or "").strip()
+    if not re.fullmatch(r"[a-f0-9]{32}", doc_id):
+        raise HTTPException(status_code=400, detail="Invalid doc_id")
+    return doc_id
+
+def _pdf_paths(doc_id: str) -> tuple[Path, Path]:
+    doc_id = _pdf_safe_doc_id(doc_id)
+    pdf_path = PDF_STORE_DIR / f"{doc_id}.pdf"
+    meta_path = PDF_STORE_DIR / f"{doc_id}.json"
+    return pdf_path, meta_path
+
+def _pdf_load_meta(doc_id: str) -> dict:
+    pdf_path, meta_path = _pdf_paths(doc_id)
+    if not meta_path.exists():
+        raise HTTPException(status_code=404, detail="doc_id not found")
+    try:
+        return json.loads(meta_path.read_text(encoding="utf-8"))
+    except Exception:
+        raise HTTPException(status_code=500, detail="corrupt metadata")
+
+def _pdf_open(doc_id: str):
+    _pdf_require()
+    pdf_path, _ = _pdf_paths(doc_id)
+    if not pdf_path.exists():
+        raise HTTPException(status_code=404, detail="doc_id not found")
+    try:
+        return fitz.open(str(pdf_path))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"failed to open pdf: {str(e)}")
+
+def _ensure_pdf_store_dir():
+    try:
+        PDF_STORE_DIR.mkdir(parents=True, exist_ok=True)
+    except Exception:
+        # if we cannot create, we fail at runtime on ingest anyway
+        pass
+
+_ensure_pdf_store_dir()
+
+@app.post("/pdf/ingest")
+async def pdf_ingest(file: UploadFile = File(...)):
+    """Upload een PDF en return doc_id + page count."""
+    _pdf_require()
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="missing filename")
+    ct = (file.content_type or "").lower()
+    if ct and ("pdf" not in ct):
+        # niet hard-fail: sommige clients sturen generiek content-type
+        logger.warning("pdf_ingest: suspicious content-type: %s", ct)
+
+    # size guard (best-effort, we stream to disk)
+    doc_id = uuid.uuid4().hex
+    pdf_path, meta_path = _pdf_paths(doc_id)
+
+    sha = hashlib.sha256()
+    total = 0
+    try:
+        with open(pdf_path, "wb") as f:
+            while True:
+                chunk = await file.read(1024 * 1024)
+                if not chunk:
+                    break
+                total += len(chunk)
+                if total > PDF_MAX_MB * 1024 * 1024:
+                    raise HTTPException(status_code=413, detail=f"PDF too large (> {PDF_MAX_MB} MB)")
+                sha.update(chunk)
+                f.write(chunk)
+    except HTTPException:
+        with contextlib.suppress(Exception):
+            if pdf_path.exists():
+                pdf_path.unlink()
+        raise
+    except Exception as e:
+        with contextlib.suppress(Exception):
+            if pdf_path.exists():
+                pdf_path.unlink()
+        raise HTTPException(status_code=500, detail=f"failed to write pdf: {str(e)}")
+
+    # Open to count pages
+    try:
+        doc = fitz.open(str(pdf_path))
+        n_pages = int(doc.page_count)
+        doc.close()
+    except Exception as e:
+        with contextlib.suppress(Exception):
+            pdf_path.unlink()
+        raise HTTPException(status_code=400, detail=f"invalid PDF: {str(e)}")
+
+    meta = {
+        "doc_id": doc_id,
+        "filename": file.filename,
+        "sha256": sha.hexdigest(),
+        "bytes": total,
+        "n_pages": n_pages,
+        "created_utc": time.time(),
+    }
+    try:
+        meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
+    except Exception as e:
+        with contextlib.suppress(Exception):
+            pdf_path.unlink()
+        raise HTTPException(status_code=500, detail=f"failed to write metadata: {str(e)}")
+
+    return {"doc_id": doc_id, "n_pages": n_pages, "sha256": meta["sha256"]}
+
+@app.get("/pdf/{doc_id}/text")
+async def pdf_text(doc_id: str, page: int = 1, mode: str = "blocks"):
+    """Tekst per pagina. mode=blocks (default) of mode=text of mode=dict."""
+    _pdf_require()
+    meta = _pdf_load_meta(doc_id)
+    n_pages = int(meta.get("n_pages") or 0)
+    if page < 1 or page > n_pages:
+        raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
+
+    doc = _pdf_open(doc_id)
+    try:
+        p = doc.load_page(page - 1)
+        mode = (mode or "blocks").lower().strip()
+        if mode == "text":
+            txt = p.get_text("text") or ""
+            return {"doc_id": doc_id, "page": page, "text": txt}
+        if mode == "dict":
+            d = p.get_text("dict")
+            return {"doc_id": doc_id, "page": page, "dict": d}
+        # blocks
+        blocks_raw = p.get_text("blocks") or []
+        blocks = []
+        texts = []
+        for b in blocks_raw:
+            try:
+                x0, y0, x1, y1, t, block_no, block_type = b[:7]
+                t = (t or "").strip()
+                if t:
+                    texts.append(t)
+                blocks.append({
+                    "bbox": [float(x0), float(y0), float(x1), float(y1)],
+                    "text": t,
+                    "block_no": int(block_no),
+                    "block_type": int(block_type),
+                })
+            except Exception:
+                continue
+        return {"doc_id": doc_id, "page": page, "text": "\n\n".join(texts), "blocks": blocks}
+    finally:
+        with contextlib.suppress(Exception):
+            doc.close()
+
+@app.get("/pdf/{doc_id}/render")
+async def pdf_render(doc_id: str, page: int = 1, dpi: int = 200):
+    """Render pagina naar PNG."""
+    _pdf_require()
+    meta = _pdf_load_meta(doc_id)
+    n_pages = int(meta.get("n_pages") or 0)
+    if page < 1 or page > n_pages:
+        raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
+    dpi = int(max(72, min(600, dpi)))
+    cache_path = PDF_STORE_DIR / f"{doc_id}.p{page}.dpi{dpi}.png"
+    if cache_path.exists():
+        data = cache_path.read_bytes()
+        return StreamingResponse(BytesIO(data), media_type="image/png")
+
+    doc = _pdf_open(doc_id)
+    try:
+        p = doc.load_page(page - 1)
+        m = fitz.Matrix(dpi / 72.0, dpi / 72.0)
+        pix = p.get_pixmap(matrix=m, alpha=False)
+        data = pix.tobytes("png")
+        with contextlib.suppress(Exception):
+            cache_path.write_bytes(data)
+        return StreamingResponse(BytesIO(data), media_type="image/png")
+    finally:
+        with contextlib.suppress(Exception):
+            doc.close()
+
+@app.get("/pdf/{doc_id}/images")
+async def pdf_images(doc_id: str, page: int = 1):
+    """List embedded images op een pagina (xref ids)."""
+    _pdf_require()
+    meta = _pdf_load_meta(doc_id)
+    n_pages = int(meta.get("n_pages") or 0)
+    if page < 1 or page > n_pages:
+        raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
+
+    doc = _pdf_open(doc_id)
+    try:
+        p = doc.load_page(page - 1)
+        imgs = []
+        for it in (p.get_images(full=True) or []):
+            # tuple shape: (xref, smask, width, height, bpc, colorspace, alt, name, filter)
+            try:
+                xref = int(it[0])
+                imgs.append({
+                    "xref": xref,
+                    "smask": int(it[1]) if it[1] is not None else 0,
+                    "width": int(it[2]),
+                    "height": int(it[3]),
+                    "bpc": int(it[4]),
+                    "colorspace": str(it[5]),
+                    "name": str(it[7]) if len(it) > 7 else "",
+                    "filter": str(it[8]) if len(it) > 8 else "",
+                })
+            except Exception:
+                continue
+        return {"doc_id": doc_id, "page": page, "images": imgs}
+    finally:
+        with contextlib.suppress(Exception):
+            doc.close()
+
+@app.get("/pdf/{doc_id}/image/{xref}")
+async def pdf_image(doc_id: str, xref: int):
+    """Download embedded image by xref."""
+    _pdf_require()
+    _pdf_load_meta(doc_id)  # validate existence
+    xref = int(xref)
+    if xref <= 0:
+        raise HTTPException(status_code=400, detail="invalid xref")
+    doc = _pdf_open(doc_id)
+    try:
+        info = doc.extract_image(xref)
+        if not info:
+            raise HTTPException(status_code=404, detail="image not found")
+        img_bytes = info.get("image") or b""
+        ext = (info.get("ext") or "bin").lower()
+        mt = "image/png" if ext == "png" else ("image/jpeg" if ext in ("jpg","jpeg") else "application/octet-stream")
+        return StreamingResponse(BytesIO(img_bytes), media_type=mt)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        with contextlib.suppress(Exception):
+            doc.close()
+
+def _pdf_simple_chunks(text: str, target_chars: int = 1800, overlap: int = 200) -> list[str]:
+    text = (text or "").strip()
+    if not text:
+        return []
+    # normalize whitespace
+    text = re.sub(r"\r\n?", "\n", text)
+    # Split on paragraph boundaries first
+    paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
+    chunks = []
+    buf = ""
+    for p in paras:
+        if not buf:
+            buf = p
+            continue
+        if len(buf) + 2 + len(p) <= target_chars:
+            buf += "\n\n" + p
+        else:
+            chunks.append(buf)
+            # overlap tail
+            tail = buf[-overlap:] if overlap > 0 and len(buf) > overlap else ""
+            buf = (tail + "\n\n" + p).strip()
+    if buf:
+        chunks.append(buf)
+    # hard split very large chunks
+    out=[]
+    for c in chunks:
+        if len(c) <= target_chars * 1.7:
+            out.append(c)
+        else:
+            step = max(400, target_chars - overlap)
+            for i in range(0, len(c), step):
+                out.append(c[i:i+target_chars])
+    return [c for c in out if c.strip()]
+
+async def _ensure_meili_index_generic(uid: str, primary_key: str = "id") -> bool:
+    if not MEILI_URL:
+        return False
+    uid = (uid or "").strip()
+    if not uid:
+        return False
+    try:
+        # create index (ignore if exists)
+        await _meili_req("POST", "/indexes", json_body={"uid": uid, "primaryKey": primary_key}, timeout=10.0)
+    except Exception:
+        pass
+    return True
+
+async def _meili_upsert_documents(index_uid: str, docs: list[dict]) -> dict:
+    if not MEILI_URL:
+        return {"status": "skipped", "reason": "MEILI_URL not set"}
+    if not docs:
+        return {"status": "skipped", "reason": "no docs"}
+    await _ensure_meili_index_generic(index_uid, primary_key="id")
+    r = await _meili_req("POST", f"/indexes/{index_uid}/documents", json_body=docs, timeout=30.0)
+    try:
+        return {"status": "ok", "http": r.status_code, "body": r.json()}
+    except Exception:
+        return {"status": "ok", "http": r.status_code, "body": (r.text or "")[:4000]}
+
+class PdfIndexRequest(BaseModel):
+    collection_name: str = "pdf_docs"
+    meili_index: str = ""
+    pages: Union[str, List[int]] = "all"  # "all" or list of page numbers (1-based)
+    chunk_target_chars: int = 1800
+    chunk_overlap: int = 200
+    extra_text_by_page: Dict[str, str] = {}  # e.g. {"1": "vision caption ..."}
+    add_meili: bool = True
+    add_chroma: bool = True
+
+@app.post("/pdf/{doc_id}/index")
+async def pdf_index(doc_id: str, req: PdfIndexRequest = Body(default=PdfIndexRequest())):
+    """Chunk + upsert naar Chroma (en optioneel Meili). Je kunt extra (vision) tekst per pagina meesturen."""
+    _pdf_require()
+    meta = _pdf_load_meta(doc_id)
+    n_pages = int(meta.get("n_pages") or 0)
+    pages = req.pages
+    if isinstance(pages, str):
+        if pages.lower().strip() == "all":
+            pages_list = list(range(1, n_pages + 1))
+        else:
+            raise HTTPException(status_code=400, detail="pages must be 'all' or list[int]")
+    else:
+        pages_list = [int(p) for p in pages]
+    for p in pages_list:
+        if p < 1 or p > n_pages:
+            raise HTTPException(status_code=400, detail=f"page out of range in pages: {p}")
+
+    collection_name_eff = _collection_effective(req.collection_name or "pdf_docs")
+    collection = _get_collection(collection_name_eff) if req.add_chroma else None
+
+    batch_docs: list[str] = []
+    batch_meta: list[dict] = []
+    batch_ids: list[str] = []
+    BATCH_SIZE = 64
+    total_chunks = 0
+
+    meili_docs: list[dict] = []
+
+    doc = _pdf_open(doc_id)
+    try:
+        for pno in pages_list:
+            page_obj = doc.load_page(pno - 1)
+            base_txt = (page_obj.get_text("text") or "").strip()
+            extra = (req.extra_text_by_page or {}).get(str(pno), "").strip()
+            merged = base_txt
+            if extra:
+                merged = (merged + "\n\n[VISION]\n" + extra).strip()
+            chunks = _pdf_simple_chunks(merged, target_chars=int(req.chunk_target_chars), overlap=int(req.chunk_overlap))
+            for ci, chunk in enumerate(chunks):
+                h = hashlib.sha1(chunk.encode("utf-8", errors="ignore")).hexdigest()[:16]
+                _id = f"pdf|{doc_id}|p{pno}|c{ci}|{h}"
+                md = {
+                    "source": "pdf",
+                    "doc_id": doc_id,
+                    "page": int(pno),
+                    "filename": meta.get("filename",""),
+                    "sha256": meta.get("sha256",""),
+                }
+                if req.add_chroma and collection is not None:
+                    batch_docs.append(chunk)
+                    batch_meta.append(md)
+                    batch_ids.append(_id)
+                    if len(batch_docs) >= BATCH_SIZE:
+                        _collection_add(collection, batch_docs, batch_meta, batch_ids)
+                        total_chunks += len(batch_docs)
+                        batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
+                if req.add_meili and (req.meili_index or MEILI_PDF_INDEX):
+                    meili_docs.append({
+                        "id": _id,
+                        "doc_id": doc_id,
+                        "page": int(pno),
+                        "filename": meta.get("filename",""),
+                        "text": chunk,
+                    })
+        # flush chroma
+        if req.add_chroma and collection is not None and batch_docs:
+            _collection_add(collection, batch_docs, batch_meta, batch_ids)
+            total_chunks += len(batch_docs)
+            batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
+    finally:
+        with contextlib.suppress(Exception):
+            doc.close()
+
+    meili_out = {"status": "skipped"}
+    if req.add_meili and (req.meili_index or MEILI_PDF_INDEX) and meili_docs:
+        meili_out = await _meili_upsert_documents((req.meili_index or MEILI_PDF_INDEX), meili_docs)
+
+    return {
+        "status": "ok",
+        "doc_id": doc_id,
+        "pages_indexed": len(pages_list),
+        "chunks_added_chroma": int(total_chunks),
+        "meili": meili_out,
+        "collection_effective": collection_name_eff,
+    }
+
+
+
+
 # Registreer injecties
 initialize_agent(
    app=app,