added PDF stuff

2026-02-23 16:39:46 +01:00 · 2026-02-23 16:39:46 +01:00 · 3d620d13b9
commit 3d620d13b9
parent cbfddf128e
2 changed files with 449 additions and 0 deletions
--- a/README_PDF.md
+++ b/README_PDF.md
@ -0,0 +1,35 @@
 # PDF endpoints in toolserver
 This build adds PDF ingestion + page text + page rendering (+ optional embedded image extraction) and indexing into Chroma / Meilisearch.
 ## Enable / config (env)
 - ENABLE_PDF=1
 - PDF_STORE_DIR=/data/pdf_store
 - PDF_MAX_MB=80
 - MEILI_PDF_INDEX=pdf_docs   (optional; used by /pdf/{doc_id}/index when add_meili=true)
 PyMuPDF is required:
 - pip install pymupdf
 - import name: `fitz`
 ## Endpoints
 - POST /pdf/ingest   (multipart/form-data, field: file)
  -> {doc_id, n_pages, sha256}
 - GET /pdf/{doc_id}/text?page=1&mode=blocks|text|dict
  -> blocks includes bbox + text
 - GET /pdf/{doc_id}/render?page=1&dpi=200
  -> image/png (cached on disk)
 - GET /pdf/{doc_id}/images?page=1
  -> list of embedded images (xref ids)
 - GET /pdf/{doc_id}/image/{xref}
  -> download embedded image
 - POST /pdf/{doc_id}/index
  body: PdfIndexRequest
  -> chunks to Chroma + optional Meili, supports extra_text_by_page for vision captions.
--- a/app.py
+++ b/app.py
@ -24,6 +24,12 @@ import chromadb
 import git
 import base64
 # --- Optional PDF backend (PyMuPDF) ---
 try:
    import fitz  # PyMuPDF
 except Exception:
    fitz = None
 from fastapi import FastAPI, APIRouter, UploadFile, File, Form, Request, HTTPException, Body
 from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
 from fastapi.middleware.cors import CORSMiddleware
@ -5464,6 +5470,414 @@ async def _get_git_repo_async(repo_url: str, branch: str = "main") -> str:
    # gitpython doet subprocess/IO → altijd in threadpool
    return await run_in_threadpool(get_git_repo, repo_url, branch)
 # ============================
 # PDF endpoints (toolserver)
 # ============================
 PDF_ENABLED = (os.getenv("ENABLE_PDF", "1").lower() not in ("0", "false", "no"))
 PDF_STORE_DIR = Path(os.getenv("PDF_STORE_DIR", "/data/pdf_store")).resolve()
 PDF_MAX_MB = int(os.getenv("PDF_MAX_MB", "80"))
 MEILI_PDF_INDEX = os.getenv("MEILI_PDF_INDEX", "pdf_docs")
 def _pdf_require():
    if not PDF_ENABLED:
        raise HTTPException(status_code=404, detail="PDF endpoints disabled (ENABLE_PDF=0)")
    if fitz is None:
        raise HTTPException(status_code=500, detail="PyMuPDF (fitz) not installed in this container")
 def _pdf_safe_doc_id(doc_id: str) -> str:
    doc_id = (doc_id or "").strip()
    if not re.fullmatch(r"[a-f0-9]{32}", doc_id):
        raise HTTPException(status_code=400, detail="Invalid doc_id")
    return doc_id
 def _pdf_paths(doc_id: str) -> tuple[Path, Path]:
    doc_id = _pdf_safe_doc_id(doc_id)
    pdf_path = PDF_STORE_DIR / f"{doc_id}.pdf"
    meta_path = PDF_STORE_DIR / f"{doc_id}.json"
    return pdf_path, meta_path
 def _pdf_load_meta(doc_id: str) -> dict:
    pdf_path, meta_path = _pdf_paths(doc_id)
    if not meta_path.exists():
        raise HTTPException(status_code=404, detail="doc_id not found")
    try:
        return json.loads(meta_path.read_text(encoding="utf-8"))
    except Exception:
        raise HTTPException(status_code=500, detail="corrupt metadata")
 def _pdf_open(doc_id: str):
    _pdf_require()
    pdf_path, _ = _pdf_paths(doc_id)
    if not pdf_path.exists():
        raise HTTPException(status_code=404, detail="doc_id not found")
    try:
        return fitz.open(str(pdf_path))
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"failed to open pdf: {str(e)}")
 def _ensure_pdf_store_dir():
    try:
        PDF_STORE_DIR.mkdir(parents=True, exist_ok=True)
    except Exception:
        # if we cannot create, we fail at runtime on ingest anyway
        pass
 _ensure_pdf_store_dir()
@app.post("/pdf/ingest")
 async def pdf_ingest(file: UploadFile = File(...)):
    """Upload een PDF en return doc_id + page count."""
    _pdf_require()
    if not file.filename:
        raise HTTPException(status_code=400, detail="missing filename")
    ct = (file.content_type or "").lower()
    if ct and ("pdf" not in ct):
        # niet hard-fail: sommige clients sturen generiek content-type
        logger.warning("pdf_ingest: suspicious content-type: %s", ct)
    # size guard (best-effort, we stream to disk)
    doc_id = uuid.uuid4().hex
    pdf_path, meta_path = _pdf_paths(doc_id)
    sha = hashlib.sha256()
    total = 0
    try:
        with open(pdf_path, "wb") as f:
            while True:
                chunk = await file.read(1024 * 1024)
                if not chunk:
                    break
                total += len(chunk)
                if total > PDF_MAX_MB * 1024 * 1024:
                    raise HTTPException(status_code=413, detail=f"PDF too large (> {PDF_MAX_MB} MB)")
                sha.update(chunk)
                f.write(chunk)
    except HTTPException:
        with contextlib.suppress(Exception):
            if pdf_path.exists():
                pdf_path.unlink()
        raise
    except Exception as e:
        with contextlib.suppress(Exception):
            if pdf_path.exists():
                pdf_path.unlink()
        raise HTTPException(status_code=500, detail=f"failed to write pdf: {str(e)}")
    # Open to count pages
    try:
        doc = fitz.open(str(pdf_path))
        n_pages = int(doc.page_count)
        doc.close()
    except Exception as e:
        with contextlib.suppress(Exception):
            pdf_path.unlink()
        raise HTTPException(status_code=400, detail=f"invalid PDF: {str(e)}")
    meta = {
        "doc_id": doc_id,
        "filename": file.filename,
        "sha256": sha.hexdigest(),
        "bytes": total,
        "n_pages": n_pages,
        "created_utc": time.time(),
    }
    try:
        meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
    except Exception as e:
        with contextlib.suppress(Exception):
            pdf_path.unlink()
        raise HTTPException(status_code=500, detail=f"failed to write metadata: {str(e)}")
    return {"doc_id": doc_id, "n_pages": n_pages, "sha256": meta["sha256"]}
@app.get("/pdf/{doc_id}/text")
 async def pdf_text(doc_id: str, page: int = 1, mode: str = "blocks"):
    """Tekst per pagina. mode=blocks (default) of mode=text of mode=dict."""
    _pdf_require()
    meta = _pdf_load_meta(doc_id)
    n_pages = int(meta.get("n_pages") or 0)
    if page < 1 or page > n_pages:
        raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
    doc = _pdf_open(doc_id)
    try:
        p = doc.load_page(page - 1)
        mode = (mode or "blocks").lower().strip()
        if mode == "text":
            txt = p.get_text("text") or ""
            return {"doc_id": doc_id, "page": page, "text": txt}
        if mode == "dict":
            d = p.get_text("dict")
            return {"doc_id": doc_id, "page": page, "dict": d}
        # blocks
        blocks_raw = p.get_text("blocks") or []
        blocks = []
        texts = []
        for b in blocks_raw:
            try:
                x0, y0, x1, y1, t, block_no, block_type = b[:7]
                t = (t or "").strip()
                if t:
                    texts.append(t)
                blocks.append({
                    "bbox": [float(x0), float(y0), float(x1), float(y1)],
                    "text": t,
                    "block_no": int(block_no),
                    "block_type": int(block_type),
                })
            except Exception:
                continue
        return {"doc_id": doc_id, "page": page, "text": "\n\n".join(texts), "blocks": blocks}
    finally:
        with contextlib.suppress(Exception):
            doc.close()
@app.get("/pdf/{doc_id}/render")
 async def pdf_render(doc_id: str, page: int = 1, dpi: int = 200):
    """Render pagina naar PNG."""
    _pdf_require()
    meta = _pdf_load_meta(doc_id)
    n_pages = int(meta.get("n_pages") or 0)
    if page < 1 or page > n_pages:
        raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
    dpi = int(max(72, min(600, dpi)))
    cache_path = PDF_STORE_DIR / f"{doc_id}.p{page}.dpi{dpi}.png"
    if cache_path.exists():
        data = cache_path.read_bytes()
        return StreamingResponse(BytesIO(data), media_type="image/png")
    doc = _pdf_open(doc_id)
    try:
        p = doc.load_page(page - 1)
        m = fitz.Matrix(dpi / 72.0, dpi / 72.0)
        pix = p.get_pixmap(matrix=m, alpha=False)
        data = pix.tobytes("png")
        with contextlib.suppress(Exception):
            cache_path.write_bytes(data)
        return StreamingResponse(BytesIO(data), media_type="image/png")
    finally:
        with contextlib.suppress(Exception):
            doc.close()
@app.get("/pdf/{doc_id}/images")
 async def pdf_images(doc_id: str, page: int = 1):
    """List embedded images op een pagina (xref ids)."""
    _pdf_require()
    meta = _pdf_load_meta(doc_id)
    n_pages = int(meta.get("n_pages") or 0)
    if page < 1 or page > n_pages:
        raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
    doc = _pdf_open(doc_id)
    try:
        p = doc.load_page(page - 1)
        imgs = []
        for it in (p.get_images(full=True) or []):
            # tuple shape: (xref, smask, width, height, bpc, colorspace, alt, name, filter)
            try:
                xref = int(it[0])
                imgs.append({
                    "xref": xref,
                    "smask": int(it[1]) if it[1] is not None else 0,
                    "width": int(it[2]),
                    "height": int(it[3]),
                    "bpc": int(it[4]),
                    "colorspace": str(it[5]),
                    "name": str(it[7]) if len(it) > 7 else "",
                    "filter": str(it[8]) if len(it) > 8 else "",
                })
            except Exception:
                continue
        return {"doc_id": doc_id, "page": page, "images": imgs}
    finally:
        with contextlib.suppress(Exception):
            doc.close()
@app.get("/pdf/{doc_id}/image/{xref}")
 async def pdf_image(doc_id: str, xref: int):
    """Download embedded image by xref."""
    _pdf_require()
    _pdf_load_meta(doc_id)  # validate existence
    xref = int(xref)
    if xref <= 0:
        raise HTTPException(status_code=400, detail="invalid xref")
    doc = _pdf_open(doc_id)
    try:
        info = doc.extract_image(xref)
        if not info:
            raise HTTPException(status_code=404, detail="image not found")
        img_bytes = info.get("image") or b""
        ext = (info.get("ext") or "bin").lower()
        mt = "image/png" if ext == "png" else ("image/jpeg" if ext in ("jpg","jpeg") else "application/octet-stream")
        return StreamingResponse(BytesIO(img_bytes), media_type=mt)
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        with contextlib.suppress(Exception):
            doc.close()
 def _pdf_simple_chunks(text: str, target_chars: int = 1800, overlap: int = 200) -> list[str]:
    text = (text or "").strip()
    if not text:
        return []
    # normalize whitespace
    text = re.sub(r"\r\n?", "\n", text)
    # Split on paragraph boundaries first
    paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
    chunks = []
    buf = ""
    for p in paras:
        if not buf:
            buf = p
            continue
        if len(buf) + 2 + len(p) <= target_chars:
            buf += "\n\n" + p
        else:
            chunks.append(buf)
            # overlap tail
            tail = buf[-overlap:] if overlap > 0 and len(buf) > overlap else ""
            buf = (tail + "\n\n" + p).strip()
    if buf:
        chunks.append(buf)
    # hard split very large chunks
    out=[]
    for c in chunks:
        if len(c) <= target_chars * 1.7:
            out.append(c)
        else:
            step = max(400, target_chars - overlap)
            for i in range(0, len(c), step):
                out.append(c[i:i+target_chars])
    return [c for c in out if c.strip()]
 async def _ensure_meili_index_generic(uid: str, primary_key: str = "id") -> bool:
    if not MEILI_URL:
        return False
    uid = (uid or "").strip()
    if not uid:
        return False
    try:
        # create index (ignore if exists)
        await _meili_req("POST", "/indexes", json_body={"uid": uid, "primaryKey": primary_key}, timeout=10.0)
    except Exception:
        pass
    return True
 async def _meili_upsert_documents(index_uid: str, docs: list[dict]) -> dict:
    if not MEILI_URL:
        return {"status": "skipped", "reason": "MEILI_URL not set"}
    if not docs:
        return {"status": "skipped", "reason": "no docs"}
    await _ensure_meili_index_generic(index_uid, primary_key="id")
    r = await _meili_req("POST", f"/indexes/{index_uid}/documents", json_body=docs, timeout=30.0)
    try:
        return {"status": "ok", "http": r.status_code, "body": r.json()}
    except Exception:
        return {"status": "ok", "http": r.status_code, "body": (r.text or "")[:4000]}
 class PdfIndexRequest(BaseModel):
    collection_name: str = "pdf_docs"
    meili_index: str = ""
    pages: Union[str, List[int]] = "all"  # "all" or list of page numbers (1-based)
    chunk_target_chars: int = 1800
    chunk_overlap: int = 200
    extra_text_by_page: Dict[str, str] = {}  # e.g. {"1": "vision caption ..."}
    add_meili: bool = True
    add_chroma: bool = True
@app.post("/pdf/{doc_id}/index")
 async def pdf_index(doc_id: str, req: PdfIndexRequest = Body(default=PdfIndexRequest())):
    """Chunk + upsert naar Chroma (en optioneel Meili). Je kunt extra (vision) tekst per pagina meesturen."""
    _pdf_require()
    meta = _pdf_load_meta(doc_id)
    n_pages = int(meta.get("n_pages") or 0)
    pages = req.pages
    if isinstance(pages, str):
        if pages.lower().strip() == "all":
            pages_list = list(range(1, n_pages + 1))
        else:
            raise HTTPException(status_code=400, detail="pages must be 'all' or list[int]")
    else:
        pages_list = [int(p) for p in pages]
    for p in pages_list:
        if p < 1 or p > n_pages:
            raise HTTPException(status_code=400, detail=f"page out of range in pages: {p}")
    collection_name_eff = _collection_effective(req.collection_name or "pdf_docs")
    collection = _get_collection(collection_name_eff) if req.add_chroma else None
    batch_docs: list[str] = []
    batch_meta: list[dict] = []
    batch_ids: list[str] = []
    BATCH_SIZE = 64
    total_chunks = 0
    meili_docs: list[dict] = []
    doc = _pdf_open(doc_id)
    try:
        for pno in pages_list:
            page_obj = doc.load_page(pno - 1)
            base_txt = (page_obj.get_text("text") or "").strip()
            extra = (req.extra_text_by_page or {}).get(str(pno), "").strip()
            merged = base_txt
            if extra:
                merged = (merged + "\n\n[VISION]\n" + extra).strip()
            chunks = _pdf_simple_chunks(merged, target_chars=int(req.chunk_target_chars), overlap=int(req.chunk_overlap))
            for ci, chunk in enumerate(chunks):
                h = hashlib.sha1(chunk.encode("utf-8", errors="ignore")).hexdigest()[:16]
                _id = f"pdf|{doc_id}|p{pno}|c{ci}|{h}"
                md = {
                    "source": "pdf",
                    "doc_id": doc_id,
                    "page": int(pno),
                    "filename": meta.get("filename",""),
                    "sha256": meta.get("sha256",""),
                }
                if req.add_chroma and collection is not None:
                    batch_docs.append(chunk)
                    batch_meta.append(md)
                    batch_ids.append(_id)
                    if len(batch_docs) >= BATCH_SIZE:
                        _collection_add(collection, batch_docs, batch_meta, batch_ids)
                        total_chunks += len(batch_docs)
                        batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
                if req.add_meili and (req.meili_index or MEILI_PDF_INDEX):
                    meili_docs.append({
                        "id": _id,
                        "doc_id": doc_id,
                        "page": int(pno),
                        "filename": meta.get("filename",""),
                        "text": chunk,
                    })
        # flush chroma
        if req.add_chroma and collection is not None and batch_docs:
            _collection_add(collection, batch_docs, batch_meta, batch_ids)
            total_chunks += len(batch_docs)
            batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
    finally:
        with contextlib.suppress(Exception):
            doc.close()
    meili_out = {"status": "skipped"}
    if req.add_meili and (req.meili_index or MEILI_PDF_INDEX) and meili_docs:
        meili_out = await _meili_upsert_documents((req.meili_index or MEILI_PDF_INDEX), meili_docs)
    return {
        "status": "ok",
        "doc_id": doc_id,
        "pages_indexed": len(pages_list),
        "chunks_added_chroma": int(total_chunks),
        "meili": meili_out,
        "collection_effective": collection_name_eff,
    }
 # Registreer injecties
 initialize_agent(
    app=app,