added PDF stuff

This commit is contained in:
admin 2026-02-23 16:39:46 +01:00
parent cbfddf128e
commit 3d620d13b9
2 changed files with 449 additions and 0 deletions

35
README_PDF.md Normal file
View File

@ -0,0 +1,35 @@
# PDF endpoints in toolserver
This build adds PDF ingestion + page text + page rendering (+ optional embedded image extraction) and indexing into Chroma / Meilisearch.
## Enable / config (env)
- ENABLE_PDF=1
- PDF_STORE_DIR=/data/pdf_store
- PDF_MAX_MB=80
- MEILI_PDF_INDEX=pdf_docs (optional; used by /pdf/{doc_id}/index when add_meili=true)
PyMuPDF is required:
- pip install pymupdf
- import name: `fitz`
## Endpoints
- POST /pdf/ingest (multipart/form-data, field: file)
-> {doc_id, n_pages, sha256}
- GET /pdf/{doc_id}/text?page=1&mode=blocks|text|dict
-> blocks includes bbox + text
- GET /pdf/{doc_id}/render?page=1&dpi=200
-> image/png (cached on disk)
- GET /pdf/{doc_id}/images?page=1
-> list of embedded images (xref ids)
- GET /pdf/{doc_id}/image/{xref}
-> download embedded image
- POST /pdf/{doc_id}/index
body: PdfIndexRequest
-> chunks to Chroma + optional Meili, supports extra_text_by_page for vision captions.

414
app.py
View File

@ -24,6 +24,12 @@ import chromadb
import git
import base64
# --- Optional PDF backend (PyMuPDF) ---
try:
import fitz # PyMuPDF
except Exception:
fitz = None
from fastapi import FastAPI, APIRouter, UploadFile, File, Form, Request, HTTPException, Body
from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
from fastapi.middleware.cors import CORSMiddleware
@ -5464,6 +5470,414 @@ async def _get_git_repo_async(repo_url: str, branch: str = "main") -> str:
# gitpython doet subprocess/IO → altijd in threadpool
return await run_in_threadpool(get_git_repo, repo_url, branch)
# ============================
# PDF endpoints (toolserver)
# ============================
PDF_ENABLED = (os.getenv("ENABLE_PDF", "1").lower() not in ("0", "false", "no"))
PDF_STORE_DIR = Path(os.getenv("PDF_STORE_DIR", "/data/pdf_store")).resolve()
PDF_MAX_MB = int(os.getenv("PDF_MAX_MB", "80"))
MEILI_PDF_INDEX = os.getenv("MEILI_PDF_INDEX", "pdf_docs")
def _pdf_require():
if not PDF_ENABLED:
raise HTTPException(status_code=404, detail="PDF endpoints disabled (ENABLE_PDF=0)")
if fitz is None:
raise HTTPException(status_code=500, detail="PyMuPDF (fitz) not installed in this container")
def _pdf_safe_doc_id(doc_id: str) -> str:
doc_id = (doc_id or "").strip()
if not re.fullmatch(r"[a-f0-9]{32}", doc_id):
raise HTTPException(status_code=400, detail="Invalid doc_id")
return doc_id
def _pdf_paths(doc_id: str) -> tuple[Path, Path]:
doc_id = _pdf_safe_doc_id(doc_id)
pdf_path = PDF_STORE_DIR / f"{doc_id}.pdf"
meta_path = PDF_STORE_DIR / f"{doc_id}.json"
return pdf_path, meta_path
def _pdf_load_meta(doc_id: str) -> dict:
pdf_path, meta_path = _pdf_paths(doc_id)
if not meta_path.exists():
raise HTTPException(status_code=404, detail="doc_id not found")
try:
return json.loads(meta_path.read_text(encoding="utf-8"))
except Exception:
raise HTTPException(status_code=500, detail="corrupt metadata")
def _pdf_open(doc_id: str):
_pdf_require()
pdf_path, _ = _pdf_paths(doc_id)
if not pdf_path.exists():
raise HTTPException(status_code=404, detail="doc_id not found")
try:
return fitz.open(str(pdf_path))
except Exception as e:
raise HTTPException(status_code=500, detail=f"failed to open pdf: {str(e)}")
def _ensure_pdf_store_dir():
try:
PDF_STORE_DIR.mkdir(parents=True, exist_ok=True)
except Exception:
# if we cannot create, we fail at runtime on ingest anyway
pass
_ensure_pdf_store_dir()
@app.post("/pdf/ingest")
async def pdf_ingest(file: UploadFile = File(...)):
"""Upload een PDF en return doc_id + page count."""
_pdf_require()
if not file.filename:
raise HTTPException(status_code=400, detail="missing filename")
ct = (file.content_type or "").lower()
if ct and ("pdf" not in ct):
# niet hard-fail: sommige clients sturen generiek content-type
logger.warning("pdf_ingest: suspicious content-type: %s", ct)
# size guard (best-effort, we stream to disk)
doc_id = uuid.uuid4().hex
pdf_path, meta_path = _pdf_paths(doc_id)
sha = hashlib.sha256()
total = 0
try:
with open(pdf_path, "wb") as f:
while True:
chunk = await file.read(1024 * 1024)
if not chunk:
break
total += len(chunk)
if total > PDF_MAX_MB * 1024 * 1024:
raise HTTPException(status_code=413, detail=f"PDF too large (> {PDF_MAX_MB} MB)")
sha.update(chunk)
f.write(chunk)
except HTTPException:
with contextlib.suppress(Exception):
if pdf_path.exists():
pdf_path.unlink()
raise
except Exception as e:
with contextlib.suppress(Exception):
if pdf_path.exists():
pdf_path.unlink()
raise HTTPException(status_code=500, detail=f"failed to write pdf: {str(e)}")
# Open to count pages
try:
doc = fitz.open(str(pdf_path))
n_pages = int(doc.page_count)
doc.close()
except Exception as e:
with contextlib.suppress(Exception):
pdf_path.unlink()
raise HTTPException(status_code=400, detail=f"invalid PDF: {str(e)}")
meta = {
"doc_id": doc_id,
"filename": file.filename,
"sha256": sha.hexdigest(),
"bytes": total,
"n_pages": n_pages,
"created_utc": time.time(),
}
try:
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
except Exception as e:
with contextlib.suppress(Exception):
pdf_path.unlink()
raise HTTPException(status_code=500, detail=f"failed to write metadata: {str(e)}")
return {"doc_id": doc_id, "n_pages": n_pages, "sha256": meta["sha256"]}
@app.get("/pdf/{doc_id}/text")
async def pdf_text(doc_id: str, page: int = 1, mode: str = "blocks"):
"""Tekst per pagina. mode=blocks (default) of mode=text of mode=dict."""
_pdf_require()
meta = _pdf_load_meta(doc_id)
n_pages = int(meta.get("n_pages") or 0)
if page < 1 or page > n_pages:
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
doc = _pdf_open(doc_id)
try:
p = doc.load_page(page - 1)
mode = (mode or "blocks").lower().strip()
if mode == "text":
txt = p.get_text("text") or ""
return {"doc_id": doc_id, "page": page, "text": txt}
if mode == "dict":
d = p.get_text("dict")
return {"doc_id": doc_id, "page": page, "dict": d}
# blocks
blocks_raw = p.get_text("blocks") or []
blocks = []
texts = []
for b in blocks_raw:
try:
x0, y0, x1, y1, t, block_no, block_type = b[:7]
t = (t or "").strip()
if t:
texts.append(t)
blocks.append({
"bbox": [float(x0), float(y0), float(x1), float(y1)],
"text": t,
"block_no": int(block_no),
"block_type": int(block_type),
})
except Exception:
continue
return {"doc_id": doc_id, "page": page, "text": "\n\n".join(texts), "blocks": blocks}
finally:
with contextlib.suppress(Exception):
doc.close()
@app.get("/pdf/{doc_id}/render")
async def pdf_render(doc_id: str, page: int = 1, dpi: int = 200):
"""Render pagina naar PNG."""
_pdf_require()
meta = _pdf_load_meta(doc_id)
n_pages = int(meta.get("n_pages") or 0)
if page < 1 or page > n_pages:
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
dpi = int(max(72, min(600, dpi)))
cache_path = PDF_STORE_DIR / f"{doc_id}.p{page}.dpi{dpi}.png"
if cache_path.exists():
data = cache_path.read_bytes()
return StreamingResponse(BytesIO(data), media_type="image/png")
doc = _pdf_open(doc_id)
try:
p = doc.load_page(page - 1)
m = fitz.Matrix(dpi / 72.0, dpi / 72.0)
pix = p.get_pixmap(matrix=m, alpha=False)
data = pix.tobytes("png")
with contextlib.suppress(Exception):
cache_path.write_bytes(data)
return StreamingResponse(BytesIO(data), media_type="image/png")
finally:
with contextlib.suppress(Exception):
doc.close()
@app.get("/pdf/{doc_id}/images")
async def pdf_images(doc_id: str, page: int = 1):
"""List embedded images op een pagina (xref ids)."""
_pdf_require()
meta = _pdf_load_meta(doc_id)
n_pages = int(meta.get("n_pages") or 0)
if page < 1 or page > n_pages:
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
doc = _pdf_open(doc_id)
try:
p = doc.load_page(page - 1)
imgs = []
for it in (p.get_images(full=True) or []):
# tuple shape: (xref, smask, width, height, bpc, colorspace, alt, name, filter)
try:
xref = int(it[0])
imgs.append({
"xref": xref,
"smask": int(it[1]) if it[1] is not None else 0,
"width": int(it[2]),
"height": int(it[3]),
"bpc": int(it[4]),
"colorspace": str(it[5]),
"name": str(it[7]) if len(it) > 7 else "",
"filter": str(it[8]) if len(it) > 8 else "",
})
except Exception:
continue
return {"doc_id": doc_id, "page": page, "images": imgs}
finally:
with contextlib.suppress(Exception):
doc.close()
@app.get("/pdf/{doc_id}/image/{xref}")
async def pdf_image(doc_id: str, xref: int):
"""Download embedded image by xref."""
_pdf_require()
_pdf_load_meta(doc_id) # validate existence
xref = int(xref)
if xref <= 0:
raise HTTPException(status_code=400, detail="invalid xref")
doc = _pdf_open(doc_id)
try:
info = doc.extract_image(xref)
if not info:
raise HTTPException(status_code=404, detail="image not found")
img_bytes = info.get("image") or b""
ext = (info.get("ext") or "bin").lower()
mt = "image/png" if ext == "png" else ("image/jpeg" if ext in ("jpg","jpeg") else "application/octet-stream")
return StreamingResponse(BytesIO(img_bytes), media_type=mt)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
with contextlib.suppress(Exception):
doc.close()
def _pdf_simple_chunks(text: str, target_chars: int = 1800, overlap: int = 200) -> list[str]:
text = (text or "").strip()
if not text:
return []
# normalize whitespace
text = re.sub(r"\r\n?", "\n", text)
# Split on paragraph boundaries first
paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
chunks = []
buf = ""
for p in paras:
if not buf:
buf = p
continue
if len(buf) + 2 + len(p) <= target_chars:
buf += "\n\n" + p
else:
chunks.append(buf)
# overlap tail
tail = buf[-overlap:] if overlap > 0 and len(buf) > overlap else ""
buf = (tail + "\n\n" + p).strip()
if buf:
chunks.append(buf)
# hard split very large chunks
out=[]
for c in chunks:
if len(c) <= target_chars * 1.7:
out.append(c)
else:
step = max(400, target_chars - overlap)
for i in range(0, len(c), step):
out.append(c[i:i+target_chars])
return [c for c in out if c.strip()]
async def _ensure_meili_index_generic(uid: str, primary_key: str = "id") -> bool:
if not MEILI_URL:
return False
uid = (uid or "").strip()
if not uid:
return False
try:
# create index (ignore if exists)
await _meili_req("POST", "/indexes", json_body={"uid": uid, "primaryKey": primary_key}, timeout=10.0)
except Exception:
pass
return True
async def _meili_upsert_documents(index_uid: str, docs: list[dict]) -> dict:
if not MEILI_URL:
return {"status": "skipped", "reason": "MEILI_URL not set"}
if not docs:
return {"status": "skipped", "reason": "no docs"}
await _ensure_meili_index_generic(index_uid, primary_key="id")
r = await _meili_req("POST", f"/indexes/{index_uid}/documents", json_body=docs, timeout=30.0)
try:
return {"status": "ok", "http": r.status_code, "body": r.json()}
except Exception:
return {"status": "ok", "http": r.status_code, "body": (r.text or "")[:4000]}
class PdfIndexRequest(BaseModel):
collection_name: str = "pdf_docs"
meili_index: str = ""
pages: Union[str, List[int]] = "all" # "all" or list of page numbers (1-based)
chunk_target_chars: int = 1800
chunk_overlap: int = 200
extra_text_by_page: Dict[str, str] = {} # e.g. {"1": "vision caption ..."}
add_meili: bool = True
add_chroma: bool = True
@app.post("/pdf/{doc_id}/index")
async def pdf_index(doc_id: str, req: PdfIndexRequest = Body(default=PdfIndexRequest())):
"""Chunk + upsert naar Chroma (en optioneel Meili). Je kunt extra (vision) tekst per pagina meesturen."""
_pdf_require()
meta = _pdf_load_meta(doc_id)
n_pages = int(meta.get("n_pages") or 0)
pages = req.pages
if isinstance(pages, str):
if pages.lower().strip() == "all":
pages_list = list(range(1, n_pages + 1))
else:
raise HTTPException(status_code=400, detail="pages must be 'all' or list[int]")
else:
pages_list = [int(p) for p in pages]
for p in pages_list:
if p < 1 or p > n_pages:
raise HTTPException(status_code=400, detail=f"page out of range in pages: {p}")
collection_name_eff = _collection_effective(req.collection_name or "pdf_docs")
collection = _get_collection(collection_name_eff) if req.add_chroma else None
batch_docs: list[str] = []
batch_meta: list[dict] = []
batch_ids: list[str] = []
BATCH_SIZE = 64
total_chunks = 0
meili_docs: list[dict] = []
doc = _pdf_open(doc_id)
try:
for pno in pages_list:
page_obj = doc.load_page(pno - 1)
base_txt = (page_obj.get_text("text") or "").strip()
extra = (req.extra_text_by_page or {}).get(str(pno), "").strip()
merged = base_txt
if extra:
merged = (merged + "\n\n[VISION]\n" + extra).strip()
chunks = _pdf_simple_chunks(merged, target_chars=int(req.chunk_target_chars), overlap=int(req.chunk_overlap))
for ci, chunk in enumerate(chunks):
h = hashlib.sha1(chunk.encode("utf-8", errors="ignore")).hexdigest()[:16]
_id = f"pdf|{doc_id}|p{pno}|c{ci}|{h}"
md = {
"source": "pdf",
"doc_id": doc_id,
"page": int(pno),
"filename": meta.get("filename",""),
"sha256": meta.get("sha256",""),
}
if req.add_chroma and collection is not None:
batch_docs.append(chunk)
batch_meta.append(md)
batch_ids.append(_id)
if len(batch_docs) >= BATCH_SIZE:
_collection_add(collection, batch_docs, batch_meta, batch_ids)
total_chunks += len(batch_docs)
batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
if req.add_meili and (req.meili_index or MEILI_PDF_INDEX):
meili_docs.append({
"id": _id,
"doc_id": doc_id,
"page": int(pno),
"filename": meta.get("filename",""),
"text": chunk,
})
# flush chroma
if req.add_chroma and collection is not None and batch_docs:
_collection_add(collection, batch_docs, batch_meta, batch_ids)
total_chunks += len(batch_docs)
batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
finally:
with contextlib.suppress(Exception):
doc.close()
meili_out = {"status": "skipped"}
if req.add_meili and (req.meili_index or MEILI_PDF_INDEX) and meili_docs:
meili_out = await _meili_upsert_documents((req.meili_index or MEILI_PDF_INDEX), meili_docs)
return {
"status": "ok",
"doc_id": doc_id,
"pages_indexed": len(pages_list),
"chunks_added_chroma": int(total_chunks),
"meili": meili_out,
"collection_effective": collection_name_eff,
}
# Registreer injecties
initialize_agent(
app=app,