added PDF stuff

This commit is contained in:
admin 2026-02-23 16:39:46 +01:00
parent cbfddf128e
commit 3d620d13b9
2 changed files with 449 additions and 0 deletions

35
README_PDF.md Normal file
View File

@ -0,0 +1,35 @@
# PDF endpoints in toolserver
This build adds PDF ingestion + page text + page rendering (+ optional embedded image extraction) and indexing into Chroma / Meilisearch.
## Enable / config (env)
- ENABLE_PDF=1
- PDF_STORE_DIR=/data/pdf_store
- PDF_MAX_MB=80
- MEILI_PDF_INDEX=pdf_docs (optional; used by /pdf/{doc_id}/index when add_meili=true)
PyMuPDF is required:
- pip install pymupdf
- import name: `fitz`
## Endpoints
- POST /pdf/ingest (multipart/form-data, field: file)
-> {doc_id, n_pages, sha256}
- GET /pdf/{doc_id}/text?page=1&mode=blocks|text|dict
-> blocks includes bbox + text
- GET /pdf/{doc_id}/render?page=1&dpi=200
-> image/png (cached on disk)
- GET /pdf/{doc_id}/images?page=1
-> list of embedded images (xref ids)
- GET /pdf/{doc_id}/image/{xref}
-> download embedded image
- POST /pdf/{doc_id}/index
body: PdfIndexRequest
-> chunks to Chroma + optional Meili, supports extra_text_by_page for vision captions.

414
app.py
View File

@ -24,6 +24,12 @@ import chromadb
import git import git
import base64 import base64
# --- Optional PDF backend (PyMuPDF) ---
try:
import fitz # PyMuPDF
except Exception:
fitz = None
from fastapi import FastAPI, APIRouter, UploadFile, File, Form, Request, HTTPException, Body from fastapi import FastAPI, APIRouter, UploadFile, File, Form, Request, HTTPException, Body
from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@ -5464,6 +5470,414 @@ async def _get_git_repo_async(repo_url: str, branch: str = "main") -> str:
# gitpython doet subprocess/IO → altijd in threadpool # gitpython doet subprocess/IO → altijd in threadpool
return await run_in_threadpool(get_git_repo, repo_url, branch) return await run_in_threadpool(get_git_repo, repo_url, branch)
# ============================
# PDF endpoints (toolserver)
# ============================
PDF_ENABLED = (os.getenv("ENABLE_PDF", "1").lower() not in ("0", "false", "no"))
PDF_STORE_DIR = Path(os.getenv("PDF_STORE_DIR", "/data/pdf_store")).resolve()
PDF_MAX_MB = int(os.getenv("PDF_MAX_MB", "80"))
MEILI_PDF_INDEX = os.getenv("MEILI_PDF_INDEX", "pdf_docs")
def _pdf_require():
if not PDF_ENABLED:
raise HTTPException(status_code=404, detail="PDF endpoints disabled (ENABLE_PDF=0)")
if fitz is None:
raise HTTPException(status_code=500, detail="PyMuPDF (fitz) not installed in this container")
def _pdf_safe_doc_id(doc_id: str) -> str:
doc_id = (doc_id or "").strip()
if not re.fullmatch(r"[a-f0-9]{32}", doc_id):
raise HTTPException(status_code=400, detail="Invalid doc_id")
return doc_id
def _pdf_paths(doc_id: str) -> tuple[Path, Path]:
doc_id = _pdf_safe_doc_id(doc_id)
pdf_path = PDF_STORE_DIR / f"{doc_id}.pdf"
meta_path = PDF_STORE_DIR / f"{doc_id}.json"
return pdf_path, meta_path
def _pdf_load_meta(doc_id: str) -> dict:
pdf_path, meta_path = _pdf_paths(doc_id)
if not meta_path.exists():
raise HTTPException(status_code=404, detail="doc_id not found")
try:
return json.loads(meta_path.read_text(encoding="utf-8"))
except Exception:
raise HTTPException(status_code=500, detail="corrupt metadata")
def _pdf_open(doc_id: str):
_pdf_require()
pdf_path, _ = _pdf_paths(doc_id)
if not pdf_path.exists():
raise HTTPException(status_code=404, detail="doc_id not found")
try:
return fitz.open(str(pdf_path))
except Exception as e:
raise HTTPException(status_code=500, detail=f"failed to open pdf: {str(e)}")
def _ensure_pdf_store_dir():
try:
PDF_STORE_DIR.mkdir(parents=True, exist_ok=True)
except Exception:
# if we cannot create, we fail at runtime on ingest anyway
pass
_ensure_pdf_store_dir()
@app.post("/pdf/ingest")
async def pdf_ingest(file: UploadFile = File(...)):
"""Upload een PDF en return doc_id + page count."""
_pdf_require()
if not file.filename:
raise HTTPException(status_code=400, detail="missing filename")
ct = (file.content_type or "").lower()
if ct and ("pdf" not in ct):
# niet hard-fail: sommige clients sturen generiek content-type
logger.warning("pdf_ingest: suspicious content-type: %s", ct)
# size guard (best-effort, we stream to disk)
doc_id = uuid.uuid4().hex
pdf_path, meta_path = _pdf_paths(doc_id)
sha = hashlib.sha256()
total = 0
try:
with open(pdf_path, "wb") as f:
while True:
chunk = await file.read(1024 * 1024)
if not chunk:
break
total += len(chunk)
if total > PDF_MAX_MB * 1024 * 1024:
raise HTTPException(status_code=413, detail=f"PDF too large (> {PDF_MAX_MB} MB)")
sha.update(chunk)
f.write(chunk)
except HTTPException:
with contextlib.suppress(Exception):
if pdf_path.exists():
pdf_path.unlink()
raise
except Exception as e:
with contextlib.suppress(Exception):
if pdf_path.exists():
pdf_path.unlink()
raise HTTPException(status_code=500, detail=f"failed to write pdf: {str(e)}")
# Open to count pages
try:
doc = fitz.open(str(pdf_path))
n_pages = int(doc.page_count)
doc.close()
except Exception as e:
with contextlib.suppress(Exception):
pdf_path.unlink()
raise HTTPException(status_code=400, detail=f"invalid PDF: {str(e)}")
meta = {
"doc_id": doc_id,
"filename": file.filename,
"sha256": sha.hexdigest(),
"bytes": total,
"n_pages": n_pages,
"created_utc": time.time(),
}
try:
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
except Exception as e:
with contextlib.suppress(Exception):
pdf_path.unlink()
raise HTTPException(status_code=500, detail=f"failed to write metadata: {str(e)}")
return {"doc_id": doc_id, "n_pages": n_pages, "sha256": meta["sha256"]}
@app.get("/pdf/{doc_id}/text")
async def pdf_text(doc_id: str, page: int = 1, mode: str = "blocks"):
"""Tekst per pagina. mode=blocks (default) of mode=text of mode=dict."""
_pdf_require()
meta = _pdf_load_meta(doc_id)
n_pages = int(meta.get("n_pages") or 0)
if page < 1 or page > n_pages:
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
doc = _pdf_open(doc_id)
try:
p = doc.load_page(page - 1)
mode = (mode or "blocks").lower().strip()
if mode == "text":
txt = p.get_text("text") or ""
return {"doc_id": doc_id, "page": page, "text": txt}
if mode == "dict":
d = p.get_text("dict")
return {"doc_id": doc_id, "page": page, "dict": d}
# blocks
blocks_raw = p.get_text("blocks") or []
blocks = []
texts = []
for b in blocks_raw:
try:
x0, y0, x1, y1, t, block_no, block_type = b[:7]
t = (t or "").strip()
if t:
texts.append(t)
blocks.append({
"bbox": [float(x0), float(y0), float(x1), float(y1)],
"text": t,
"block_no": int(block_no),
"block_type": int(block_type),
})
except Exception:
continue
return {"doc_id": doc_id, "page": page, "text": "\n\n".join(texts), "blocks": blocks}
finally:
with contextlib.suppress(Exception):
doc.close()
@app.get("/pdf/{doc_id}/render")
async def pdf_render(doc_id: str, page: int = 1, dpi: int = 200):
"""Render pagina naar PNG."""
_pdf_require()
meta = _pdf_load_meta(doc_id)
n_pages = int(meta.get("n_pages") or 0)
if page < 1 or page > n_pages:
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
dpi = int(max(72, min(600, dpi)))
cache_path = PDF_STORE_DIR / f"{doc_id}.p{page}.dpi{dpi}.png"
if cache_path.exists():
data = cache_path.read_bytes()
return StreamingResponse(BytesIO(data), media_type="image/png")
doc = _pdf_open(doc_id)
try:
p = doc.load_page(page - 1)
m = fitz.Matrix(dpi / 72.0, dpi / 72.0)
pix = p.get_pixmap(matrix=m, alpha=False)
data = pix.tobytes("png")
with contextlib.suppress(Exception):
cache_path.write_bytes(data)
return StreamingResponse(BytesIO(data), media_type="image/png")
finally:
with contextlib.suppress(Exception):
doc.close()
@app.get("/pdf/{doc_id}/images")
async def pdf_images(doc_id: str, page: int = 1):
"""List embedded images op een pagina (xref ids)."""
_pdf_require()
meta = _pdf_load_meta(doc_id)
n_pages = int(meta.get("n_pages") or 0)
if page < 1 or page > n_pages:
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
doc = _pdf_open(doc_id)
try:
p = doc.load_page(page - 1)
imgs = []
for it in (p.get_images(full=True) or []):
# tuple shape: (xref, smask, width, height, bpc, colorspace, alt, name, filter)
try:
xref = int(it[0])
imgs.append({
"xref": xref,
"smask": int(it[1]) if it[1] is not None else 0,
"width": int(it[2]),
"height": int(it[3]),
"bpc": int(it[4]),
"colorspace": str(it[5]),
"name": str(it[7]) if len(it) > 7 else "",
"filter": str(it[8]) if len(it) > 8 else "",
})
except Exception:
continue
return {"doc_id": doc_id, "page": page, "images": imgs}
finally:
with contextlib.suppress(Exception):
doc.close()
@app.get("/pdf/{doc_id}/image/{xref}")
async def pdf_image(doc_id: str, xref: int):
"""Download embedded image by xref."""
_pdf_require()
_pdf_load_meta(doc_id) # validate existence
xref = int(xref)
if xref <= 0:
raise HTTPException(status_code=400, detail="invalid xref")
doc = _pdf_open(doc_id)
try:
info = doc.extract_image(xref)
if not info:
raise HTTPException(status_code=404, detail="image not found")
img_bytes = info.get("image") or b""
ext = (info.get("ext") or "bin").lower()
mt = "image/png" if ext == "png" else ("image/jpeg" if ext in ("jpg","jpeg") else "application/octet-stream")
return StreamingResponse(BytesIO(img_bytes), media_type=mt)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
with contextlib.suppress(Exception):
doc.close()
def _pdf_simple_chunks(text: str, target_chars: int = 1800, overlap: int = 200) -> list[str]:
text = (text or "").strip()
if not text:
return []
# normalize whitespace
text = re.sub(r"\r\n?", "\n", text)
# Split on paragraph boundaries first
paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
chunks = []
buf = ""
for p in paras:
if not buf:
buf = p
continue
if len(buf) + 2 + len(p) <= target_chars:
buf += "\n\n" + p
else:
chunks.append(buf)
# overlap tail
tail = buf[-overlap:] if overlap > 0 and len(buf) > overlap else ""
buf = (tail + "\n\n" + p).strip()
if buf:
chunks.append(buf)
# hard split very large chunks
out=[]
for c in chunks:
if len(c) <= target_chars * 1.7:
out.append(c)
else:
step = max(400, target_chars - overlap)
for i in range(0, len(c), step):
out.append(c[i:i+target_chars])
return [c for c in out if c.strip()]
async def _ensure_meili_index_generic(uid: str, primary_key: str = "id") -> bool:
if not MEILI_URL:
return False
uid = (uid or "").strip()
if not uid:
return False
try:
# create index (ignore if exists)
await _meili_req("POST", "/indexes", json_body={"uid": uid, "primaryKey": primary_key}, timeout=10.0)
except Exception:
pass
return True
async def _meili_upsert_documents(index_uid: str, docs: list[dict]) -> dict:
if not MEILI_URL:
return {"status": "skipped", "reason": "MEILI_URL not set"}
if not docs:
return {"status": "skipped", "reason": "no docs"}
await _ensure_meili_index_generic(index_uid, primary_key="id")
r = await _meili_req("POST", f"/indexes/{index_uid}/documents", json_body=docs, timeout=30.0)
try:
return {"status": "ok", "http": r.status_code, "body": r.json()}
except Exception:
return {"status": "ok", "http": r.status_code, "body": (r.text or "")[:4000]}
class PdfIndexRequest(BaseModel):
collection_name: str = "pdf_docs"
meili_index: str = ""
pages: Union[str, List[int]] = "all" # "all" or list of page numbers (1-based)
chunk_target_chars: int = 1800
chunk_overlap: int = 200
extra_text_by_page: Dict[str, str] = {} # e.g. {"1": "vision caption ..."}
add_meili: bool = True
add_chroma: bool = True
@app.post("/pdf/{doc_id}/index")
async def pdf_index(doc_id: str, req: PdfIndexRequest = Body(default=PdfIndexRequest())):
"""Chunk + upsert naar Chroma (en optioneel Meili). Je kunt extra (vision) tekst per pagina meesturen."""
_pdf_require()
meta = _pdf_load_meta(doc_id)
n_pages = int(meta.get("n_pages") or 0)
pages = req.pages
if isinstance(pages, str):
if pages.lower().strip() == "all":
pages_list = list(range(1, n_pages + 1))
else:
raise HTTPException(status_code=400, detail="pages must be 'all' or list[int]")
else:
pages_list = [int(p) for p in pages]
for p in pages_list:
if p < 1 or p > n_pages:
raise HTTPException(status_code=400, detail=f"page out of range in pages: {p}")
collection_name_eff = _collection_effective(req.collection_name or "pdf_docs")
collection = _get_collection(collection_name_eff) if req.add_chroma else None
batch_docs: list[str] = []
batch_meta: list[dict] = []
batch_ids: list[str] = []
BATCH_SIZE = 64
total_chunks = 0
meili_docs: list[dict] = []
doc = _pdf_open(doc_id)
try:
for pno in pages_list:
page_obj = doc.load_page(pno - 1)
base_txt = (page_obj.get_text("text") or "").strip()
extra = (req.extra_text_by_page or {}).get(str(pno), "").strip()
merged = base_txt
if extra:
merged = (merged + "\n\n[VISION]\n" + extra).strip()
chunks = _pdf_simple_chunks(merged, target_chars=int(req.chunk_target_chars), overlap=int(req.chunk_overlap))
for ci, chunk in enumerate(chunks):
h = hashlib.sha1(chunk.encode("utf-8", errors="ignore")).hexdigest()[:16]
_id = f"pdf|{doc_id}|p{pno}|c{ci}|{h}"
md = {
"source": "pdf",
"doc_id": doc_id,
"page": int(pno),
"filename": meta.get("filename",""),
"sha256": meta.get("sha256",""),
}
if req.add_chroma and collection is not None:
batch_docs.append(chunk)
batch_meta.append(md)
batch_ids.append(_id)
if len(batch_docs) >= BATCH_SIZE:
_collection_add(collection, batch_docs, batch_meta, batch_ids)
total_chunks += len(batch_docs)
batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
if req.add_meili and (req.meili_index or MEILI_PDF_INDEX):
meili_docs.append({
"id": _id,
"doc_id": doc_id,
"page": int(pno),
"filename": meta.get("filename",""),
"text": chunk,
})
# flush chroma
if req.add_chroma and collection is not None and batch_docs:
_collection_add(collection, batch_docs, batch_meta, batch_ids)
total_chunks += len(batch_docs)
batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
finally:
with contextlib.suppress(Exception):
doc.close()
meili_out = {"status": "skipped"}
if req.add_meili and (req.meili_index or MEILI_PDF_INDEX) and meili_docs:
meili_out = await _meili_upsert_documents((req.meili_index or MEILI_PDF_INDEX), meili_docs)
return {
"status": "ok",
"doc_id": doc_id,
"pages_indexed": len(pages_list),
"chunks_added_chroma": int(total_chunks),
"meili": meili_out,
"collection_effective": collection_name_eff,
}
# Registreer injecties # Registreer injecties
initialize_agent( initialize_agent(
app=app, app=app,