added PDF stuff
This commit is contained in:
parent
cbfddf128e
commit
3d620d13b9
35
README_PDF.md
Normal file
35
README_PDF.md
Normal file
@ -0,0 +1,35 @@
|
||||
# PDF endpoints in toolserver
|
||||
|
||||
This build adds PDF ingestion + page text + page rendering (+ optional embedded image extraction) and indexing into Chroma / Meilisearch.
|
||||
|
||||
## Enable / config (env)
|
||||
|
||||
- ENABLE_PDF=1
|
||||
- PDF_STORE_DIR=/data/pdf_store
|
||||
- PDF_MAX_MB=80
|
||||
- MEILI_PDF_INDEX=pdf_docs (optional; used by /pdf/{doc_id}/index when add_meili=true)
|
||||
|
||||
PyMuPDF is required:
|
||||
- pip install pymupdf
|
||||
- import name: `fitz`
|
||||
|
||||
## Endpoints
|
||||
|
||||
- POST /pdf/ingest (multipart/form-data, field: file)
|
||||
-> {doc_id, n_pages, sha256}
|
||||
|
||||
- GET /pdf/{doc_id}/text?page=1&mode=blocks|text|dict
|
||||
-> blocks includes bbox + text
|
||||
|
||||
- GET /pdf/{doc_id}/render?page=1&dpi=200
|
||||
-> image/png (cached on disk)
|
||||
|
||||
- GET /pdf/{doc_id}/images?page=1
|
||||
-> list of embedded images (xref ids)
|
||||
|
||||
- GET /pdf/{doc_id}/image/{xref}
|
||||
-> download embedded image
|
||||
|
||||
- POST /pdf/{doc_id}/index
|
||||
body: PdfIndexRequest
|
||||
-> chunks to Chroma + optional Meili, supports extra_text_by_page for vision captions.
|
||||
414
app.py
414
app.py
@ -24,6 +24,12 @@ import chromadb
|
||||
import git
|
||||
import base64
|
||||
|
||||
# --- Optional PDF backend (PyMuPDF) ---
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except Exception:
|
||||
fitz = None
|
||||
|
||||
from fastapi import FastAPI, APIRouter, UploadFile, File, Form, Request, HTTPException, Body
|
||||
from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
@ -5464,6 +5470,414 @@ async def _get_git_repo_async(repo_url: str, branch: str = "main") -> str:
|
||||
# gitpython doet subprocess/IO → altijd in threadpool
|
||||
return await run_in_threadpool(get_git_repo, repo_url, branch)
|
||||
|
||||
|
||||
# ============================
|
||||
# PDF endpoints (toolserver)
|
||||
# ============================
|
||||
PDF_ENABLED = (os.getenv("ENABLE_PDF", "1").lower() not in ("0", "false", "no"))
|
||||
PDF_STORE_DIR = Path(os.getenv("PDF_STORE_DIR", "/data/pdf_store")).resolve()
|
||||
PDF_MAX_MB = int(os.getenv("PDF_MAX_MB", "80"))
|
||||
MEILI_PDF_INDEX = os.getenv("MEILI_PDF_INDEX", "pdf_docs")
|
||||
|
||||
def _pdf_require():
|
||||
if not PDF_ENABLED:
|
||||
raise HTTPException(status_code=404, detail="PDF endpoints disabled (ENABLE_PDF=0)")
|
||||
if fitz is None:
|
||||
raise HTTPException(status_code=500, detail="PyMuPDF (fitz) not installed in this container")
|
||||
|
||||
def _pdf_safe_doc_id(doc_id: str) -> str:
|
||||
doc_id = (doc_id or "").strip()
|
||||
if not re.fullmatch(r"[a-f0-9]{32}", doc_id):
|
||||
raise HTTPException(status_code=400, detail="Invalid doc_id")
|
||||
return doc_id
|
||||
|
||||
def _pdf_paths(doc_id: str) -> tuple[Path, Path]:
|
||||
doc_id = _pdf_safe_doc_id(doc_id)
|
||||
pdf_path = PDF_STORE_DIR / f"{doc_id}.pdf"
|
||||
meta_path = PDF_STORE_DIR / f"{doc_id}.json"
|
||||
return pdf_path, meta_path
|
||||
|
||||
def _pdf_load_meta(doc_id: str) -> dict:
|
||||
pdf_path, meta_path = _pdf_paths(doc_id)
|
||||
if not meta_path.exists():
|
||||
raise HTTPException(status_code=404, detail="doc_id not found")
|
||||
try:
|
||||
return json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
raise HTTPException(status_code=500, detail="corrupt metadata")
|
||||
|
||||
def _pdf_open(doc_id: str):
|
||||
_pdf_require()
|
||||
pdf_path, _ = _pdf_paths(doc_id)
|
||||
if not pdf_path.exists():
|
||||
raise HTTPException(status_code=404, detail="doc_id not found")
|
||||
try:
|
||||
return fitz.open(str(pdf_path))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"failed to open pdf: {str(e)}")
|
||||
|
||||
def _ensure_pdf_store_dir():
|
||||
try:
|
||||
PDF_STORE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
except Exception:
|
||||
# if we cannot create, we fail at runtime on ingest anyway
|
||||
pass
|
||||
|
||||
_ensure_pdf_store_dir()
|
||||
|
||||
@app.post("/pdf/ingest")
|
||||
async def pdf_ingest(file: UploadFile = File(...)):
|
||||
"""Upload een PDF en return doc_id + page count."""
|
||||
_pdf_require()
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="missing filename")
|
||||
ct = (file.content_type or "").lower()
|
||||
if ct and ("pdf" not in ct):
|
||||
# niet hard-fail: sommige clients sturen generiek content-type
|
||||
logger.warning("pdf_ingest: suspicious content-type: %s", ct)
|
||||
|
||||
# size guard (best-effort, we stream to disk)
|
||||
doc_id = uuid.uuid4().hex
|
||||
pdf_path, meta_path = _pdf_paths(doc_id)
|
||||
|
||||
sha = hashlib.sha256()
|
||||
total = 0
|
||||
try:
|
||||
with open(pdf_path, "wb") as f:
|
||||
while True:
|
||||
chunk = await file.read(1024 * 1024)
|
||||
if not chunk:
|
||||
break
|
||||
total += len(chunk)
|
||||
if total > PDF_MAX_MB * 1024 * 1024:
|
||||
raise HTTPException(status_code=413, detail=f"PDF too large (> {PDF_MAX_MB} MB)")
|
||||
sha.update(chunk)
|
||||
f.write(chunk)
|
||||
except HTTPException:
|
||||
with contextlib.suppress(Exception):
|
||||
if pdf_path.exists():
|
||||
pdf_path.unlink()
|
||||
raise
|
||||
except Exception as e:
|
||||
with contextlib.suppress(Exception):
|
||||
if pdf_path.exists():
|
||||
pdf_path.unlink()
|
||||
raise HTTPException(status_code=500, detail=f"failed to write pdf: {str(e)}")
|
||||
|
||||
# Open to count pages
|
||||
try:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
n_pages = int(doc.page_count)
|
||||
doc.close()
|
||||
except Exception as e:
|
||||
with contextlib.suppress(Exception):
|
||||
pdf_path.unlink()
|
||||
raise HTTPException(status_code=400, detail=f"invalid PDF: {str(e)}")
|
||||
|
||||
meta = {
|
||||
"doc_id": doc_id,
|
||||
"filename": file.filename,
|
||||
"sha256": sha.hexdigest(),
|
||||
"bytes": total,
|
||||
"n_pages": n_pages,
|
||||
"created_utc": time.time(),
|
||||
}
|
||||
try:
|
||||
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
except Exception as e:
|
||||
with contextlib.suppress(Exception):
|
||||
pdf_path.unlink()
|
||||
raise HTTPException(status_code=500, detail=f"failed to write metadata: {str(e)}")
|
||||
|
||||
return {"doc_id": doc_id, "n_pages": n_pages, "sha256": meta["sha256"]}
|
||||
|
||||
@app.get("/pdf/{doc_id}/text")
|
||||
async def pdf_text(doc_id: str, page: int = 1, mode: str = "blocks"):
|
||||
"""Tekst per pagina. mode=blocks (default) of mode=text of mode=dict."""
|
||||
_pdf_require()
|
||||
meta = _pdf_load_meta(doc_id)
|
||||
n_pages = int(meta.get("n_pages") or 0)
|
||||
if page < 1 or page > n_pages:
|
||||
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
|
||||
|
||||
doc = _pdf_open(doc_id)
|
||||
try:
|
||||
p = doc.load_page(page - 1)
|
||||
mode = (mode or "blocks").lower().strip()
|
||||
if mode == "text":
|
||||
txt = p.get_text("text") or ""
|
||||
return {"doc_id": doc_id, "page": page, "text": txt}
|
||||
if mode == "dict":
|
||||
d = p.get_text("dict")
|
||||
return {"doc_id": doc_id, "page": page, "dict": d}
|
||||
# blocks
|
||||
blocks_raw = p.get_text("blocks") or []
|
||||
blocks = []
|
||||
texts = []
|
||||
for b in blocks_raw:
|
||||
try:
|
||||
x0, y0, x1, y1, t, block_no, block_type = b[:7]
|
||||
t = (t or "").strip()
|
||||
if t:
|
||||
texts.append(t)
|
||||
blocks.append({
|
||||
"bbox": [float(x0), float(y0), float(x1), float(y1)],
|
||||
"text": t,
|
||||
"block_no": int(block_no),
|
||||
"block_type": int(block_type),
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
return {"doc_id": doc_id, "page": page, "text": "\n\n".join(texts), "blocks": blocks}
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
doc.close()
|
||||
|
||||
@app.get("/pdf/{doc_id}/render")
|
||||
async def pdf_render(doc_id: str, page: int = 1, dpi: int = 200):
|
||||
"""Render pagina naar PNG."""
|
||||
_pdf_require()
|
||||
meta = _pdf_load_meta(doc_id)
|
||||
n_pages = int(meta.get("n_pages") or 0)
|
||||
if page < 1 or page > n_pages:
|
||||
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
|
||||
dpi = int(max(72, min(600, dpi)))
|
||||
cache_path = PDF_STORE_DIR / f"{doc_id}.p{page}.dpi{dpi}.png"
|
||||
if cache_path.exists():
|
||||
data = cache_path.read_bytes()
|
||||
return StreamingResponse(BytesIO(data), media_type="image/png")
|
||||
|
||||
doc = _pdf_open(doc_id)
|
||||
try:
|
||||
p = doc.load_page(page - 1)
|
||||
m = fitz.Matrix(dpi / 72.0, dpi / 72.0)
|
||||
pix = p.get_pixmap(matrix=m, alpha=False)
|
||||
data = pix.tobytes("png")
|
||||
with contextlib.suppress(Exception):
|
||||
cache_path.write_bytes(data)
|
||||
return StreamingResponse(BytesIO(data), media_type="image/png")
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
doc.close()
|
||||
|
||||
@app.get("/pdf/{doc_id}/images")
|
||||
async def pdf_images(doc_id: str, page: int = 1):
|
||||
"""List embedded images op een pagina (xref ids)."""
|
||||
_pdf_require()
|
||||
meta = _pdf_load_meta(doc_id)
|
||||
n_pages = int(meta.get("n_pages") or 0)
|
||||
if page < 1 or page > n_pages:
|
||||
raise HTTPException(status_code=400, detail=f"page out of range (1..{n_pages})")
|
||||
|
||||
doc = _pdf_open(doc_id)
|
||||
try:
|
||||
p = doc.load_page(page - 1)
|
||||
imgs = []
|
||||
for it in (p.get_images(full=True) or []):
|
||||
# tuple shape: (xref, smask, width, height, bpc, colorspace, alt, name, filter)
|
||||
try:
|
||||
xref = int(it[0])
|
||||
imgs.append({
|
||||
"xref": xref,
|
||||
"smask": int(it[1]) if it[1] is not None else 0,
|
||||
"width": int(it[2]),
|
||||
"height": int(it[3]),
|
||||
"bpc": int(it[4]),
|
||||
"colorspace": str(it[5]),
|
||||
"name": str(it[7]) if len(it) > 7 else "",
|
||||
"filter": str(it[8]) if len(it) > 8 else "",
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
return {"doc_id": doc_id, "page": page, "images": imgs}
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
doc.close()
|
||||
|
||||
@app.get("/pdf/{doc_id}/image/{xref}")
|
||||
async def pdf_image(doc_id: str, xref: int):
|
||||
"""Download embedded image by xref."""
|
||||
_pdf_require()
|
||||
_pdf_load_meta(doc_id) # validate existence
|
||||
xref = int(xref)
|
||||
if xref <= 0:
|
||||
raise HTTPException(status_code=400, detail="invalid xref")
|
||||
doc = _pdf_open(doc_id)
|
||||
try:
|
||||
info = doc.extract_image(xref)
|
||||
if not info:
|
||||
raise HTTPException(status_code=404, detail="image not found")
|
||||
img_bytes = info.get("image") or b""
|
||||
ext = (info.get("ext") or "bin").lower()
|
||||
mt = "image/png" if ext == "png" else ("image/jpeg" if ext in ("jpg","jpeg") else "application/octet-stream")
|
||||
return StreamingResponse(BytesIO(img_bytes), media_type=mt)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
doc.close()
|
||||
|
||||
def _pdf_simple_chunks(text: str, target_chars: int = 1800, overlap: int = 200) -> list[str]:
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
return []
|
||||
# normalize whitespace
|
||||
text = re.sub(r"\r\n?", "\n", text)
|
||||
# Split on paragraph boundaries first
|
||||
paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
|
||||
chunks = []
|
||||
buf = ""
|
||||
for p in paras:
|
||||
if not buf:
|
||||
buf = p
|
||||
continue
|
||||
if len(buf) + 2 + len(p) <= target_chars:
|
||||
buf += "\n\n" + p
|
||||
else:
|
||||
chunks.append(buf)
|
||||
# overlap tail
|
||||
tail = buf[-overlap:] if overlap > 0 and len(buf) > overlap else ""
|
||||
buf = (tail + "\n\n" + p).strip()
|
||||
if buf:
|
||||
chunks.append(buf)
|
||||
# hard split very large chunks
|
||||
out=[]
|
||||
for c in chunks:
|
||||
if len(c) <= target_chars * 1.7:
|
||||
out.append(c)
|
||||
else:
|
||||
step = max(400, target_chars - overlap)
|
||||
for i in range(0, len(c), step):
|
||||
out.append(c[i:i+target_chars])
|
||||
return [c for c in out if c.strip()]
|
||||
|
||||
async def _ensure_meili_index_generic(uid: str, primary_key: str = "id") -> bool:
|
||||
if not MEILI_URL:
|
||||
return False
|
||||
uid = (uid or "").strip()
|
||||
if not uid:
|
||||
return False
|
||||
try:
|
||||
# create index (ignore if exists)
|
||||
await _meili_req("POST", "/indexes", json_body={"uid": uid, "primaryKey": primary_key}, timeout=10.0)
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
async def _meili_upsert_documents(index_uid: str, docs: list[dict]) -> dict:
|
||||
if not MEILI_URL:
|
||||
return {"status": "skipped", "reason": "MEILI_URL not set"}
|
||||
if not docs:
|
||||
return {"status": "skipped", "reason": "no docs"}
|
||||
await _ensure_meili_index_generic(index_uid, primary_key="id")
|
||||
r = await _meili_req("POST", f"/indexes/{index_uid}/documents", json_body=docs, timeout=30.0)
|
||||
try:
|
||||
return {"status": "ok", "http": r.status_code, "body": r.json()}
|
||||
except Exception:
|
||||
return {"status": "ok", "http": r.status_code, "body": (r.text or "")[:4000]}
|
||||
|
||||
class PdfIndexRequest(BaseModel):
|
||||
collection_name: str = "pdf_docs"
|
||||
meili_index: str = ""
|
||||
pages: Union[str, List[int]] = "all" # "all" or list of page numbers (1-based)
|
||||
chunk_target_chars: int = 1800
|
||||
chunk_overlap: int = 200
|
||||
extra_text_by_page: Dict[str, str] = {} # e.g. {"1": "vision caption ..."}
|
||||
add_meili: bool = True
|
||||
add_chroma: bool = True
|
||||
|
||||
@app.post("/pdf/{doc_id}/index")
|
||||
async def pdf_index(doc_id: str, req: PdfIndexRequest = Body(default=PdfIndexRequest())):
|
||||
"""Chunk + upsert naar Chroma (en optioneel Meili). Je kunt extra (vision) tekst per pagina meesturen."""
|
||||
_pdf_require()
|
||||
meta = _pdf_load_meta(doc_id)
|
||||
n_pages = int(meta.get("n_pages") or 0)
|
||||
pages = req.pages
|
||||
if isinstance(pages, str):
|
||||
if pages.lower().strip() == "all":
|
||||
pages_list = list(range(1, n_pages + 1))
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="pages must be 'all' or list[int]")
|
||||
else:
|
||||
pages_list = [int(p) for p in pages]
|
||||
for p in pages_list:
|
||||
if p < 1 or p > n_pages:
|
||||
raise HTTPException(status_code=400, detail=f"page out of range in pages: {p}")
|
||||
|
||||
collection_name_eff = _collection_effective(req.collection_name or "pdf_docs")
|
||||
collection = _get_collection(collection_name_eff) if req.add_chroma else None
|
||||
|
||||
batch_docs: list[str] = []
|
||||
batch_meta: list[dict] = []
|
||||
batch_ids: list[str] = []
|
||||
BATCH_SIZE = 64
|
||||
total_chunks = 0
|
||||
|
||||
meili_docs: list[dict] = []
|
||||
|
||||
doc = _pdf_open(doc_id)
|
||||
try:
|
||||
for pno in pages_list:
|
||||
page_obj = doc.load_page(pno - 1)
|
||||
base_txt = (page_obj.get_text("text") or "").strip()
|
||||
extra = (req.extra_text_by_page or {}).get(str(pno), "").strip()
|
||||
merged = base_txt
|
||||
if extra:
|
||||
merged = (merged + "\n\n[VISION]\n" + extra).strip()
|
||||
chunks = _pdf_simple_chunks(merged, target_chars=int(req.chunk_target_chars), overlap=int(req.chunk_overlap))
|
||||
for ci, chunk in enumerate(chunks):
|
||||
h = hashlib.sha1(chunk.encode("utf-8", errors="ignore")).hexdigest()[:16]
|
||||
_id = f"pdf|{doc_id}|p{pno}|c{ci}|{h}"
|
||||
md = {
|
||||
"source": "pdf",
|
||||
"doc_id": doc_id,
|
||||
"page": int(pno),
|
||||
"filename": meta.get("filename",""),
|
||||
"sha256": meta.get("sha256",""),
|
||||
}
|
||||
if req.add_chroma and collection is not None:
|
||||
batch_docs.append(chunk)
|
||||
batch_meta.append(md)
|
||||
batch_ids.append(_id)
|
||||
if len(batch_docs) >= BATCH_SIZE:
|
||||
_collection_add(collection, batch_docs, batch_meta, batch_ids)
|
||||
total_chunks += len(batch_docs)
|
||||
batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
|
||||
if req.add_meili and (req.meili_index or MEILI_PDF_INDEX):
|
||||
meili_docs.append({
|
||||
"id": _id,
|
||||
"doc_id": doc_id,
|
||||
"page": int(pno),
|
||||
"filename": meta.get("filename",""),
|
||||
"text": chunk,
|
||||
})
|
||||
# flush chroma
|
||||
if req.add_chroma and collection is not None and batch_docs:
|
||||
_collection_add(collection, batch_docs, batch_meta, batch_ids)
|
||||
total_chunks += len(batch_docs)
|
||||
batch_docs.clear(); batch_meta.clear(); batch_ids.clear()
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
doc.close()
|
||||
|
||||
meili_out = {"status": "skipped"}
|
||||
if req.add_meili and (req.meili_index or MEILI_PDF_INDEX) and meili_docs:
|
||||
meili_out = await _meili_upsert_documents((req.meili_index or MEILI_PDF_INDEX), meili_docs)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"doc_id": doc_id,
|
||||
"pages_indexed": len(pages_list),
|
||||
"chunks_added_chroma": int(total_chunks),
|
||||
"meili": meili_out,
|
||||
"collection_effective": collection_name_eff,
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
# Registreer injecties
|
||||
initialize_agent(
|
||||
app=app,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user