AI-markdown/docling-service/app/services/DoclingService.py

202 lines
7.1 KiB
Python

import os
import re
import json
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.datamodel.base_models import InputFormat
from app.models.ConvertModel import Conversion
logger = logging.getLogger(__name__)
import openai as _openai
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
def _build_converter() -> DocumentConverter:
try:
ocr_opts = TesseractCliOcrOptions()
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_opts)
logger.info("Docling: OCR enabled via Tesseract CLI")
except Exception as e:
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
pdf_opts = PdfPipelineOptions(do_ocr=False)
# ImageFormatOption also uses StandardPdfPipeline — pass same pdf_opts
# to prevent docling from falling back to RapidOCR / PP-OCRv6
return DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_opts),
})
converter = _build_converter()
_llm_client = None
LLM_ACTIVE = False
def _init_llm(base_url: str | None, model: str) -> bool:
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client
if not base_url:
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client = None, model, False, None
return False
try:
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
OLLAMA_BASE_URL = base_url
OLLAMA_MODEL = model
_llm_client = client
LLM_ACTIVE = True
logger.info("Docling: LLM enabled via %s (model=%s)", base_url, model)
return True
except Exception as e:
logger.warning("Docling: LLM init failed (%s)", e)
LLM_ACTIVE = False
return False
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
DEFAULT_ENRICH_PROMPT = (
"You are a document cleaning assistant. "
"Fix OCR errors, normalise whitespace, and improve the Markdown structure. "
"Return ONLY the raw Markdown text — no code fences, no commentary, no explanation."
)
def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str:
"""Send extracted markdown to LLM for cleanup. Optionally override the system prompt."""
if not _llm_client or not markdown.strip():
return markdown
try:
resp = _llm_client.chat.completions.create(
model=OLLAMA_MODEL,
messages=[
{"role": "system", "content": system_prompt or DEFAULT_ENRICH_PROMPT},
{"role": "user", "content": markdown},
],
temperature=0,
)
result = resp.choices[0].message.content or markdown
# llava tends to wrap output in code fences regardless of instructions — strip them
result = re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
result = re.sub(r"\n?```\s*$", "", result.strip())
return result.strip() or markdown
except Exception as e:
logger.warning("Docling: LLM enrichment failed (%s) — returning raw output", e)
return markdown
# -----------------------------------------------------------------
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
"html", "htm", "jpg", "jpeg", "png",
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc"
}
OUTPUT_FORMATS = {"markdown", "json", "html", "text"}
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
async def convert_file(
file: UploadFile,
db: AsyncSession,
output_format: str = "markdown",
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
if not _allowed_file(file.filename):
raise HTTPException(
status_code=422,
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
if output_format not in OUTPUT_FORMATS:
raise HTTPException(
status_code=422,
detail=f"Output format not supported. Supported: {', '.join(sorted(OUTPUT_FORMATS))}"
)
suffix = os.path.splitext(file.filename)[1]
file_type = suffix.lstrip(".").lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
result = converter.convert(tmp_path)
doc = result.document
page_count = len(doc.pages) if hasattr(doc, "pages") and doc.pages else None
if output_format == "markdown":
content = doc.export_to_markdown()
elif output_format == "json":
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format == "html":
content = doc.export_to_html()
elif output_format == "text":
content = doc.export_to_markdown()
content = re.sub(r"#{1,6}\s?", "", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
# LLM enrichment — only for markdown / text output, and only if requested
llm_used = False
if _llm_client and use_llm and output_format in ("markdown", "text"):
content = _llm_enrich(content, system_prompt=llm_prompt or None)
llm_used = True
record = Conversion(
filename=file.filename,
file_type=file_type,
output_format=output_format,
content=content,
page_count=page_count,
llm_enabled=llm_used,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
async def get_conversion(conversion_id: int, db: AsyncSession) -> Conversion:
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
record = result.scalar_one_or_none()
if not record:
raise HTTPException(status_code=404, detail="Conversion not found")
return record
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
result = await db.execute(
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
)
return result.scalars().all()
async def delete_conversion(conversion_id: int, db: AsyncSession) -> dict:
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
record = result.scalar_one_or_none()
if not record:
raise HTTPException(status_code=404, detail="Conversion not found")
await db.delete(record)
await db.commit()
return {"message": f"Conversion {conversion_id} deleted"}