Fix PP-OCRv6 error + MarkItDown LLM fallback

Docling: pass PdfPipelineOptions (TesseractCLI) to ImageFormatOption
to prevent RapidOCR/PP-OCRv6 being loaded for image files

MarkItDown: auto-fallback to plain conversion when Ollama returns 500
(OOM/crash) instead of propagating the error to the user

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Kai Ton 2026-06-25 07:53:22 +00:00
parent 94cbabe6d7
commit 22cc0d0857
2 changed files with 19 additions and 18 deletions

View File

@ -6,7 +6,7 @@ import logging
from fastapi import UploadFile, HTTPException from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select from sqlalchemy import select
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from app.models.ConvertModel import Conversion from app.models.ConvertModel import Conversion
@ -26,23 +26,14 @@ def _build_converter() -> DocumentConverter:
logger.info("Docling: OCR enabled via Tesseract CLI") logger.info("Docling: OCR enabled via Tesseract CLI")
except Exception as e: except Exception as e:
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e) logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
ocr_opts = None
pdf_opts = PdfPipelineOptions(do_ocr=False) pdf_opts = PdfPipelineOptions(do_ocr=False)
fmt_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)} # ImageFormatOption also uses StandardPdfPipeline — pass same pdf_opts
# to prevent docling from falling back to RapidOCR / PP-OCRv6
# Force Tesseract for image formats too (prevents RapidOCR/PP-OCRv6 fallback) return DocumentConverter(format_options={
if ocr_opts is not None: InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts),
try: InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_opts),
from docling.document_converter import ImageFormatOption })
from docling.datamodel.pipeline_options import ImagePipelineOptions
img_opts = ImagePipelineOptions(do_ocr=True, ocr_options=ocr_opts)
for fmt in (InputFormat.IMAGE, InputFormat.PNG, InputFormat.JPEG, InputFormat.TIFF, InputFormat.BMP):
fmt_options[fmt] = ImageFormatOption(pipeline_options=img_opts)
except Exception:
pass # older docling without ImageFormatOption — PDF-only override is sufficient
return DocumentConverter(format_options=fmt_options)
converter = _build_converter() converter = _build_converter()

View File

@ -127,13 +127,23 @@ async def convert_file(
else: else:
converter = md_plain converter = md_plain
try:
try: try:
result = converter.convert(tmp_path) result = converter.convert(tmp_path)
actual_llm = use_llm_now
except Exception as llm_err:
# Ollama OOM / 500 — fallback to plain conversion without LLM
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
result = md_plain.convert(tmp_path)
actual_llm = False
else:
raise
record = Conversion( record = Conversion(
filename=file.filename, filename=file.filename,
file_type=file_type, file_type=file_type,
markdown=result.text_content, markdown=result.text_content,
llm_enabled=use_llm_now, llm_enabled=actual_llm,
) )
db.add(record) db.add(record)
await db.commit() await db.commit()