Fix PP-OCRv6 error + MarkItDown LLM fallback

Docling: pass PdfPipelineOptions (TesseractCLI) to ImageFormatOption to prevent RapidOCR/PP-OCRv6 being loaded for image files MarkItDown: auto-fallback to plain conversion when Ollama returns 500 (OOM/crash) instead of propagating the error to the user Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-25 07:53:22 +00:00 · 2026-06-25 07:53:22 +00:00 · 22cc0d0857
parent 94cbabe6d7
commit 22cc0d0857
2 changed files with 19 additions and 18 deletions
--- a/docling-service/app/services/DoclingService.py
+++ b/docling-service/app/services/DoclingService.py
@ -6,7 +6,7 @@ import logging
 from fastapi import UploadFile, HTTPException
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy import select
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
 from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
 from docling.datamodel.base_models import InputFormat
 from app.models.ConvertModel import Conversion
@ -26,23 +26,14 @@ def _build_converter() -> DocumentConverter:
        logger.info("Docling: OCR enabled via Tesseract CLI")
    except Exception as e:
        logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
-        ocr_opts = None
        pdf_opts = PdfPipelineOptions(do_ocr=False)

-    fmt_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
-
-    # Force Tesseract for image formats too (prevents RapidOCR/PP-OCRv6 fallback)
-    if ocr_opts is not None:
-        try:
-            from docling.document_converter import ImageFormatOption
-            from docling.datamodel.pipeline_options import ImagePipelineOptions
-            img_opts = ImagePipelineOptions(do_ocr=True, ocr_options=ocr_opts)
-            for fmt in (InputFormat.IMAGE, InputFormat.PNG, InputFormat.JPEG, InputFormat.TIFF, InputFormat.BMP):
-                fmt_options[fmt] = ImageFormatOption(pipeline_options=img_opts)
-        except Exception:
-            pass  # older docling without ImageFormatOption — PDF-only override is sufficient
-
-    return DocumentConverter(format_options=fmt_options)
+    # ImageFormatOption also uses StandardPdfPipeline — pass same pdf_opts
+    # to prevent docling from falling back to RapidOCR / PP-OCRv6
+    return DocumentConverter(format_options={
+        InputFormat.PDF:   PdfFormatOption(pipeline_options=pdf_opts),
+        InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_opts),
+    })

 converter = _build_converter()

--- a/markitdown-service/app/services/MarkitdownService.py
+++ b/markitdown-service/app/services/MarkitdownService.py
@ -128,12 +128,22 @@ async def convert_file(
        converter = md_plain

    try:
-        result = converter.convert(tmp_path)
+        try:
+            result = converter.convert(tmp_path)
+            actual_llm = use_llm_now
+        except Exception as llm_err:
+            # Ollama OOM / 500 — fallback to plain conversion without LLM
+            if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
+                logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
+                result = md_plain.convert(tmp_path)
+                actual_llm = False
+            else:
+                raise
        record = Conversion(
            filename=file.filename,
            file_type=file_type,
            markdown=result.text_content,
-            llm_enabled=use_llm_now,
+            llm_enabled=actual_llm,
        )
        db.add(record)
        await db.commit()