Fix PP-OCRv6 error + MarkItDown LLM fallback
Docling: pass PdfPipelineOptions (TesseractCLI) to ImageFormatOption to prevent RapidOCR/PP-OCRv6 being loaded for image files MarkItDown: auto-fallback to plain conversion when Ollama returns 500 (OOM/crash) instead of propagating the error to the user Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
94cbabe6d7
commit
22cc0d0857
|
|
@ -6,7 +6,7 @@ import logging
|
||||||
from fastapi import UploadFile, HTTPException
|
from fastapi import UploadFile, HTTPException
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from app.models.ConvertModel import Conversion
|
from app.models.ConvertModel import Conversion
|
||||||
|
|
@ -26,23 +26,14 @@ def _build_converter() -> DocumentConverter:
|
||||||
logger.info("Docling: OCR enabled via Tesseract CLI")
|
logger.info("Docling: OCR enabled via Tesseract CLI")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
|
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
|
||||||
ocr_opts = None
|
|
||||||
pdf_opts = PdfPipelineOptions(do_ocr=False)
|
pdf_opts = PdfPipelineOptions(do_ocr=False)
|
||||||
|
|
||||||
fmt_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
|
# ImageFormatOption also uses StandardPdfPipeline — pass same pdf_opts
|
||||||
|
# to prevent docling from falling back to RapidOCR / PP-OCRv6
|
||||||
# Force Tesseract for image formats too (prevents RapidOCR/PP-OCRv6 fallback)
|
return DocumentConverter(format_options={
|
||||||
if ocr_opts is not None:
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts),
|
||||||
try:
|
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_opts),
|
||||||
from docling.document_converter import ImageFormatOption
|
})
|
||||||
from docling.datamodel.pipeline_options import ImagePipelineOptions
|
|
||||||
img_opts = ImagePipelineOptions(do_ocr=True, ocr_options=ocr_opts)
|
|
||||||
for fmt in (InputFormat.IMAGE, InputFormat.PNG, InputFormat.JPEG, InputFormat.TIFF, InputFormat.BMP):
|
|
||||||
fmt_options[fmt] = ImageFormatOption(pipeline_options=img_opts)
|
|
||||||
except Exception:
|
|
||||||
pass # older docling without ImageFormatOption — PDF-only override is sufficient
|
|
||||||
|
|
||||||
return DocumentConverter(format_options=fmt_options)
|
|
||||||
|
|
||||||
converter = _build_converter()
|
converter = _build_converter()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -127,13 +127,23 @@ async def convert_file(
|
||||||
else:
|
else:
|
||||||
converter = md_plain
|
converter = md_plain
|
||||||
|
|
||||||
|
try:
|
||||||
try:
|
try:
|
||||||
result = converter.convert(tmp_path)
|
result = converter.convert(tmp_path)
|
||||||
|
actual_llm = use_llm_now
|
||||||
|
except Exception as llm_err:
|
||||||
|
# Ollama OOM / 500 — fallback to plain conversion without LLM
|
||||||
|
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
|
||||||
|
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
|
||||||
|
result = md_plain.convert(tmp_path)
|
||||||
|
actual_llm = False
|
||||||
|
else:
|
||||||
|
raise
|
||||||
record = Conversion(
|
record = Conversion(
|
||||||
filename=file.filename,
|
filename=file.filename,
|
||||||
file_type=file_type,
|
file_type=file_type,
|
||||||
markdown=result.text_content,
|
markdown=result.text_content,
|
||||||
llm_enabled=use_llm_now,
|
llm_enabled=actual_llm,
|
||||||
)
|
)
|
||||||
db.add(record)
|
db.add(record)
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue