diff --git a/docling-service/app/services/DoclingService.py b/docling-service/app/services/DoclingService.py index d17ae03..f6fa537 100644 --- a/docling-service/app/services/DoclingService.py +++ b/docling-service/app/services/DoclingService.py @@ -6,7 +6,7 @@ import logging from fastapi import UploadFile, HTTPException from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import select -from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions from docling.datamodel.base_models import InputFormat from app.models.ConvertModel import Conversion @@ -26,23 +26,14 @@ def _build_converter() -> DocumentConverter: logger.info("Docling: OCR enabled via Tesseract CLI") except Exception as e: logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e) - ocr_opts = None pdf_opts = PdfPipelineOptions(do_ocr=False) - fmt_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)} - - # Force Tesseract for image formats too (prevents RapidOCR/PP-OCRv6 fallback) - if ocr_opts is not None: - try: - from docling.document_converter import ImageFormatOption - from docling.datamodel.pipeline_options import ImagePipelineOptions - img_opts = ImagePipelineOptions(do_ocr=True, ocr_options=ocr_opts) - for fmt in (InputFormat.IMAGE, InputFormat.PNG, InputFormat.JPEG, InputFormat.TIFF, InputFormat.BMP): - fmt_options[fmt] = ImageFormatOption(pipeline_options=img_opts) - except Exception: - pass # older docling without ImageFormatOption — PDF-only override is sufficient - - return DocumentConverter(format_options=fmt_options) + # ImageFormatOption also uses StandardPdfPipeline — pass same pdf_opts + # to prevent docling from falling back to RapidOCR / PP-OCRv6 + return DocumentConverter(format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts), + InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_opts), + }) converter = _build_converter() diff --git a/markitdown-service/app/services/MarkitdownService.py b/markitdown-service/app/services/MarkitdownService.py index 273c8f7..f48f5d0 100644 --- a/markitdown-service/app/services/MarkitdownService.py +++ b/markitdown-service/app/services/MarkitdownService.py @@ -128,12 +128,22 @@ async def convert_file( converter = md_plain try: - result = converter.convert(tmp_path) + try: + result = converter.convert(tmp_path) + actual_llm = use_llm_now + except Exception as llm_err: + # Ollama OOM / 500 — fallback to plain conversion without LLM + if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__): + logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err) + result = md_plain.convert(tmp_path) + actual_llm = False + else: + raise record = Conversion( filename=file.filename, file_type=file_type, markdown=result.text_content, - llm_enabled=use_llm_now, + llm_enabled=actual_llm, ) db.add(record) await db.commit()