Fix Docling PP-OCRv6 error + add DB init scripts

- Force TesseractCliOcrOptions for image formats (JPG/PNG/TIFF/BMP)
  to prevent RapidOCR/PP-OCRv6 fallback on docling 2.107
- Add db/init.sql and db/init_docling.sql for database initialization

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Kai Ton 2026-06-25 07:48:04 +00:00
parent 11de2d2175
commit 94cbabe6d7
3 changed files with 37 additions and 6 deletions

8
db/init.sql Normal file
View File

@ -0,0 +1,8 @@
-- markitdown database tables
CREATE TABLE IF NOT EXISTS conversions (
id SERIAL PRIMARY KEY,
filename VARCHAR(255) NOT NULL,
file_type VARCHAR(50),
markdown TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

10
db/init_docling.sql Normal file
View File

@ -0,0 +1,10 @@
-- docling database tables
CREATE TABLE IF NOT EXISTS conversions (
id SERIAL PRIMARY KEY,
filename VARCHAR(255) NOT NULL,
file_type VARCHAR(50),
output_format VARCHAR(20) DEFAULT 'markdown',
content TEXT,
page_count INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

View File

@ -7,7 +7,7 @@ from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.datamodel.base_models import InputFormat
from app.models.ConvertModel import Conversion
@ -21,15 +21,28 @@ CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
def _build_converter() -> DocumentConverter:
try:
from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractCliOcrOptions
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractCliOcrOptions())
ocr_opts = TesseractCliOcrOptions()
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_opts)
logger.info("Docling: OCR enabled via Tesseract CLI")
except Exception as e:
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
ocr_opts = None
pdf_opts = PdfPipelineOptions(do_ocr=False)
return DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
)
fmt_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
# Force Tesseract for image formats too (prevents RapidOCR/PP-OCRv6 fallback)
if ocr_opts is not None:
try:
from docling.document_converter import ImageFormatOption
from docling.datamodel.pipeline_options import ImagePipelineOptions
img_opts = ImagePipelineOptions(do_ocr=True, ocr_options=ocr_opts)
for fmt in (InputFormat.IMAGE, InputFormat.PNG, InputFormat.JPEG, InputFormat.TIFF, InputFormat.BMP):
fmt_options[fmt] = ImageFormatOption(pipeline_options=img_opts)
except Exception:
pass # older docling without ImageFormatOption — PDF-only override is sufficient
return DocumentConverter(format_options=fmt_options)
converter = _build_converter()