Fix Docling PP-OCRv6 error + add DB init scripts
- Force TesseractCliOcrOptions for image formats (JPG/PNG/TIFF/BMP) to prevent RapidOCR/PP-OCRv6 fallback on docling 2.107 - Add db/init.sql and db/init_docling.sql for database initialization Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
11de2d2175
commit
94cbabe6d7
|
|
@ -0,0 +1,8 @@
|
|||
-- markitdown database tables
|
||||
CREATE TABLE IF NOT EXISTS conversions (
|
||||
id SERIAL PRIMARY KEY,
|
||||
filename VARCHAR(255) NOT NULL,
|
||||
file_type VARCHAR(50),
|
||||
markdown TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
-- docling database tables
|
||||
CREATE TABLE IF NOT EXISTS conversions (
|
||||
id SERIAL PRIMARY KEY,
|
||||
filename VARCHAR(255) NOT NULL,
|
||||
file_type VARCHAR(50),
|
||||
output_format VARCHAR(20) DEFAULT 'markdown',
|
||||
content TEXT,
|
||||
page_count INTEGER,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
|
@ -7,7 +7,7 @@ from fastapi import UploadFile, HTTPException
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from app.models.ConvertModel import Conversion
|
||||
|
||||
|
|
@ -21,15 +21,28 @@ CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
|
|||
|
||||
def _build_converter() -> DocumentConverter:
|
||||
try:
|
||||
from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractCliOcrOptions())
|
||||
ocr_opts = TesseractCliOcrOptions()
|
||||
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_opts)
|
||||
logger.info("Docling: OCR enabled via Tesseract CLI")
|
||||
except Exception as e:
|
||||
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
|
||||
ocr_opts = None
|
||||
pdf_opts = PdfPipelineOptions(do_ocr=False)
|
||||
return DocumentConverter(
|
||||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
|
||||
)
|
||||
|
||||
fmt_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
|
||||
|
||||
# Force Tesseract for image formats too (prevents RapidOCR/PP-OCRv6 fallback)
|
||||
if ocr_opts is not None:
|
||||
try:
|
||||
from docling.document_converter import ImageFormatOption
|
||||
from docling.datamodel.pipeline_options import ImagePipelineOptions
|
||||
img_opts = ImagePipelineOptions(do_ocr=True, ocr_options=ocr_opts)
|
||||
for fmt in (InputFormat.IMAGE, InputFormat.PNG, InputFormat.JPEG, InputFormat.TIFF, InputFormat.BMP):
|
||||
fmt_options[fmt] = ImageFormatOption(pipeline_options=img_opts)
|
||||
except Exception:
|
||||
pass # older docling without ImageFormatOption — PDF-only override is sufficient
|
||||
|
||||
return DocumentConverter(format_options=fmt_options)
|
||||
|
||||
converter = _build_converter()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue