Fix Docling PP-OCRv6 error + add DB init scripts
- Force TesseractCliOcrOptions for image formats (JPG/PNG/TIFF/BMP) to prevent RapidOCR/PP-OCRv6 fallback on docling 2.107 - Add db/init.sql and db/init_docling.sql for database initialization Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
11de2d2175
commit
94cbabe6d7
|
|
@ -0,0 +1,8 @@
|
||||||
|
-- markitdown database tables
|
||||||
|
CREATE TABLE IF NOT EXISTS conversions (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
filename VARCHAR(255) NOT NULL,
|
||||||
|
file_type VARCHAR(50),
|
||||||
|
markdown TEXT,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
-- docling database tables
|
||||||
|
CREATE TABLE IF NOT EXISTS conversions (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
filename VARCHAR(255) NOT NULL,
|
||||||
|
file_type VARCHAR(50),
|
||||||
|
output_format VARCHAR(20) DEFAULT 'markdown',
|
||||||
|
content TEXT,
|
||||||
|
page_count INTEGER,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
@ -7,7 +7,7 @@ from fastapi import UploadFile, HTTPException
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from app.models.ConvertModel import Conversion
|
from app.models.ConvertModel import Conversion
|
||||||
|
|
||||||
|
|
@ -21,15 +21,28 @@ CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
|
||||||
|
|
||||||
def _build_converter() -> DocumentConverter:
|
def _build_converter() -> DocumentConverter:
|
||||||
try:
|
try:
|
||||||
from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
ocr_opts = TesseractCliOcrOptions()
|
||||||
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractCliOcrOptions())
|
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_opts)
|
||||||
logger.info("Docling: OCR enabled via Tesseract CLI")
|
logger.info("Docling: OCR enabled via Tesseract CLI")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
|
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
|
||||||
|
ocr_opts = None
|
||||||
pdf_opts = PdfPipelineOptions(do_ocr=False)
|
pdf_opts = PdfPipelineOptions(do_ocr=False)
|
||||||
return DocumentConverter(
|
|
||||||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
|
fmt_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
|
||||||
)
|
|
||||||
|
# Force Tesseract for image formats too (prevents RapidOCR/PP-OCRv6 fallback)
|
||||||
|
if ocr_opts is not None:
|
||||||
|
try:
|
||||||
|
from docling.document_converter import ImageFormatOption
|
||||||
|
from docling.datamodel.pipeline_options import ImagePipelineOptions
|
||||||
|
img_opts = ImagePipelineOptions(do_ocr=True, ocr_options=ocr_opts)
|
||||||
|
for fmt in (InputFormat.IMAGE, InputFormat.PNG, InputFormat.JPEG, InputFormat.TIFF, InputFormat.BMP):
|
||||||
|
fmt_options[fmt] = ImageFormatOption(pipeline_options=img_opts)
|
||||||
|
except Exception:
|
||||||
|
pass # older docling without ImageFormatOption — PDF-only override is sufficient
|
||||||
|
|
||||||
|
return DocumentConverter(format_options=fmt_options)
|
||||||
|
|
||||||
converter = _build_converter()
|
converter = _build_converter()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue