diff --git a/db/init.sql b/db/init.sql new file mode 100644 index 0000000..7ac0eec --- /dev/null +++ b/db/init.sql @@ -0,0 +1,8 @@ +-- markitdown database tables +CREATE TABLE IF NOT EXISTS conversions ( + id SERIAL PRIMARY KEY, + filename VARCHAR(255) NOT NULL, + file_type VARCHAR(50), + markdown TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); diff --git a/db/init_docling.sql b/db/init_docling.sql new file mode 100644 index 0000000..a076004 --- /dev/null +++ b/db/init_docling.sql @@ -0,0 +1,10 @@ +-- docling database tables +CREATE TABLE IF NOT EXISTS conversions ( + id SERIAL PRIMARY KEY, + filename VARCHAR(255) NOT NULL, + file_type VARCHAR(50), + output_format VARCHAR(20) DEFAULT 'markdown', + content TEXT, + page_count INTEGER, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); diff --git a/docling-service/app/services/DoclingService.py b/docling-service/app/services/DoclingService.py index 6ec787b..d17ae03 100644 --- a/docling-service/app/services/DoclingService.py +++ b/docling-service/app/services/DoclingService.py @@ -7,7 +7,7 @@ from fastapi import UploadFile, HTTPException from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import select from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions from docling.datamodel.base_models import InputFormat from app.models.ConvertModel import Conversion @@ -21,15 +21,28 @@ CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "") def _build_converter() -> DocumentConverter: try: - from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractCliOcrOptions - pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractCliOcrOptions()) + ocr_opts = TesseractCliOcrOptions() + pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_opts) logger.info("Docling: OCR enabled via Tesseract CLI") except Exception as e: logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e) + ocr_opts = None pdf_opts = PdfPipelineOptions(do_ocr=False) - return DocumentConverter( - format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)} - ) + + fmt_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)} + + # Force Tesseract for image formats too (prevents RapidOCR/PP-OCRv6 fallback) + if ocr_opts is not None: + try: + from docling.document_converter import ImageFormatOption + from docling.datamodel.pipeline_options import ImagePipelineOptions + img_opts = ImagePipelineOptions(do_ocr=True, ocr_options=ocr_opts) + for fmt in (InputFormat.IMAGE, InputFormat.PNG, InputFormat.JPEG, InputFormat.TIFF, InputFormat.BMP): + fmt_options[fmt] = ImageFormatOption(pipeline_options=img_opts) + except Exception: + pass # older docling without ImageFormatOption — PDF-only override is sufficient + + return DocumentConverter(format_options=fmt_options) converter = _build_converter()