MarkItDown vs Docling — LLM Input Processing

commit 11de2d21754b1a2b76512f1195674c2473975ee4 Author: kai.t@apactech.io Date: Thu Jun 25 06:47:35 2026 +0000 Initial commit — MarkItDown vs Docling demo - FastAPI microservices: MarkItDown + Docling với async SQLAlchemy - Caddy reverse proxy same-origin (no CORS) - Bootstrap 5 frontend với marked.js rendering - LLM settings card: Ollama URL, model select từ API, cleanup model - POST /cleanup endpoint với AI làm đẹp Markdown - GET /models fetch danh sách model từ Ollama - Runtime LLM re-init không cần restart container - PYTHONDONTWRITEBYTECODE + .dockerignore Co-Authored-By: Claude Sonnet 4.6 diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..8aa5d18 --- /dev/null +++ b/.env.example @@ -0,0 +1,7 @@ +POSTGRES_USER=admin +POSTGRES_PASSWORD=admin +POSTGRES_DB=markitdown + +# Optional — Ollama LLM integration +# OLLAMA_BASE_URL=https://chat-ai.nswteam.net/ollama/v1 +# OLLAMA_MODEL=llava:7b diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b229342 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +.env +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.egg-info/ +dist/ +build/ +db/data/ +*.db +*.sqlite diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..42ac8ec --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,124 @@ +# ============================================================ +# AI Markdown Demo — MarkItDown vs Docling +# ============================================================ +# Naming convention: - +# +# ui :8484 — Comparison UI (nginx) +# api-markitdown:8282 — MarkItDown API (Microsoft) +# api-docling :8383 — Docling API (IBM) +# db-markitdown :5432 — PostgreSQL for markitdown +# db-docling :— — PostgreSQL for docling (internal only) +# db-admin :5050 — Adminer +# ============================================================ + +x-healthcheck-defaults: &healthcheck-defaults + interval: 10s + timeout: 5s + retries: 5 + +x-service-defaults: &service-defaults + restart: unless-stopped + networks: + - app-network + +# ========================== +# Services +# ========================== +services: + + # --- UI layer --------------------------------------------------- + ui: + image: caddy:alpine + <<: *service-defaults + ports: + - "8484:80" + volumes: + - ./frontend/Caddyfile:/etc/caddy/Caddyfile:ro + - ./frontend:/srv:ro + + # --- API layer -------------------------------------------------- + api-markitdown: + build: + context: ./markitdown-service + <<: *service-defaults + ports: + - "8282:8000" + env_file: .env + environment: + DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-markitdown:5432/${POSTGRES_DB:-markitdown} + OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-} + OLLAMA_MODEL: ${OLLAMA_MODEL:-llava} + depends_on: + db-markitdown: + condition: service_healthy + + api-docling: + build: + context: ./docling-service + <<: *service-defaults + ports: + - "8383:8000" + env_file: .env + environment: + DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-docling:5432/docling + OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-} + OLLAMA_MODEL: ${OLLAMA_MODEL:-llava} + depends_on: + db-docling: + condition: service_healthy + + # --- Database layer --------------------------------------------- + db-markitdown: + image: postgres:16-alpine + <<: *service-defaults + env_file: .env + environment: + POSTGRES_DB: ${POSTGRES_DB:-markitdown} + POSTGRES_USER: ${POSTGRES_USER:-admin} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin} + volumes: + - db_markitdown_data:/var/lib/postgresql/data + - ./db/init.sql:/docker-entrypoint-initdb.d/init.sql:ro + ports: + - "5432:5432" + healthcheck: + <<: *healthcheck-defaults + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d ${POSTGRES_DB:-markitdown}"] + + db-docling: + image: postgres:16-alpine + <<: *service-defaults + env_file: .env + environment: + POSTGRES_DB: docling + POSTGRES_USER: ${POSTGRES_USER:-admin} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin} + volumes: + - db_docling_data:/var/lib/postgresql/data + - ./db/init_docling.sql:/docker-entrypoint-initdb.d/init.sql:ro + healthcheck: + <<: *healthcheck-defaults + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"] + + # --- Admin layer ------------------------------------------------ + db-admin: + image: adminer:4.8.1 + <<: *service-defaults + ports: + - "5050:8080" + environment: + ADMINER_DEFAULT_SERVER: db-markitdown + depends_on: + db-markitdown: + condition: service_healthy + +# ========================== +# Infrastructure +# ========================== +networks: + app-network: + driver: bridge + +volumes: + db_markitdown_data: + db_docling_data: diff --git a/docling-service/.dockerignore b/docling-service/.dockerignore new file mode 100644 index 0000000..77de465 --- /dev/null +++ b/docling-service/.dockerignore @@ -0,0 +1,4 @@ +__pycache__ +*.pyc +*.pyo +.env diff --git a/docling-service/Dockerfile b/docling-service/Dockerfile new file mode 100644 index 0000000..0f4d282 --- /dev/null +++ b/docling-service/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + libgl1 libglib2.0-0 libgomp1 \ + poppler-utils tesseract-ocr \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY main.py . +COPY app/ ./app/ + +EXPOSE 8000 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docling-service/app/controllers/ConvertController.py b/docling-service/app/controllers/ConvertController.py new file mode 100644 index 0000000..6df3183 --- /dev/null +++ b/docling-service/app/controllers/ConvertController.py @@ -0,0 +1,122 @@ +from fastapi import APIRouter, UploadFile, File, Depends, Query +from sqlalchemy.ext.asyncio import AsyncSession +from app.models.ConvertModel import ConvertResponse, HealthResponse, ConversionRecord +from app.services import DoclingService as docling_service +from app.database import get_db +from pydantic import BaseModel + +router = APIRouter() + +class SettingsRequest(BaseModel): + ollama_base_url: str | None = None + ollama_model: str = "llava" + cleanup_model: str | None = None + +class SettingsResponse(BaseModel): + llm_enabled: bool + ollama_base_url: str | None + ollama_model: str + cleanup_model: str | None = None + default_prompt: str | None = None + +SUPPORTED_INPUT_FORMATS = sorted([ + "pdf", "docx", "xlsx", "pptx", + "html", "htm", "jpg", "jpeg", "png", + "tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc" +]) + +SUPPORTED_OUTPUT_FORMATS = ["markdown", "json", "html", "text"] + + +@router.get("/settings", response_model=SettingsResponse) +def get_settings(): + return SettingsResponse( + llm_enabled=docling_service.LLM_ACTIVE, + ollama_base_url=docling_service.OLLAMA_BASE_URL, + ollama_model=docling_service.OLLAMA_MODEL, + cleanup_model=docling_service.CLEANUP_MODEL or None, + default_prompt=docling_service.DEFAULT_ENRICH_PROMPT, + ) + +@router.post("/settings", response_model=SettingsResponse) +def update_settings(req: SettingsRequest): + docling_service._init_llm(req.ollama_base_url or None, req.ollama_model) + docling_service.CLEANUP_MODEL = req.cleanup_model or "" + return SettingsResponse( + llm_enabled=docling_service.LLM_ACTIVE, + ollama_base_url=docling_service.OLLAMA_BASE_URL, + ollama_model=docling_service.OLLAMA_MODEL, + cleanup_model=docling_service.CLEANUP_MODEL or None, + default_prompt=docling_service.DEFAULT_ENRICH_PROMPT, + ) + +@router.get("/health", response_model=HealthResponse) +def health(): + from app.services.DoclingService import LLM_ACTIVE, OLLAMA_MODEL + ocr = "tesseract" if _ocr_available() else "none" + return HealthResponse( + status="ok", + supported_formats=SUPPORTED_INPUT_FORMATS, + output_formats=SUPPORTED_OUTPUT_FORMATS, + llm_enabled=LLM_ACTIVE, + llm_model=OLLAMA_MODEL if LLM_ACTIVE else None, + ocr_engine=ocr, + ) + + +def _ocr_available() -> bool: + import shutil + return shutil.which("tesseract") is not None + + +@router.post("/convert", response_model=ConvertResponse) +async def convert( + file: UploadFile = File(...), + output_format: str = Query(default="markdown", description="Output format: markdown | json | html | text"), + use_llm: bool = Query(default=True, description="Run LLM enrichment on extracted text"), + llm_prompt: str | None = Query(default=None, description="Custom system prompt for LLM enrichment"), + db: AsyncSession = Depends(get_db), +): + record = await docling_service.convert_file(file, db, output_format, use_llm=use_llm, llm_prompt=llm_prompt) + return ConvertResponse( + id=record.id, + filename=record.filename, + output_format=record.output_format, + content=record.content, + page_count=record.page_count, + llm_enabled=record.llm_enabled, + ) + + +@router.get("/conversions/{conversion_id}", response_model=ConvertResponse) +async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)): + record = await docling_service.get_conversion(conversion_id, db) + return ConvertResponse( + id=record.id, + filename=record.filename, + output_format=record.output_format, + content=record.content, + page_count=record.page_count, + llm_enabled=record.llm_enabled, + ) + + +@router.get("/history", response_model=list[ConversionRecord]) +async def history(limit: int = 20, db: AsyncSession = Depends(get_db)): + records = await docling_service.get_history(db, limit) + return [ + ConversionRecord( + id=r.id, + filename=r.filename, + file_type=r.file_type, + output_format=r.output_format, + page_count=r.page_count, + created_at=str(r.created_at), + ) + for r in records + ] + + +@router.delete("/conversions/{conversion_id}") +async def delete_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)): + return await docling_service.delete_conversion(conversion_id, db) diff --git a/docling-service/app/database.py b/docling-service/app/database.py new file mode 100644 index 0000000..83f045a --- /dev/null +++ b/docling-service/app/database.py @@ -0,0 +1,20 @@ +import os +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession +from sqlalchemy.orm import sessionmaker, DeclarativeBase + +DATABASE_URL = os.getenv("DATABASE_URL", "postgresql+asyncpg://admin:secret@db:5432/docling") + +DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+asyncpg://") + +engine = create_async_engine(DATABASE_URL, echo=False) + +AsyncSessionLocal = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) + + +class Base(DeclarativeBase): + pass + + +async def get_db(): + async with AsyncSessionLocal() as session: + yield session diff --git a/docling-service/app/models/ConvertModel.py b/docling-service/app/models/ConvertModel.py new file mode 100644 index 0000000..3ebe29d --- /dev/null +++ b/docling-service/app/models/ConvertModel.py @@ -0,0 +1,51 @@ +from typing import Optional +from pydantic import BaseModel +from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, func +from app.database import Base + + +class Conversion(Base): + __tablename__ = "conversions" + + id = Column(Integer, primary_key=True, index=True) + filename = Column(String(255), nullable=False) + file_type = Column(String(50)) + output_format = Column(String(20), default="markdown") + content = Column(Text) + page_count = Column(Integer, nullable=True) + llm_enabled = Column(Boolean, default=False) + created_at = Column(DateTime, server_default=func.now()) + + +class ConvertResponse(BaseModel): + id: int + filename: str + output_format: str + content: str + page_count: Optional[int] = None + llm_enabled: bool = False + + class Config: + from_attributes = True + + +class ConversionRecord(BaseModel): + id: int + filename: str + file_type: Optional[str] = None + output_format: str + page_count: Optional[int] = None + llm_enabled: bool = False + created_at: str + + class Config: + from_attributes = True + + +class HealthResponse(BaseModel): + status: str + supported_formats: list[str] + output_formats: list[str] + llm_enabled: bool = False + llm_model: Optional[str] = None + ocr_engine: str = "none" diff --git a/docling-service/app/services/DoclingService.py b/docling-service/app/services/DoclingService.py new file mode 100644 index 0000000..6ec787b --- /dev/null +++ b/docling-service/app/services/DoclingService.py @@ -0,0 +1,197 @@ +import os +import re +import json +import tempfile +import logging +from fastapi import UploadFile, HTTPException +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.base_models import InputFormat +from app.models.ConvertModel import Conversion + +logger = logging.getLogger(__name__) + +import openai as _openai + +OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava") +CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "") + +def _build_converter() -> DocumentConverter: + try: + from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractCliOcrOptions + pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractCliOcrOptions()) + logger.info("Docling: OCR enabled via Tesseract CLI") + except Exception as e: + logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e) + pdf_opts = PdfPipelineOptions(do_ocr=False) + return DocumentConverter( + format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)} + ) + +converter = _build_converter() + +_llm_client = None +LLM_ACTIVE = False + + +def _init_llm(base_url: str | None, model: str) -> bool: + global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client + if not base_url: + OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client = None, model, False, None + return False + try: + client = _openai.OpenAI(base_url=base_url, api_key="ollama") + OLLAMA_BASE_URL = base_url + OLLAMA_MODEL = model + _llm_client = client + LLM_ACTIVE = True + logger.info("Docling: LLM enabled via %s (model=%s)", base_url, model) + return True + except Exception as e: + logger.warning("Docling: LLM init failed (%s)", e) + LLM_ACTIVE = False + return False + + +_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL) + + +DEFAULT_ENRICH_PROMPT = ( + "You are a document cleaning assistant. " + "Fix OCR errors, normalise whitespace, and improve the Markdown structure. " + "Return ONLY the raw Markdown text — no code fences, no commentary, no explanation." +) + + +def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str: + """Send extracted markdown to LLM for cleanup. Optionally override the system prompt.""" + if not _llm_client or not markdown.strip(): + return markdown + try: + resp = _llm_client.chat.completions.create( + model=OLLAMA_MODEL, + messages=[ + {"role": "system", "content": system_prompt or DEFAULT_ENRICH_PROMPT}, + {"role": "user", "content": markdown}, + ], + temperature=0, + ) + result = resp.choices[0].message.content or markdown + # llava tends to wrap output in code fences regardless of instructions — strip them + result = re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip()) + result = re.sub(r"\n?```\s*$", "", result.strip()) + return result.strip() or markdown + except Exception as e: + logger.warning("Docling: LLM enrichment failed (%s) — returning raw output", e) + return markdown + + + +# ----------------------------------------------------------------- +ALLOWED_EXTENSIONS = { + "pdf", "docx", "xlsx", "pptx", + "html", "htm", "jpg", "jpeg", "png", + "tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc" +} +OUTPUT_FORMATS = {"markdown", "json", "html", "text"} + + +def _allowed_file(filename: str) -> bool: + return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS + + +async def convert_file( + file: UploadFile, + db: AsyncSession, + output_format: str = "markdown", + use_llm: bool = True, + llm_prompt: str | None = None, +) -> Conversion: + if not _allowed_file(file.filename): + raise HTTPException( + status_code=422, + detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}" + ) + if output_format not in OUTPUT_FORMATS: + raise HTTPException( + status_code=422, + detail=f"Output format not supported. Supported: {', '.join(sorted(OUTPUT_FORMATS))}" + ) + + suffix = os.path.splitext(file.filename)[1] + file_type = suffix.lstrip(".").lower() + + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(await file.read()) + tmp_path = tmp.name + + try: + result = converter.convert(tmp_path) + doc = result.document + + page_count = len(doc.pages) if hasattr(doc, "pages") and doc.pages else None + + if output_format == "markdown": + content = doc.export_to_markdown() + elif output_format == "json": + content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2) + elif output_format == "html": + content = doc.export_to_html() + elif output_format == "text": + content = doc.export_to_markdown() + content = re.sub(r"#{1,6}\s?", "", content) + content = re.sub(r"\*\*(.+?)\*\*", r"\1", content) + content = re.sub(r"\*(.+?)\*", r"\1", content) + + # LLM enrichment — only for markdown / text output, and only if requested + llm_used = False + if _llm_client and use_llm and output_format in ("markdown", "text"): + content = _llm_enrich(content, system_prompt=llm_prompt or None) + llm_used = True + + record = Conversion( + filename=file.filename, + file_type=file_type, + output_format=output_format, + content=content, + page_count=page_count, + llm_enabled=llm_used, + ) + db.add(record) + await db.commit() + await db.refresh(record) + return record + + except Exception as e: + await db.rollback() + raise HTTPException(status_code=500, detail=str(e)) + finally: + os.unlink(tmp_path) + + +async def get_conversion(conversion_id: int, db: AsyncSession) -> Conversion: + result = await db.execute(select(Conversion).where(Conversion.id == conversion_id)) + record = result.scalar_one_or_none() + if not record: + raise HTTPException(status_code=404, detail="Conversion not found") + return record + + +async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]: + result = await db.execute( + select(Conversion).order_by(Conversion.created_at.desc()).limit(limit) + ) + return result.scalars().all() + + +async def delete_conversion(conversion_id: int, db: AsyncSession) -> dict: + result = await db.execute(select(Conversion).where(Conversion.id == conversion_id)) + record = result.scalar_one_or_none() + if not record: + raise HTTPException(status_code=404, detail="Conversion not found") + await db.delete(record) + await db.commit() + return {"message": f"Conversion {conversion_id} deleted"} diff --git a/docling-service/main.py b/docling-service/main.py new file mode 100644 index 0000000..66395d5 --- /dev/null +++ b/docling-service/main.py @@ -0,0 +1,29 @@ +from contextlib import asynccontextmanager +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from app.controllers.ConvertController import router +from app.database import engine, Base + + +@asynccontextmanager +async def lifespan(app: FastAPI): + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + yield + + +app = FastAPI( + title="Docling API", + version="1.0.0", + description="Advanced document conversion service powered by Docling. Supports PDF, DOCX, PPTX, XLSX, HTML, images and more.", + lifespan=lifespan, +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(router) diff --git a/docling-service/requirements.txt b/docling-service/requirements.txt new file mode 100644 index 0000000..e023c30 --- /dev/null +++ b/docling-service/requirements.txt @@ -0,0 +1,7 @@ +docling +fastapi +uvicorn +python-multipart +asyncpg +sqlalchemy[asyncio] +openai diff --git a/example/1.pdf b/example/1.pdf new file mode 100644 index 0000000..e18d7de Binary files /dev/null and b/example/1.pdf differ diff --git a/frontend/Caddyfile b/frontend/Caddyfile new file mode 100644 index 0000000..76c8932 --- /dev/null +++ b/frontend/Caddyfile @@ -0,0 +1,19 @@ +:80 { + encode gzip + + # Reverse proxy — strips prefix and forwards to backend + handle_path /api/markitdown/* { + reverse_proxy api-markitdown:8000 + } + + handle_path /api/docling/* { + reverse_proxy api-docling:8000 + } + + # Static files with SPA fallback + handle { + root * /srv + file_server + try_files {path} /index.html + } +} diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..36a8f57 --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,822 @@ + + + + + + + MarkItDown vs Docling — LLM Input Processing + + + + + + + + + +

+ + +

Tải lên tài liệu để so sánh

+ +

+ + +

Kéo thả hoặc click để chọn file

+ +

+ PDF + DOCX + XLSX + PPTX + HTML + CSV + TXT + JPG/PNG + EPUB + TIFF + ASCIIDoc +

+ + +

+ Docling format + +

+ +

+ + LLM bật +

+ + + +

+ + +

+ Cài đặt LLM (Ollama) +

+ Ollama Base URL +

+ + +

OpenAI-compatible endpoint

+ Model mặc định + +

Convert + Docling enrich

+ Model Format + +

Để trống = dùng model mặc định

MarkItDown LLM

Docling LLM

+ +

+ + + + + +

Tóm tắt so sánh

+ + + + + + + + + + +

Tiêu chí	MarkItDown	Docling	Ghi chú

+ + +

+ + MarkItDown + +

+ + +

Raw
Preview

Tải file lên để + xem kết quả

+ + Docling + +

+ +

Raw
Preview

Tải file lên để + xem kết quả

+ + +

Lịch sử chuyển đổi gần đây

+ MarkItDown +

Chưa có lịch sử

+ Docling +

Chưa có lịch sử

+ +

+ + + + + + + \ No newline at end of file diff --git a/markitdown-service/.dockerignore b/markitdown-service/.dockerignore new file mode 100644 index 0000000..77de465 --- /dev/null +++ b/markitdown-service/.dockerignore @@ -0,0 +1,4 @@ +__pycache__ +*.pyc +*.pyo +.env diff --git a/markitdown-service/Dockerfile b/markitdown-service/Dockerfile new file mode 100644 index 0000000..8e57562 --- /dev/null +++ b/markitdown-service/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + ffmpeg libmagic1 \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY main.py . +COPY app/ ./app/ + +EXPOSE 8000 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/markitdown-service/app/controllers/ConvertController.py b/markitdown-service/app/controllers/ConvertController.py new file mode 100644 index 0000000..562e21a --- /dev/null +++ b/markitdown-service/app/controllers/ConvertController.py @@ -0,0 +1,107 @@ +from fastapi import APIRouter, UploadFile, File, Depends, Query, Body, HTTPException +from sqlalchemy.ext.asyncio import AsyncSession +from app.models.ConvertModel import ConvertResponse, HealthResponse, ConversionRecord +from app.services import MarkitdownService as markitdown_service +from app.database import get_db +from pydantic import BaseModel + +class CleanupRequest(BaseModel): + text: str + prompt: str | None = None + model: str | None = None + +class CleanupResponse(BaseModel): + text: str + +class SettingsRequest(BaseModel): + ollama_base_url: str | None = None + ollama_model: str = "llava" + cleanup_model: str | None = None + +class SettingsResponse(BaseModel): + llm_enabled: bool + ollama_base_url: str | None + ollama_model: str + cleanup_model: str | None = None + default_prompt: str | None = None + +router = APIRouter() + + +@router.get("/health", response_model=HealthResponse) +def health(): + return HealthResponse( + status="ok", + llm_enabled=markitdown_service.LLM_ACTIVE, + llm_model=markitdown_service.OLLAMA_MODEL if markitdown_service.LLM_ACTIVE else None, + ) + + +@router.post("/convert", response_model=ConvertResponse) +async def convert( + file: UploadFile = File(...), + use_llm: bool = Query(default=True, description="Use LLM vision for image understanding"), + llm_prompt: str | None = Query(default=None, description="Custom prompt for LLM vision"), + db: AsyncSession = Depends(get_db), +): + record = await markitdown_service.convert_file(file, db, use_llm=use_llm, llm_prompt=llm_prompt) + return record + + +@router.get("/models") +def list_models(): + if not markitdown_service.OLLAMA_BASE_URL: + return {"models": []} + try: + import httpx, re + base = re.sub(r"/v1/?$", "", markitdown_service.OLLAMA_BASE_URL.rstrip("/")) + resp = httpx.get(f"{base}/api/tags", timeout=5) + resp.raise_for_status() + names = [m["name"] for m in resp.json().get("models", [])] + return {"models": sorted(names)} + except Exception as e: + return {"models": [], "error": str(e)} + +@router.get("/settings", response_model=SettingsResponse) +def get_settings(): + return SettingsResponse( + llm_enabled=markitdown_service.LLM_ACTIVE, + ollama_base_url=markitdown_service.OLLAMA_BASE_URL, + ollama_model=markitdown_service.OLLAMA_MODEL, + cleanup_model=markitdown_service.CLEANUP_MODEL or None, + default_prompt=markitdown_service.DEFAULT_CLEANUP_PROMPT, + ) + +@router.post("/settings", response_model=SettingsResponse) +def update_settings(req: SettingsRequest): + markitdown_service._init_llm(req.ollama_base_url or None, req.ollama_model) + markitdown_service.CLEANUP_MODEL = req.cleanup_model or "" + return SettingsResponse( + llm_enabled=markitdown_service.LLM_ACTIVE, + ollama_base_url=markitdown_service.OLLAMA_BASE_URL, + ollama_model=markitdown_service.OLLAMA_MODEL, + cleanup_model=markitdown_service.CLEANUP_MODEL or None, + default_prompt=markitdown_service.DEFAULT_CLEANUP_PROMPT, + ) + +@router.post("/cleanup", response_model=CleanupResponse) +async def cleanup(req: CleanupRequest): + if not markitdown_service.LLM_ACTIVE: + raise HTTPException(status_code=503, detail="LLM not configured") + cleaned = markitdown_service.llm_cleanup(req.text, req.prompt, req.model) + return CleanupResponse(text=cleaned) + + +@router.get("/history", response_model=list[ConversionRecord]) +async def history(limit: int = 20, db: AsyncSession = Depends(get_db)): + records = await markitdown_service.get_history(db, limit) + return [ + ConversionRecord( + id=r.id, + filename=r.filename, + file_type=r.file_type, + llm_enabled=r.llm_enabled, + created_at=str(r.created_at), + ) + for r in records + ] diff --git a/markitdown-service/app/database.py b/markitdown-service/app/database.py new file mode 100644 index 0000000..9e7518c --- /dev/null +++ b/markitdown-service/app/database.py @@ -0,0 +1,21 @@ +import os +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession +from sqlalchemy.orm import sessionmaker, DeclarativeBase + +DATABASE_URL = os.getenv("DATABASE_URL", "postgresql+asyncpg://admin:secret@db:5432/markitdown") + +# asyncpg driver +DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+asyncpg://") + +engine = create_async_engine(DATABASE_URL, echo=False) + +AsyncSessionLocal = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) + + +class Base(DeclarativeBase): + pass + + +async def get_db(): + async with AsyncSessionLocal() as session: + yield session diff --git a/markitdown-service/app/models/ConvertModel.py b/markitdown-service/app/models/ConvertModel.py new file mode 100644 index 0000000..15e04fe --- /dev/null +++ b/markitdown-service/app/models/ConvertModel.py @@ -0,0 +1,42 @@ +from typing import Optional +from pydantic import BaseModel +from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, func +from app.database import Base + + +class Conversion(Base): + __tablename__ = "conversions" + + id = Column(Integer, primary_key=True, index=True) + filename = Column(String(255), nullable=False) + file_type = Column(String(50)) + markdown = Column(Text) + llm_enabled = Column(Boolean, default=False) + created_at = Column(DateTime, server_default=func.now()) + + +class ConvertResponse(BaseModel): + id: int + filename: str + markdown: str + llm_enabled: bool = False + + class Config: + from_attributes = True + + +class HealthResponse(BaseModel): + status: str + llm_enabled: bool = False + llm_model: Optional[str] = None + + +class ConversionRecord(BaseModel): + id: int + filename: str + file_type: Optional[str] = None + llm_enabled: bool = False + created_at: str + + class Config: + from_attributes = True diff --git a/markitdown-service/app/services/MarkitdownService.py b/markitdown-service/app/services/MarkitdownService.py new file mode 100644 index 0000000..273c8f7 --- /dev/null +++ b/markitdown-service/app/services/MarkitdownService.py @@ -0,0 +1,153 @@ +import os +import tempfile +import logging +from fastapi import UploadFile, HTTPException +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select +from markitdown import MarkItDown +from app.models.ConvertModel import Conversion + +logger = logging.getLogger(__name__) + +import openai as _openai + +OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava") +CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "") + +LLM_ACTIVE = False +_llm_client = None +md_plain = MarkItDown() +md = md_plain + + +def _init_llm(base_url: str | None, model: str) -> bool: + global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md + if not base_url: + OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md = None, model, False, None, md_plain + return False + try: + client = _openai.OpenAI(base_url=base_url, api_key="ollama") + OLLAMA_BASE_URL = base_url + OLLAMA_MODEL = model + _llm_client = client + md = MarkItDown(llm_client=client, llm_model=model) + LLM_ACTIVE = True + logger.info("MarkItDown: LLM enabled via %s (model=%s)", base_url, model) + return True + except Exception as e: + logger.warning("MarkItDown: LLM init failed (%s)", e) + LLM_ACTIVE = False + return False + + +_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL) + +DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \ +The text below was extracted from a multi-column PDF using OCR and is poorly structured: \ +columns are merged, headers are mixed with values, and content is out of order. + +Your task: +1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.) +2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value | +3. Keep bullet lists where appropriate (e.g. ports, certifications) +4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows) +5. Preserve all technical values exactly — do not paraphrase specs + +Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble.""" + +import re as _re + +def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str: + if not _llm_client or not text.strip(): + return text + try: + resp = _llm_client.chat.completions.create( + model=model or OLLAMA_MODEL, + messages=[ + {"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT}, + {"role": "user", "content": text}, + ], + temperature=0, + ) + result = resp.choices[0].message.content or text + result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip()) + result = _re.sub(r"\n?```\s*$", "", result.strip()) + return result.strip() or text + except Exception as e: + logger.warning("MarkItDown: cleanup failed (%s)", e) + return text + + +ALLOWED_EXTENSIONS = { + "pdf", "docx", "xlsx", "pptx", + "html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub" +} + + +def _allowed_file(filename: str) -> bool: + return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS + + +async def convert_file( + file: UploadFile, + db: AsyncSession, + use_llm: bool = True, + llm_prompt: str | None = None, +) -> Conversion: + if not _allowed_file(file.filename): + raise HTTPException( + status_code=422, + detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}" + ) + + suffix = os.path.splitext(file.filename)[1] + file_type = suffix.lstrip(".").lower() + + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(await file.read()) + tmp_path = tmp.name + + # Choose converter: LLM only if enabled globally AND requested per-call + use_llm_now = LLM_ACTIVE and use_llm + + # If custom prompt provided, create a one-off MarkItDown with that prompt + if use_llm_now and llm_prompt: + try: + converter = MarkItDown( + llm_client=_llm_client, + llm_model=OLLAMA_MODEL, + llm_prompt=llm_prompt, + ) + except TypeError: + # older markitdown versions may not support llm_prompt kwarg + converter = md + elif use_llm_now: + converter = md + else: + converter = md_plain + + try: + result = converter.convert(tmp_path) + record = Conversion( + filename=file.filename, + file_type=file_type, + markdown=result.text_content, + llm_enabled=use_llm_now, + ) + db.add(record) + await db.commit() + await db.refresh(record) + return record + except Exception as e: + await db.rollback() + raise HTTPException(status_code=500, detail=str(e)) + finally: + os.unlink(tmp_path) + + +async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]: + result = await db.execute( + select(Conversion).order_by(Conversion.created_at.desc()).limit(limit) + ) + return result.scalars().all() diff --git a/markitdown-service/main.py b/markitdown-service/main.py new file mode 100644 index 0000000..dfa019b --- /dev/null +++ b/markitdown-service/main.py @@ -0,0 +1,24 @@ +from contextlib import asynccontextmanager +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from app.controllers.ConvertController import router +from app.database import engine, Base + + +@asynccontextmanager +async def lifespan(app: FastAPI): + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + yield + + +app = FastAPI(title="MarkItDown API", version="1.0.0", lifespan=lifespan) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(router) diff --git a/markitdown-service/requirements.txt b/markitdown-service/requirements.txt new file mode 100644 index 0000000..634f36a --- /dev/null +++ b/markitdown-service/requirements.txt @@ -0,0 +1,8 @@ +markitdown[all] +fastapi +uvicorn +python-multipart +asyncpg +sqlalchemy[asyncio] +openai +httpx