Initial commit — MarkItDown vs Docling demo

- FastAPI microservices: MarkItDown + Docling với async SQLAlchemy
- Caddy reverse proxy same-origin (no CORS)
- Bootstrap 5 frontend với marked.js rendering
- LLM settings card: Ollama URL, model select từ API, cleanup model
- POST /cleanup endpoint với AI làm đẹp Markdown
- GET /models fetch danh sách model từ Ollama
- Runtime LLM re-init không cần restart container
- PYTHONDONTWRITEBYTECODE + .dockerignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Kai Ton 2026-06-25 06:47:35 +00:00
commit 11de2d2175
22 changed files with 1814 additions and 0 deletions

7
.env.example Normal file
View File

@ -0,0 +1,7 @@
POSTGRES_USER=admin
POSTGRES_PASSWORD=admin
POSTGRES_DB=markitdown
# Optional — Ollama LLM integration
# OLLAMA_BASE_URL=https://chat-ai.nswteam.net/ollama/v1
# OLLAMA_MODEL=llava:7b

12
.gitignore vendored Normal file
View File

@ -0,0 +1,12 @@
.env
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
*.egg-info/
dist/
build/
db/data/
*.db
*.sqlite

124
docker-compose.yml Normal file
View File

@ -0,0 +1,124 @@
# ============================================================
# AI Markdown Demo — MarkItDown vs Docling
# ============================================================
# Naming convention: <layer>-<service>
#
# ui :8484 — Comparison UI (nginx)
# api-markitdown:8282 — MarkItDown API (Microsoft)
# api-docling :8383 — Docling API (IBM)
# db-markitdown :5432 — PostgreSQL for markitdown
# db-docling :— — PostgreSQL for docling (internal only)
# db-admin :5050 — Adminer
# ============================================================
x-healthcheck-defaults: &healthcheck-defaults
interval: 10s
timeout: 5s
retries: 5
x-service-defaults: &service-defaults
restart: unless-stopped
networks:
- app-network
# ==========================
# Services
# ==========================
services:
# --- UI layer ---------------------------------------------------
ui:
image: caddy:alpine
<<: *service-defaults
ports:
- "8484:80"
volumes:
- ./frontend/Caddyfile:/etc/caddy/Caddyfile:ro
- ./frontend:/srv:ro
# --- API layer --------------------------------------------------
api-markitdown:
build:
context: ./markitdown-service
<<: *service-defaults
ports:
- "8282:8000"
env_file: .env
environment:
DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-markitdown:5432/${POSTGRES_DB:-markitdown}
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-}
OLLAMA_MODEL: ${OLLAMA_MODEL:-llava}
depends_on:
db-markitdown:
condition: service_healthy
api-docling:
build:
context: ./docling-service
<<: *service-defaults
ports:
- "8383:8000"
env_file: .env
environment:
DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-docling:5432/docling
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-}
OLLAMA_MODEL: ${OLLAMA_MODEL:-llava}
depends_on:
db-docling:
condition: service_healthy
# --- Database layer ---------------------------------------------
db-markitdown:
image: postgres:16-alpine
<<: *service-defaults
env_file: .env
environment:
POSTGRES_DB: ${POSTGRES_DB:-markitdown}
POSTGRES_USER: ${POSTGRES_USER:-admin}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
volumes:
- db_markitdown_data:/var/lib/postgresql/data
- ./db/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
ports:
- "5432:5432"
healthcheck:
<<: *healthcheck-defaults
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d ${POSTGRES_DB:-markitdown}"]
db-docling:
image: postgres:16-alpine
<<: *service-defaults
env_file: .env
environment:
POSTGRES_DB: docling
POSTGRES_USER: ${POSTGRES_USER:-admin}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
volumes:
- db_docling_data:/var/lib/postgresql/data
- ./db/init_docling.sql:/docker-entrypoint-initdb.d/init.sql:ro
healthcheck:
<<: *healthcheck-defaults
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"]
# --- Admin layer ------------------------------------------------
db-admin:
image: adminer:4.8.1
<<: *service-defaults
ports:
- "5050:8080"
environment:
ADMINER_DEFAULT_SERVER: db-markitdown
depends_on:
db-markitdown:
condition: service_healthy
# ==========================
# Infrastructure
# ==========================
networks:
app-network:
driver: bridge
volumes:
db_markitdown_data:
db_docling_data:

View File

@ -0,0 +1,4 @@
__pycache__
*.pyc
*.pyo
.env

View File

@ -0,0 +1,21 @@
FROM python:3.11-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
WORKDIR /app
RUN apt-get update && apt-get install -y \
libgl1 libglib2.0-0 libgomp1 \
poppler-utils tesseract-ocr \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY main.py .
COPY app/ ./app/
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@ -0,0 +1,122 @@
from fastapi import APIRouter, UploadFile, File, Depends, Query
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.ConvertModel import ConvertResponse, HealthResponse, ConversionRecord
from app.services import DoclingService as docling_service
from app.database import get_db
from pydantic import BaseModel
router = APIRouter()
class SettingsRequest(BaseModel):
ollama_base_url: str | None = None
ollama_model: str = "llava"
cleanup_model: str | None = None
class SettingsResponse(BaseModel):
llm_enabled: bool
ollama_base_url: str | None
ollama_model: str
cleanup_model: str | None = None
default_prompt: str | None = None
SUPPORTED_INPUT_FORMATS = sorted([
"pdf", "docx", "xlsx", "pptx",
"html", "htm", "jpg", "jpeg", "png",
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc"
])
SUPPORTED_OUTPUT_FORMATS = ["markdown", "json", "html", "text"]
@router.get("/settings", response_model=SettingsResponse)
def get_settings():
return SettingsResponse(
llm_enabled=docling_service.LLM_ACTIVE,
ollama_base_url=docling_service.OLLAMA_BASE_URL,
ollama_model=docling_service.OLLAMA_MODEL,
cleanup_model=docling_service.CLEANUP_MODEL or None,
default_prompt=docling_service.DEFAULT_ENRICH_PROMPT,
)
@router.post("/settings", response_model=SettingsResponse)
def update_settings(req: SettingsRequest):
docling_service._init_llm(req.ollama_base_url or None, req.ollama_model)
docling_service.CLEANUP_MODEL = req.cleanup_model or ""
return SettingsResponse(
llm_enabled=docling_service.LLM_ACTIVE,
ollama_base_url=docling_service.OLLAMA_BASE_URL,
ollama_model=docling_service.OLLAMA_MODEL,
cleanup_model=docling_service.CLEANUP_MODEL or None,
default_prompt=docling_service.DEFAULT_ENRICH_PROMPT,
)
@router.get("/health", response_model=HealthResponse)
def health():
from app.services.DoclingService import LLM_ACTIVE, OLLAMA_MODEL
ocr = "tesseract" if _ocr_available() else "none"
return HealthResponse(
status="ok",
supported_formats=SUPPORTED_INPUT_FORMATS,
output_formats=SUPPORTED_OUTPUT_FORMATS,
llm_enabled=LLM_ACTIVE,
llm_model=OLLAMA_MODEL if LLM_ACTIVE else None,
ocr_engine=ocr,
)
def _ocr_available() -> bool:
import shutil
return shutil.which("tesseract") is not None
@router.post("/convert", response_model=ConvertResponse)
async def convert(
file: UploadFile = File(...),
output_format: str = Query(default="markdown", description="Output format: markdown | json | html | text"),
use_llm: bool = Query(default=True, description="Run LLM enrichment on extracted text"),
llm_prompt: str | None = Query(default=None, description="Custom system prompt for LLM enrichment"),
db: AsyncSession = Depends(get_db),
):
record = await docling_service.convert_file(file, db, output_format, use_llm=use_llm, llm_prompt=llm_prompt)
return ConvertResponse(
id=record.id,
filename=record.filename,
output_format=record.output_format,
content=record.content,
page_count=record.page_count,
llm_enabled=record.llm_enabled,
)
@router.get("/conversions/{conversion_id}", response_model=ConvertResponse)
async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
record = await docling_service.get_conversion(conversion_id, db)
return ConvertResponse(
id=record.id,
filename=record.filename,
output_format=record.output_format,
content=record.content,
page_count=record.page_count,
llm_enabled=record.llm_enabled,
)
@router.get("/history", response_model=list[ConversionRecord])
async def history(limit: int = 20, db: AsyncSession = Depends(get_db)):
records = await docling_service.get_history(db, limit)
return [
ConversionRecord(
id=r.id,
filename=r.filename,
file_type=r.file_type,
output_format=r.output_format,
page_count=r.page_count,
created_at=str(r.created_at),
)
for r in records
]
@router.delete("/conversions/{conversion_id}")
async def delete_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
return await docling_service.delete_conversion(conversion_id, db)

View File

@ -0,0 +1,20 @@
import os
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import sessionmaker, DeclarativeBase
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql+asyncpg://admin:secret@db:5432/docling")
DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+asyncpg://")
engine = create_async_engine(DATABASE_URL, echo=False)
AsyncSessionLocal = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
class Base(DeclarativeBase):
pass
async def get_db():
async with AsyncSessionLocal() as session:
yield session

View File

@ -0,0 +1,51 @@
from typing import Optional
from pydantic import BaseModel
from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, func
from app.database import Base
class Conversion(Base):
__tablename__ = "conversions"
id = Column(Integer, primary_key=True, index=True)
filename = Column(String(255), nullable=False)
file_type = Column(String(50))
output_format = Column(String(20), default="markdown")
content = Column(Text)
page_count = Column(Integer, nullable=True)
llm_enabled = Column(Boolean, default=False)
created_at = Column(DateTime, server_default=func.now())
class ConvertResponse(BaseModel):
id: int
filename: str
output_format: str
content: str
page_count: Optional[int] = None
llm_enabled: bool = False
class Config:
from_attributes = True
class ConversionRecord(BaseModel):
id: int
filename: str
file_type: Optional[str] = None
output_format: str
page_count: Optional[int] = None
llm_enabled: bool = False
created_at: str
class Config:
from_attributes = True
class HealthResponse(BaseModel):
status: str
supported_formats: list[str]
output_formats: list[str]
llm_enabled: bool = False
llm_model: Optional[str] = None
ocr_engine: str = "none"

View File

@ -0,0 +1,197 @@
import os
import re
import json
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from app.models.ConvertModel import Conversion
logger = logging.getLogger(__name__)
import openai as _openai
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
def _build_converter() -> DocumentConverter:
try:
from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractCliOcrOptions
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractCliOcrOptions())
logger.info("Docling: OCR enabled via Tesseract CLI")
except Exception as e:
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
pdf_opts = PdfPipelineOptions(do_ocr=False)
return DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
)
converter = _build_converter()
_llm_client = None
LLM_ACTIVE = False
def _init_llm(base_url: str | None, model: str) -> bool:
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client
if not base_url:
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client = None, model, False, None
return False
try:
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
OLLAMA_BASE_URL = base_url
OLLAMA_MODEL = model
_llm_client = client
LLM_ACTIVE = True
logger.info("Docling: LLM enabled via %s (model=%s)", base_url, model)
return True
except Exception as e:
logger.warning("Docling: LLM init failed (%s)", e)
LLM_ACTIVE = False
return False
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
DEFAULT_ENRICH_PROMPT = (
"You are a document cleaning assistant. "
"Fix OCR errors, normalise whitespace, and improve the Markdown structure. "
"Return ONLY the raw Markdown text — no code fences, no commentary, no explanation."
)
def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str:
"""Send extracted markdown to LLM for cleanup. Optionally override the system prompt."""
if not _llm_client or not markdown.strip():
return markdown
try:
resp = _llm_client.chat.completions.create(
model=OLLAMA_MODEL,
messages=[
{"role": "system", "content": system_prompt or DEFAULT_ENRICH_PROMPT},
{"role": "user", "content": markdown},
],
temperature=0,
)
result = resp.choices[0].message.content or markdown
# llava tends to wrap output in code fences regardless of instructions — strip them
result = re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
result = re.sub(r"\n?```\s*$", "", result.strip())
return result.strip() or markdown
except Exception as e:
logger.warning("Docling: LLM enrichment failed (%s) — returning raw output", e)
return markdown
# -----------------------------------------------------------------
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
"html", "htm", "jpg", "jpeg", "png",
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc"
}
OUTPUT_FORMATS = {"markdown", "json", "html", "text"}
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
async def convert_file(
file: UploadFile,
db: AsyncSession,
output_format: str = "markdown",
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
if not _allowed_file(file.filename):
raise HTTPException(
status_code=422,
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
if output_format not in OUTPUT_FORMATS:
raise HTTPException(
status_code=422,
detail=f"Output format not supported. Supported: {', '.join(sorted(OUTPUT_FORMATS))}"
)
suffix = os.path.splitext(file.filename)[1]
file_type = suffix.lstrip(".").lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
result = converter.convert(tmp_path)
doc = result.document
page_count = len(doc.pages) if hasattr(doc, "pages") and doc.pages else None
if output_format == "markdown":
content = doc.export_to_markdown()
elif output_format == "json":
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format == "html":
content = doc.export_to_html()
elif output_format == "text":
content = doc.export_to_markdown()
content = re.sub(r"#{1,6}\s?", "", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
# LLM enrichment — only for markdown / text output, and only if requested
llm_used = False
if _llm_client and use_llm and output_format in ("markdown", "text"):
content = _llm_enrich(content, system_prompt=llm_prompt or None)
llm_used = True
record = Conversion(
filename=file.filename,
file_type=file_type,
output_format=output_format,
content=content,
page_count=page_count,
llm_enabled=llm_used,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
async def get_conversion(conversion_id: int, db: AsyncSession) -> Conversion:
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
record = result.scalar_one_or_none()
if not record:
raise HTTPException(status_code=404, detail="Conversion not found")
return record
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
result = await db.execute(
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
)
return result.scalars().all()
async def delete_conversion(conversion_id: int, db: AsyncSession) -> dict:
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
record = result.scalar_one_or_none()
if not record:
raise HTTPException(status_code=404, detail="Conversion not found")
await db.delete(record)
await db.commit()
return {"message": f"Conversion {conversion_id} deleted"}

29
docling-service/main.py Normal file
View File

@ -0,0 +1,29 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.controllers.ConvertController import router
from app.database import engine, Base
@asynccontextmanager
async def lifespan(app: FastAPI):
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
yield
app = FastAPI(
title="Docling API",
version="1.0.0",
description="Advanced document conversion service powered by Docling. Supports PDF, DOCX, PPTX, XLSX, HTML, images and more.",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
app.include_router(router)

View File

@ -0,0 +1,7 @@
docling
fastapi
uvicorn
python-multipart
asyncpg
sqlalchemy[asyncio]
openai

BIN
example/1.pdf Normal file

Binary file not shown.

19
frontend/Caddyfile Normal file
View File

@ -0,0 +1,19 @@
:80 {
encode gzip
# Reverse proxy strips prefix and forwards to backend
handle_path /api/markitdown/* {
reverse_proxy api-markitdown:8000
}
handle_path /api/docling/* {
reverse_proxy api-docling:8000
}
# Static files with SPA fallback
handle {
root * /srv
file_server
try_files {path} /index.html
}
}

822
frontend/index.html Normal file
View File

@ -0,0 +1,822 @@
<!DOCTYPE html>
<html lang="vi">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>MarkItDown vs Docling — LLM Input Processing</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" />
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.min.css" rel="stylesheet" />
<style>
body {
background: #f8f9fa;
}
/* Upload zone */
#UploadZone {
border: 2px dashed #dee2e6;
border-radius: .5rem;
padding: 3rem 1.5rem;
text-align: center;
cursor: pointer;
transition: border-color .2s, background .2s;
}
#UploadZone:hover,
#UploadZone.dragover {
border-color: #0d6efd;
background: #f0f6ff;
}
#UploadZone input[type="file"] {
display: none;
}
/* Result pane */
.ResultPre {
max-height: 460px;
overflow: auto;
white-space: pre-wrap;
word-break: break-word;
font-size: .78rem;
background: #f8f9fa;
}
.PreviewPane {
max-height: 460px;
overflow: auto;
font-size: .85rem;
padding: 1rem;
line-height: 1.7;
}
.PreviewPane table {
border-collapse: collapse;
width: 100%;
margin: .5rem 0;
}
.PreviewPane th,
.PreviewPane td {
border: 1px solid #dee2e6;
padding: .3rem .6rem;
font-size: .8rem;
}
.PreviewPane th {
background: #f1f3f5;
}
.PreviewPane code {
background: #f1f3f5;
padding: 1px 4px;
border-radius: 3px;
font-size: .85em;
}
.PreviewPane blockquote {
border-left: 3px solid #dee2e6;
padding-left: .75rem;
color: #6c757d;
}
</style>
</head>
<body>
<nav class="navbar navbar-light bg-white border-bottom px-4 py-2">
<span class="navbar-brand fw-bold mb-0">MarkItDown <span class="text-muted fw-normal">vs</span> Docling</span>
<span class="badge bg-primary-subtle text-primary">Demo — LLM Input Processing</span>
</nav>
<div class="container-xl py-4">
<!-- Upload card -->
<div class="card shadow-sm mb-4">
<div class="card-body">
<h6 class="card-title fw-semibold mb-3">Tải lên tài liệu để so sánh</h6>
<div id="UploadZone">
<input type="file" id="FileInput"
accept=".pdf,.docx,.xlsx,.pptx,.html,.htm,.csv,.txt,.jpg,.jpeg,.png,.tiff,.tif,.bmp,.md,.epub,.zip,.asciidoc,.adoc" />
<i class="bi bi-file-earmark-text fs-1 text-secondary"></i>
<p class="text-muted mt-2 mb-1">Kéo thả hoặc click để chọn file</p>
<div id="FileName" class="fw-semibold text-primary small"></div>
</div>
<div class="d-flex flex-wrap gap-1 mt-2">
<span class="badge bg-secondary-subtle text-secondary">PDF</span>
<span class="badge bg-secondary-subtle text-secondary">DOCX</span>
<span class="badge bg-secondary-subtle text-secondary">XLSX</span>
<span class="badge bg-secondary-subtle text-secondary">PPTX</span>
<span class="badge bg-secondary-subtle text-secondary">HTML</span>
<span class="badge bg-secondary-subtle text-secondary">CSV</span>
<span class="badge bg-secondary-subtle text-secondary">TXT</span>
<span class="badge bg-secondary-subtle text-secondary">JPG/PNG</span>
<span class="badge bg-secondary-subtle text-secondary">EPUB</span>
<span class="badge bg-secondary-subtle text-secondary">TIFF</span>
<span class="badge bg-secondary-subtle text-secondary">ASCIIDoc</span>
</div>
<!-- Controls row -->
<div class="d-flex flex-wrap align-items-center gap-3 mt-3">
<div class="d-flex align-items-center gap-2">
<label class="form-label mb-0 small fw-medium" for="DoclingFormat">Docling format</label>
<select class="form-select form-select-sm" id="DoclingFormat" style="width:auto">
<option value="markdown">Markdown</option>
<option value="json">JSON</option>
<option value="html">HTML</option>
<option value="text">Plain Text</option>
</select>
</div>
<div class="form-check form-switch mb-0">
<input class="form-check-input" type="checkbox" id="LlmToggle" checked onchange="OnLlmToggle()" />
<label class="form-check-label small fw-medium" for="LlmToggle" id="LlmToggleLabel">LLM bật</label>
</div>
<button class="btn btn-primary btn-sm" id="ConvertBtn" disabled onclick="RunConversion()">
<i class="bi bi-play-fill me-1"></i>Chuyển đổi & So sánh
</button>
<button class="btn btn-outline-secondary btn-sm" onclick="ClearResults()">
<i class="bi bi-x-circle me-1"></i>Xoá
</button>
</div>
<!-- LLM Prompt panel -->
<div id="LlmPanel" class="mt-3 d-none">
<div class="row g-3">
<!-- Col 1: Custom prompt -->
<div class="col-md-6">
<div class="h-100 p-3 rounded border bg-primary-subtle">
<div class="d-flex justify-content-between align-items-center mb-2">
<span class="small fw-semibold text-primary"><i class="bi bi-pencil-square me-1"></i>Custom LLM
Prompt</span>
<button class="btn btn-link btn-sm p-0 text-primary text-decoration-underline"
onclick="ResetPrompt()">Xoá</button>
</div>
<textarea class="form-control form-control-sm" id="LlmPrompt" rows="6"
placeholder="Để trống = dùng Default prompt bên phải&#10;&#10;Nhập custom system prompt để override..."></textarea>
<div class="form-text mt-1 text-primary small">
<b>MarkItDown</b>: vision prompt &nbsp;|&nbsp; <b>Docling</b>: enrich prompt
</div>
</div>
</div>
<!-- Col 2: Default prompts with tabs -->
<div class="col-md-6">
<div class="h-100 p-3 rounded border bg-light">
<ul class="nav nav-tabs nav-sm mb-2" id="DefaultPromptTabs">
<li class="nav-item">
<a class="nav-link active py-1 px-2 small" href="#"
onclick="SwitchDefaultTab('Md',this);return false">
<span class="badge bg-primary rounded-circle p-1 me-1">&nbsp;</span>MarkItDown
</a>
</li>
<li class="nav-item">
<a class="nav-link py-1 px-2 small" href="#" onclick="SwitchDefaultTab('Dl',this);return false">
<span class="badge bg-success rounded-circle p-1 me-1">&nbsp;</span>Docling
</a>
</li>
</ul>
<div id="DefaultPrompt-Md">
<textarea class="form-control form-control-sm font-monospace" id="MdDefaultPrompt" rows="5" readonly
style="font-size:.7rem;resize:none;background:#fff"></textarea>
<button class="btn btn-outline-primary btn-sm mt-2 w-100" onclick="UsePrompt('Md')">
<i class="bi bi-arrow-left me-1"></i>Dùng prompt này
</button>
</div>
<div id="DefaultPrompt-Dl" class="d-none">
<textarea class="form-control form-control-sm font-monospace" id="DlDefaultPrompt" rows="5" readonly
style="font-size:.7rem;resize:none;background:#fff"></textarea>
<button class="btn btn-outline-success btn-sm mt-2 w-100" onclick="UsePrompt('Dl')">
<i class="bi bi-arrow-left me-1"></i>Dùng prompt này
</button>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Settings card -->
<div class="card shadow-sm mb-4">
<div class="card-header py-2">
<span class="fw-semibold small"><i class="bi bi-gear me-2"></i>Cài đặt LLM (Ollama)</span>
</div>
<div class="card-body" id="SettingsBody">
<div class="row g-3 align-items-end">
<div class="col-md-5">
<label class="form-label fw-medium small">Ollama Base URL</label>
<div class="input-group input-group-sm">
<input type="url" class="form-control" id="SettingUrl" placeholder="https://your-ollama-server/v1" />
<button class="btn btn-outline-secondary" onclick="FetchModels()" title="Tải danh sách model">
<i class="bi bi-arrow-clockwise"></i>
</button>
</div>
<div class="form-text">OpenAI-compatible endpoint</div>
</div>
<div class="col-md-3">
<label class="form-label fw-medium small">Model mặc định</label>
<select class="form-select form-select-sm" id="SettingModel">
<option value="">-- chọn model --</option>
</select>
<div class="form-text">Convert + Docling enrich</div>
</div>
<div class="col-md-4">
<label class="form-label fw-medium small">Model Format</label>
<select class="form-select form-select-sm" id="CleanupModel">
<option value="">-- dùng model trên --</option>
</select>
<div class="form-text">Để trống = dùng model mặc định</div>
</div>
</div>
<div class="row g-2 mt-3 text-center">
<div class="col-6">
<div class="border rounded py-2 px-3">
<div class="small text-muted mb-1"><span
class="badge bg-primary rounded-circle p-1 me-1">&nbsp;</span>MarkItDown LLM</div>
<div id="MdLlmStatus"><span class="spinner-border spinner-border-sm"></span></div>
</div>
</div>
<div class="col-6">
<div class="border rounded py-2 px-3">
<div class="small text-muted mb-1"><span
class="badge bg-success rounded-circle p-1 me-1">&nbsp;</span>Docling LLM</div>
<div id="DlLlmStatus"><span class="spinner-border spinner-border-sm"></span></div>
</div>
</div>
</div>
<div class="mt-3 text-end">
<button class="btn btn-primary btn-sm px-4" id="SaveSettingsBtn" onclick="SaveSettings()">
<span id="SaveSpinner" class="spinner-border spinner-border-sm d-none"></span>
<i class="bi bi-check-lg me-1"></i>Lưu & Áp dụng
</button>
</div>
</div>
</div>
<!-- Progress row -->
<div class="row g-3 mb-4 d-none" id="ProgressRow">
<div class="col-md-6">
<div class="card h-100">
<div class="card-body">
<h6 class="card-title d-flex align-items-center gap-2">
<span class="badge bg-primary rounded-circle p-1">&nbsp;</span>MarkItDown
</h6>
<div class="d-flex align-items-center gap-2" id="MdStatusLine">
<div class="spinner-border spinner-border-sm text-primary d-none" id="MdSpinner"></div>
<small id="MdStatus" class="text-muted">Đang chờ...</small>
</div>
<div class="row g-2 mt-2 d-none" id="MdMetrics">
<div class="col-4">
<div class="border rounded text-center py-2" id="MdTimeCard">
<div class="fw-bold" id="MdTimeVal">-</div>
<div class="text-muted" style="font-size:.7rem">ms</div>
</div>
</div>
<div class="col-4">
<div class="border rounded text-center py-2">
<div class="fw-bold" id="MdCharsVal">-</div>
<div class="text-muted" style="font-size:.7rem">ký tự</div>
</div>
</div>
<div class="col-4">
<div class="border rounded text-center py-2">
<div class="fw-bold" id="MdLinesVal">-</div>
<div class="text-muted" style="font-size:.7rem">dòng</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card h-100">
<div class="card-body">
<h6 class="card-title d-flex align-items-center gap-2">
<span class="badge bg-success rounded-circle p-1">&nbsp;</span>Docling
</h6>
<div class="d-flex align-items-center gap-2" id="DlStatusLine">
<div class="spinner-border spinner-border-sm text-success d-none" id="DlSpinner"></div>
<small id="DlStatus" class="text-muted">Đang chờ...</small>
</div>
<div class="row g-2 mt-2 d-none" id="DlMetrics">
<div class="col-3">
<div class="border rounded text-center py-2" id="DlTimeCard">
<div class="fw-bold" id="DlTimeVal">-</div>
<div class="text-muted" style="font-size:.7rem">ms</div>
</div>
</div>
<div class="col-3">
<div class="border rounded text-center py-2">
<div class="fw-bold" id="DlCharsVal">-</div>
<div class="text-muted" style="font-size:.7rem">ký tự</div>
</div>
</div>
<div class="col-3">
<div class="border rounded text-center py-2">
<div class="fw-bold" id="DlLinesVal">-</div>
<div class="text-muted" style="font-size:.7rem">dòng</div>
</div>
</div>
<div class="col-3 d-none" id="DlPagesCard">
<div class="border rounded text-center py-2">
<div class="fw-bold" id="DlPagesVal">-</div>
<div class="text-muted" style="font-size:.7rem">trang</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Compare banner -->
<div class="card shadow-sm mb-4 d-none" id="CompareBanner">
<div class="card-body">
<h6 class="card-title fw-semibold mb-3"><i class="bi bi-bar-chart-line me-1"></i>Tóm tắt so sánh</h6>
<table class="table table-sm table-bordered mb-0">
<thead class="table-light">
<tr>
<th>Tiêu chí</th>
<th>MarkItDown</th>
<th>Docling</th>
<th class="text-muted">Ghi chú</th>
</tr>
</thead>
<tbody id="CompareRows"></tbody>
</table>
</div>
</div>
<!-- Results -->
<div class="row g-3 mb-4">
<div class="col-md-6">
<div class="card shadow-sm h-100" id="MdResultCard">
<div class="card-header d-flex justify-content-between align-items-center py-2">
<span class="fw-semibold small d-flex align-items-center gap-2">
<span class="badge bg-primary rounded-circle p-1">&nbsp;</span>MarkItDown
</span>
<div class="d-flex align-items-center gap-2">
<button class="btn btn-outline-warning btn-sm py-0 px-2 d-none" id="CleanupBtn"
onclick="CleanupMarkItDown()" title="Dùng LLM tái cấu trúc output">
<span id="CleanupSpinner" class="spinner-border spinner-border-sm d-none"></span>
✨ Làm đẹp
</button>
<button class="btn btn-outline-secondary btn-sm py-0 px-2 d-none" id="MdDownloadBtn"
onclick="DownloadMd('Md')" title="Tải xuống .md">
<i class="bi bi-download"></i> .md
</button>
<ul class="nav nav-tabs card-header-tabs border-0" id="MdTabs">
<li class="nav-item"><a class="nav-link active py-1 px-2 small" href="#"
onclick="SwitchTab('Md','Raw',this);return false">Raw</a></li>
<li class="nav-item"><a class="nav-link py-1 px-2 small" href="#"
onclick="SwitchTab('Md','Preview',this);return false">Preview</a></li>
</ul>
</div>
</div>
<div class="card-body p-0" id="MdBody">
<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để
xem kết quả</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card shadow-sm h-100" id="DlResultCard">
<div class="card-header d-flex justify-content-between align-items-center py-2">
<span class="fw-semibold small d-flex align-items-center gap-2">
<span class="badge bg-success rounded-circle p-1">&nbsp;</span>Docling
</span>
<div class="d-flex align-items-center gap-2">
<button class="btn btn-outline-secondary btn-sm py-0 px-2 d-none" id="DlDownloadBtn"
onclick="DownloadMd('Dl')" title="Tải xuống .md">
<i class="bi bi-download"></i> .md
</button>
<ul class="nav nav-tabs card-header-tabs border-0" id="DlTabs">
<li class="nav-item"><a class="nav-link active py-1 px-2 small" href="#"
onclick="SwitchTab('Dl','Raw',this);return false">Raw</a></li>
<li class="nav-item"><a class="nav-link py-1 px-2 small" href="#"
onclick="SwitchTab('Dl','Preview',this);return false">Preview</a></li>
</ul>
</div>
</div>
<div class="card-body p-0" id="DlBody">
<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để
xem kết quả</div>
</div>
</div>
</div>
</div>
<!-- History -->
<h6 class="fw-semibold mb-3">Lịch sử chuyển đổi gần đây</h6>
<div class="row g-3">
<div class="col-md-6">
<div class="card shadow-sm">
<div class="card-header py-2 small fw-semibold d-flex align-items-center gap-2">
<span class="badge bg-primary rounded-circle p-1">&nbsp;</span>MarkItDown
</div>
<div id="MdHistory">
<div class="text-center text-muted py-3 small">Chưa có lịch sử</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card shadow-sm">
<div class="card-header py-2 small fw-semibold d-flex align-items-center gap-2">
<span class="badge bg-success rounded-circle p-1">&nbsp;</span>Docling
</div>
<div id="DlHistory">
<div class="text-center text-muted py-3 small">Chưa có lịch sử</div>
</div>
</div>
</div>
</div>
</div><!-- /container -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/marked@13/marked.min.js"></script>
<script>
const MarkItDownUrl = '/api/markitdown';
const DoclingUrl = '/api/docling';
let CurrentFile = null;
let MdContent = '';
let DlContent = '';
// ── File input ────────────────────────────────────────────────
const UploadZone = document.getElementById('UploadZone');
const FileInput = document.getElementById('FileInput');
UploadZone.addEventListener('click', () => FileInput.click());
UploadZone.addEventListener('dragover', e => { e.preventDefault(); UploadZone.classList.add('dragover'); });
UploadZone.addEventListener('dragleave', () => UploadZone.classList.remove('dragover'));
UploadZone.addEventListener('drop', e => {
e.preventDefault();
UploadZone.classList.remove('dragover');
if (e.dataTransfer.files[0]) SelectFile(e.dataTransfer.files[0]);
});
FileInput.addEventListener('change', () => { if (FileInput.files[0]) SelectFile(FileInput.files[0]); });
function SelectFile(File_) {
CurrentFile = File_;
document.getElementById('FileName').textContent = File_.name + ' (' + FormatBytes(File_.size) + ')';
document.getElementById('ConvertBtn').disabled = false;
}
function FormatBytes(Bytes) {
if (Bytes < 1024) return Bytes + ' B';
if (Bytes < 1024 * 1024) return (Bytes / 1024).toFixed(1) + ' KB';
return (Bytes / (1024 * 1024)).toFixed(1) + ' MB';
}
// ── LLM toggle ────────────────────────────────────────────────
function OnLlmToggle() {
const IsOn = document.getElementById('LlmToggle').checked;
document.getElementById('LlmToggleLabel').textContent = IsOn ? 'LLM bật' : 'LLM tắt';
document.getElementById('LlmPanel').classList.toggle('d-none', !IsOn);
localStorage.setItem('llm_enabled', IsOn ? '1' : '0');
}
function ResetPrompt() {
document.getElementById('LlmPrompt').value = '';
localStorage.removeItem('llm_prompt');
}
document.getElementById('LlmPrompt').addEventListener('input', () => {
localStorage.setItem('llm_prompt', document.getElementById('LlmPrompt').value);
});
// ── Conversion ────────────────────────────────────────────────
async function RunConversion() {
if (!CurrentFile) return;
document.getElementById('ConvertBtn').disabled = true;
SetDisplay('ProgressRow', true);
SetDisplay('CompareBanner', false);
document.getElementById('MdBody').innerHTML = '<div class="text-center text-muted py-5 small"><div class="spinner-border spinner-border-sm mb-2"></div><br>Đang xử lý...</div>';
document.getElementById('DlBody').innerHTML = '<div class="text-center text-muted py-5 small"><div class="spinner-border spinner-border-sm text-success mb-2"></div><br>Đang xử lý...</div>';
SetDisplay('MdMetrics', false);
SetDisplay('DlMetrics', false);
SetSpinner('Md', true, 'Đang chuyển đổi...');
SetSpinner('Dl', true, 'Đang chuyển đổi...');
const DoclingFmt = document.getElementById('DoclingFormat').value;
const UseLlm = document.getElementById('LlmToggle').checked;
const CustomPrompt = document.getElementById('LlmPrompt').value.trim();
const [MdResult, DlResult] = await Promise.allSettled([
ConvertMarkItDown(CurrentFile, UseLlm, CustomPrompt),
ConvertDocling(CurrentFile, DoclingFmt, UseLlm, CustomPrompt),
]);
document.getElementById('ConvertBtn').disabled = false;
ShowCompare(MdResult, DlResult);
LoadHistory();
}
async function ConvertMarkItDown(File_, UseLlm, CustomPrompt) {
const Form = new FormData();
Form.append('file', File_);
const Params = new URLSearchParams({ use_llm: UseLlm });
if (CustomPrompt) Params.set('llm_prompt', CustomPrompt);
const T0 = performance.now();
const Res = await fetch(MarkItDownUrl + '/convert?' + Params, { method: 'POST', body: Form });
const Ms = Math.round(performance.now() - T0);
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
const Data = await Res.json();
return { Content: Data.markdown, Ms, LlmEnabled: Data.llm_enabled };
}
async function ConvertDocling(File_, Fmt, UseLlm, CustomPrompt) {
const Form = new FormData();
Form.append('file', File_);
const Params = new URLSearchParams({ output_format: Fmt, use_llm: UseLlm });
if (CustomPrompt) Params.set('llm_prompt', CustomPrompt);
const T0 = performance.now();
const Res = await fetch(DoclingUrl + '/convert?' + Params, { method: 'POST', body: Form });
const Ms = Math.round(performance.now() - T0);
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
const Data = await Res.json();
return { Content: Data.content, Ms, Pages: Data.page_count, LlmEnabled: Data.llm_enabled };
}
function SetDisplay(Id, Visible) {
document.getElementById(Id).classList.toggle('d-none', !Visible);
}
function SetSpinner(Prefix, Active, Msg) {
document.getElementById(Prefix + 'Spinner').classList.toggle('d-none', !Active);
document.getElementById(Prefix + 'Status').textContent = Msg;
}
function ShowCompare(MdResult, DlResult) {
const MdOk = MdResult.status === 'fulfilled';
const DlOk = DlResult.status === 'fulfilled';
if (MdOk) {
MdContent = MdResult.value.Content;
const LlmTag = MdResult.value.LlmEnabled ? ' 🤖' : '';
SetSpinner('Md', false, '✅ Hoàn tất (' + MdResult.value.Ms + ' ms)' + LlmTag);
ShowMetrics('Md', MdResult.value);
RenderResult('Md', MdContent);
document.getElementById('CleanupBtn').classList.remove('d-none');
document.getElementById('MdDownloadBtn').classList.remove('d-none');
} else {
MdContent = '';
SetSpinner('Md', false, '❌ ' + MdResult.reason.message);
document.getElementById('MdBody').innerHTML = '<div class="alert alert-danger m-3 small">' + EscHtml(MdResult.reason.message) + '</div>';
}
if (DlOk) {
DlContent = DlResult.value.Content;
document.getElementById('DlDownloadBtn').classList.remove('d-none');
const LlmTag = DlResult.value.LlmEnabled ? ' 🤖' : '';
SetSpinner('Dl', false, '✅ Hoàn tất (' + DlResult.value.Ms + ' ms)' + LlmTag);
ShowMetrics('Dl', DlResult.value);
RenderResult('Dl', DlContent);
} else {
DlContent = '';
SetSpinner('Dl', false, '❌ ' + DlResult.reason.message);
document.getElementById('DlBody').innerHTML = '<div class="alert alert-danger m-3 small">' + EscHtml(DlResult.reason.message) + '</div>';
}
if (MdOk && DlOk) BuildCompareTable(MdResult.value, DlResult.value);
}
function ShowMetrics(Prefix, Data) {
const Lines = Data.Content.split('\n').length;
const Chars = Data.Content.length;
document.getElementById(Prefix + 'TimeVal').textContent = Data.Ms.toLocaleString();
document.getElementById(Prefix + 'CharsVal').textContent = Chars.toLocaleString();
document.getElementById(Prefix + 'LinesVal').textContent = Lines.toLocaleString();
if (Prefix === 'Dl' && Data.Pages) {
document.getElementById('DlPagesVal').textContent = Data.Pages;
document.getElementById('DlPagesCard').classList.remove('d-none');
}
SetDisplay(Prefix + 'Metrics', true);
}
function BuildCompareTable(Md, Dl) {
const Rows = [
{ Label: 'Thời gian xử lý', MdVal: Md.Ms + ' ms', DlVal: Dl.Ms + ' ms', MdWin: Md.Ms < Dl.Ms, Note: 'Thấp hơn = nhanh hơn' },
{ Label: 'Độ dài output', MdVal: Md.Content.length + ' ký tự', DlVal: Dl.Content.length + ' ký tự', MdWin: Md.Content.length >= Dl.Content.length, Note: 'Nhiều hơn = giữ được nội dung hơn' },
{ Label: 'Số dòng', MdVal: Md.Content.split('\n').length, DlVal: Dl.Content.split('\n').length, MdWin: Md.Content.split('\n').length >= Dl.Content.split('\n').length, Note: '' },
];
document.getElementById('CompareRows').innerHTML = Rows.map(R => `
<tr>
<td>${R.Label}</td>
<td class="${R.MdWin ? 'text-success fw-semibold' : 'text-muted'}">${R.MdWin ? '🏆 ' : ''}${R.MdVal}</td>
<td class="${!R.MdWin ? 'text-success fw-semibold' : 'text-muted'}">${!R.MdWin ? '🏆 ' : ''}${R.DlVal}</td>
<td class="text-muted small">${R.Note}</td>
</tr>`).join('');
SetDisplay('CompareBanner', true);
}
// ── Render result ─────────────────────────────────────────────
function RenderResult(Prefix, Content_) {
document.getElementById(Prefix + 'Body').innerHTML = `
<div id="${Prefix}-RawPane"><pre class="ResultPre p-3 m-0">${EscHtml(Content_)}</pre></div>
<div id="${Prefix}-PreviewPane" class="d-none"><div class="PreviewPane">${marked.parse(Content_)}</div></div>`;
}
function SwitchTab(Prefix, Tab, Link) {
document.querySelectorAll('#' + Prefix + 'Tabs .nav-link').forEach(L => L.classList.remove('active'));
Link.classList.add('active');
document.getElementById(Prefix + '-RawPane').classList.toggle('d-none', Tab !== 'Raw');
document.getElementById(Prefix + '-PreviewPane').classList.toggle('d-none', Tab !== 'Preview');
}
function EscHtml(Str) {
return String(Str).replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;');
}
async function CleanupMarkItDown() {
if (!MdContent) return;
const Btn = document.getElementById('CleanupBtn');
const Spinner = document.getElementById('CleanupSpinner');
Btn.disabled = true;
Spinner.classList.remove('d-none');
try {
const CustomPrompt = document.getElementById('LlmPrompt').value.trim();
const Res = await fetch(MarkItDownUrl + '/cleanup', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text: MdContent, prompt: CustomPrompt || null, model: document.getElementById('CleanupModel').value.trim() || null }),
});
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
const Data = await Res.json();
MdContent = Data.text;
RenderResult('Md', MdContent);
document.querySelector('#MdTabs .nav-link:last-child').click();
} catch (E) {
alert('Cleanup failed: ' + E.message);
} finally {
Btn.disabled = false;
Spinner.classList.add('d-none');
}
}
function ClearResults() {
MdContent = ''; DlContent = '';
CurrentFile = null;
SetDisplay('ProgressRow', false);
SetDisplay('CompareBanner', false);
document.getElementById('MdBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
document.getElementById('DlBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
document.getElementById('FileName').textContent = '';
document.getElementById('ConvertBtn').disabled = true;
document.getElementById('CleanupBtn').classList.add('d-none');
document.getElementById('MdDownloadBtn').classList.add('d-none');
document.getElementById('DlDownloadBtn').classList.add('d-none');
FileInput.value = '';
}
// ── History ───────────────────────────────────────────────────
async function LoadHistory() {
try {
const [MdData, DlData] = await Promise.all([
fetch(MarkItDownUrl + '/history?limit=8').then(R => R.json()),
fetch(DoclingUrl + '/history?limit=8').then(R => R.json()),
]);
RenderHistory('MdHistory', MdData);
RenderHistory('DlHistory', DlData);
} catch (E) {
console.warn('History load failed', E);
}
}
function RenderHistory(ElId, Items) {
const El = document.getElementById(ElId);
if (!Items || !Items.length) {
El.innerHTML = '<div class="text-center text-muted py-3 small">Chưa có lịch sử</div>';
return;
}
El.innerHTML = '<ul class="list-group list-group-flush">' +
Items.map(Item => `
<li class="list-group-item d-flex justify-content-between align-items-center py-2 px-3">
<span class="small fw-medium text-truncate me-2" style="max-width:60%">${EscHtml(Item.filename)}</span>
<span class="d-flex gap-1 align-items-center flex-shrink-0">
${Item.file_type ? `<span class="badge bg-secondary-subtle text-secondary">${Item.file_type}</span>` : ''}
${Item.llm_enabled ? '<span class="badge bg-primary-subtle text-primary">🤖 LLM</span>' : ''}
<span class="text-muted" style="font-size:.7rem">${FormatDate(Item.created_at)}</span>
</span>
</li>`).join('') +
'</ul>';
}
function FormatDate(Str) {
try { return new Date(Str).toLocaleTimeString('vi-VN'); } catch { return Str; }
}
// ── Settings modal ───────────────────────────────────────────
async function LoadSettings() {
try {
const [Md, Dl] = await Promise.all([
fetch(MarkItDownUrl + '/settings').then(R => R.json()),
fetch(DoclingUrl + '/settings').then(R => R.json()),
]);
document.getElementById('SettingUrl').value = Md.ollama_base_url || '';
document.getElementById('MdDefaultPrompt').value = Md.default_prompt || '';
document.getElementById('DlDefaultPrompt').value = Dl.default_prompt || '';
RenderLlmStatus('MdLlmStatus', Md);
RenderLlmStatus('DlLlmStatus', Dl);
await FetchModels(Md.ollama_model, Md.cleanup_model);
} catch (E) {
console.warn('Settings load failed', E);
}
}
async function FetchModels(SelectModel, SelectCleanup) {
try {
const Data = await fetch(MarkItDownUrl + '/models').then(R => R.json());
const Models = Data.models || [];
PopulateSelect('SettingModel', Models, SelectModel || null, '-- chọn model --');
PopulateSelect('CleanupModel', Models, SelectCleanup || null, '-- dùng model trên --');
} catch (E) {
console.warn('FetchModels failed', E);
}
}
function PopulateSelect(ElId, Models, Selected, EmptyLabel) {
const Sel = document.getElementById(ElId);
const Prev = Sel.value || Selected;
Sel.innerHTML = `<option value="">${EmptyLabel}</option>` +
Models.map(M => `<option value="${M}"${M === Prev ? ' selected' : ''}>${M}</option>`).join('');
}
function UsePrompt(Prefix) {
const Src = Prefix === 'Md' ? 'MdDefaultPrompt' : 'DlDefaultPrompt';
const Text = document.getElementById(Src).value;
document.getElementById('LlmPrompt').value = Text;
localStorage.setItem('llm_prompt', Text);
}
function DownloadMd(Prefix) {
const Content = Prefix === 'Md' ? MdContent : DlContent;
if (!Content) return;
const BaseName = (CurrentFile ? CurrentFile.name.replace(/\.[^.]+$/, '') : 'output');
const Suffix = Prefix === 'Md' ? '_markitdown' : '_docling';
const Blob = new Blob([Content], { type: 'text/markdown' });
const A = document.createElement('a');
A.href = URL.createObjectURL(Blob);
A.download = BaseName + Suffix + '.md';
A.click();
URL.revokeObjectURL(A.href);
}
function SwitchDefaultTab(Prefix, Link) {
document.querySelectorAll('#DefaultPromptTabs .nav-link').forEach(L => L.classList.remove('active'));
Link.classList.add('active');
document.getElementById('DefaultPrompt-Md').classList.toggle('d-none', Prefix !== 'Md');
document.getElementById('DefaultPrompt-Dl').classList.toggle('d-none', Prefix !== 'Dl');
}
function RenderLlmStatus(ElId, Data) {
const El = document.getElementById(ElId);
El.innerHTML = Data.llm_enabled
? `<span class="badge bg-success-subtle text-success">✅ ${Data.ollama_model}</span>`
: `<span class="badge bg-secondary-subtle text-secondary">⚪ Tắt</span>`;
}
async function SaveSettings() {
const Url = document.getElementById('SettingUrl').value.trim();
const Model = document.getElementById('SettingModel').value || 'llava';
const CleanupModel = document.getElementById('CleanupModel').value;
const Btn = document.getElementById('SaveSettingsBtn');
const Spin = document.getElementById('SaveSpinner');
Btn.disabled = true; Spin.classList.remove('d-none');
try {
const Body = JSON.stringify({ ollama_base_url: Url || null, ollama_model: Model, cleanup_model: CleanupModel || null });
const [Md, Dl] = await Promise.all([
fetch(MarkItDownUrl + '/settings', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: Body }).then(R => R.json()),
fetch(DoclingUrl + '/settings', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: Body }).then(R => R.json()),
]);
RenderLlmStatus('MdLlmStatus', Md);
RenderLlmStatus('DlLlmStatus', Dl);
} catch (E) {
alert('Lưu thất bại: ' + E.message);
} finally {
Btn.disabled = false; Spin.classList.add('d-none');
}
}
LoadSettings();
// init — restore saved state
const _savedPrompt = localStorage.getItem('llm_prompt');
if (_savedPrompt) document.getElementById('LlmPrompt').value = _savedPrompt;
const _savedLlm = localStorage.getItem('llm_enabled');
if (_savedLlm === '0') {
document.getElementById('LlmToggle').checked = false;
}
OnLlmToggle();
LoadHistory();
</script>
</body>
</html>

View File

@ -0,0 +1,4 @@
__pycache__
*.pyc
*.pyo
.env

View File

@ -0,0 +1,20 @@
FROM python:3.11-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
WORKDIR /app
RUN apt-get update && apt-get install -y \
ffmpeg libmagic1 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY main.py .
COPY app/ ./app/
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@ -0,0 +1,107 @@
from fastapi import APIRouter, UploadFile, File, Depends, Query, Body, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.ConvertModel import ConvertResponse, HealthResponse, ConversionRecord
from app.services import MarkitdownService as markitdown_service
from app.database import get_db
from pydantic import BaseModel
class CleanupRequest(BaseModel):
text: str
prompt: str | None = None
model: str | None = None
class CleanupResponse(BaseModel):
text: str
class SettingsRequest(BaseModel):
ollama_base_url: str | None = None
ollama_model: str = "llava"
cleanup_model: str | None = None
class SettingsResponse(BaseModel):
llm_enabled: bool
ollama_base_url: str | None
ollama_model: str
cleanup_model: str | None = None
default_prompt: str | None = None
router = APIRouter()
@router.get("/health", response_model=HealthResponse)
def health():
return HealthResponse(
status="ok",
llm_enabled=markitdown_service.LLM_ACTIVE,
llm_model=markitdown_service.OLLAMA_MODEL if markitdown_service.LLM_ACTIVE else None,
)
@router.post("/convert", response_model=ConvertResponse)
async def convert(
file: UploadFile = File(...),
use_llm: bool = Query(default=True, description="Use LLM vision for image understanding"),
llm_prompt: str | None = Query(default=None, description="Custom prompt for LLM vision"),
db: AsyncSession = Depends(get_db),
):
record = await markitdown_service.convert_file(file, db, use_llm=use_llm, llm_prompt=llm_prompt)
return record
@router.get("/models")
def list_models():
if not markitdown_service.OLLAMA_BASE_URL:
return {"models": []}
try:
import httpx, re
base = re.sub(r"/v1/?$", "", markitdown_service.OLLAMA_BASE_URL.rstrip("/"))
resp = httpx.get(f"{base}/api/tags", timeout=5)
resp.raise_for_status()
names = [m["name"] for m in resp.json().get("models", [])]
return {"models": sorted(names)}
except Exception as e:
return {"models": [], "error": str(e)}
@router.get("/settings", response_model=SettingsResponse)
def get_settings():
return SettingsResponse(
llm_enabled=markitdown_service.LLM_ACTIVE,
ollama_base_url=markitdown_service.OLLAMA_BASE_URL,
ollama_model=markitdown_service.OLLAMA_MODEL,
cleanup_model=markitdown_service.CLEANUP_MODEL or None,
default_prompt=markitdown_service.DEFAULT_CLEANUP_PROMPT,
)
@router.post("/settings", response_model=SettingsResponse)
def update_settings(req: SettingsRequest):
markitdown_service._init_llm(req.ollama_base_url or None, req.ollama_model)
markitdown_service.CLEANUP_MODEL = req.cleanup_model or ""
return SettingsResponse(
llm_enabled=markitdown_service.LLM_ACTIVE,
ollama_base_url=markitdown_service.OLLAMA_BASE_URL,
ollama_model=markitdown_service.OLLAMA_MODEL,
cleanup_model=markitdown_service.CLEANUP_MODEL or None,
default_prompt=markitdown_service.DEFAULT_CLEANUP_PROMPT,
)
@router.post("/cleanup", response_model=CleanupResponse)
async def cleanup(req: CleanupRequest):
if not markitdown_service.LLM_ACTIVE:
raise HTTPException(status_code=503, detail="LLM not configured")
cleaned = markitdown_service.llm_cleanup(req.text, req.prompt, req.model)
return CleanupResponse(text=cleaned)
@router.get("/history", response_model=list[ConversionRecord])
async def history(limit: int = 20, db: AsyncSession = Depends(get_db)):
records = await markitdown_service.get_history(db, limit)
return [
ConversionRecord(
id=r.id,
filename=r.filename,
file_type=r.file_type,
llm_enabled=r.llm_enabled,
created_at=str(r.created_at),
)
for r in records
]

View File

@ -0,0 +1,21 @@
import os
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import sessionmaker, DeclarativeBase
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql+asyncpg://admin:secret@db:5432/markitdown")
# asyncpg driver
DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+asyncpg://")
engine = create_async_engine(DATABASE_URL, echo=False)
AsyncSessionLocal = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
class Base(DeclarativeBase):
pass
async def get_db():
async with AsyncSessionLocal() as session:
yield session

View File

@ -0,0 +1,42 @@
from typing import Optional
from pydantic import BaseModel
from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, func
from app.database import Base
class Conversion(Base):
__tablename__ = "conversions"
id = Column(Integer, primary_key=True, index=True)
filename = Column(String(255), nullable=False)
file_type = Column(String(50))
markdown = Column(Text)
llm_enabled = Column(Boolean, default=False)
created_at = Column(DateTime, server_default=func.now())
class ConvertResponse(BaseModel):
id: int
filename: str
markdown: str
llm_enabled: bool = False
class Config:
from_attributes = True
class HealthResponse(BaseModel):
status: str
llm_enabled: bool = False
llm_model: Optional[str] = None
class ConversionRecord(BaseModel):
id: int
filename: str
file_type: Optional[str] = None
llm_enabled: bool = False
created_at: str
class Config:
from_attributes = True

View File

@ -0,0 +1,153 @@
import os
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from markitdown import MarkItDown
from app.models.ConvertModel import Conversion
logger = logging.getLogger(__name__)
import openai as _openai
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
LLM_ACTIVE = False
_llm_client = None
md_plain = MarkItDown()
md = md_plain
def _init_llm(base_url: str | None, model: str) -> bool:
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md
if not base_url:
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md = None, model, False, None, md_plain
return False
try:
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
OLLAMA_BASE_URL = base_url
OLLAMA_MODEL = model
_llm_client = client
md = MarkItDown(llm_client=client, llm_model=model)
LLM_ACTIVE = True
logger.info("MarkItDown: LLM enabled via %s (model=%s)", base_url, model)
return True
except Exception as e:
logger.warning("MarkItDown: LLM init failed (%s)", e)
LLM_ACTIVE = False
return False
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \
The text below was extracted from a multi-column PDF using OCR and is poorly structured: \
columns are merged, headers are mixed with values, and content is out of order.
Your task:
1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.)
2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value |
3. Keep bullet lists where appropriate (e.g. ports, certifications)
4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows)
5. Preserve all technical values exactly do not paraphrase specs
Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble."""
import re as _re
def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str:
if not _llm_client or not text.strip():
return text
try:
resp = _llm_client.chat.completions.create(
model=model or OLLAMA_MODEL,
messages=[
{"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT},
{"role": "user", "content": text},
],
temperature=0,
)
result = resp.choices[0].message.content or text
result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
result = _re.sub(r"\n?```\s*$", "", result.strip())
return result.strip() or text
except Exception as e:
logger.warning("MarkItDown: cleanup failed (%s)", e)
return text
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
}
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
async def convert_file(
file: UploadFile,
db: AsyncSession,
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
if not _allowed_file(file.filename):
raise HTTPException(
status_code=422,
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
suffix = os.path.splitext(file.filename)[1]
file_type = suffix.lstrip(".").lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
# Choose converter: LLM only if enabled globally AND requested per-call
use_llm_now = LLM_ACTIVE and use_llm
# If custom prompt provided, create a one-off MarkItDown with that prompt
if use_llm_now and llm_prompt:
try:
converter = MarkItDown(
llm_client=_llm_client,
llm_model=OLLAMA_MODEL,
llm_prompt=llm_prompt,
)
except TypeError:
# older markitdown versions may not support llm_prompt kwarg
converter = md
elif use_llm_now:
converter = md
else:
converter = md_plain
try:
result = converter.convert(tmp_path)
record = Conversion(
filename=file.filename,
file_type=file_type,
markdown=result.text_content,
llm_enabled=use_llm_now,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
result = await db.execute(
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
)
return result.scalars().all()

View File

@ -0,0 +1,24 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.controllers.ConvertController import router
from app.database import engine, Base
@asynccontextmanager
async def lifespan(app: FastAPI):
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
yield
app = FastAPI(title="MarkItDown API", version="1.0.0", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
app.include_router(router)

View File

@ -0,0 +1,8 @@
markitdown[all]
fastapi
uvicorn
python-multipart
asyncpg
sqlalchemy[asyncio]
openai
httpx