Initial commit — MarkItDown vs Docling demo
- FastAPI microservices: MarkItDown + Docling với async SQLAlchemy - Caddy reverse proxy same-origin (no CORS) - Bootstrap 5 frontend với marked.js rendering - LLM settings card: Ollama URL, model select từ API, cleanup model - POST /cleanup endpoint với AI làm đẹp Markdown - GET /models fetch danh sách model từ Ollama - Runtime LLM re-init không cần restart container - PYTHONDONTWRITEBYTECODE + .dockerignore Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
11de2d2175
|
|
@ -0,0 +1,7 @@
|
|||
POSTGRES_USER=admin
|
||||
POSTGRES_PASSWORD=admin
|
||||
POSTGRES_DB=markitdown
|
||||
|
||||
# Optional — Ollama LLM integration
|
||||
# OLLAMA_BASE_URL=https://chat-ai.nswteam.net/ollama/v1
|
||||
# OLLAMA_MODEL=llava:7b
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
.env
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
.Python
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
db/data/
|
||||
*.db
|
||||
*.sqlite
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
# ============================================================
|
||||
# AI Markdown Demo — MarkItDown vs Docling
|
||||
# ============================================================
|
||||
# Naming convention: <layer>-<service>
|
||||
#
|
||||
# ui :8484 — Comparison UI (nginx)
|
||||
# api-markitdown:8282 — MarkItDown API (Microsoft)
|
||||
# api-docling :8383 — Docling API (IBM)
|
||||
# db-markitdown :5432 — PostgreSQL for markitdown
|
||||
# db-docling :— — PostgreSQL for docling (internal only)
|
||||
# db-admin :5050 — Adminer
|
||||
# ============================================================
|
||||
|
||||
x-healthcheck-defaults: &healthcheck-defaults
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
x-service-defaults: &service-defaults
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-network
|
||||
|
||||
# ==========================
|
||||
# Services
|
||||
# ==========================
|
||||
services:
|
||||
|
||||
# --- UI layer ---------------------------------------------------
|
||||
ui:
|
||||
image: caddy:alpine
|
||||
<<: *service-defaults
|
||||
ports:
|
||||
- "8484:80"
|
||||
volumes:
|
||||
- ./frontend/Caddyfile:/etc/caddy/Caddyfile:ro
|
||||
- ./frontend:/srv:ro
|
||||
|
||||
# --- API layer --------------------------------------------------
|
||||
api-markitdown:
|
||||
build:
|
||||
context: ./markitdown-service
|
||||
<<: *service-defaults
|
||||
ports:
|
||||
- "8282:8000"
|
||||
env_file: .env
|
||||
environment:
|
||||
DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-markitdown:5432/${POSTGRES_DB:-markitdown}
|
||||
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-}
|
||||
OLLAMA_MODEL: ${OLLAMA_MODEL:-llava}
|
||||
depends_on:
|
||||
db-markitdown:
|
||||
condition: service_healthy
|
||||
|
||||
api-docling:
|
||||
build:
|
||||
context: ./docling-service
|
||||
<<: *service-defaults
|
||||
ports:
|
||||
- "8383:8000"
|
||||
env_file: .env
|
||||
environment:
|
||||
DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-docling:5432/docling
|
||||
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-}
|
||||
OLLAMA_MODEL: ${OLLAMA_MODEL:-llava}
|
||||
depends_on:
|
||||
db-docling:
|
||||
condition: service_healthy
|
||||
|
||||
# --- Database layer ---------------------------------------------
|
||||
db-markitdown:
|
||||
image: postgres:16-alpine
|
||||
<<: *service-defaults
|
||||
env_file: .env
|
||||
environment:
|
||||
POSTGRES_DB: ${POSTGRES_DB:-markitdown}
|
||||
POSTGRES_USER: ${POSTGRES_USER:-admin}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
|
||||
volumes:
|
||||
- db_markitdown_data:/var/lib/postgresql/data
|
||||
- ./db/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
|
||||
ports:
|
||||
- "5432:5432"
|
||||
healthcheck:
|
||||
<<: *healthcheck-defaults
|
||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d ${POSTGRES_DB:-markitdown}"]
|
||||
|
||||
db-docling:
|
||||
image: postgres:16-alpine
|
||||
<<: *service-defaults
|
||||
env_file: .env
|
||||
environment:
|
||||
POSTGRES_DB: docling
|
||||
POSTGRES_USER: ${POSTGRES_USER:-admin}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
|
||||
volumes:
|
||||
- db_docling_data:/var/lib/postgresql/data
|
||||
- ./db/init_docling.sql:/docker-entrypoint-initdb.d/init.sql:ro
|
||||
healthcheck:
|
||||
<<: *healthcheck-defaults
|
||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"]
|
||||
|
||||
# --- Admin layer ------------------------------------------------
|
||||
db-admin:
|
||||
image: adminer:4.8.1
|
||||
<<: *service-defaults
|
||||
ports:
|
||||
- "5050:8080"
|
||||
environment:
|
||||
ADMINER_DEFAULT_SERVER: db-markitdown
|
||||
depends_on:
|
||||
db-markitdown:
|
||||
condition: service_healthy
|
||||
|
||||
# ==========================
|
||||
# Infrastructure
|
||||
# ==========================
|
||||
networks:
|
||||
app-network:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
db_markitdown_data:
|
||||
db_docling_data:
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
.env
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
libgl1 libglib2.0-0 libgomp1 \
|
||||
poppler-utils tesseract-ocr \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY main.py .
|
||||
COPY app/ ./app/
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
from fastapi import APIRouter, UploadFile, File, Depends, Query
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from app.models.ConvertModel import ConvertResponse, HealthResponse, ConversionRecord
|
||||
from app.services import DoclingService as docling_service
|
||||
from app.database import get_db
|
||||
from pydantic import BaseModel
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
class SettingsRequest(BaseModel):
|
||||
ollama_base_url: str | None = None
|
||||
ollama_model: str = "llava"
|
||||
cleanup_model: str | None = None
|
||||
|
||||
class SettingsResponse(BaseModel):
|
||||
llm_enabled: bool
|
||||
ollama_base_url: str | None
|
||||
ollama_model: str
|
||||
cleanup_model: str | None = None
|
||||
default_prompt: str | None = None
|
||||
|
||||
SUPPORTED_INPUT_FORMATS = sorted([
|
||||
"pdf", "docx", "xlsx", "pptx",
|
||||
"html", "htm", "jpg", "jpeg", "png",
|
||||
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc"
|
||||
])
|
||||
|
||||
SUPPORTED_OUTPUT_FORMATS = ["markdown", "json", "html", "text"]
|
||||
|
||||
|
||||
@router.get("/settings", response_model=SettingsResponse)
|
||||
def get_settings():
|
||||
return SettingsResponse(
|
||||
llm_enabled=docling_service.LLM_ACTIVE,
|
||||
ollama_base_url=docling_service.OLLAMA_BASE_URL,
|
||||
ollama_model=docling_service.OLLAMA_MODEL,
|
||||
cleanup_model=docling_service.CLEANUP_MODEL or None,
|
||||
default_prompt=docling_service.DEFAULT_ENRICH_PROMPT,
|
||||
)
|
||||
|
||||
@router.post("/settings", response_model=SettingsResponse)
|
||||
def update_settings(req: SettingsRequest):
|
||||
docling_service._init_llm(req.ollama_base_url or None, req.ollama_model)
|
||||
docling_service.CLEANUP_MODEL = req.cleanup_model or ""
|
||||
return SettingsResponse(
|
||||
llm_enabled=docling_service.LLM_ACTIVE,
|
||||
ollama_base_url=docling_service.OLLAMA_BASE_URL,
|
||||
ollama_model=docling_service.OLLAMA_MODEL,
|
||||
cleanup_model=docling_service.CLEANUP_MODEL or None,
|
||||
default_prompt=docling_service.DEFAULT_ENRICH_PROMPT,
|
||||
)
|
||||
|
||||
@router.get("/health", response_model=HealthResponse)
|
||||
def health():
|
||||
from app.services.DoclingService import LLM_ACTIVE, OLLAMA_MODEL
|
||||
ocr = "tesseract" if _ocr_available() else "none"
|
||||
return HealthResponse(
|
||||
status="ok",
|
||||
supported_formats=SUPPORTED_INPUT_FORMATS,
|
||||
output_formats=SUPPORTED_OUTPUT_FORMATS,
|
||||
llm_enabled=LLM_ACTIVE,
|
||||
llm_model=OLLAMA_MODEL if LLM_ACTIVE else None,
|
||||
ocr_engine=ocr,
|
||||
)
|
||||
|
||||
|
||||
def _ocr_available() -> bool:
|
||||
import shutil
|
||||
return shutil.which("tesseract") is not None
|
||||
|
||||
|
||||
@router.post("/convert", response_model=ConvertResponse)
|
||||
async def convert(
|
||||
file: UploadFile = File(...),
|
||||
output_format: str = Query(default="markdown", description="Output format: markdown | json | html | text"),
|
||||
use_llm: bool = Query(default=True, description="Run LLM enrichment on extracted text"),
|
||||
llm_prompt: str | None = Query(default=None, description="Custom system prompt for LLM enrichment"),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
record = await docling_service.convert_file(file, db, output_format, use_llm=use_llm, llm_prompt=llm_prompt)
|
||||
return ConvertResponse(
|
||||
id=record.id,
|
||||
filename=record.filename,
|
||||
output_format=record.output_format,
|
||||
content=record.content,
|
||||
page_count=record.page_count,
|
||||
llm_enabled=record.llm_enabled,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/conversions/{conversion_id}", response_model=ConvertResponse)
|
||||
async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
|
||||
record = await docling_service.get_conversion(conversion_id, db)
|
||||
return ConvertResponse(
|
||||
id=record.id,
|
||||
filename=record.filename,
|
||||
output_format=record.output_format,
|
||||
content=record.content,
|
||||
page_count=record.page_count,
|
||||
llm_enabled=record.llm_enabled,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/history", response_model=list[ConversionRecord])
|
||||
async def history(limit: int = 20, db: AsyncSession = Depends(get_db)):
|
||||
records = await docling_service.get_history(db, limit)
|
||||
return [
|
||||
ConversionRecord(
|
||||
id=r.id,
|
||||
filename=r.filename,
|
||||
file_type=r.file_type,
|
||||
output_format=r.output_format,
|
||||
page_count=r.page_count,
|
||||
created_at=str(r.created_at),
|
||||
)
|
||||
for r in records
|
||||
]
|
||||
|
||||
|
||||
@router.delete("/conversions/{conversion_id}")
|
||||
async def delete_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
|
||||
return await docling_service.delete_conversion(conversion_id, db)
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
import os
|
||||
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
|
||||
from sqlalchemy.orm import sessionmaker, DeclarativeBase
|
||||
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql+asyncpg://admin:secret@db:5432/docling")
|
||||
|
||||
DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+asyncpg://")
|
||||
|
||||
engine = create_async_engine(DATABASE_URL, echo=False)
|
||||
|
||||
AsyncSessionLocal = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
async def get_db():
|
||||
async with AsyncSessionLocal() as session:
|
||||
yield session
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, func
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Conversion(Base):
|
||||
__tablename__ = "conversions"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
filename = Column(String(255), nullable=False)
|
||||
file_type = Column(String(50))
|
||||
output_format = Column(String(20), default="markdown")
|
||||
content = Column(Text)
|
||||
page_count = Column(Integer, nullable=True)
|
||||
llm_enabled = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime, server_default=func.now())
|
||||
|
||||
|
||||
class ConvertResponse(BaseModel):
|
||||
id: int
|
||||
filename: str
|
||||
output_format: str
|
||||
content: str
|
||||
page_count: Optional[int] = None
|
||||
llm_enabled: bool = False
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ConversionRecord(BaseModel):
|
||||
id: int
|
||||
filename: str
|
||||
file_type: Optional[str] = None
|
||||
output_format: str
|
||||
page_count: Optional[int] = None
|
||||
llm_enabled: bool = False
|
||||
created_at: str
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
supported_formats: list[str]
|
||||
output_formats: list[str]
|
||||
llm_enabled: bool = False
|
||||
llm_model: Optional[str] = None
|
||||
ocr_engine: str = "none"
|
||||
|
|
@ -0,0 +1,197 @@
|
|||
import os
|
||||
import re
|
||||
import json
|
||||
import tempfile
|
||||
import logging
|
||||
from fastapi import UploadFile, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from app.models.ConvertModel import Conversion
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import openai as _openai
|
||||
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
|
||||
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
|
||||
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
|
||||
|
||||
def _build_converter() -> DocumentConverter:
|
||||
try:
|
||||
from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractCliOcrOptions())
|
||||
logger.info("Docling: OCR enabled via Tesseract CLI")
|
||||
except Exception as e:
|
||||
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
|
||||
pdf_opts = PdfPipelineOptions(do_ocr=False)
|
||||
return DocumentConverter(
|
||||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
|
||||
)
|
||||
|
||||
converter = _build_converter()
|
||||
|
||||
_llm_client = None
|
||||
LLM_ACTIVE = False
|
||||
|
||||
|
||||
def _init_llm(base_url: str | None, model: str) -> bool:
|
||||
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client
|
||||
if not base_url:
|
||||
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client = None, model, False, None
|
||||
return False
|
||||
try:
|
||||
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
|
||||
OLLAMA_BASE_URL = base_url
|
||||
OLLAMA_MODEL = model
|
||||
_llm_client = client
|
||||
LLM_ACTIVE = True
|
||||
logger.info("Docling: LLM enabled via %s (model=%s)", base_url, model)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("Docling: LLM init failed (%s)", e)
|
||||
LLM_ACTIVE = False
|
||||
return False
|
||||
|
||||
|
||||
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
|
||||
|
||||
|
||||
DEFAULT_ENRICH_PROMPT = (
|
||||
"You are a document cleaning assistant. "
|
||||
"Fix OCR errors, normalise whitespace, and improve the Markdown structure. "
|
||||
"Return ONLY the raw Markdown text — no code fences, no commentary, no explanation."
|
||||
)
|
||||
|
||||
|
||||
def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str:
|
||||
"""Send extracted markdown to LLM for cleanup. Optionally override the system prompt."""
|
||||
if not _llm_client or not markdown.strip():
|
||||
return markdown
|
||||
try:
|
||||
resp = _llm_client.chat.completions.create(
|
||||
model=OLLAMA_MODEL,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt or DEFAULT_ENRICH_PROMPT},
|
||||
{"role": "user", "content": markdown},
|
||||
],
|
||||
temperature=0,
|
||||
)
|
||||
result = resp.choices[0].message.content or markdown
|
||||
# llava tends to wrap output in code fences regardless of instructions — strip them
|
||||
result = re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
|
||||
result = re.sub(r"\n?```\s*$", "", result.strip())
|
||||
return result.strip() or markdown
|
||||
except Exception as e:
|
||||
logger.warning("Docling: LLM enrichment failed (%s) — returning raw output", e)
|
||||
return markdown
|
||||
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
ALLOWED_EXTENSIONS = {
|
||||
"pdf", "docx", "xlsx", "pptx",
|
||||
"html", "htm", "jpg", "jpeg", "png",
|
||||
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc"
|
||||
}
|
||||
OUTPUT_FORMATS = {"markdown", "json", "html", "text"}
|
||||
|
||||
|
||||
def _allowed_file(filename: str) -> bool:
|
||||
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
||||
|
||||
|
||||
async def convert_file(
|
||||
file: UploadFile,
|
||||
db: AsyncSession,
|
||||
output_format: str = "markdown",
|
||||
use_llm: bool = True,
|
||||
llm_prompt: str | None = None,
|
||||
) -> Conversion:
|
||||
if not _allowed_file(file.filename):
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
|
||||
)
|
||||
if output_format not in OUTPUT_FORMATS:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"Output format not supported. Supported: {', '.join(sorted(OUTPUT_FORMATS))}"
|
||||
)
|
||||
|
||||
suffix = os.path.splitext(file.filename)[1]
|
||||
file_type = suffix.lstrip(".").lower()
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(await file.read())
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
result = converter.convert(tmp_path)
|
||||
doc = result.document
|
||||
|
||||
page_count = len(doc.pages) if hasattr(doc, "pages") and doc.pages else None
|
||||
|
||||
if output_format == "markdown":
|
||||
content = doc.export_to_markdown()
|
||||
elif output_format == "json":
|
||||
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
|
||||
elif output_format == "html":
|
||||
content = doc.export_to_html()
|
||||
elif output_format == "text":
|
||||
content = doc.export_to_markdown()
|
||||
content = re.sub(r"#{1,6}\s?", "", content)
|
||||
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
|
||||
content = re.sub(r"\*(.+?)\*", r"\1", content)
|
||||
|
||||
# LLM enrichment — only for markdown / text output, and only if requested
|
||||
llm_used = False
|
||||
if _llm_client and use_llm and output_format in ("markdown", "text"):
|
||||
content = _llm_enrich(content, system_prompt=llm_prompt or None)
|
||||
llm_used = True
|
||||
|
||||
record = Conversion(
|
||||
filename=file.filename,
|
||||
file_type=file_type,
|
||||
output_format=output_format,
|
||||
content=content,
|
||||
page_count=page_count,
|
||||
llm_enabled=llm_used,
|
||||
)
|
||||
db.add(record)
|
||||
await db.commit()
|
||||
await db.refresh(record)
|
||||
return record
|
||||
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
async def get_conversion(conversion_id: int, db: AsyncSession) -> Conversion:
|
||||
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
|
||||
record = result.scalar_one_or_none()
|
||||
if not record:
|
||||
raise HTTPException(status_code=404, detail="Conversion not found")
|
||||
return record
|
||||
|
||||
|
||||
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
|
||||
result = await db.execute(
|
||||
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
|
||||
)
|
||||
return result.scalars().all()
|
||||
|
||||
|
||||
async def delete_conversion(conversion_id: int, db: AsyncSession) -> dict:
|
||||
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
|
||||
record = result.scalar_one_or_none()
|
||||
if not record:
|
||||
raise HTTPException(status_code=404, detail="Conversion not found")
|
||||
await db.delete(record)
|
||||
await db.commit()
|
||||
return {"message": f"Conversion {conversion_id} deleted"}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from app.controllers.ConvertController import router
|
||||
from app.database import engine, Base
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Docling API",
|
||||
version="1.0.0",
|
||||
description="Advanced document conversion service powered by Docling. Supports PDF, DOCX, PPTX, XLSX, HTML, images and more.",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.include_router(router)
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
docling
|
||||
fastapi
|
||||
uvicorn
|
||||
python-multipart
|
||||
asyncpg
|
||||
sqlalchemy[asyncio]
|
||||
openai
|
||||
Binary file not shown.
|
|
@ -0,0 +1,19 @@
|
|||
:80 {
|
||||
encode gzip
|
||||
|
||||
# Reverse proxy — strips prefix and forwards to backend
|
||||
handle_path /api/markitdown/* {
|
||||
reverse_proxy api-markitdown:8000
|
||||
}
|
||||
|
||||
handle_path /api/docling/* {
|
||||
reverse_proxy api-docling:8000
|
||||
}
|
||||
|
||||
# Static files with SPA fallback
|
||||
handle {
|
||||
root * /srv
|
||||
file_server
|
||||
try_files {path} /index.html
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,822 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="vi">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>MarkItDown vs Docling — LLM Input Processing</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" />
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.min.css" rel="stylesheet" />
|
||||
<style>
|
||||
body {
|
||||
background: #f8f9fa;
|
||||
}
|
||||
|
||||
/* Upload zone */
|
||||
#UploadZone {
|
||||
border: 2px dashed #dee2e6;
|
||||
border-radius: .5rem;
|
||||
padding: 3rem 1.5rem;
|
||||
text-align: center;
|
||||
cursor: pointer;
|
||||
transition: border-color .2s, background .2s;
|
||||
}
|
||||
|
||||
#UploadZone:hover,
|
||||
#UploadZone.dragover {
|
||||
border-color: #0d6efd;
|
||||
background: #f0f6ff;
|
||||
}
|
||||
|
||||
#UploadZone input[type="file"] {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* Result pane */
|
||||
.ResultPre {
|
||||
max-height: 460px;
|
||||
overflow: auto;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
font-size: .78rem;
|
||||
background: #f8f9fa;
|
||||
}
|
||||
|
||||
.PreviewPane {
|
||||
max-height: 460px;
|
||||
overflow: auto;
|
||||
font-size: .85rem;
|
||||
padding: 1rem;
|
||||
line-height: 1.7;
|
||||
}
|
||||
|
||||
.PreviewPane table {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin: .5rem 0;
|
||||
}
|
||||
|
||||
.PreviewPane th,
|
||||
.PreviewPane td {
|
||||
border: 1px solid #dee2e6;
|
||||
padding: .3rem .6rem;
|
||||
font-size: .8rem;
|
||||
}
|
||||
|
||||
.PreviewPane th {
|
||||
background: #f1f3f5;
|
||||
}
|
||||
|
||||
.PreviewPane code {
|
||||
background: #f1f3f5;
|
||||
padding: 1px 4px;
|
||||
border-radius: 3px;
|
||||
font-size: .85em;
|
||||
}
|
||||
|
||||
.PreviewPane blockquote {
|
||||
border-left: 3px solid #dee2e6;
|
||||
padding-left: .75rem;
|
||||
color: #6c757d;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<nav class="navbar navbar-light bg-white border-bottom px-4 py-2">
|
||||
<span class="navbar-brand fw-bold mb-0">MarkItDown <span class="text-muted fw-normal">vs</span> Docling</span>
|
||||
<span class="badge bg-primary-subtle text-primary">Demo — LLM Input Processing</span>
|
||||
</nav>
|
||||
|
||||
<div class="container-xl py-4">
|
||||
|
||||
<!-- Upload card -->
|
||||
<div class="card shadow-sm mb-4">
|
||||
<div class="card-body">
|
||||
<h6 class="card-title fw-semibold mb-3">Tải lên tài liệu để so sánh</h6>
|
||||
|
||||
<div id="UploadZone">
|
||||
<input type="file" id="FileInput"
|
||||
accept=".pdf,.docx,.xlsx,.pptx,.html,.htm,.csv,.txt,.jpg,.jpeg,.png,.tiff,.tif,.bmp,.md,.epub,.zip,.asciidoc,.adoc" />
|
||||
<i class="bi bi-file-earmark-text fs-1 text-secondary"></i>
|
||||
<p class="text-muted mt-2 mb-1">Kéo thả hoặc click để chọn file</p>
|
||||
<div id="FileName" class="fw-semibold text-primary small"></div>
|
||||
</div>
|
||||
|
||||
<div class="d-flex flex-wrap gap-1 mt-2">
|
||||
<span class="badge bg-secondary-subtle text-secondary">PDF</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">DOCX</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">XLSX</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">PPTX</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">HTML</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">CSV</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">TXT</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">JPG/PNG</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">EPUB</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">TIFF</span>
|
||||
<span class="badge bg-secondary-subtle text-secondary">ASCIIDoc</span>
|
||||
</div>
|
||||
|
||||
<!-- Controls row -->
|
||||
<div class="d-flex flex-wrap align-items-center gap-3 mt-3">
|
||||
<div class="d-flex align-items-center gap-2">
|
||||
<label class="form-label mb-0 small fw-medium" for="DoclingFormat">Docling format</label>
|
||||
<select class="form-select form-select-sm" id="DoclingFormat" style="width:auto">
|
||||
<option value="markdown">Markdown</option>
|
||||
<option value="json">JSON</option>
|
||||
<option value="html">HTML</option>
|
||||
<option value="text">Plain Text</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="form-check form-switch mb-0">
|
||||
<input class="form-check-input" type="checkbox" id="LlmToggle" checked onchange="OnLlmToggle()" />
|
||||
<label class="form-check-label small fw-medium" for="LlmToggle" id="LlmToggleLabel">LLM bật</label>
|
||||
</div>
|
||||
|
||||
<button class="btn btn-primary btn-sm" id="ConvertBtn" disabled onclick="RunConversion()">
|
||||
<i class="bi bi-play-fill me-1"></i>Chuyển đổi & So sánh
|
||||
</button>
|
||||
<button class="btn btn-outline-secondary btn-sm" onclick="ClearResults()">
|
||||
<i class="bi bi-x-circle me-1"></i>Xoá
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- LLM Prompt panel -->
|
||||
<div id="LlmPanel" class="mt-3 d-none">
|
||||
<div class="row g-3">
|
||||
<!-- Col 1: Custom prompt -->
|
||||
<div class="col-md-6">
|
||||
<div class="h-100 p-3 rounded border bg-primary-subtle">
|
||||
<div class="d-flex justify-content-between align-items-center mb-2">
|
||||
<span class="small fw-semibold text-primary"><i class="bi bi-pencil-square me-1"></i>Custom LLM
|
||||
Prompt</span>
|
||||
<button class="btn btn-link btn-sm p-0 text-primary text-decoration-underline"
|
||||
onclick="ResetPrompt()">Xoá</button>
|
||||
</div>
|
||||
<textarea class="form-control form-control-sm" id="LlmPrompt" rows="6"
|
||||
placeholder="Để trống = dùng Default prompt bên phải Nhập custom system prompt để override..."></textarea>
|
||||
<div class="form-text mt-1 text-primary small">
|
||||
<b>MarkItDown</b>: vision prompt | <b>Docling</b>: enrich prompt
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Col 2: Default prompts with tabs -->
|
||||
<div class="col-md-6">
|
||||
<div class="h-100 p-3 rounded border bg-light">
|
||||
<ul class="nav nav-tabs nav-sm mb-2" id="DefaultPromptTabs">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link active py-1 px-2 small" href="#"
|
||||
onclick="SwitchDefaultTab('Md',this);return false">
|
||||
<span class="badge bg-primary rounded-circle p-1 me-1"> </span>MarkItDown
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link py-1 px-2 small" href="#" onclick="SwitchDefaultTab('Dl',this);return false">
|
||||
<span class="badge bg-success rounded-circle p-1 me-1"> </span>Docling
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
<div id="DefaultPrompt-Md">
|
||||
<textarea class="form-control form-control-sm font-monospace" id="MdDefaultPrompt" rows="5" readonly
|
||||
style="font-size:.7rem;resize:none;background:#fff"></textarea>
|
||||
<button class="btn btn-outline-primary btn-sm mt-2 w-100" onclick="UsePrompt('Md')">
|
||||
<i class="bi bi-arrow-left me-1"></i>Dùng prompt này
|
||||
</button>
|
||||
</div>
|
||||
<div id="DefaultPrompt-Dl" class="d-none">
|
||||
<textarea class="form-control form-control-sm font-monospace" id="DlDefaultPrompt" rows="5" readonly
|
||||
style="font-size:.7rem;resize:none;background:#fff"></textarea>
|
||||
<button class="btn btn-outline-success btn-sm mt-2 w-100" onclick="UsePrompt('Dl')">
|
||||
<i class="bi bi-arrow-left me-1"></i>Dùng prompt này
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Settings card -->
|
||||
<div class="card shadow-sm mb-4">
|
||||
<div class="card-header py-2">
|
||||
<span class="fw-semibold small"><i class="bi bi-gear me-2"></i>Cài đặt LLM (Ollama)</span>
|
||||
</div>
|
||||
<div class="card-body" id="SettingsBody">
|
||||
<div class="row g-3 align-items-end">
|
||||
<div class="col-md-5">
|
||||
<label class="form-label fw-medium small">Ollama Base URL</label>
|
||||
<div class="input-group input-group-sm">
|
||||
<input type="url" class="form-control" id="SettingUrl" placeholder="https://your-ollama-server/v1" />
|
||||
<button class="btn btn-outline-secondary" onclick="FetchModels()" title="Tải danh sách model">
|
||||
<i class="bi bi-arrow-clockwise"></i>
|
||||
</button>
|
||||
</div>
|
||||
<div class="form-text">OpenAI-compatible endpoint</div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<label class="form-label fw-medium small">Model mặc định</label>
|
||||
<select class="form-select form-select-sm" id="SettingModel">
|
||||
<option value="">-- chọn model --</option>
|
||||
</select>
|
||||
<div class="form-text">Convert + Docling enrich</div>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<label class="form-label fw-medium small">Model Format</label>
|
||||
<select class="form-select form-select-sm" id="CleanupModel">
|
||||
<option value="">-- dùng model trên --</option>
|
||||
</select>
|
||||
<div class="form-text">Để trống = dùng model mặc định</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="row g-2 mt-3 text-center">
|
||||
<div class="col-6">
|
||||
<div class="border rounded py-2 px-3">
|
||||
<div class="small text-muted mb-1"><span
|
||||
class="badge bg-primary rounded-circle p-1 me-1"> </span>MarkItDown LLM</div>
|
||||
<div id="MdLlmStatus"><span class="spinner-border spinner-border-sm"></span></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-6">
|
||||
<div class="border rounded py-2 px-3">
|
||||
<div class="small text-muted mb-1"><span
|
||||
class="badge bg-success rounded-circle p-1 me-1"> </span>Docling LLM</div>
|
||||
<div id="DlLlmStatus"><span class="spinner-border spinner-border-sm"></span></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="mt-3 text-end">
|
||||
<button class="btn btn-primary btn-sm px-4" id="SaveSettingsBtn" onclick="SaveSettings()">
|
||||
<span id="SaveSpinner" class="spinner-border spinner-border-sm d-none"></span>
|
||||
<i class="bi bi-check-lg me-1"></i>Lưu & Áp dụng
|
||||
</button>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Progress row -->
|
||||
<div class="row g-3 mb-4 d-none" id="ProgressRow">
|
||||
<div class="col-md-6">
|
||||
<div class="card h-100">
|
||||
<div class="card-body">
|
||||
<h6 class="card-title d-flex align-items-center gap-2">
|
||||
<span class="badge bg-primary rounded-circle p-1"> </span>MarkItDown
|
||||
</h6>
|
||||
<div class="d-flex align-items-center gap-2" id="MdStatusLine">
|
||||
<div class="spinner-border spinner-border-sm text-primary d-none" id="MdSpinner"></div>
|
||||
<small id="MdStatus" class="text-muted">Đang chờ...</small>
|
||||
</div>
|
||||
<div class="row g-2 mt-2 d-none" id="MdMetrics">
|
||||
<div class="col-4">
|
||||
<div class="border rounded text-center py-2" id="MdTimeCard">
|
||||
<div class="fw-bold" id="MdTimeVal">-</div>
|
||||
<div class="text-muted" style="font-size:.7rem">ms</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-4">
|
||||
<div class="border rounded text-center py-2">
|
||||
<div class="fw-bold" id="MdCharsVal">-</div>
|
||||
<div class="text-muted" style="font-size:.7rem">ký tự</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-4">
|
||||
<div class="border rounded text-center py-2">
|
||||
<div class="fw-bold" id="MdLinesVal">-</div>
|
||||
<div class="text-muted" style="font-size:.7rem">dòng</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<div class="card h-100">
|
||||
<div class="card-body">
|
||||
<h6 class="card-title d-flex align-items-center gap-2">
|
||||
<span class="badge bg-success rounded-circle p-1"> </span>Docling
|
||||
</h6>
|
||||
<div class="d-flex align-items-center gap-2" id="DlStatusLine">
|
||||
<div class="spinner-border spinner-border-sm text-success d-none" id="DlSpinner"></div>
|
||||
<small id="DlStatus" class="text-muted">Đang chờ...</small>
|
||||
</div>
|
||||
<div class="row g-2 mt-2 d-none" id="DlMetrics">
|
||||
<div class="col-3">
|
||||
<div class="border rounded text-center py-2" id="DlTimeCard">
|
||||
<div class="fw-bold" id="DlTimeVal">-</div>
|
||||
<div class="text-muted" style="font-size:.7rem">ms</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-3">
|
||||
<div class="border rounded text-center py-2">
|
||||
<div class="fw-bold" id="DlCharsVal">-</div>
|
||||
<div class="text-muted" style="font-size:.7rem">ký tự</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-3">
|
||||
<div class="border rounded text-center py-2">
|
||||
<div class="fw-bold" id="DlLinesVal">-</div>
|
||||
<div class="text-muted" style="font-size:.7rem">dòng</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-3 d-none" id="DlPagesCard">
|
||||
<div class="border rounded text-center py-2">
|
||||
<div class="fw-bold" id="DlPagesVal">-</div>
|
||||
<div class="text-muted" style="font-size:.7rem">trang</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Compare banner -->
|
||||
<div class="card shadow-sm mb-4 d-none" id="CompareBanner">
|
||||
<div class="card-body">
|
||||
<h6 class="card-title fw-semibold mb-3"><i class="bi bi-bar-chart-line me-1"></i>Tóm tắt so sánh</h6>
|
||||
<table class="table table-sm table-bordered mb-0">
|
||||
<thead class="table-light">
|
||||
<tr>
|
||||
<th>Tiêu chí</th>
|
||||
<th>MarkItDown</th>
|
||||
<th>Docling</th>
|
||||
<th class="text-muted">Ghi chú</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="CompareRows"></tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Results -->
|
||||
<div class="row g-3 mb-4">
|
||||
<div class="col-md-6">
|
||||
<div class="card shadow-sm h-100" id="MdResultCard">
|
||||
<div class="card-header d-flex justify-content-between align-items-center py-2">
|
||||
<span class="fw-semibold small d-flex align-items-center gap-2">
|
||||
<span class="badge bg-primary rounded-circle p-1"> </span>MarkItDown
|
||||
</span>
|
||||
<div class="d-flex align-items-center gap-2">
|
||||
<button class="btn btn-outline-warning btn-sm py-0 px-2 d-none" id="CleanupBtn"
|
||||
onclick="CleanupMarkItDown()" title="Dùng LLM tái cấu trúc output">
|
||||
<span id="CleanupSpinner" class="spinner-border spinner-border-sm d-none"></span>
|
||||
✨ Làm đẹp
|
||||
</button>
|
||||
<button class="btn btn-outline-secondary btn-sm py-0 px-2 d-none" id="MdDownloadBtn"
|
||||
onclick="DownloadMd('Md')" title="Tải xuống .md">
|
||||
<i class="bi bi-download"></i> .md
|
||||
</button>
|
||||
<ul class="nav nav-tabs card-header-tabs border-0" id="MdTabs">
|
||||
<li class="nav-item"><a class="nav-link active py-1 px-2 small" href="#"
|
||||
onclick="SwitchTab('Md','Raw',this);return false">Raw</a></li>
|
||||
<li class="nav-item"><a class="nav-link py-1 px-2 small" href="#"
|
||||
onclick="SwitchTab('Md','Preview',this);return false">Preview</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body p-0" id="MdBody">
|
||||
<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để
|
||||
xem kết quả</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<div class="card shadow-sm h-100" id="DlResultCard">
|
||||
<div class="card-header d-flex justify-content-between align-items-center py-2">
|
||||
<span class="fw-semibold small d-flex align-items-center gap-2">
|
||||
<span class="badge bg-success rounded-circle p-1"> </span>Docling
|
||||
</span>
|
||||
<div class="d-flex align-items-center gap-2">
|
||||
<button class="btn btn-outline-secondary btn-sm py-0 px-2 d-none" id="DlDownloadBtn"
|
||||
onclick="DownloadMd('Dl')" title="Tải xuống .md">
|
||||
<i class="bi bi-download"></i> .md
|
||||
</button>
|
||||
<ul class="nav nav-tabs card-header-tabs border-0" id="DlTabs">
|
||||
<li class="nav-item"><a class="nav-link active py-1 px-2 small" href="#"
|
||||
onclick="SwitchTab('Dl','Raw',this);return false">Raw</a></li>
|
||||
<li class="nav-item"><a class="nav-link py-1 px-2 small" href="#"
|
||||
onclick="SwitchTab('Dl','Preview',this);return false">Preview</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body p-0" id="DlBody">
|
||||
<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để
|
||||
xem kết quả</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- History -->
|
||||
<h6 class="fw-semibold mb-3">Lịch sử chuyển đổi gần đây</h6>
|
||||
<div class="row g-3">
|
||||
<div class="col-md-6">
|
||||
<div class="card shadow-sm">
|
||||
<div class="card-header py-2 small fw-semibold d-flex align-items-center gap-2">
|
||||
<span class="badge bg-primary rounded-circle p-1"> </span>MarkItDown
|
||||
</div>
|
||||
<div id="MdHistory">
|
||||
<div class="text-center text-muted py-3 small">Chưa có lịch sử</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<div class="card shadow-sm">
|
||||
<div class="card-header py-2 small fw-semibold d-flex align-items-center gap-2">
|
||||
<span class="badge bg-success rounded-circle p-1"> </span>Docling
|
||||
</div>
|
||||
<div id="DlHistory">
|
||||
<div class="text-center text-muted py-3 small">Chưa có lịch sử</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div><!-- /container -->
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked@13/marked.min.js"></script>
|
||||
<script>
|
||||
const MarkItDownUrl = '/api/markitdown';
|
||||
const DoclingUrl = '/api/docling';
|
||||
|
||||
let CurrentFile = null;
|
||||
let MdContent = '';
|
||||
let DlContent = '';
|
||||
|
||||
// ── File input ────────────────────────────────────────────────
|
||||
const UploadZone = document.getElementById('UploadZone');
|
||||
const FileInput = document.getElementById('FileInput');
|
||||
|
||||
UploadZone.addEventListener('click', () => FileInput.click());
|
||||
UploadZone.addEventListener('dragover', e => { e.preventDefault(); UploadZone.classList.add('dragover'); });
|
||||
UploadZone.addEventListener('dragleave', () => UploadZone.classList.remove('dragover'));
|
||||
UploadZone.addEventListener('drop', e => {
|
||||
e.preventDefault();
|
||||
UploadZone.classList.remove('dragover');
|
||||
if (e.dataTransfer.files[0]) SelectFile(e.dataTransfer.files[0]);
|
||||
});
|
||||
FileInput.addEventListener('change', () => { if (FileInput.files[0]) SelectFile(FileInput.files[0]); });
|
||||
|
||||
function SelectFile(File_) {
|
||||
CurrentFile = File_;
|
||||
document.getElementById('FileName').textContent = File_.name + ' (' + FormatBytes(File_.size) + ')';
|
||||
document.getElementById('ConvertBtn').disabled = false;
|
||||
}
|
||||
|
||||
function FormatBytes(Bytes) {
|
||||
if (Bytes < 1024) return Bytes + ' B';
|
||||
if (Bytes < 1024 * 1024) return (Bytes / 1024).toFixed(1) + ' KB';
|
||||
return (Bytes / (1024 * 1024)).toFixed(1) + ' MB';
|
||||
}
|
||||
|
||||
// ── LLM toggle ────────────────────────────────────────────────
|
||||
function OnLlmToggle() {
|
||||
const IsOn = document.getElementById('LlmToggle').checked;
|
||||
document.getElementById('LlmToggleLabel').textContent = IsOn ? 'LLM bật' : 'LLM tắt';
|
||||
document.getElementById('LlmPanel').classList.toggle('d-none', !IsOn);
|
||||
localStorage.setItem('llm_enabled', IsOn ? '1' : '0');
|
||||
}
|
||||
|
||||
function ResetPrompt() {
|
||||
document.getElementById('LlmPrompt').value = '';
|
||||
localStorage.removeItem('llm_prompt');
|
||||
}
|
||||
|
||||
document.getElementById('LlmPrompt').addEventListener('input', () => {
|
||||
localStorage.setItem('llm_prompt', document.getElementById('LlmPrompt').value);
|
||||
});
|
||||
|
||||
// ── Conversion ────────────────────────────────────────────────
|
||||
async function RunConversion() {
|
||||
if (!CurrentFile) return;
|
||||
|
||||
document.getElementById('ConvertBtn').disabled = true;
|
||||
SetDisplay('ProgressRow', true);
|
||||
SetDisplay('CompareBanner', false);
|
||||
document.getElementById('MdBody').innerHTML = '<div class="text-center text-muted py-5 small"><div class="spinner-border spinner-border-sm mb-2"></div><br>Đang xử lý...</div>';
|
||||
document.getElementById('DlBody').innerHTML = '<div class="text-center text-muted py-5 small"><div class="spinner-border spinner-border-sm text-success mb-2"></div><br>Đang xử lý...</div>';
|
||||
SetDisplay('MdMetrics', false);
|
||||
SetDisplay('DlMetrics', false);
|
||||
|
||||
SetSpinner('Md', true, 'Đang chuyển đổi...');
|
||||
SetSpinner('Dl', true, 'Đang chuyển đổi...');
|
||||
|
||||
const DoclingFmt = document.getElementById('DoclingFormat').value;
|
||||
const UseLlm = document.getElementById('LlmToggle').checked;
|
||||
const CustomPrompt = document.getElementById('LlmPrompt').value.trim();
|
||||
|
||||
const [MdResult, DlResult] = await Promise.allSettled([
|
||||
ConvertMarkItDown(CurrentFile, UseLlm, CustomPrompt),
|
||||
ConvertDocling(CurrentFile, DoclingFmt, UseLlm, CustomPrompt),
|
||||
]);
|
||||
|
||||
document.getElementById('ConvertBtn').disabled = false;
|
||||
ShowCompare(MdResult, DlResult);
|
||||
LoadHistory();
|
||||
}
|
||||
|
||||
async function ConvertMarkItDown(File_, UseLlm, CustomPrompt) {
|
||||
const Form = new FormData();
|
||||
Form.append('file', File_);
|
||||
const Params = new URLSearchParams({ use_llm: UseLlm });
|
||||
if (CustomPrompt) Params.set('llm_prompt', CustomPrompt);
|
||||
const T0 = performance.now();
|
||||
const Res = await fetch(MarkItDownUrl + '/convert?' + Params, { method: 'POST', body: Form });
|
||||
const Ms = Math.round(performance.now() - T0);
|
||||
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
|
||||
const Data = await Res.json();
|
||||
return { Content: Data.markdown, Ms, LlmEnabled: Data.llm_enabled };
|
||||
}
|
||||
|
||||
async function ConvertDocling(File_, Fmt, UseLlm, CustomPrompt) {
|
||||
const Form = new FormData();
|
||||
Form.append('file', File_);
|
||||
const Params = new URLSearchParams({ output_format: Fmt, use_llm: UseLlm });
|
||||
if (CustomPrompt) Params.set('llm_prompt', CustomPrompt);
|
||||
const T0 = performance.now();
|
||||
const Res = await fetch(DoclingUrl + '/convert?' + Params, { method: 'POST', body: Form });
|
||||
const Ms = Math.round(performance.now() - T0);
|
||||
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
|
||||
const Data = await Res.json();
|
||||
return { Content: Data.content, Ms, Pages: Data.page_count, LlmEnabled: Data.llm_enabled };
|
||||
}
|
||||
|
||||
function SetDisplay(Id, Visible) {
|
||||
document.getElementById(Id).classList.toggle('d-none', !Visible);
|
||||
}
|
||||
|
||||
function SetSpinner(Prefix, Active, Msg) {
|
||||
document.getElementById(Prefix + 'Spinner').classList.toggle('d-none', !Active);
|
||||
document.getElementById(Prefix + 'Status').textContent = Msg;
|
||||
}
|
||||
|
||||
function ShowCompare(MdResult, DlResult) {
|
||||
const MdOk = MdResult.status === 'fulfilled';
|
||||
const DlOk = DlResult.status === 'fulfilled';
|
||||
|
||||
if (MdOk) {
|
||||
MdContent = MdResult.value.Content;
|
||||
const LlmTag = MdResult.value.LlmEnabled ? ' 🤖' : '';
|
||||
SetSpinner('Md', false, '✅ Hoàn tất (' + MdResult.value.Ms + ' ms)' + LlmTag);
|
||||
ShowMetrics('Md', MdResult.value);
|
||||
RenderResult('Md', MdContent);
|
||||
document.getElementById('CleanupBtn').classList.remove('d-none');
|
||||
document.getElementById('MdDownloadBtn').classList.remove('d-none');
|
||||
} else {
|
||||
MdContent = '';
|
||||
SetSpinner('Md', false, '❌ ' + MdResult.reason.message);
|
||||
document.getElementById('MdBody').innerHTML = '<div class="alert alert-danger m-3 small">' + EscHtml(MdResult.reason.message) + '</div>';
|
||||
}
|
||||
|
||||
if (DlOk) {
|
||||
DlContent = DlResult.value.Content;
|
||||
document.getElementById('DlDownloadBtn').classList.remove('d-none');
|
||||
const LlmTag = DlResult.value.LlmEnabled ? ' 🤖' : '';
|
||||
SetSpinner('Dl', false, '✅ Hoàn tất (' + DlResult.value.Ms + ' ms)' + LlmTag);
|
||||
ShowMetrics('Dl', DlResult.value);
|
||||
RenderResult('Dl', DlContent);
|
||||
} else {
|
||||
DlContent = '';
|
||||
SetSpinner('Dl', false, '❌ ' + DlResult.reason.message);
|
||||
document.getElementById('DlBody').innerHTML = '<div class="alert alert-danger m-3 small">' + EscHtml(DlResult.reason.message) + '</div>';
|
||||
}
|
||||
|
||||
if (MdOk && DlOk) BuildCompareTable(MdResult.value, DlResult.value);
|
||||
}
|
||||
|
||||
function ShowMetrics(Prefix, Data) {
|
||||
const Lines = Data.Content.split('\n').length;
|
||||
const Chars = Data.Content.length;
|
||||
document.getElementById(Prefix + 'TimeVal').textContent = Data.Ms.toLocaleString();
|
||||
document.getElementById(Prefix + 'CharsVal').textContent = Chars.toLocaleString();
|
||||
document.getElementById(Prefix + 'LinesVal').textContent = Lines.toLocaleString();
|
||||
if (Prefix === 'Dl' && Data.Pages) {
|
||||
document.getElementById('DlPagesVal').textContent = Data.Pages;
|
||||
document.getElementById('DlPagesCard').classList.remove('d-none');
|
||||
}
|
||||
SetDisplay(Prefix + 'Metrics', true);
|
||||
}
|
||||
|
||||
function BuildCompareTable(Md, Dl) {
|
||||
const Rows = [
|
||||
{ Label: 'Thời gian xử lý', MdVal: Md.Ms + ' ms', DlVal: Dl.Ms + ' ms', MdWin: Md.Ms < Dl.Ms, Note: 'Thấp hơn = nhanh hơn' },
|
||||
{ Label: 'Độ dài output', MdVal: Md.Content.length + ' ký tự', DlVal: Dl.Content.length + ' ký tự', MdWin: Md.Content.length >= Dl.Content.length, Note: 'Nhiều hơn = giữ được nội dung hơn' },
|
||||
{ Label: 'Số dòng', MdVal: Md.Content.split('\n').length, DlVal: Dl.Content.split('\n').length, MdWin: Md.Content.split('\n').length >= Dl.Content.split('\n').length, Note: '' },
|
||||
];
|
||||
document.getElementById('CompareRows').innerHTML = Rows.map(R => `
|
||||
<tr>
|
||||
<td>${R.Label}</td>
|
||||
<td class="${R.MdWin ? 'text-success fw-semibold' : 'text-muted'}">${R.MdWin ? '🏆 ' : ''}${R.MdVal}</td>
|
||||
<td class="${!R.MdWin ? 'text-success fw-semibold' : 'text-muted'}">${!R.MdWin ? '🏆 ' : ''}${R.DlVal}</td>
|
||||
<td class="text-muted small">${R.Note}</td>
|
||||
</tr>`).join('');
|
||||
SetDisplay('CompareBanner', true);
|
||||
}
|
||||
|
||||
// ── Render result ─────────────────────────────────────────────
|
||||
function RenderResult(Prefix, Content_) {
|
||||
document.getElementById(Prefix + 'Body').innerHTML = `
|
||||
<div id="${Prefix}-RawPane"><pre class="ResultPre p-3 m-0">${EscHtml(Content_)}</pre></div>
|
||||
<div id="${Prefix}-PreviewPane" class="d-none"><div class="PreviewPane">${marked.parse(Content_)}</div></div>`;
|
||||
}
|
||||
|
||||
function SwitchTab(Prefix, Tab, Link) {
|
||||
document.querySelectorAll('#' + Prefix + 'Tabs .nav-link').forEach(L => L.classList.remove('active'));
|
||||
Link.classList.add('active');
|
||||
document.getElementById(Prefix + '-RawPane').classList.toggle('d-none', Tab !== 'Raw');
|
||||
document.getElementById(Prefix + '-PreviewPane').classList.toggle('d-none', Tab !== 'Preview');
|
||||
}
|
||||
|
||||
function EscHtml(Str) {
|
||||
return String(Str).replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"');
|
||||
}
|
||||
|
||||
async function CleanupMarkItDown() {
|
||||
if (!MdContent) return;
|
||||
const Btn = document.getElementById('CleanupBtn');
|
||||
const Spinner = document.getElementById('CleanupSpinner');
|
||||
Btn.disabled = true;
|
||||
Spinner.classList.remove('d-none');
|
||||
try {
|
||||
const CustomPrompt = document.getElementById('LlmPrompt').value.trim();
|
||||
const Res = await fetch(MarkItDownUrl + '/cleanup', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ text: MdContent, prompt: CustomPrompt || null, model: document.getElementById('CleanupModel').value.trim() || null }),
|
||||
});
|
||||
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
|
||||
const Data = await Res.json();
|
||||
MdContent = Data.text;
|
||||
RenderResult('Md', MdContent);
|
||||
document.querySelector('#MdTabs .nav-link:last-child').click();
|
||||
} catch (E) {
|
||||
alert('Cleanup failed: ' + E.message);
|
||||
} finally {
|
||||
Btn.disabled = false;
|
||||
Spinner.classList.add('d-none');
|
||||
}
|
||||
}
|
||||
|
||||
function ClearResults() {
|
||||
MdContent = ''; DlContent = '';
|
||||
CurrentFile = null;
|
||||
SetDisplay('ProgressRow', false);
|
||||
SetDisplay('CompareBanner', false);
|
||||
document.getElementById('MdBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
|
||||
document.getElementById('DlBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
|
||||
document.getElementById('FileName').textContent = '';
|
||||
document.getElementById('ConvertBtn').disabled = true;
|
||||
document.getElementById('CleanupBtn').classList.add('d-none');
|
||||
document.getElementById('MdDownloadBtn').classList.add('d-none');
|
||||
document.getElementById('DlDownloadBtn').classList.add('d-none');
|
||||
FileInput.value = '';
|
||||
}
|
||||
|
||||
// ── History ───────────────────────────────────────────────────
|
||||
async function LoadHistory() {
|
||||
try {
|
||||
const [MdData, DlData] = await Promise.all([
|
||||
fetch(MarkItDownUrl + '/history?limit=8').then(R => R.json()),
|
||||
fetch(DoclingUrl + '/history?limit=8').then(R => R.json()),
|
||||
]);
|
||||
RenderHistory('MdHistory', MdData);
|
||||
RenderHistory('DlHistory', DlData);
|
||||
} catch (E) {
|
||||
console.warn('History load failed', E);
|
||||
}
|
||||
}
|
||||
|
||||
function RenderHistory(ElId, Items) {
|
||||
const El = document.getElementById(ElId);
|
||||
if (!Items || !Items.length) {
|
||||
El.innerHTML = '<div class="text-center text-muted py-3 small">Chưa có lịch sử</div>';
|
||||
return;
|
||||
}
|
||||
El.innerHTML = '<ul class="list-group list-group-flush">' +
|
||||
Items.map(Item => `
|
||||
<li class="list-group-item d-flex justify-content-between align-items-center py-2 px-3">
|
||||
<span class="small fw-medium text-truncate me-2" style="max-width:60%">${EscHtml(Item.filename)}</span>
|
||||
<span class="d-flex gap-1 align-items-center flex-shrink-0">
|
||||
${Item.file_type ? `<span class="badge bg-secondary-subtle text-secondary">${Item.file_type}</span>` : ''}
|
||||
${Item.llm_enabled ? '<span class="badge bg-primary-subtle text-primary">🤖 LLM</span>' : ''}
|
||||
<span class="text-muted" style="font-size:.7rem">${FormatDate(Item.created_at)}</span>
|
||||
</span>
|
||||
</li>`).join('') +
|
||||
'</ul>';
|
||||
}
|
||||
|
||||
function FormatDate(Str) {
|
||||
try { return new Date(Str).toLocaleTimeString('vi-VN'); } catch { return Str; }
|
||||
}
|
||||
|
||||
// ── Settings modal ───────────────────────────────────────────
|
||||
async function LoadSettings() {
|
||||
try {
|
||||
const [Md, Dl] = await Promise.all([
|
||||
fetch(MarkItDownUrl + '/settings').then(R => R.json()),
|
||||
fetch(DoclingUrl + '/settings').then(R => R.json()),
|
||||
]);
|
||||
document.getElementById('SettingUrl').value = Md.ollama_base_url || '';
|
||||
document.getElementById('MdDefaultPrompt').value = Md.default_prompt || '';
|
||||
document.getElementById('DlDefaultPrompt').value = Dl.default_prompt || '';
|
||||
RenderLlmStatus('MdLlmStatus', Md);
|
||||
RenderLlmStatus('DlLlmStatus', Dl);
|
||||
await FetchModels(Md.ollama_model, Md.cleanup_model);
|
||||
} catch (E) {
|
||||
console.warn('Settings load failed', E);
|
||||
}
|
||||
}
|
||||
|
||||
async function FetchModels(SelectModel, SelectCleanup) {
|
||||
try {
|
||||
const Data = await fetch(MarkItDownUrl + '/models').then(R => R.json());
|
||||
const Models = Data.models || [];
|
||||
PopulateSelect('SettingModel', Models, SelectModel || null, '-- chọn model --');
|
||||
PopulateSelect('CleanupModel', Models, SelectCleanup || null, '-- dùng model trên --');
|
||||
} catch (E) {
|
||||
console.warn('FetchModels failed', E);
|
||||
}
|
||||
}
|
||||
|
||||
function PopulateSelect(ElId, Models, Selected, EmptyLabel) {
|
||||
const Sel = document.getElementById(ElId);
|
||||
const Prev = Sel.value || Selected;
|
||||
Sel.innerHTML = `<option value="">${EmptyLabel}</option>` +
|
||||
Models.map(M => `<option value="${M}"${M === Prev ? ' selected' : ''}>${M}</option>`).join('');
|
||||
}
|
||||
|
||||
function UsePrompt(Prefix) {
|
||||
const Src = Prefix === 'Md' ? 'MdDefaultPrompt' : 'DlDefaultPrompt';
|
||||
const Text = document.getElementById(Src).value;
|
||||
document.getElementById('LlmPrompt').value = Text;
|
||||
localStorage.setItem('llm_prompt', Text);
|
||||
}
|
||||
|
||||
function DownloadMd(Prefix) {
|
||||
const Content = Prefix === 'Md' ? MdContent : DlContent;
|
||||
if (!Content) return;
|
||||
const BaseName = (CurrentFile ? CurrentFile.name.replace(/\.[^.]+$/, '') : 'output');
|
||||
const Suffix = Prefix === 'Md' ? '_markitdown' : '_docling';
|
||||
const Blob = new Blob([Content], { type: 'text/markdown' });
|
||||
const A = document.createElement('a');
|
||||
A.href = URL.createObjectURL(Blob);
|
||||
A.download = BaseName + Suffix + '.md';
|
||||
A.click();
|
||||
URL.revokeObjectURL(A.href);
|
||||
}
|
||||
|
||||
function SwitchDefaultTab(Prefix, Link) {
|
||||
document.querySelectorAll('#DefaultPromptTabs .nav-link').forEach(L => L.classList.remove('active'));
|
||||
Link.classList.add('active');
|
||||
document.getElementById('DefaultPrompt-Md').classList.toggle('d-none', Prefix !== 'Md');
|
||||
document.getElementById('DefaultPrompt-Dl').classList.toggle('d-none', Prefix !== 'Dl');
|
||||
}
|
||||
|
||||
function RenderLlmStatus(ElId, Data) {
|
||||
const El = document.getElementById(ElId);
|
||||
El.innerHTML = Data.llm_enabled
|
||||
? `<span class="badge bg-success-subtle text-success">✅ ${Data.ollama_model}</span>`
|
||||
: `<span class="badge bg-secondary-subtle text-secondary">⚪ Tắt</span>`;
|
||||
}
|
||||
|
||||
async function SaveSettings() {
|
||||
const Url = document.getElementById('SettingUrl').value.trim();
|
||||
const Model = document.getElementById('SettingModel').value || 'llava';
|
||||
const CleanupModel = document.getElementById('CleanupModel').value;
|
||||
const Btn = document.getElementById('SaveSettingsBtn');
|
||||
const Spin = document.getElementById('SaveSpinner');
|
||||
Btn.disabled = true; Spin.classList.remove('d-none');
|
||||
try {
|
||||
const Body = JSON.stringify({ ollama_base_url: Url || null, ollama_model: Model, cleanup_model: CleanupModel || null });
|
||||
const [Md, Dl] = await Promise.all([
|
||||
fetch(MarkItDownUrl + '/settings', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: Body }).then(R => R.json()),
|
||||
fetch(DoclingUrl + '/settings', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: Body }).then(R => R.json()),
|
||||
]);
|
||||
RenderLlmStatus('MdLlmStatus', Md);
|
||||
RenderLlmStatus('DlLlmStatus', Dl);
|
||||
} catch (E) {
|
||||
alert('Lưu thất bại: ' + E.message);
|
||||
} finally {
|
||||
Btn.disabled = false; Spin.classList.add('d-none');
|
||||
}
|
||||
}
|
||||
|
||||
LoadSettings();
|
||||
|
||||
// init — restore saved state
|
||||
const _savedPrompt = localStorage.getItem('llm_prompt');
|
||||
if (_savedPrompt) document.getElementById('LlmPrompt').value = _savedPrompt;
|
||||
const _savedLlm = localStorage.getItem('llm_enabled');
|
||||
if (_savedLlm === '0') {
|
||||
document.getElementById('LlmToggle').checked = false;
|
||||
}
|
||||
OnLlmToggle();
|
||||
LoadHistory();
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
.env
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg libmagic1 \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY main.py .
|
||||
COPY app/ ./app/
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
from fastapi import APIRouter, UploadFile, File, Depends, Query, Body, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from app.models.ConvertModel import ConvertResponse, HealthResponse, ConversionRecord
|
||||
from app.services import MarkitdownService as markitdown_service
|
||||
from app.database import get_db
|
||||
from pydantic import BaseModel
|
||||
|
||||
class CleanupRequest(BaseModel):
|
||||
text: str
|
||||
prompt: str | None = None
|
||||
model: str | None = None
|
||||
|
||||
class CleanupResponse(BaseModel):
|
||||
text: str
|
||||
|
||||
class SettingsRequest(BaseModel):
|
||||
ollama_base_url: str | None = None
|
||||
ollama_model: str = "llava"
|
||||
cleanup_model: str | None = None
|
||||
|
||||
class SettingsResponse(BaseModel):
|
||||
llm_enabled: bool
|
||||
ollama_base_url: str | None
|
||||
ollama_model: str
|
||||
cleanup_model: str | None = None
|
||||
default_prompt: str | None = None
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/health", response_model=HealthResponse)
|
||||
def health():
|
||||
return HealthResponse(
|
||||
status="ok",
|
||||
llm_enabled=markitdown_service.LLM_ACTIVE,
|
||||
llm_model=markitdown_service.OLLAMA_MODEL if markitdown_service.LLM_ACTIVE else None,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/convert", response_model=ConvertResponse)
|
||||
async def convert(
|
||||
file: UploadFile = File(...),
|
||||
use_llm: bool = Query(default=True, description="Use LLM vision for image understanding"),
|
||||
llm_prompt: str | None = Query(default=None, description="Custom prompt for LLM vision"),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
record = await markitdown_service.convert_file(file, db, use_llm=use_llm, llm_prompt=llm_prompt)
|
||||
return record
|
||||
|
||||
|
||||
@router.get("/models")
|
||||
def list_models():
|
||||
if not markitdown_service.OLLAMA_BASE_URL:
|
||||
return {"models": []}
|
||||
try:
|
||||
import httpx, re
|
||||
base = re.sub(r"/v1/?$", "", markitdown_service.OLLAMA_BASE_URL.rstrip("/"))
|
||||
resp = httpx.get(f"{base}/api/tags", timeout=5)
|
||||
resp.raise_for_status()
|
||||
names = [m["name"] for m in resp.json().get("models", [])]
|
||||
return {"models": sorted(names)}
|
||||
except Exception as e:
|
||||
return {"models": [], "error": str(e)}
|
||||
|
||||
@router.get("/settings", response_model=SettingsResponse)
|
||||
def get_settings():
|
||||
return SettingsResponse(
|
||||
llm_enabled=markitdown_service.LLM_ACTIVE,
|
||||
ollama_base_url=markitdown_service.OLLAMA_BASE_URL,
|
||||
ollama_model=markitdown_service.OLLAMA_MODEL,
|
||||
cleanup_model=markitdown_service.CLEANUP_MODEL or None,
|
||||
default_prompt=markitdown_service.DEFAULT_CLEANUP_PROMPT,
|
||||
)
|
||||
|
||||
@router.post("/settings", response_model=SettingsResponse)
|
||||
def update_settings(req: SettingsRequest):
|
||||
markitdown_service._init_llm(req.ollama_base_url or None, req.ollama_model)
|
||||
markitdown_service.CLEANUP_MODEL = req.cleanup_model or ""
|
||||
return SettingsResponse(
|
||||
llm_enabled=markitdown_service.LLM_ACTIVE,
|
||||
ollama_base_url=markitdown_service.OLLAMA_BASE_URL,
|
||||
ollama_model=markitdown_service.OLLAMA_MODEL,
|
||||
cleanup_model=markitdown_service.CLEANUP_MODEL or None,
|
||||
default_prompt=markitdown_service.DEFAULT_CLEANUP_PROMPT,
|
||||
)
|
||||
|
||||
@router.post("/cleanup", response_model=CleanupResponse)
|
||||
async def cleanup(req: CleanupRequest):
|
||||
if not markitdown_service.LLM_ACTIVE:
|
||||
raise HTTPException(status_code=503, detail="LLM not configured")
|
||||
cleaned = markitdown_service.llm_cleanup(req.text, req.prompt, req.model)
|
||||
return CleanupResponse(text=cleaned)
|
||||
|
||||
|
||||
@router.get("/history", response_model=list[ConversionRecord])
|
||||
async def history(limit: int = 20, db: AsyncSession = Depends(get_db)):
|
||||
records = await markitdown_service.get_history(db, limit)
|
||||
return [
|
||||
ConversionRecord(
|
||||
id=r.id,
|
||||
filename=r.filename,
|
||||
file_type=r.file_type,
|
||||
llm_enabled=r.llm_enabled,
|
||||
created_at=str(r.created_at),
|
||||
)
|
||||
for r in records
|
||||
]
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
import os
|
||||
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
|
||||
from sqlalchemy.orm import sessionmaker, DeclarativeBase
|
||||
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql+asyncpg://admin:secret@db:5432/markitdown")
|
||||
|
||||
# asyncpg driver
|
||||
DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+asyncpg://")
|
||||
|
||||
engine = create_async_engine(DATABASE_URL, echo=False)
|
||||
|
||||
AsyncSessionLocal = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
async def get_db():
|
||||
async with AsyncSessionLocal() as session:
|
||||
yield session
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, func
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Conversion(Base):
|
||||
__tablename__ = "conversions"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
filename = Column(String(255), nullable=False)
|
||||
file_type = Column(String(50))
|
||||
markdown = Column(Text)
|
||||
llm_enabled = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime, server_default=func.now())
|
||||
|
||||
|
||||
class ConvertResponse(BaseModel):
|
||||
id: int
|
||||
filename: str
|
||||
markdown: str
|
||||
llm_enabled: bool = False
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
llm_enabled: bool = False
|
||||
llm_model: Optional[str] = None
|
||||
|
||||
|
||||
class ConversionRecord(BaseModel):
|
||||
id: int
|
||||
filename: str
|
||||
file_type: Optional[str] = None
|
||||
llm_enabled: bool = False
|
||||
created_at: str
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
from fastapi import UploadFile, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select
|
||||
from markitdown import MarkItDown
|
||||
from app.models.ConvertModel import Conversion
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import openai as _openai
|
||||
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
|
||||
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
|
||||
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
|
||||
|
||||
LLM_ACTIVE = False
|
||||
_llm_client = None
|
||||
md_plain = MarkItDown()
|
||||
md = md_plain
|
||||
|
||||
|
||||
def _init_llm(base_url: str | None, model: str) -> bool:
|
||||
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md
|
||||
if not base_url:
|
||||
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md = None, model, False, None, md_plain
|
||||
return False
|
||||
try:
|
||||
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
|
||||
OLLAMA_BASE_URL = base_url
|
||||
OLLAMA_MODEL = model
|
||||
_llm_client = client
|
||||
md = MarkItDown(llm_client=client, llm_model=model)
|
||||
LLM_ACTIVE = True
|
||||
logger.info("MarkItDown: LLM enabled via %s (model=%s)", base_url, model)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("MarkItDown: LLM init failed (%s)", e)
|
||||
LLM_ACTIVE = False
|
||||
return False
|
||||
|
||||
|
||||
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
|
||||
|
||||
DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \
|
||||
The text below was extracted from a multi-column PDF using OCR and is poorly structured: \
|
||||
columns are merged, headers are mixed with values, and content is out of order.
|
||||
|
||||
Your task:
|
||||
1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.)
|
||||
2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value |
|
||||
3. Keep bullet lists where appropriate (e.g. ports, certifications)
|
||||
4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows)
|
||||
5. Preserve all technical values exactly — do not paraphrase specs
|
||||
|
||||
Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble."""
|
||||
|
||||
import re as _re
|
||||
|
||||
def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str:
|
||||
if not _llm_client or not text.strip():
|
||||
return text
|
||||
try:
|
||||
resp = _llm_client.chat.completions.create(
|
||||
model=model or OLLAMA_MODEL,
|
||||
messages=[
|
||||
{"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT},
|
||||
{"role": "user", "content": text},
|
||||
],
|
||||
temperature=0,
|
||||
)
|
||||
result = resp.choices[0].message.content or text
|
||||
result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
|
||||
result = _re.sub(r"\n?```\s*$", "", result.strip())
|
||||
return result.strip() or text
|
||||
except Exception as e:
|
||||
logger.warning("MarkItDown: cleanup failed (%s)", e)
|
||||
return text
|
||||
|
||||
|
||||
ALLOWED_EXTENSIONS = {
|
||||
"pdf", "docx", "xlsx", "pptx",
|
||||
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
|
||||
}
|
||||
|
||||
|
||||
def _allowed_file(filename: str) -> bool:
|
||||
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
||||
|
||||
|
||||
async def convert_file(
|
||||
file: UploadFile,
|
||||
db: AsyncSession,
|
||||
use_llm: bool = True,
|
||||
llm_prompt: str | None = None,
|
||||
) -> Conversion:
|
||||
if not _allowed_file(file.filename):
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
|
||||
)
|
||||
|
||||
suffix = os.path.splitext(file.filename)[1]
|
||||
file_type = suffix.lstrip(".").lower()
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(await file.read())
|
||||
tmp_path = tmp.name
|
||||
|
||||
# Choose converter: LLM only if enabled globally AND requested per-call
|
||||
use_llm_now = LLM_ACTIVE and use_llm
|
||||
|
||||
# If custom prompt provided, create a one-off MarkItDown with that prompt
|
||||
if use_llm_now and llm_prompt:
|
||||
try:
|
||||
converter = MarkItDown(
|
||||
llm_client=_llm_client,
|
||||
llm_model=OLLAMA_MODEL,
|
||||
llm_prompt=llm_prompt,
|
||||
)
|
||||
except TypeError:
|
||||
# older markitdown versions may not support llm_prompt kwarg
|
||||
converter = md
|
||||
elif use_llm_now:
|
||||
converter = md
|
||||
else:
|
||||
converter = md_plain
|
||||
|
||||
try:
|
||||
result = converter.convert(tmp_path)
|
||||
record = Conversion(
|
||||
filename=file.filename,
|
||||
file_type=file_type,
|
||||
markdown=result.text_content,
|
||||
llm_enabled=use_llm_now,
|
||||
)
|
||||
db.add(record)
|
||||
await db.commit()
|
||||
await db.refresh(record)
|
||||
return record
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
|
||||
result = await db.execute(
|
||||
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
|
||||
)
|
||||
return result.scalars().all()
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from app.controllers.ConvertController import router
|
||||
from app.database import engine, Base
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title="MarkItDown API", version="1.0.0", lifespan=lifespan)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.include_router(router)
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
markitdown[all]
|
||||
fastapi
|
||||
uvicorn
|
||||
python-multipart
|
||||
asyncpg
|
||||
sqlalchemy[asyncio]
|
||||
openai
|
||||
httpx
|
||||
Loading…
Reference in New Issue