AI-markdown/docling-service/app/controllers/ConvertController.py

from fastapi import APIRouter, UploadFile, File, Depends, Query
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.ConvertModel import ConvertResponse, HealthResponse, ConversionRecord
from app.services import DoclingService as docling_service
from app.database import get_db
from pydantic import BaseModel

router = APIRouter()

class SettingsRequest(BaseModel):
    ollama_base_url: str | None = None
    ollama_model: str = "llava"
    cleanup_model: str | None = None

class SettingsResponse(BaseModel):
    llm_enabled: bool
    ollama_base_url: str | None
    ollama_model: str
    cleanup_model: str | None = None
    default_prompt: str | None = None

SUPPORTED_INPUT_FORMATS = sorted([
    "pdf", "docx", "xlsx", "pptx",
    "html", "htm", "jpg", "jpeg", "png",
    "tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc"
])

SUPPORTED_OUTPUT_FORMATS = ["markdown", "json", "html", "text"]


@router.get("/settings", response_model=SettingsResponse)
def get_settings():
    return SettingsResponse(
        llm_enabled=docling_service.LLM_ACTIVE,
        ollama_base_url=docling_service.OLLAMA_BASE_URL,
        ollama_model=docling_service.OLLAMA_MODEL,
        cleanup_model=docling_service.CLEANUP_MODEL or None,
        default_prompt=docling_service.DEFAULT_ENRICH_PROMPT,
    )

@router.post("/settings", response_model=SettingsResponse)
def update_settings(req: SettingsRequest):
    docling_service._init_llm(req.ollama_base_url or None, req.ollama_model)
    docling_service.CLEANUP_MODEL = req.cleanup_model or ""
    return SettingsResponse(
        llm_enabled=docling_service.LLM_ACTIVE,
        ollama_base_url=docling_service.OLLAMA_BASE_URL,
        ollama_model=docling_service.OLLAMA_MODEL,
        cleanup_model=docling_service.CLEANUP_MODEL or None,
        default_prompt=docling_service.DEFAULT_ENRICH_PROMPT,
    )

@router.get("/health", response_model=HealthResponse)
def health():
    from app.services.DoclingService import LLM_ACTIVE, OLLAMA_MODEL
    ocr = "tesseract" if _ocr_available() else "none"
    return HealthResponse(
        status="ok",
        supported_formats=SUPPORTED_INPUT_FORMATS,
        output_formats=SUPPORTED_OUTPUT_FORMATS,
        llm_enabled=LLM_ACTIVE,
        llm_model=OLLAMA_MODEL if LLM_ACTIVE else None,
        ocr_engine=ocr,
    )


def _ocr_available() -> bool:
    import shutil
    return shutil.which("tesseract") is not None


@router.post("/convert", response_model=ConvertResponse)
async def convert(
    file: UploadFile = File(...),
    output_format: str = Query(default="markdown", description="Output format: markdown | json | html | text"),
    use_llm: bool = Query(default=True, description="Run LLM enrichment on extracted text"),
    llm_prompt: str | None = Query(default=None, description="Custom system prompt for LLM enrichment"),
    db: AsyncSession = Depends(get_db),
):
    record = await docling_service.convert_file(file, db, output_format, use_llm=use_llm, llm_prompt=llm_prompt)
    return ConvertResponse(
        id=record.id,
        filename=record.filename,
        output_format=record.output_format,
        content=record.content,
        page_count=record.page_count,
        llm_enabled=record.llm_enabled,
    )


@router.get("/conversions/{conversion_id}", response_model=ConvertResponse)
async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
    record = await docling_service.get_conversion(conversion_id, db)
    return ConvertResponse(
        id=record.id,
        filename=record.filename,
        output_format=record.output_format,
        content=record.content,
        page_count=record.page_count,
        llm_enabled=record.llm_enabled,
    )


@router.get("/history", response_model=list[ConversionRecord])
async def history(limit: int = 20, db: AsyncSession = Depends(get_db)):
    records = await docling_service.get_history(db, limit)
    return [
        ConversionRecord(
            id=r.id,
            filename=r.filename,
            file_type=r.file_type,
            output_format=r.output_format,
            page_count=r.page_count,
            created_at=str(r.created_at),
        )
        for r in records
    ]


@router.delete("/conversions/{conversion_id}")
async def delete_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
    return await docling_service.delete_conversion(conversion_id, db)