AI-markdown/markitdown-service/app/services/MarkitdownService.py

import os
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from markitdown import MarkItDown
from app.models.ConvertModel import Conversion

logger = logging.getLogger(__name__)

import openai as _openai

OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
OLLAMA_MODEL    = os.getenv("OLLAMA_MODEL", "llava")
CLEANUP_MODEL   = os.getenv("CLEANUP_MODEL", "")

LLM_ACTIVE = False
_llm_client = None
md_plain = MarkItDown()
md = md_plain


def _init_llm(base_url: str | None, model: str) -> bool:
    global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md
    if not base_url:
        OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md = None, model, False, None, md_plain
        return False
    try:
        client = _openai.OpenAI(base_url=base_url, api_key="ollama")
        OLLAMA_BASE_URL = base_url
        OLLAMA_MODEL    = model
        _llm_client     = client
        md              = MarkItDown(llm_client=client, llm_model=model)
        LLM_ACTIVE      = True
        logger.info("MarkItDown: LLM enabled via %s (model=%s)", base_url, model)
        return True
    except Exception as e:
        logger.warning("MarkItDown: LLM init failed (%s)", e)
        LLM_ACTIVE = False
        return False


_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)

DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \
The text below was extracted from a multi-column PDF using OCR and is poorly structured: \
columns are merged, headers are mixed with values, and content is out of order.

Your task:
1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.)
2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value |
3. Keep bullet lists where appropriate (e.g. ports, certifications)
4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows)
5. Preserve all technical values exactly — do not paraphrase specs

Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble."""

import re as _re

def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str:
    if not _llm_client or not text.strip():
        return text
    try:
        resp = _llm_client.chat.completions.create(
            model=model or OLLAMA_MODEL,
            messages=[
                {"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT},
                {"role": "user", "content": text},
            ],
            temperature=0,
        )
        result = resp.choices[0].message.content or text
        result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
        result = _re.sub(r"\n?```\s*$", "", result.strip())
        return result.strip() or text
    except Exception as e:
        logger.warning("MarkItDown: cleanup failed (%s)", e)
        return text


ALLOWED_EXTENSIONS = {
    "pdf", "docx", "xlsx", "pptx",
    "html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
}


def _allowed_file(filename: str) -> bool:
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS


async def convert_file(
    file: UploadFile,
    db: AsyncSession,
    use_llm: bool = True,
    llm_prompt: str | None = None,
) -> Conversion:
    if not _allowed_file(file.filename):
        raise HTTPException(
            status_code=422,
            detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
        )

    suffix = os.path.splitext(file.filename)[1]
    file_type = suffix.lstrip(".").lower()

    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(await file.read())
        tmp_path = tmp.name

    # Choose converter: LLM only if enabled globally AND requested per-call
    use_llm_now = LLM_ACTIVE and use_llm

    # If custom prompt provided, create a one-off MarkItDown with that prompt
    if use_llm_now and llm_prompt:
        try:
            converter = MarkItDown(
                llm_client=_llm_client,
                llm_model=OLLAMA_MODEL,
                llm_prompt=llm_prompt,
            )
        except TypeError:
            # older markitdown versions may not support llm_prompt kwarg
            converter = md
    elif use_llm_now:
        converter = md
    else:
        converter = md_plain

    try:
        try:
            result = converter.convert(tmp_path)
            actual_llm = use_llm_now
        except Exception as llm_err:
            # Ollama OOM / 500 — fallback to plain conversion without LLM
            if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
                logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
                result = md_plain.convert(tmp_path)
                actual_llm = False
            else:
                raise
        record = Conversion(
            filename=file.filename,
            file_type=file_type,
            markdown=result.text_content,
            llm_enabled=actual_llm,
        )
        db.add(record)
        await db.commit()
        await db.refresh(record)
        return record
    except Exception as e:
        await db.rollback()
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        os.unlink(tmp_path)


async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
    result = await db.execute(
        select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
    )
    return result.scalars().all()