AI-markdown/markitdown-service/app/services/MarkitdownService.py

164 lines
5.5 KiB
Python

import os
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from markitdown import MarkItDown
from app.models.ConvertModel import Conversion
logger = logging.getLogger(__name__)
import openai as _openai
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
LLM_ACTIVE = False
_llm_client = None
md_plain = MarkItDown()
md = md_plain
def _init_llm(base_url: str | None, model: str) -> bool:
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md
if not base_url:
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md = None, model, False, None, md_plain
return False
try:
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
OLLAMA_BASE_URL = base_url
OLLAMA_MODEL = model
_llm_client = client
md = MarkItDown(llm_client=client, llm_model=model)
LLM_ACTIVE = True
logger.info("MarkItDown: LLM enabled via %s (model=%s)", base_url, model)
return True
except Exception as e:
logger.warning("MarkItDown: LLM init failed (%s)", e)
LLM_ACTIVE = False
return False
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \
The text below was extracted from a multi-column PDF using OCR and is poorly structured: \
columns are merged, headers are mixed with values, and content is out of order.
Your task:
1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.)
2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value |
3. Keep bullet lists where appropriate (e.g. ports, certifications)
4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows)
5. Preserve all technical values exactly — do not paraphrase specs
Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble."""
import re as _re
def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str:
if not _llm_client or not text.strip():
return text
try:
resp = _llm_client.chat.completions.create(
model=model or OLLAMA_MODEL,
messages=[
{"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT},
{"role": "user", "content": text},
],
temperature=0,
)
result = resp.choices[0].message.content or text
result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
result = _re.sub(r"\n?```\s*$", "", result.strip())
return result.strip() or text
except Exception as e:
logger.warning("MarkItDown: cleanup failed (%s)", e)
return text
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
}
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
async def convert_file(
file: UploadFile,
db: AsyncSession,
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
if not _allowed_file(file.filename):
raise HTTPException(
status_code=422,
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
suffix = os.path.splitext(file.filename)[1]
file_type = suffix.lstrip(".").lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
# Choose converter: LLM only if enabled globally AND requested per-call
use_llm_now = LLM_ACTIVE and use_llm
# If custom prompt provided, create a one-off MarkItDown with that prompt
if use_llm_now and llm_prompt:
try:
converter = MarkItDown(
llm_client=_llm_client,
llm_model=OLLAMA_MODEL,
llm_prompt=llm_prompt,
)
except TypeError:
# older markitdown versions may not support llm_prompt kwarg
converter = md
elif use_llm_now:
converter = md
else:
converter = md_plain
try:
try:
result = converter.convert(tmp_path)
actual_llm = use_llm_now
except Exception as llm_err:
# Ollama OOM / 500 — fallback to plain conversion without LLM
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
result = md_plain.convert(tmp_path)
actual_llm = False
else:
raise
record = Conversion(
filename=file.filename,
file_type=file_type,
markdown=result.text_content,
llm_enabled=actual_llm,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
result = await db.execute(
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
)
return result.scalars().all()