220 lines
7.1 KiB
Python
220 lines
7.1 KiB
Python
import os
|
|
import tempfile
|
|
import logging
|
|
from fastapi import UploadFile, HTTPException
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from sqlalchemy import select
|
|
from markitdown import MarkItDown
|
|
from app.models.ConvertModel import Conversion
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
import openai as _openai
|
|
|
|
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
|
|
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
|
|
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
|
|
|
|
LLM_ACTIVE = False
|
|
_llm_client = None
|
|
md_plain = MarkItDown()
|
|
md = md_plain
|
|
|
|
|
|
def _init_llm(base_url: str | None, model: str) -> bool:
|
|
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md
|
|
if not base_url:
|
|
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md = None, model, False, None, md_plain
|
|
return False
|
|
try:
|
|
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
|
|
OLLAMA_BASE_URL = base_url
|
|
OLLAMA_MODEL = model
|
|
_llm_client = client
|
|
md = MarkItDown(llm_client=client, llm_model=model)
|
|
LLM_ACTIVE = True
|
|
logger.info("MarkItDown: LLM enabled via %s (model=%s)", base_url, model)
|
|
return True
|
|
except Exception as e:
|
|
logger.warning("MarkItDown: LLM init failed (%s)", e)
|
|
LLM_ACTIVE = False
|
|
return False
|
|
|
|
|
|
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
|
|
|
|
DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \
|
|
The text below was extracted from a multi-column PDF using OCR and is poorly structured: \
|
|
columns are merged, headers are mixed with values, and content is out of order.
|
|
|
|
Your task:
|
|
1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.)
|
|
2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value |
|
|
3. Keep bullet lists where appropriate (e.g. ports, certifications)
|
|
4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows)
|
|
5. Preserve all technical values exactly — do not paraphrase specs
|
|
|
|
Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble."""
|
|
|
|
import re as _re
|
|
|
|
def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str:
|
|
if not _llm_client or not text.strip():
|
|
return text
|
|
try:
|
|
resp = _llm_client.chat.completions.create(
|
|
model=model or OLLAMA_MODEL,
|
|
messages=[
|
|
{"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT},
|
|
{"role": "user", "content": text},
|
|
],
|
|
temperature=0,
|
|
)
|
|
result = resp.choices[0].message.content or text
|
|
result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
|
|
result = _re.sub(r"\n?```\s*$", "", result.strip())
|
|
return result.strip() or text
|
|
except Exception as e:
|
|
logger.warning("MarkItDown: cleanup failed (%s)", e)
|
|
return text
|
|
|
|
|
|
ALLOWED_EXTENSIONS = {
|
|
"pdf", "docx", "xlsx", "pptx",
|
|
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
|
|
}
|
|
|
|
YOUTUBE_PATTERN = _re.compile(
|
|
r"(https?://)?(www\.)?(youtube\.com/watch|youtu\.be/|youtube\.com/shorts/)"
|
|
)
|
|
|
|
|
|
async def convert_url(
|
|
url: str,
|
|
db: AsyncSession,
|
|
use_llm: bool = True,
|
|
llm_prompt: str | None = None,
|
|
) -> Conversion:
|
|
use_llm_now = LLM_ACTIVE and use_llm
|
|
if use_llm_now and llm_prompt:
|
|
try:
|
|
converter = MarkItDown(
|
|
llm_client=_llm_client,
|
|
llm_model=OLLAMA_MODEL,
|
|
llm_prompt=llm_prompt,
|
|
)
|
|
except TypeError:
|
|
converter = md
|
|
elif use_llm_now:
|
|
converter = md
|
|
else:
|
|
converter = md_plain
|
|
|
|
try:
|
|
result = converter.convert(url)
|
|
actual_llm = use_llm_now
|
|
except Exception as llm_err:
|
|
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
|
|
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
|
|
result = md_plain.convert(url)
|
|
actual_llm = False
|
|
else:
|
|
raise HTTPException(status_code=500, detail=str(llm_err))
|
|
|
|
# Use last segment of URL as filename
|
|
slug = url.rstrip("/").split("/")[-1].split("?")[0] or "youtube"
|
|
filename = f"{slug}.md"
|
|
|
|
try:
|
|
record = Conversion(
|
|
filename=filename,
|
|
file_type="youtube",
|
|
markdown=result.text_content,
|
|
llm_enabled=actual_llm,
|
|
)
|
|
db.add(record)
|
|
await db.commit()
|
|
await db.refresh(record)
|
|
return record
|
|
except Exception as e:
|
|
await db.rollback()
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
def _allowed_file(filename: str) -> bool:
|
|
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
|
|
|
async def convert_file(
|
|
file: UploadFile,
|
|
db: AsyncSession,
|
|
use_llm: bool = True,
|
|
llm_prompt: str | None = None,
|
|
) -> Conversion:
|
|
if not _allowed_file(file.filename):
|
|
raise HTTPException(
|
|
status_code=422,
|
|
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
|
|
)
|
|
|
|
suffix = os.path.splitext(file.filename)[1]
|
|
file_type = suffix.lstrip(".").lower()
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
|
tmp.write(await file.read())
|
|
tmp_path = tmp.name
|
|
|
|
# Choose converter: LLM only if enabled globally AND requested per-call
|
|
use_llm_now = LLM_ACTIVE and use_llm
|
|
|
|
# If custom prompt provided, create a one-off MarkItDown with that prompt
|
|
if use_llm_now and llm_prompt:
|
|
try:
|
|
converter = MarkItDown(
|
|
llm_client=_llm_client,
|
|
llm_model=OLLAMA_MODEL,
|
|
llm_prompt=llm_prompt,
|
|
)
|
|
except TypeError:
|
|
# older markitdown versions may not support llm_prompt kwarg
|
|
converter = md
|
|
elif use_llm_now:
|
|
converter = md
|
|
else:
|
|
converter = md_plain
|
|
|
|
try:
|
|
try:
|
|
result = converter.convert(tmp_path)
|
|
actual_llm = use_llm_now
|
|
except Exception as llm_err:
|
|
# Ollama OOM / 500 — fallback to plain conversion without LLM
|
|
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
|
|
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
|
|
result = md_plain.convert(tmp_path)
|
|
actual_llm = False
|
|
else:
|
|
raise
|
|
record = Conversion(
|
|
filename=file.filename,
|
|
file_type=file_type,
|
|
markdown=result.text_content,
|
|
llm_enabled=actual_llm,
|
|
)
|
|
db.add(record)
|
|
await db.commit()
|
|
await db.refresh(record)
|
|
return record
|
|
except Exception as e:
|
|
await db.rollback()
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
finally:
|
|
os.unlink(tmp_path)
|
|
|
|
|
|
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
|
|
result = await db.execute(
|
|
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
|
|
)
|
|
return result.scalars().all()
|