import os import tempfile import logging from fastapi import UploadFile, HTTPException from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import select from markitdown import MarkItDown from app.models.ConvertModel import Conversion logger = logging.getLogger(__name__) import openai as _openai OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava") CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "") LLM_ACTIVE = False _llm_client = None md_plain = MarkItDown() md = md_plain def _init_llm(base_url: str | None, model: str) -> bool: global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md if not base_url: OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md = None, model, False, None, md_plain return False try: client = _openai.OpenAI(base_url=base_url, api_key="ollama") OLLAMA_BASE_URL = base_url OLLAMA_MODEL = model _llm_client = client md = MarkItDown(llm_client=client, llm_model=model) LLM_ACTIVE = True logger.info("MarkItDown: LLM enabled via %s (model=%s)", base_url, model) return True except Exception as e: logger.warning("MarkItDown: LLM init failed (%s)", e) LLM_ACTIVE = False return False _init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL) DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \ The text below was extracted from a multi-column PDF using OCR and is poorly structured: \ columns are merged, headers are mixed with values, and content is out of order. Your task: 1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.) 2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value | 3. Keep bullet lists where appropriate (e.g. ports, certifications) 4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows) 5. Preserve all technical values exactly — do not paraphrase specs Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble.""" import re as _re def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str: if not _llm_client or not text.strip(): return text try: resp = _llm_client.chat.completions.create( model=model or OLLAMA_MODEL, messages=[ {"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT}, {"role": "user", "content": text}, ], temperature=0, ) result = resp.choices[0].message.content or text result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip()) result = _re.sub(r"\n?```\s*$", "", result.strip()) return result.strip() or text except Exception as e: logger.warning("MarkItDown: cleanup failed (%s)", e) return text ALLOWED_EXTENSIONS = { "pdf", "docx", "xlsx", "pptx", "html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub" } def _allowed_file(filename: str) -> bool: return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS async def convert_file( file: UploadFile, db: AsyncSession, use_llm: bool = True, llm_prompt: str | None = None, ) -> Conversion: if not _allowed_file(file.filename): raise HTTPException( status_code=422, detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}" ) suffix = os.path.splitext(file.filename)[1] file_type = suffix.lstrip(".").lower() with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(await file.read()) tmp_path = tmp.name # Choose converter: LLM only if enabled globally AND requested per-call use_llm_now = LLM_ACTIVE and use_llm # If custom prompt provided, create a one-off MarkItDown with that prompt if use_llm_now and llm_prompt: try: converter = MarkItDown( llm_client=_llm_client, llm_model=OLLAMA_MODEL, llm_prompt=llm_prompt, ) except TypeError: # older markitdown versions may not support llm_prompt kwarg converter = md elif use_llm_now: converter = md else: converter = md_plain try: try: result = converter.convert(tmp_path) actual_llm = use_llm_now except Exception as llm_err: # Ollama OOM / 500 — fallback to plain conversion without LLM if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__): logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err) result = md_plain.convert(tmp_path) actual_llm = False else: raise record = Conversion( filename=file.filename, file_type=file_type, markdown=result.text_content, llm_enabled=actual_llm, ) db.add(record) await db.commit() await db.refresh(record) return record except Exception as e: await db.rollback() raise HTTPException(status_code=500, detail=str(e)) finally: os.unlink(tmp_path) async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]: result = await db.execute( select(Conversion).order_by(Conversion.created_at.desc()).limit(limit) ) return result.scalars().all()