Migrate to Laravel app + unified services; add email conversion

- Move docling/markitdown services under services/ alongside new
  unlimited-ocr and vision services
- Add Laravel app for email-to-markdown conversion and OCR frontend
- Add email export tooling and example emails/output
- Update docker-compose, Caddyfile, and frontend assets

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kai Ton 2026-07-01 07:22:08 +00:00
parent 6ba704865f
commit 3f8765a581
116 changed files with 30757 additions and 1449 deletions

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
{
"git.ignoreLimitWarning": true
}

11
db/init_unlimited_ocr.sql Normal file
View File

@ -0,0 +1,11 @@
-- unlimited_ocr database tables
CREATE TABLE IF NOT EXISTS conversions (
id SERIAL PRIMARY KEY,
filename VARCHAR(255) NOT NULL,
file_type VARCHAR(50),
output_format VARCHAR(20) DEFAULT 'markdown',
content TEXT,
page_count INTEGER,
llm_enabled BOOLEAN DEFAULT TRUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

View File

@ -1,14 +1,16 @@
# ============================================================
# AI Markdown Demo — MarkItDown vs Docling
# AI Markdown Demo — MarkItDown vs Docling vs Unlimited-OCR
# ============================================================
# Naming convention: <layer>-<service>
#
# ui :8484 — Comparison UI (nginx)
# api-markitdown:8282 — MarkItDown API (Microsoft)
# api-docling :8383 — Docling API (IBM)
# db-markitdown :5432 — PostgreSQL for markitdown
# db-docling :— — PostgreSQL for docling (internal only)
# db-admin :5050 — Adminer
# ui :8484 — Comparison UI (Laravel)
# api-markitdown :8282 — MarkItDown API (Microsoft)
# api-docling :8383 — Docling API (IBM)
# api-unlimited-ocr :8181 — Unlimited-OCR API (Baidu)
# db-markitdown :5432 — PostgreSQL for markitdown
# db-docling :— — PostgreSQL for docling (internal only)
# db-unlimited-ocr :— — PostgreSQL for unlimited-ocr (internal only)
# db-admin :5050 — Adminer
# ============================================================
x-healthcheck-defaults: &healthcheck-defaults
@ -28,44 +30,98 @@ services:
# --- UI layer ---------------------------------------------------
ui:
image: caddy:alpine
image: shinsenter/laravel:php8.3
<<: *service-defaults
ports:
- "8484:80"
volumes:
- ./frontend/Caddyfile:/etc/caddy/Caddyfile:ro
- ./frontend:/srv:ro
- ./laravel-app:/var/www/html
- .:/workspace:ro
environment:
APP_ENV: production
APP_DEBUG: "false"
LOG_CHANNEL: stderr
depends_on:
db-markitdown:
condition: service_healthy
db-docling:
condition: service_healthy
db-unlimited-ocr:
condition: service_healthy
# --- API layer --------------------------------------------------
api-markitdown:
build:
context: ./markitdown-service
context: ./services/markitdown-service
<<: *service-defaults
ports:
- "8282:8000"
env_file: .env
environment:
DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-markitdown:5432/${POSTGRES_DB:-markitdown}
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-}
OLLAMA_MODEL: ${OLLAMA_MODEL:-llava}
VISION_URL: ${VISION_URL:-http://api-vision:8000}
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- .:/workspace:rw
depends_on:
db-markitdown:
condition: service_healthy
api-vision:
condition: service_started
api-docling:
build:
context: ./docling-service
context: ./services/docling-service
<<: *service-defaults
ports:
- "8383:8000"
env_file: .env
environment:
DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-docling:5432/docling
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-}
OLLAMA_MODEL: ${OLLAMA_MODEL:-llava}
VISION_URL: ${VISION_URL:-http://api-vision:8000}
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- .:/workspace:ro
depends_on:
db-docling:
condition: service_healthy
api-vision:
condition: service_started
api-unlimited-ocr:
build:
context: ./services/unlimited-ocr-service
<<: *service-defaults
ports:
- "8181:8000"
env_file: .env
environment:
DATABASE_URL: postgresql://${POSTGRES_USER:-admin}:${POSTGRES_PASSWORD:-admin}@db-unlimited-ocr:5432/unlimited_ocr
OCR_BASE_URL: ${OCR_BASE_URL:-}
OCR_API_KEY: ${OCR_API_KEY:-none}
OCR_MODEL: ${OCR_MODEL:-Unlimited-OCR}
OCR_DPI: ${OCR_DPI:-300}
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
db-unlimited-ocr:
condition: service_healthy
# --- Vision layer (shared OCRFlux, dùng chung cho markitdown + docling) ---
api-vision:
build:
context: ./services/vision-service
<<: *service-defaults
ports:
- "8585:8000"
env_file: .env
environment:
LLM_BASE_URL: ${LLM_BASE_URL:-http://host.docker.internal:11436/v1}
LLM_MODEL: ${LLM_MODEL:-ggml-org/Qwen2.5-VL-3B-Instruct-GGUF}
extra_hosts:
- "host.docker.internal:host-gateway"
# --- Database layer ---------------------------------------------
db-markitdown:
@ -100,6 +156,21 @@ services:
<<: *healthcheck-defaults
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"]
db-unlimited-ocr:
image: postgres:16-alpine
<<: *service-defaults
env_file: .env
environment:
POSTGRES_DB: unlimited_ocr
POSTGRES_USER: ${POSTGRES_USER:-admin}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
volumes:
- db_unlimited_ocr_data:/var/lib/postgresql/data
- ./db/init_unlimited_ocr.sql:/docker-entrypoint-initdb.d/init.sql:ro
healthcheck:
<<: *healthcheck-defaults
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d unlimited_ocr"]
# --- Admin layer ------------------------------------------------
db-admin:
image: adminer:4.8.1
@ -122,3 +193,4 @@ networks:
volumes:
db_markitdown_data:
db_docling_data:
db_unlimited_ocr_data:

View File

@ -1,343 +0,0 @@
import os
import re
import json
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.datamodel.base_models import InputFormat
from app.models.ConvertModel import Conversion
logger = logging.getLogger(__name__)
import openai as _openai
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
def _build_converter() -> DocumentConverter:
try:
ocr_opts = TesseractCliOcrOptions()
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_opts)
logger.info("Docling: OCR enabled via Tesseract CLI")
except Exception as e:
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
pdf_opts = PdfPipelineOptions(do_ocr=False)
# ImageFormatOption also uses StandardPdfPipeline — pass same pdf_opts
# to prevent docling from falling back to RapidOCR / PP-OCRv6
return DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_opts),
})
converter = _build_converter()
_llm_client = None
LLM_ACTIVE = False
def _init_llm(base_url: str | None, model: str) -> bool:
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client
if not base_url:
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client = None, model, False, None
return False
try:
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
OLLAMA_BASE_URL = base_url
OLLAMA_MODEL = model
_llm_client = client
LLM_ACTIVE = True
logger.info("Docling: LLM enabled via %s (model=%s)", base_url, model)
return True
except Exception as e:
logger.warning("Docling: LLM init failed (%s)", e)
LLM_ACTIVE = False
return False
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
DEFAULT_ENRICH_PROMPT = (
"You are a document cleaning assistant. "
"Fix OCR errors, normalise whitespace, and improve the Markdown structure. "
"Return ONLY the raw Markdown text — no code fences, no commentary, no explanation."
)
def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str:
"""Send extracted markdown to LLM for cleanup. Optionally override the system prompt."""
if not _llm_client or not markdown.strip():
return markdown
try:
resp = _llm_client.chat.completions.create(
model=OLLAMA_MODEL,
messages=[
{"role": "system", "content": system_prompt or DEFAULT_ENRICH_PROMPT},
{"role": "user", "content": markdown},
],
temperature=0,
)
result = resp.choices[0].message.content or markdown
# llava tends to wrap output in code fences regardless of instructions — strip them
result = re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
result = re.sub(r"\n?```\s*$", "", result.strip())
return result.strip() or markdown
except Exception as e:
logger.warning("Docling: LLM enrichment failed (%s) — returning raw output", e)
return markdown
async def convert_url(
url: str,
db: AsyncSession,
output_format: str = "markdown",
use_llm: bool = True,
llm_prompt: str | None = None,
) -> "Conversion":
"""Fetch a YouTube (or any URL) transcript via yt-dlp, then convert with Docling."""
try:
import yt_dlp # noqa: PLC0415
except ImportError:
raise HTTPException(status_code=500, detail="yt-dlp not installed")
ydl_opts = {
"quiet": True,
"skip_download": True,
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": ["vi", "en"],
"outtmpl": "%(id)s.%(ext)s",
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
except Exception as e:
raise HTTPException(status_code=422, detail=f"yt-dlp error: {e}")
title = info.get("title", "YouTube Video")
description = info.get("description", "") or ""
channel = info.get("channel", info.get("uploader", ""))
duration = info.get("duration_string", "")
upload_date = info.get("upload_date", "")
view_count = info.get("view_count")
chapters = info.get("chapters") or []
# Build markdown from available metadata
lines = [f"# {title}", ""]
meta_rows = []
if channel:
meta_rows.append(f"**Kênh:** {channel}")
if duration:
meta_rows.append(f"**Thời lượng:** {duration}")
if upload_date and len(upload_date) == 8:
meta_rows.append(f"**Ngày đăng:** {upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:]}")
if view_count is not None:
meta_rows.append(f"**Lượt xem:** {view_count:,}")
meta_rows.append(f"**URL:** {url}")
lines.extend(meta_rows)
lines.append("")
# Subtitles/transcript
subtitles = info.get("subtitles") or {}
auto_subtitles = info.get("automatic_captions") or {}
transcript_text = None
for lang in ("vi", "en"):
tracks = subtitles.get(lang) or auto_subtitles.get(lang)
if tracks:
# Find a json3 or srv3 track to extract plain text
for track in tracks:
if track.get("ext") in ("json3", "srv3", "ttml", "vtt"):
try:
import urllib.request
with urllib.request.urlopen(track["url"], timeout=15) as r:
raw = r.read().decode("utf-8", errors="ignore")
# Strip VTT timestamps for vtt format
if track.get("ext") == "vtt":
cleaned = re.sub(r"\d{2}:\d{2}:\d{2}\.\d+ --> .*", "", raw)
cleaned = re.sub(r"^\d+$", "", cleaned, flags=re.MULTILINE)
cleaned = re.sub(r"<[^>]+>", "", cleaned)
transcript_text = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
else:
transcript_text = raw
break
except Exception:
pass
if transcript_text:
break
if transcript_text:
lines += ["## Transcript / Phụ đề", "", transcript_text, ""]
elif description:
lines += ["## Mô tả", "", description[:3000], ""]
else:
lines += ["## Ghi chú", "", "_Không có transcript hoặc mô tả._", ""]
if chapters:
lines += ["## Chapters", ""]
for ch in chapters:
start = ch.get("start_time", 0)
m, s = divmod(int(start), 60)
lines.append(f"- **{m:02d}:{s:02d}** — {ch.get('title', '')}")
lines.append("")
markdown_text = "\n".join(lines)
# Write temp file and run through Docling
video_id = info.get("id", "youtube")
with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp:
tmp.write(markdown_text)
tmp_path = tmp.name
try:
result = converter.convert(tmp_path)
doc = result.document
page_count = None
if output_format == "markdown":
content = doc.export_to_markdown()
elif output_format == "json":
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format == "html":
content = doc.export_to_html()
else:
content = markdown_text
llm_used = False
if _llm_client and use_llm and output_format in ("markdown", "text"):
content = _llm_enrich(content, system_prompt=llm_prompt or None)
llm_used = True
from app.models.ConvertModel import Conversion
record = Conversion(
filename=f"{video_id}.md",
file_type="youtube",
output_format=output_format,
content=content,
page_count=page_count,
llm_enabled=llm_used,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
# -----------------------------------------------------------------
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
"html", "htm", "jpg", "jpeg", "png",
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc"
}
OUTPUT_FORMATS = {"markdown", "json", "html", "text"}
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
async def convert_file(
file: UploadFile,
db: AsyncSession,
output_format: str = "markdown",
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
if not _allowed_file(file.filename):
raise HTTPException(
status_code=422,
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
if output_format not in OUTPUT_FORMATS:
raise HTTPException(
status_code=422,
detail=f"Output format not supported. Supported: {', '.join(sorted(OUTPUT_FORMATS))}"
)
suffix = os.path.splitext(file.filename)[1]
file_type = suffix.lstrip(".").lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
result = converter.convert(tmp_path)
doc = result.document
page_count = len(doc.pages) if hasattr(doc, "pages") and doc.pages else None
if output_format == "markdown":
content = doc.export_to_markdown()
elif output_format == "json":
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format == "html":
content = doc.export_to_html()
elif output_format == "text":
content = doc.export_to_markdown()
content = re.sub(r"#{1,6}\s?", "", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
# LLM enrichment — only for markdown / text output, and only if requested
llm_used = False
if _llm_client and use_llm and output_format in ("markdown", "text"):
content = _llm_enrich(content, system_prompt=llm_prompt or None)
llm_used = True
record = Conversion(
filename=file.filename,
file_type=file_type,
output_format=output_format,
content=content,
page_count=page_count,
llm_enabled=llm_used,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
async def get_conversion(conversion_id: int, db: AsyncSession) -> Conversion:
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
record = result.scalar_one_or_none()
if not record:
raise HTTPException(status_code=404, detail="Conversion not found")
return record
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
result = await db.execute(
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
)
return result.scalars().all()
async def delete_conversion(conversion_id: int, db: AsyncSession) -> dict:
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
record = result.scalar_one_or_none()
if not record:
raise HTTPException(status_code=404, detail="Conversion not found")
await db.delete(record)
await db.commit()
return {"message": f"Conversion {conversion_id} deleted"}

3
emails/input/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Ignore tất cả file input (email thô), giữ lại thư mục
*
!.gitignore

3
emails/output/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Ignore tất cả file output (markdown đã convert), giữ lại thư mục
*
!.gitignore

2095
example/1.eml Normal file

File diff suppressed because it is too large Load Diff

10546
example/1.md Normal file

File diff suppressed because it is too large Load Diff

97
example/1_extraction.md Normal file
View File

@ -0,0 +1,97 @@
# IP Supply — Shipment extraction from 1.md (4 prompts)
Compiled from a full read of `example/1.md` (all 10,546 lines). SNs quoted exactly.
## Summary counts
- **Prompt 1 — Shipped:** ~145 shipment/return sections, ~700+ individual SNs (each has a `t#` tracking or is a confirmed outbound PO/return). **Kept/held:** ~12 explicit groups.
- **Prompt 2 — Items with issue:** ~24 groups / 30+ faulty/troubleshooting SNs.
- **Prompt 3 — Transferred to AUS:** ~10 "Import Australia" shipments, 150+ SNs.
- **Prompt 4 — xxxx-xxxx-xxxx codes (shipped/outbound):** 27 codes.
> Note: prompt 1's full per-SN list is very large. This file captures the analytical lists (2/3/4) in full plus prompt-1 kept items and the shipment index. For every SN under each shipment, see the corresponding section in `1.md`.
---
## Prompt 1 — Kept / NOT shipped
- QSFP-40G-SR-BD lot `10-2945-02` (TNT Import) — "total 8 units only, the rest keep in US", "*not here yet"
- 4x JNP-SFP-25G-SR — "*not here yet"
- C9200-STACK ×28 + STACK-T4-50CM ×10 — "keep aside, will ship AUS" (see AUS list)
- C9120AXE-B FJC28081EXK — "already shipped to CMS last wk" (do not reship)
- QFX top-cover kept aside (for C9120AXE-B FJC27151F4C return)
- QSFP-40G-LR4 SPC23020002 — "can't find it here"
- M.2 SSDs / EM7455/EM7430 355813079879315 — "→ next shipment" (held)
- WS-C2960X-48TS-L FOC2111S08L — NOB, staged for AUS
- MR86-HW Q3AE-DMMG-D9KV — "Do NOT ship"
## Prompt 2 — Items with an issue (PID | SN | reason)
- C9300-24P-A/E | FCW2303DHNT | "van bi loi"; earlier "BOOT FAIL W..."
- C9120AXE-B | FJC27041KKE | TS24, "recover them"
- C9120AXE-E | FGL2812LK6C | TS12
- C9120AXE-B | FJC27102CXZ | "no console / chuyen sang line khac"
- SUP32/PFC2/MSFC2 | SAL1338Z2TC / SAL1339ZJD8 / SAL1339ZGPX | "Traceback ... Error75"
- C9300-24P-E | FCW2238S0BA | "line 5 bi loi"
- NIM-2GE-CU-SFP | FOC23503PHV, FOC24331W30 | bad → returned
- MV86-HW | Q3AE-EGKE-B3X2, Q3AE-SZD8-3PPD, Q3AE-6S3U-Y6N5, Q3AE-VWFF-6MGG, Q3AE-8P53-P74T, Q3AE-SFXM-7CK2 | "failed test / LED orange"
- C8300-1N1S-6T | FLM290910DT, FLM280510M6 | "RFID module broken, can't fix"
- WS-SUP32-GE-3B | SAL09402986 | "damaged power connector" (returned)
- MR86-HW (6 units, no SN) | — | "LED orange when tested" (OSI PO-25694)
- N5K-C5596UP | (TS17/TS21) | "no console"
- C1111-8PLTEEAWB (2) | — | "no console after restart"
- WS-C3750X-48P/24P-S | batch | "most were bad, scratches"
- DS-C9148T-K9 | JPG2603008A | "can't unbend chassis/ports"
- CISCO2821 | FTX1134A22A, FTX0950C0GD, FTX1418AKP2 | missing FAN/PSU/faceplate
- ISR4331/K9 | FLM1940W0QB | "PASS WITH WARNING"
- WS-C3560G-24PS-S | FOC1124Z45D | "no console, no faceplate"
- C9120AXE-B (return) | FGL2812LPJG, FJC27151F4C | return; one "without top cover from AUS"
## Prompt 3 — SN transferred to AUS / Australia
(dedicated "Import Australia" shipments — key SNs)
**C9200-STACK batch (t#1ZJW78500439236591):**
SFP-10G-LRM: FNS22050YND, FNS23051B30, FNS23051B32, FNS230514Q4, FNS23051B20, OPM23170H22
C9200-STACK (28): JAE234004PC, JAE234004T3, JAE25360Z1S, JAE25360ZXV, JAE25360ZXW, JAE25360WQ5, JAE25210K5G, JAE25360U84, JAE25360U7W, JAE26440B4B, JAE26402RM2, FDO24260JB2, JAE2610039Z, JAE25210GZQ, FDO19321P8C, JAE25360VU0, FDO20120G3J, JAE23051476, JAE230601NH, FVH27510R69, JAE25161CTG, FDO24250TUS, FDO24250T0B, FDO24260G32
STACK-T4-50CM (10): LCC2428GA36, LCC2618GGEJ, MOP2732A3H9, LCC2246GX01, LCC2625GFEE, MOC261800C4, MOC261801GT, MOC261700NW, MOP2732A3EX, LCC2320GCQW
AIR-AP1562E-B-K9 FJC263610EP; EX-SFP-10GE-SR SCJ19YP0M8, SCJ19YP0V4, SCJ19YP02N, SAZH1TSH
**Import Australia UPS (t#1ZJW78500415223596):** UCSX-TPM-002C FCH270476WX, FCH26177EG0, FCH270476HD, FCH251572R7; PAN-SFP-PLUS-SR MU90S5H, MU60V2X, MU20TFU, MU211ZP; NIM-2GE-CU-SFP FOC250624L6, FOC250624QT, FOC24400EXW, FOC21452FDH, FOC263359EC; JNP-SFP-25G-SR S1A1M31A751BAB, S1A1M31A7517WR, S1A1M31A751BVZ, S1A1M31A751BFQ; NVME 22293A26BAF6, 22293A447730; NIM-ES2-8 SFOC194350CH, SFOC24484UR9
**Import Australia UPS (t#1ZJW78500423326317):** PWR-80W-AC DAB192754P8, DAB194450WP; MS120-8LP Q2BX-YLYW-556A; STACK-T4-50CM LCC2920G58F; XEON-6342 2A2109BF5F0F8383RKXA, 2A2134BF51728CD8RKXA, 2A226ABF4F28BBEARKXA, 2A2076BC995F5353RKXA, 3A4819C0FC3F792ARKXA, 3A481A1EB9C113CFRKXA; C9500-40X FJC254223DA
**Import Australia UPS (t#1ZJW78500431774363):** GLC-BX-D 080322BD001; N9K-C93180YC-EX FDO2252023P, FDO21482ECD, FDO21111F2A; WS-F6K-PFC3B SAD092705FY
**Import Australia Fedex/TNT (t#873308530323):** C9300-24T-A FJC2333S06H, FJC2333S05T; C9200-STACK (20) FVH29100T7W, FVH29100X6E, FVH2910151B, FVH29100XBB, FVH29100VEM, FVH29100TH7, FVH29100VQ1, FVH29100U1P, FVH29100TNL, JAE23180HMY, JAE25360VTV, FDO23290KA5, FDO24260J5C, JAE241108B5, FDO23290LAT, JAE26360XUR, JAE26382LDW, JAE25360VD8, FDO22263HYZ, FDO22040VZ4; C9200L-48P-4X-E FOC2608054S; WS-SUP32-GE-3B SAL12330S7D
**TNT/Fedex (make New in AUS, T#873256774277):** C9300-24T-E-WS FOC2721YA6K, FOC2721Y72K, FOC2648YCCD, FOC2721YA6F; WS-C2960X-48TS-L FOC2111S08L; C9120AXI FJC25432C7X, FJC25432CB7, FJC25432CA2, FJC25432C7S
## Prompt 4 — xxxx-xxxx-xxxx hyphenated codes (shipped/outbound)
| code | context |
|---|---|
| 17-14693-01192 | ship PWR-115W-AC (DAB264260LD) |
| 10-2945-02 | TNT Import — QSFP-40G-SR-BD lot (8 shipped) |
| 10-14642-61136 | return C9300-24P-A (FJC2340S02R) |
| 14-14726-39035 | ship MR36-HW (Q3AJ-49HS-7EJT) |
| 04-14738-90912 | ship IPO-500-EXP-MOD (09FC19500851) |
| 18-14713-58108 | ship 2x CP-840 (TCL251100KM, TCL2511007X) |
| 02-14762-32221 | ship MA-MNT-MR-17-MR36 (CCA45679958) |
| 17-14713-99371 | ship C9136I-ROW (FGL2702LXZV) |
| 05-14746-55384 | ship VIC3-4FXS/DID (FOC14317P5Z) |
| 02-14656-21410 | return WS-SUP32-GE-3B set |
| 23-14741-66101 | ship AIR-ACC1530-PMK1 (JNV23453733) |
| 18-14577-09030 | return 2x C9120AXE-B (FJC27141WFH, FJC24441S1F) |
| 05-14777-19977 | ship 2x PVDM2-64 (FOC123546Z2) |
| 03-14766-07874 | ship GLC-ZX-SM-RGD (SPC1432030T) |
| 08-14781-17608 | ship 2x PWR-C1-440WDC (ART2210FCJL, ART2210FCDR) |
| 27-14612-29455-P01 | AUS — Micron MTFDDAV240TDS |
| 22-14679-31676 | return NIM-2GE-CU-SFP (FOC24331W30) |
| 26-14668-35871 | return NIM-2GE-CU-SFP (FOC23503PHV) |
| 17-14781-37655 | ship EX-UM-4SFPP-MR (XS0221420145) |
| 14-14782-66558 | ship 2x WS-C3560G-24PS-E |
| 18-14787-11692 | ship VIC3-4FXS/DID (FOC18375W7G) |
| 16-14788-52149 | ship VIC3-4FXS/DID (FOC16036FFK) |
| 21-14778-64209 | ship PVDM2-64 (FOC13313DX8) |
| 14-14784-46115 | ship VIC3-4FXS/DID (FOC17361MNE) |
| 13-14800-40754 | ship ASR-9902-2P-KIT |
| 15-14694-40587 | return 2x MR86-HW |
| 03-14718-50867 | return 2x PWR-C1-350WAC-P |
Excluded (incoming, not outbound): 12-14747-30779 (eBay receiving), 24-14738-31605 (return never completed).

View File

@ -0,0 +1,357 @@
# IP Supply — Joe Bui → Daniel Vu Shipment Analysis
Compiled from a full read of `example/1.md` (all 10,546 lines). SNs are quoted exactly as written (including inconsistent lowercase and leading-`S`/`X` prefixes). Items #2, #3, #4 are cross-referenced against the shipped context.
---
## 1. SN Shipped vs SN Kept
### A) SHIPPED (grouped by section — each has a `t#` tracking number or is a confirmed outbound customer/PO shipment)
**1x C9300-24P-E → Network Craze (BTV) PO-P30531 — `t#1ZJW78500211632137`** (PID C9300-24P-E)
FOC2427U033, LIT24223GHD (PWR-C1-715WAC), NID2425W1DA4, NID2421W0MS6, NID2410W0EUZ
**1x C9200CX-12P-2X2G-E → Summit 360 PO-75588 — `t#1ZJW78500335076497`**
FJC28281TSH
**6x MA-MNT-MR-16 → OSI Global PO-25740 — `t#1ZJW78500318099129`**
BCAA0004946, BCAA0004888, BCAA0004880, BCAA0004892, BCAA0004968, BCAA0004878
**15x PWR-C1-715WAC-P → Curvature PO-1360562-I — `t#381683654045`**
ART2217P09A, DCC2304C1DA, DCC2805C48Y, DCC2823C1YN, ART2302P9CR, ART2302P9Y1, ART2302P9T4, ART2250P7JZ, DCC2530C4XE, DCC2541CJUN, DCC2541CGZK, DCC2541CJYF, DCC2541CJY7, ART2350P4N2, ART2302P9PF
**2x N540-RCKMT-19 → Procurri PO-AT 109692 — `t#872413464140`** (no unit SNs listed)
**40x GLC-BX-D / GLC-BX-U → DHD PO-72125 — `t#1ZJW78500335530889`** (PID GLC-BX-D / GLC-BX-U)
ACW2853072T, ACW285306VG, ACW28530746, ACW2853074J, ACW2853074F, ACW285306V2, ACW285306XH, ACW28530757, ACW28530753, ACW2853074B, ACW2853073W, ACW285306XE, ACW285306WU, ACW2853073N, ACW2853072B, ACW28530755, ACW2853072V, ACW285306WW, ACW2853074K, ACW28530747, ACW285306XQ, ACW28530750, ACW2853072M, ACW2853073Q, ACW285306X0, ACW285306WZ, ACW285306X9, ACW285306WN, ACW2853075D, ACW2853074S, ACW28530749, ACW285306VQ, ACW2853075F, ACW2853075E, ACW285306V1, ACW28530751, ACW285306XJ, ACW285306WH, ACW28530754, ACW28530743, NDG21190929, NDG21364652, NDG21190449
**15x C9300-NM-8X → Curvature PO-1360736-I — `t#391683375950`**
FJZ27177878, FOC250718RD, FJZ23141KYB, FOC21351WND, FJZ243504TU, FJZ23090CJC, FJZ24131BYC, FOC2243931X, FJZ23141MB3, FJZ23300DKJ, FOC221047UU, FJZ264676Q6, FJZ2436096X, FOC22442R41, FOC2243539R
**5x JNP-FAN-1RU → Terabit PO-2631 — `t#872509139509`** (new SNs SBCFX…, old SNs listed too)
New: SBCFX3650, SBCFX3630, SBCFX3659, SBCFX3654, SBCGB5835 | Old: WF0216430251, WF0216430522, WF0216430139, WF0216430127, JNV23455010
**TNT Import Economy — `t#872479468286`** (mixed AUS/import box; see also list 3)
PWR-C49E-300AC-F: QCS18391UD9, PAC18385DBZ | 14x C9120AXI: FJC25432C6Y, FJC25432C5G, FJC25432C9T, FJC25432CA1, FJC25432C7Q, FJC26361XQC, FJC25432C7T, FJC25432C5D, FJC25432C3D, FJC25432CAK, FJC25432C4F, FJC25432CBA, FJC25432C73, FJC25432C74 | QSFP-40G-SR-BD lot `10-2945-02` (see list 4) | C9200L-48P-4X-E: JAE263024R0, JAE263023SP
**10x AIR-ACC1530-PMK1 → Curvature PO-1360802-I — `t#381737217720`**
JNV23453768, JNV23453766, JNV23453765, JNV23453767, JNV23453769, JNV23453771, JNV23453773, JNV23453779, JNV23453775, JNV23453777
**7x GLC-BX-U → DHD PO-72125 — `t#1ZJW78500326506931`**
SNDG19490387, SNDG19490597, SNDG19490604, SNDG19490723, SNDG19490316, SNDG19490724, SNDG19491086
**100x C9120AXE-B-REF → CMS PO-0272806 — `t#872507852844`** (28x, PID C9120AXE-B)
FJC27081CA2, FJC24381DM7, FJC24381E8W, FJC24311C2L, FJC24381DV4, FJC24401SHB, FJC250618AX, FJC24331ATB, FJC242916V0, FJC24381DV2, FJC250618KG, FJC2428180H, FJC24381DV9, FJC27102DKZ, FJC24511LK7, FJC24381DUJ, FJC27081CAY, FJC27081CA0, FJC27081CAX, FJC27081CA1, FJC25331KQW, FJC24401SKB, FJC242916W2, FJC242612Y5, FJC242817ZG, FJC270918G9, FJC27043K60, FJC271823UE
**1x C892FSP-K9 → DHD PO-72258 — `t#1ZJW78500336414128`**
FGL1741211R (chassis), DTH2022D4TT (PSU)
**2x FAN-T2 → NetworkTigers PO-260601-5** (no t# but confirmed outbound blind ship)
ACI232100KY, ACI232104V7
**4x MA-MNT-MR-16 → OSI Global PO-25782 — `t#1ZJW78500312990541`**
BCAA0004882, BCAA0004886, BCAA0004685, BCAA0004681
**1x PWR-115W-AC (NIB) → ship** `17-14693-01192` — DAB264260LD (see list 4)
**12x GLC-FE-100FX-RGD → Curvature PO-1360771-C — `t#381735687048`**
ACW25411AVL, ACW25411AVT, ACW25411AVU, ACW25411AVW, ACW25411AVM, ACW29350R80, ACW26092B5L, ACW25401EX5, ACW25401EWT, ACW25401EWS, ACW29350R5B, ACW29350R61
**2x C9300-24P-A → PIVIT PO-256093/255979 — `t#872564644148`**
FCW2426D13K, FOC2432L240 | PWR: LIT23172YE6, LIT241822YE | FAN: NID2425W1DUR, NID2425W1DA9, NID2325W096X, NID2425W1D7F, NID2425W1D7G, NID2410W0EUY
**2x C9200L-48P-4X-E → OSI Global PO-25828 — `t#1ZJW78500338590210`**
JAE254300DF, JAE254300C2 | PWR-C5-1KWAC: DCI25251CV2, ART2526D03V
**3x C9200CX-12P-2X2G-E → OSI Global PO-25601 — `t#1ZJW78500323402987`**
FJC28311W43, FJC29281QU8, FJC28281TVG
**2x C9500X-ACCKIT-19I → Curvature PO-1360186-I — `t#381765230154`**
JNV23455406, JNV23455408
**2x PWR-500-DC-F → DHD PO-72280 — `t#1ZJW78500333086177`**
L101QR002SANP, L375SL00J2AHP
**1x C9500X-ACCKIT-19I → Network Craze PO-P30393 — `t#1ZJW78500126302164`**
JNV23455398
**48x C9120AXE-B-REF → CMS PO-0272591 — `t#872563690107`** (16x)
FJC24381E2G, FJC2420116U, FJC25111HC6, FJC25101JED, FJC271823UB, FJC271823UC, FJC242916W1, FJC270918G7, FJC24381DU8, FJC242916UX, FJC27081CAU, FJC262921GQ, FJC24401SL1, FJC242916V3, FJC262922NZ, FJC26261DEM
**1x C9300-24P-A return** `10-14642-61136` (return context — see list 4): FJC2340S02R + C9300-NM-8X FJZ23370ZN8, PWR LIT23272YEP, PWR ART2327D11J, FAN NID2233W0GF0/NID2233W0GEZ/NID2233W0GCP
**5x AIR-ACC1530-PMK1 → Curvature PO-1360813-I — `t#381764510886`**
JNV23453780, JNV23453782, JNV23453784, JNV23453786, JNV23453788
**3x MA-MNT-MR-17 → PIVIT PO-256336 — `t#1ZJW78501322986956`**
Bracket: JNV23455338, JNV23455340, JNV23455342 | Screw-kit: JNV23455022, JNV23455014, JNV23455024
**10x SFP-10/25G-LR-S → Summit 360 PO-75910 — `t#1ZJW78500335396543`**
SINL2926020T, SINL292605CM, SINL292602ZQ, SINL292605JM, SINL2926035S, SINL292604R8, SINL292602KR, SINL29260305, SINL292605K0, SINL292605J9
**4x NXA-PAC-1100W-PE2 → ChannelWorks/Relutech PO-14846 — `t#872562736621`**
ART2432F5YX, ART2432F5XJ, ART2336F4BH, ART2432F5TX
**14x MEM-C8200L-8GB → PO-23836** (blind ship; arrived pack 1ZW5715F0492097333)
80AD011836824D7C05, 80AD01202343D248EC, 80AD01174932A1F851, 80AD011836824D7BFA, 80AD011836824D7D98, 80AD011836824D7D9C, 80AD01202343D2493A, 80AD01202343D248C4, 80AD01202343D2492F, 80AD01202343D248BD, 80AD011836824D7D9E, 80AD011836824D7C27, 80AD011836824D7C03, 80AD011836824D7C0F
**1x CISCO2511-DC → Strategic Telecom PO-5602**: chassis 251100148 / line16 SN 16203226; DC PSU from CISCO2511-DC 251362180. *(tracking t# ambiguous in source — see caveats)*
**1x C9300-48P-E → OSI Global PO-25874 — `t#1ZJW78500334596641`**
FCW2411G0D2, DCC2352C5BZ (PWR), NID2403W0WGW, NID2403W0WGX, NID2403W0WGR (FAN)
**1x C9K-PWR-930WDC-R + CAB → Alliance Networking PO-13500 — `T#872621854979`**
LIT23142VGU, JNV23455373
**1x AIR-CT2504-50-K9 → InTown Suites PO-1511** (no t#): PSZ19311FL7, PSU dab192152nt
**1x AIR-CT2504-50-K9 → InTown Suites PO-1512** (no t#): PSZ19311BTJ, PSU DAB184550PZ
**1x MEM-4400-8G → Curvature PO-1360805-C — `t#381792604872`**: MT18KSF1G72AZ-1G6E1
**1x WS-C3560CX-8XPD-S → Network Craze PO-P30632 — `t#1ZJW78500234219032`**: FOC1943W1GE
**9x MA-MNT-MR-17 → CXtec PO-D1238993 — `t#1ZJW78501207890357`**
Bracket: JNV23455339, JNV23455341, JNV23455343, JNV23455345, JNV23455347, JNV23455301, JNV23455303, JNV23455302, JNV23455304 | Screw-kit: JNV23455230, JNV23455403, JNV23455401, JNV23455399, JNV23455397, JNV23455409, JNV23455411, JNV23455407, JNV23455405
**4x MA-MNT-MR-17 → Network Craze PO-P30602 — `t#1ZJW78500234078220`**
Bracket: JNV23455344, JNV23455346, JNV23455348, JNV23455337 | Screw-kit: JNV23455243, JNV23455241, JNV23455245, JNV23455247
**1x C3KX-PWR-440WDC → Tri Tower PO46160 — `t#872619441290`**: ART1851F57V
**1x NXA-FAN-160CFM-PE → ChannelWorks PO-14860 — `t#872619260338`**: ACI2302029Q
**1x DS-C9132T-K9 → PIVIT PO-256275 — `t#872615037247`**: JPG2303002Z; PSU ART2311FQQ2/ART2311FQQ3; FAN DCH2309RBW6/DCH2309RBU1/DCH2309RBWC/DCH2309RBUK
**2x ISR4431/K9 → Procurri PO-AT 110114 — `t#872670021815`**: FJC2216A0RJ, FJC2216A0NW; PSU LIT220636NW/LIT220636LD
**1x FACE-C9200L-48P-4X → P3 Systems PO-16605 — `t#872669917879`**: jnv23455079
**2x ONS-SI-GE-EX → OSI PO-25799 — `t#1ZJW78500319978178`**: OPC14490579, OPC14490266
**10x CP-840-BUN-K9 → Curvature PO-1359761-C — `t#381809072478`**: TCL283100SH, TCL283101CD, TCL283100SB, TCL283100W4, TCL283101KW, TCL28310148, TCL2831014A, TCL283101D7, TCL2831015E, TCL283101EK; CP-840-BAT VK24061166691, VK24061166724, VK24061161095, VK24061166799, VK24061167605, VK24061167630, VK24061166466, VK24061166507, VK24061166679, VK24061160412
**1x ISR1100-4G → Cuadrant PO-20029** (no t#): FCH2324L000, PSU DAB2630Y34N
**5x MA-MNT-MR-16 → OSI PO-25913 — `#1ZJW78500308687566`**: BCAA0004684, BCAA0004696, BCAA0004690, BCAA0004686, BCAA0004697
**2x WS-C3850-24XU-E → Curvature PO-1360972-I — `t#381820077820`**: FCW2049D07B, FOC2022X1EY; PWR LIT1928181A/LIT193501E6
**1x C9200CX-12P-2X2G-E → Summit 360 PO-75588 — `t#1ZJW78500306899233`**: FJC28311W9J
**8x ASR1001-X-PWR-DC → Whaley PO-11876 — `t#1ZJW78500312491643`**: DTN1826W0B4, DTN2423W00N, DTN1826W0A8, DTN2423W02W, DTN2423W03U, DTN1826W0BK, DTN2423W00R, DTN1826W08S
**1x C9200L-48P-4X-E → PIVIT PO-256499 — `t#872810044838`**: JAE254300TZ; PWR ART2518D02V
**4x C9K-PWR-930WDC-R → Alliance PO-13501 — `t#872814251278`**: LIT23292H6V, LIT23142UVQ, LIT23142VEP, LIT23292GZ5; CAB JNV23455327, JNV23455376, JNV23455375, JNV23455374
**2x PWR-C1-440WDC → OSI PO-26035 — `t#1ZJW78500310136665`**: ART2210FCD6, ART2045F6AN
**1x MR36-HW → ship** `14-14726-39035`: Q3AJ-49HS-7EJT; MA-MNT-MR-17-MR36 bracket JNV23455306, screw JNV23455232
**1x IPO-500-EXP-MOD (ship)** `04-14738-90912`: 09FC19500851
**32x C9120AXE-B-REF → CMS PO-0272591 — `t#872795599363`** (14x): FJC2629226K, FJC26261MLJ, FJC271823UD, FGL2812LK6C, FJC253813PF, FJC253813N5, FJC242817ZV, FJC242916WK, FJC270918FR, FJC262921HB, FJC26291G8Q, FJC262921KW, FJC24511N52, FJC26261DF0
**2x CP-840 (ship)** `18-14713-58108`: TCL251100KM, TCL2511007X; BAT VK20111200194, VK21010862870
**1x NCS-5501 → Alturna PO-3029066 — `t#1ZJW78500421598260`**: FOC2240R0AT; PSU POG22138QA8/POG22138Q4A; FAN fox2240p4lk/fox2240p4ld
**2x CP-840-BUN-K9 → Curvature PO-1359761-C — `t#381920793553`**: TCL2831014P, TCL2831013M; BAT VK24061167091, VK24061167651; CP-800-USBCH BCAA0003455, BCAA0003454
**2x C9300-NM-2Y → PICS POUS021998 — `t#1ZJW78501219294694`**: FVH27311HXT, FVH271412E4
**12x MA-MNT-MR-17 → Network Craze PO-P30763 — `t#1ZJW78500212235289`**: Bracket JNV23455305/07/09/11/13/15/17/19/21/23/08/10; Screw-kit JNV23455248/42/46/38/44/31/29/40/35/33/06/04
**22x GLC-BX-U → DHD PO-72125 — `t#1ZJW78500303941272`**: ACW260317J1, ACW253814ZC, ACW22420VHX, ACW22360JX5, ACW22280L75, ACW25300BBS, ACW260317HU, ACW25300BC6, ACW260317HP, ACW25300BBL, ACW260317HR, ACW254116WP, ACW260317HL, ACW253814Z8, ACW260317HM, ACW253814ZK, ACW260317HK, SACW223704XE, SACW222004N6, SACW22320433, SNDG21330838, SNDG21316271
**1x C9136I-ROW (ship)** `17-14713-99371`: FGL2702LXZV
**1x C9K-T1-FANTRAY → CentricsIT PO-PPO161840 — `t#872868140028`**: ACI240502AL
**1x VIC3-4FXS/DID (ship)** `05-14746-55384`: FOC14317P5Z
**3x CW9166I-MR → OSI PO-26130 — `t#1ZJW78500339659109`**: WNH27251NKD, WNH27132J0E, WNH272727QY
**5x FAN-T2 → P3 Systems PO-16613 — `t#873284420010`**: NID2322W0YG1, NID2322W0X6Z, NID2321W0J16, NID2322W0X6V, NID2543W12ZR
**1x IE-3300-8T2S-E → LSI PO189062 — `T#872974553634`**: FCW2538YNGA
**2x C9300-NM-8X → DHD PO-72397 — `t#1ZJW78500305025320`**: FOC21351W2T, FOC22311G6R
**1x ISR4321-AX/K9 → OSI PO-26102 — `t#1ZJW78500329476692`**: FLM2235W0XR; PSU PST22398CLI
**12x C9300-NM-8X → OSI PO-26122 — `t#1ZJW78500313613714`**: FOC241359GB, FJZ2519DM8M, FJZ25192EFE, FOC22363USG, FJZ262617SE, FOC25201VKC, FOC25201W04, FOC25201VE1, FOC23325Q9G, FOC25212688, FOC233148RN, FOC2521268E
**2x C9200CX-12P-2X2G-E → CXtec PO-D1239684 — `t#1ZJW78501218435506`**: FJC28311VYA, FJC28281TSG
**30x GLC-LH-SMD → Curvature PO-1361175-I — `t#381970115329`**: SAVJ205030ZH, SAVJ205030G6, SAVJ205030J5, SAVJ205030ZF, SAVJ205030JC, SAVJ205030JB, SAVJ205030S2, SAVJ205030ZZ, SAVJ205030S6, SAVJ19523TYX, SAVJ205030HN, SAVJ205030ZX, SAVJ205030Z5, SAVJ205237CE, SAVJ21093372, SAVJ19523187, SAVJ1952318U, SAVJ1952319G, SAVJ19523W6F, SAVJ2018345P, SAVJ1952325Z, SAVJ1952317W, SAVJ19523174, SAVJ212830YC, SAVJ205030G1, SAVJ205030H6, SAVJ205030LX, SFNS17290RWE, SFNS17290RVC, SFNS17290KG0
**20x PWR-C1-715WAC-P → Curvature PO-1360734-I — `t#381939491149`**: LIT2732C7SJ, LIT2724CTA3, LIT2724CT2Q, LIT2724CTDQ, LIT2724CTMN, LIT2736A692, LIT2736A6LG, LIT2724CTSH, LIT2736A69F, LIT2736A6WK, DCC2503CGS5, DCC2322CA43, DCC2322CA2N, ART2250P7J0, DCC2322C9VT, ART2302P40J, DCC2709C15T, DCC2326C947, DCC2326C1G2, ART2252PEMA
**2x N540-PWR400-A → Choice IT PO-9963 — `t#872971860863`**: LIT2833A9PW, LIT2633A3FW
**1x CW9162I-MR + brackets → OSI PO-26157 — `T#1ZJW78500329611133`**: Q5AA-JNPC-SYDC
**11x JNP-FAN-1RU → Terabit PO-2637 — `t#873024723380`**: SBCFX3636, SBCFX3638, SBCFX3629, SBCFX3637, SBCFX3631, SBCFX3633, SBCFX3626, SBCFX3630, SBCFX3628, SBCFX3627, SBCFX3639
**2x C9300L-48P-4G → Optimum Data PO148044 — `T#873023423180`**: jnv23454010, jnv23454014
**2x PAN-SFP-PLUS-SR → Curvature PO-1361282-C — `t#381995817390`**: AS21M9F, MUJ1XLG
**1x DS-C9132T-K9 → PIVIT PO-256275 — `t#873022693197`**: JPG242700E5; PSU ART2343F4DR/ART2428F2FS; FAN DCH2450R9W5/DCH2450R9VX/DCH2425RWR8/DCH2425RWQ9
**4x C892FSP-K9 → DHD PO-72376 — `t#1ZJW78500318706338`**: FGL17392360, FGL1740222S, FGL1740224X, FGL1740224E; PSU DTH2022D4L2/DTH2022D4QA/DTH2022D4SH/DTH2022D4M3
**5x C8300-1N1S-6T → PICS POUK014158 — `t#6557395235`**: FLM27161325, FLM280411TT, FLM2508115B, FLM26031027, FLM284510NN; PWR-CC1-250WAC LIT2627AYQP, LIT2537AWA5, LIT2633A4Z9, LIT2633A5CK, LIT24303K7W, LIT2633A4J9, LIT2541CP77, LIT2541CQPG, LIT2633A9WZ, LIT24303KP9
**4x C9300-NM-2Y → Network Craze PO-P30694 — `t#1ZJW78500210812382`**: SFVH30060WU5, SFVH30060X35, SFVH30060WS4, SFVH30060X1N
**1x MA-MNT-MR-16 → Network Craze PO-P30630 — `t#1ZJW78500231876748`**: BCAA0004691
**5x PWR-C5-125WAC → Curvature PO-1361370-I — `t#382045593645`**: LIT224646LA, LIT25132RKW, LIT252022AP, LIT25132Q3X, ART2515DD6L
**2x MA-MNT-MR-16 → OSI PO-25945 — `t#1ZJW78500314925760`**: BCAA0004693, BCAA0004692
**1x SFP-10/25G-LR-S → Network Craze PO-P30877 — `t#1ZJW78500114844379`**: INL292605CE
**1x SFP-10/25G-LR-S → Network Craze PO-P30665 — `t#1ZJW78500208198186`**: INL2926049T
**100x C9120AXE-B-REF → CMS PO-0272806 — `t#872714085922`** (21x): FJC27102CXY, FJC27102CSS, FJC27041KKE, FJC27081CAW, FJC27041K9U, FJC27041K89, FJC25081ALQ, FJC27081CA3, FJC25101JGZ, FJC25061LWW, FJC24381DUC, FJC26291GP3, FJC242916V4, FJC24381DWD, FJC250618B7, FJC25101JJW, FJC24381E5G, FJC25101JJV, FJC27102CXZ, FJC24441S1F, FJC27141WFH
**1x ISR4431-VSEC/K9 → OSI PO-25916 — `t#1ZJW78500338315053`**: FJC1938D0HL; PVDM4-64 FOC20325TWL; PWR LIT192606KN/LIT192606UZ
**1x GLC-ZX-SM-RGD (ship)** `03-14766-07874`: SPC1432030T
**2x PVDM2-64 (ship)** `05-14777-19977`: FOC123546Z2 (then FOC12372KLW)
**2x C9130AXI-K → ship (PP Invoice 151709)**: KWC25280F99, KWC25280F98 (then KWC25280FO6, KWC25280FIZ)
**10x MA-ANT-27 → Curvature PO-1361163-C — `t#382071721362`**: JNV23455500, JNV23455498, JNV23455502, JNV23455504, JNV23455506, JNV23455508, JNV23455510, IPSNSN00406, IPSNSN00407
**10x C9500-ACCKITH-19I → Curvature PO-1361430-I — `t#1ZJW78500317898428`**: BCAA0001977, BCAA0001825, BCAA0001834, BCAA0001832, BCAA0001978, BCAA0001830, BCAA0001836, BCAA0001987, BCAA0001983, BCAA0001833
**1x PWR-C1-1100WAC + 1x C9300-48P-E → OSI PO-26259 — `t#1ZJW78500334123151`**: DTN2045V211; C9300-48P-E FOC2502L5ZA; FAN DCH2315RLG8/ACI29021K92/DCZ274108RR; PWR LIT2732C77P
**8x WS-C2960L-16TS-LL → Network Craze PO-P30928 — `t#1ZJW78500208572815`**: FCW2117A3H3, FCW2222A3HU, FCW2204A60Q, FCW2212A3JN, FCW2212A3HN, FCW2121A5NT, FCW2204A60T, FCW2212A3MP
**2x EX3400-24P → OSI PO-26193 — `t#1ZJW78500309505796`**: NW3619150623, NW0221080057; PSU 1EDX933091Y/1EDX93308VC/1EDX8450CFG/1EDXA390HGS; FAN VS0219111481/VS0219111477/VS0218393632/VS0218393630
**1x C9300-24P-E → ACE IT PO-21047**: FOC2432L2T3; PWR LIT2724CTPM; FAN DCH2340R5US; NID2325W09CK, NID2421W0MS7
**2x ASR1001-X-PWR-DC → DHD PO-72431 — `t#1ZJW78500111913844`**: DTN1826W0JX, DTN2617W05R
**15x MV32-HW → Curvature PO-1361001-C — `t#1ZJW78500129707203` + `t#382142640681`**: Q2PV-2885-9WH5, Q2PV-599H-DNSD, Q2PV-C6EG-KB2J, Q2PV-A6NU-EQX8, Q2PV-DL34-7UEG, Q2PV-WP5A-5HGM, Q2PV-KQL5-GFS3, Q2PV-VPCA-HBCB, Q2PV-LQ7U-3RFS, Q2PV-GQMH-F5LJ, Q2PV-5ZQ9-5DD8, Q2PV-VW84-729Z, Q2PV-GFSK-KDPC, Q2PV-3X3G-2QU2, Q2PV-EJ4U-9EG4
**2x PWR-C1-440WDC (ship)** `08-14781-17608`: ART2210FCJL, ART2210FCDR
**20x GLC-LH-SM + 4x C9300L-STACK-KIT + 12x PWR-C1-1100WAC-P → Recurrent PO-P54352 — `t#873227845989`**: GLC-LH-SM OPA13145127, OPC11490602, OPC13452818, OPA12488354, OPA12461629, FNS1246118P, OPC12501727, OPA15380628, OPC12471538, OPC11440385, OPC10271313, FNS12240PMS, FNS1227077E, OPC11241282, AGC1131U7JP, AGC1346UBQH, OPA14370699, FNS11300241, OPA12486510, OPC11241469 | STACK-KIT FOC26247L3T, FOC25340E9D, FOC26484MCW, FOC254126JM, FOC25412EY3, FOC26213ULK, FOC25331HE5, FOC26247F8J + STACK-T3-50CM LCC2427PS6Q/LCC2427PR5P/LCC2427PJYI/LCC2427QZDW | PWR ART2231FKAQ, DCC2241DAJ1, QCS24053HRC, ART2231FKDK, DCC2511DY95, DCC2412D6E2, DCC2241DAE8, QCS24083FK4, ART2231FKDC, DCC2241DAGY, DCC2241DAG9, DCC2241DAH2
**15x MR86-HW → OSI PO-25694 — `t#1ZJW78500303053437`**: Q3AE-AY5R-YV22, JNV23455414, Q3AE-2MKJ-V78D, Q3AE-3PRL-VL68, Q3AE-52NF-QY6E, Q3AE-RNQP-HXGT, Q3AE-9FKB-VBJ7, Q3AE-DDPF-UH9A; mounts cca45680063/065/073/151/165/163/149/157 (DO NOT ship Q3AE-DMMG-D9KV)
**1x ISR4321-AX/K9 → OSI PO-26102 — `t#1ZJW78500329796793`**: FLM2043W0ZU
**1x ACS-1900-RM-19 → Townsend PO-25552 — `t#873227456345`** (kit, no unit SN)
**13x PWR-C1-1100WAC-P → Curvature PO-1361500-C — `t#382097988184`**: DCC2412D6F5, DCC2726D600, DCC2511DUJY, ART2445P0ZL, DCC2731D677, DCC2604DF8R, DCC2412D6GA, DCC2512D6P3, DCC2452DLB1, DCC2444D5UA, DCC2224D0D3, QCS24083FF7, DCC2509DLKS
**4x STACK-T4-1M → NetworkTigers PO-260617-9**: LCC2250H0D6, LCC2249HJR3, LCC2250H0CT, LCC2249HJR4
**1x C9300-NM-2Y → Network Craze PO-P30954 — `t#1ZJW78500230187182`**: SFVH30060X33
**1x C9124AXE-B → Curvature PO-1361442-C — `t#382097919736`**: FJC2718199M
**12x SFP-10/25G-LR-S → Knowledge Computers PO-131683 — `t#873226458385`**: MZH2823028E, MZH28230287, MZH282301J2, MZH282301J1, MZH282301J0, MZH282301J7, MZH282301JG, MZH282301HH, MZH282301H3, MZH282301H4, MZH282301H5, MZH282301H6
**5x C8300-1N1S-6T → PICS POUS022509 — `t#873226023861`** (4x): FLM283610SG, FLM2837104P, FLM251010SH, FLM290910DT; PWR-CC1-250WAC LIT2537AVZ7/LIT2537AWF6/LIT2627CLY6/LIT2633A462/LIT2627CP7W/LIT2627AYNU/LIT2537AWAE/LIT2633A4MX
**4x PVDM3-256 → Datacom PO-8202**: FOC15201M24, FOC19496SRC, FOC16064Z1Z, FOC1542480P
**1x JNP-FAN-1RU → Terabit PO-2637 — `t#873282114550`**: SBCFX3604 (old WF0219412892)
**3x N6K-C6001-FAN-B → Curvature PO-1361562-C — `t#382123623753`**: FOX2052PDHN, FOX2052PDE0, FOX2052PDGZ
**4x C9300-24T-E-WS → (TNT) — `T#873256774277`**: FOC2721YA6K, FOC2721Y72K, FOC2648YCCD, FOC2721YA6F; pwr-c1-350wac-p DCC2647B46Z/DCC2721B0TT/DCC2721B0UH/DCC2721B0U3
**2x DS-C9148T-48PETK9 + PSU/FAN/rail → OSI PO-26316 — `t#1ZJW78500308687477`**: JPG2603008A, JPG27110023; PSU ART2311FQQ1/ART2311FQZW/ART2350F0R3/ART2451F407; FAN DCH2450R9UE/DCH2425RWR5/DCH21320U0Z/DCH2450R9U6/DCH2309RBTW/DCH2309RBUD/DCH2309RBTY/DCH2309RBW9; rail JNV23455439/JNV23455441
**1x C9120AXE-B → OSI PO-26027 — `T#1ZJW78500319054862`**: FJC26261DFS
**3x C9300-24P-A → Trifecta PO-309537 — `t#873281900663` + `T#873324218515`**: FCW2426D13K, FJB2314B091, FJC2322S0UF; PWR LIT2724CTLN/LIT2724CTED/LIT2724CT36; FAN DCH2302RED8/DCH2302R76V/DCH2431REXC
**2x NXK-ACC-KIT-1RU → Procurri PO-AT 111099 — `t#873281039577`**: JNV23454848, JNV23454854
**1x PWR-500-DC-F → DHD PO-72280 — `t#1ZJW78500305395654`**: L101SL000QAPP
**50x SFP-10G-LR → Curvature PO-1361607-I — `T#382142894893`**: acw260519b3, acw2605199k, acw2605199g, acw260519ak, acw2605199u, acw26091r0c, acw26091qyz, acw26091r01, acw26091qz5, acw26091qzu, acw252937ad, acw252935bd, acw252934vc, acw252933w4, acw252933w0, acw26110xwb, acw26110xwj, acw26110xxe, acw26110xx2, acw26110xwr, acw25291p6p, acw25291p75, acw25291p6s, acw25291p6a, acw25291p69, acw26110zdk, acw26110zet, acw26110zdl, acw26110zdb, acw26110zf1, acw2615189s, acw261518an, acw261518ak, acw2620042v, acw26200423, acw26192t8x, acw26192t8z, acw26192t8l, acw26192t8j, acw26192t92, acw26110yam, acw26110yap, acw26110ybw, acw26110yhe, acw26110yjb, acw26041w0q, acw26041w13, acw26041w1s, acw26041w1p, acw26041w0a
**6x C9300-NM-8X + 10x SFP-10G-SR → Tredent PO-25754 — `t#1ZJW78500321246427`**: FJZ27411UR0, FJZ2626149P, FJZ2626146T, FOC230984VG, FJZ26261486, FJZ26251J9J; SFP SAVD1926A3ME, SAVD1909A2R2, SAVD1909A2P8, SAVD1926A5ZS, SAVD1926A0BV, SAVD1919AKHX, SAVD1926A3D7, SAVD1926A5ZG, SAVD1926A0NR, SAVD1926A0PJ
**1x C8200-1N-4T → DHD PO-72509 — `t#1ZJW78500334942418`**: FGL2703L7EF
**1x MA-ANT-27 → Curvature PO-1361163-C — `t#382139053391`**: JNV23455420
**1x EX-UM-4SFPP-MR (ship)** `17-14781-37655`: XS0221420145
**2x WS-C3560G-24PS-E (ship)** `14-14782-66558` (no unit SN)
**2x C9300-48P-E + PWR → Data Networks PO-20158986 — `t#873411659986`**: FOC2530LZV6, FOC2720Y3K3; PWR SDCC2550C1TS, SLIT2842AJKX
**20x C9300-NM-8X → Curvature PO-1361627-I — `T#1ZJW78500336683230`**: FOC22447FB0, FOC252520ZB, FJZ250407V3, FOC25062DMY, FJZ23141JWL, FJZ2444179V, FOC26452WP0, FOC222897W6, FOC221049XD, FOC24145T67, FOC24151HA4, FJZ2626139Z, FJZ26251ARU, FOC22442VA0, FOC22273U5U, FOC23246NZY, FJZ23321MWY, FOC230984BC, FJZ23150JUP, FOC23285F2V; faceplates JNV23454926/924/922/920/917/919
**VIC3-4FXS/DID (ship)** `18-14787-11692`: FOC18375W7G; `16-14788-52149`: FOC16036FFK; `14-14784-46115`: FOC17361MNE
**1x PVDM2-64 (ship)** `21-14778-64209`: FOC13313DX8
**3x IE-4010-16S12P → LSI PO 189133 — `t#873471359207`**: FDO2311J4UU, FDO2326J0U6, FDO2233U0GV; PWR-RGD-AC-DC DTH2214024Q/DTH212901F7/DTH1941006H/DTH2230002D/DTH2230002G/MP5B10700CB
**4x ISR4321-V/K9 + 2x ISR4331-V/K9 → CXtec PO-D1240982 — `t#1ZJW78501211711523`**: FLM2146V035, FLM2137V0YL, FLM2221W0C8, FLM2229V188; PSU JNV23452553/2555/2545/2547/2549/2551
**2x GLC-T-RGD → Curvature PO-1361001-C — `t#873468937279`**: ACW28350BNZ, ACW28350C51
**5x SFP-10G-T-X → ChannelWorks PO-14948 — `t#873468898750`**: ACW29080WSJ, ACW29080J4D, ACW29021EB6, ACW29081JY1, ACW29080FQL
**10x GLC-LH-SM → GlobalNet Pro PO-16524**: SOPC15471170, SOPC15471254, SOPC15471186, SOPC15471175, SOPC15471394, SOPC15471495, SOPC15471241, SOPC15471196, SOPC15471373, SOPC15471239
**6x MR46-HW → PO-23854**: Q3AC-EM2G-7YQB, Q3AC-HR8D-SKQQ, Q3AC-ZGGB-7KH8, Q3AC-FHVK-PUFD, Q3AC-EC8J-J5L4, Q3AC-MUBP-Z6H6
**1x C8200-RM-19 → DHD PO-72509 — `t#1ZJW78500300871915`** (rack ear, no unit SN)
**2x MEM-C8300-32GB → RetroHardware PO-12770 — `t#1ZJW78500304585709`**: S802C0F21122DBA344A, S802C0F21122DBA3464
**11x CW9166I-MR → INTELECA PO-20105 — `t#873509498579`**: KWC27180B7W, KWC28490GA9, KWC270908BV, KWC2712060N, KWC27170EHK, KWC28490G6D, KWC27110UCV, KWC270908AP, KWC27110U5B, KWC27110UB0, KWC27110UCX
**8x CS-DS-C32S-FAN-E + 4x CS-DS-CAC-650W-E → Procurri PO-DS 111503 — `t#873526593860`**: FAN DCH2309RBUK/DCH2309RBW6/DCH2309RBWC/DCH2309RBU1; PSU ART2311FQQ3/ART2504F89E/ART2311FQQ2/ART2504F8H7
**2x AIR-AP1562I-S-K9 (ship, Invoice 151717)**: FGL2346L5CG, FGL2431N019
**3x PWR-C49-300AC → Curvature PO-1361761-C — `t#873528345784`**: PAC14370LDZ, QCS1252B0A0, QCS1214B0CA
**1x C9300X-24Y-A → Network Craze PO-P30894 — `t#1ZJW78500214710752`**: FOC2631YKE6; PWR DCC2717C5H0; FAN DCZ262502FD/DCZ262502H7/DCZ262502G4
**1x C9300-48U-A → PIVIT PO-256970 — `t#1ZJW78500218523966`**: FJC25171FJT; PWR DTN2131V1MW; FAN DCH2245RP2S; NID2550W0SD2; DCZ28420RJH
**32x J9150D → Curvature PO-1361323-C — `t#873507576964`**: CN82KBV1M1, CN82KBV0S8, CN82KBV51F, CN82KBV0MN, CN82KBV35B, CN82KBV1G0, CN82KBV5K9, CN82KBV19X, CN82KBV32Y, CN82KBV5TJ, CN83KBV52D, CN82KBV0DQ, CN82KBV7C0, CN82KBV19D, CN82KBV154, CN82KBV5N0, CN82KBV0JP, CN81KBV0KN, CN82KBV4YD, CN82KBV21P, CN82KBV57K, CN82KBV5TH, CN82KBV0MM, MY95KC90HW, MY81KC9070, CN13KJV57L, MY81KC910C, MY81KC907P, MY81KC908C, MY81KC90JG, MY81KC907N, MY81KC90CM
**2x QSFP-100G-SR1.2 → Recurrent PO-P54386 — `t#873526921770`**: INL27130FVW, INL270405X3
**13x MA-MNT-MR-16 → CXtec PO-D1239490 — `t#1ZJW78501215940537`**: CCA45680155, CCA45680167, CCA45680161, CCA45680061, CCA45680153, CCA45680075, CCA45680168, CCA45680159, CCA45680059, CCA45680067, CCA45680069, CCA45680071, BCAA0004947
**1x C9120AXE-B → OSI PO-26415 — `t#1ZJW78500303834941`**: FJC27141WLU
**3x CISCO891-K9 → OSI PO-26501 — `T#1zjw78500320830467`**: FTX183383M8, FTX18518357, FGL211323HR; PSU dth1918d2l7/dth2022d4q8/dth2022d4lz
**3x C9200CX-12P-2X2G-E → OSI PO-26496 — `t#1ZJW78500334411250`**: FJC28281TXR, FJC28311W14, FJC28311W9J
**6x C9300-NM-2Y → Trifecta PO-309601 — `T#873575620567`**: FJZ251911DH, FOC24165Z60, FJZ25030VJC, FOC24125U5Z, FJZ25030TZG
**1x C9200CX-12P-2X2G-A → Network Craze PO-P31125 — `t#1ZJW78500231236840`**: FJC272310VC
**42x PWR-C1-715WAC-P → Curvature PO-1361774-I — `T#873530090274`** (36x): LIT2736A6LU, LIT2732C75W, LIT2736A6ES, LIT2724CT3F, LIT2726A3G4, LIT2736A6GK, LIT2724CT6P, LIT2732C7FH, LIT2726A3Y9, LIT2732C7DV, LIT2732C7GC, LIT2736A6PC, LIT2726A35H, LIT2726A3TY, LIT2724CT9G, LIT2724CT29, LIT2724CT95, LIT2732C77V, LIT2736A6TU, LIT2726A3JE, LIT2732C7J6, LIT2736A6LH, LIT2732C7AY, LIT2732C765, LIT2732C77S, LIT2736A65F, LIT2736A6JD, LIT2732C7SR, LIT2724CT9T, LIT2736A6T2, LIT2724CTEP, LIT2732C7JQ, LIT2724CTPX, LIT2732C7GN, LIT2726A3VJ, DCC2913C1MB
**1 button for IE-4010-16S12P → LSI PO 189133 — `t#1ZJW78504436235307`**: FDO244114ZK (module button, partial)
**Return shipments (outbound returns):** WS-SUP32-GE-3B set SAL09402986/SAL09402B2Q/SAL094027D8 (`02-14656-21410`); C9120AXE-B FJC27141WFH + FJC24441S1F (`18-14577-09030`); NIM-2GE-CU-SFP FOC24331W30 (`22-14679-31676`); NIM-2GE-CU-SFP FOC23503PHV (`26-14668-35871`); 2x MR86-HW Q3AE-VWFF-6MGG, Q3AE-EGKE-B3X2 (`15-14694-40587`); 2x PWR-C1-350WAC-P DCC2304B04R, DCC2503B41T (`03-14718-50867`)
*SHIPPED total: ~145 shipment/return sections, ~700+ individual serial numbers.*
### B) KEPT / NOT SHIPPED
- QSFP-40G-SR-BD lot `10-2945-02` (TNT Import) — "total 8 units only, the rest keep in US", "*not here yet"
- 4x JNP-SFP-25G-SR (TNT Import) — "*not here yet"
- C9200-STACK ×28 + STACK-T4-50CM ×10 — "keep aside, will ship AUS" (see list 3)
- C9120AXE-B FJC28081EXK — repeatedly "already shipped to CMS last wk" (historical, do not reship)
- Top-cover kept aside (for C9120AXE-B FJC27151F4C return)
- QSFP-40G-LR4 SPC23020002 — "can't find it here"
- C9120AXE staged/keep aside (ship tomorrow): FJC27141WLU, FJC27081CEG, FJC262921J0, FJC2629225Y
- M.2 SSDs / EM7455/EM7430 355813079879315 — held for next shipment
- WS-C2960X-48TS-L FOC2111S08L — NOB, staged for AUS
- MR86-HW Q3AE-DMMG-D9KV — "Do NOT ship"
*KEPT total: ~12 explicit kept/held item groups.*
---
## 2. Items With an Issue (faulty / troubleshooting / damaged / return)
| PID | SN | Reason |
|---|---|---|
| C9300-24P-A / C9300-24P-E | FCW2303DHNT | "line 2 van bi loi" (still faulty); earlier "khong boot duoc: BOOT FAIL W..." |
| C9120AXE-B | FJC27041KKE | "TS24 ... reconnect again so we can recover them" |
| C9120AXE-E | FGL2812LK6C | "TS12" troubleshooting (1/0 USEB) |
| C9120AXE-B | FJC27102CXZ | "line 21 ... chuyen sang line khac giup e" (no console) |
| SUP32 / PFC2 / MSFC2 | SAL1338Z2TC / SAL1339ZJD8 / SAL1339ZGPX | "bi loi -Traceback= ... Error75" |
| C9300-24P-E | FCW2238S0BA | "line 5 bi loi" |
| NIM-2GE-CU-SFP | FOC23503PHV, FOC24331W30 | bad → returned |
| MV86-HW | Q3AE-EGKE-B3X2, Q3AE-SZD8-3PPD, Q3AE-6S3U-Y6N5, Q3AE-VWFF-6MGG, Q3AE-8P53-P74T, Q3AE-SFXM-7CK2 | "failed test" / LED orange |
| C8300-1N1S-6T | FLM290910DT | "RFID module broken, can't fix" |
| C8300-1N1S-6T | FLM280510M6 | "can't fix ... RFID module broken" |
| WS-SUP32-GE-3B | SAL09402986 | "damaged power connector" (returned) |
| MR86-HW (6 units, no SN) | — | "LED orange when tested" (OSI PO-25694) |
| N5K-C5596UP | (TS17/TS21) | "PSU/FAN/LED ok but no console" |
| C1111-8PLTEEAWB (2) | — | "no console after restart" |
| WS-C3750X-48P/24P-S | batch | "most were bad, scratches" |
| DS-C9148T-K9 | JPG2603008A | "can't unbend chassis/ports" |
| CISCO2821 | FTX1134A22A, FTX0950C0GD, FTX1418AKP2 | missing FAN/PSU/faceplate |
| ISR4331/K9 | FLM1940W0QB | "PASS WITH WARNING" |
| WS-C3560G-24PS-S | FOC1124Z45D | "no console, no faceplate" |
| ISR4331 (x2) | — | "PSU missing power cab" |
| MR86-HW (return) | Q3AE-VWFF-6MGG, Q3AE-EGKE-B3X2 | returned faulty (`15-14694-40587`) |
| C9120AXE-B (return) | FGL2812LPJG, FJC27151F4C | return (`18-14577-09030`); one "without top cover from AUS" |
*ISSUE total: ~24 item groups / 30+ individual faulty/troubleshooting SNs.*
---
## 3. SN Transferred to AUS / Australia
**C9200-STACK batch ("keep aside, will ship AUS" → Import Australia UPS `t#1ZJW78500439236591`):**
SFP-10G-LRM: FNS22050YND, FNS23051B30, FNS23051B32, FNS230514Q4, FNS23051B20, OPM23170H22
C9200-STACK (28): JAE234004PC, JAE234004T3, JAE25360Z1S, JAE25360ZXV, JAE25360ZXW, JAE25360WQ5, JAE25210K5G, JAE25360U84, JAE25360U7W, JAE26440B4B, JAE26402RM2, FDO24260JB2, JAE2610039Z, JAE25210GZQ, FDO19321P8C, JAE25360VU0, FDO20120G3J, JAE23051476, JAE230601NH, FVH27510R69, JAE25161CTG, FDO24250TUS, FDO24250T0B, FDO24260G32
STACK-T4-50CM (10): LCC2428GA36, LCC2618GGEJ, MOP2732A3H9, LCC2246GX01, LCC2625GFEE, MOC261800C4, MOC261801GT, MOC261700NW, MOP2732A3EX, LCC2320GCQW
AIR-AP1562E-B-K9 FJC263610EP; EX-SFP-10GE-SR SCJ19YP0M8, SCJ19YP0V4, SCJ19YP02N, SAZH1TSH
**Import Australia UPS `t#1ZJW78500415223596`:** UCSX-TPM-002C FCH270476WX, FCH26177EG0, FCH270476HD, FCH251572R7; PAN-SFP-PLUS-SR MU90S5H, MU60V2X, MU20TFU, MU211ZP; NIM-2GE-CU-SFP FOC250624L6, FOC250624QT, FOC24400EXW, FOC21452FDH, FOC263359EC; JNP-SFP-25G-SR S1A1M31A751BAB, S1A1M31A7517WR, S1A1M31A751BVZ, S1A1M31A751BFQ; UCSX-210C-M7 (3x); NVME-SSD 22293A26BAF6, 22293A447730; SHOKZ JNV23455443; NIM-ES2-8 SFOC194350CH, SFOC24484UR9
**Import Australia UPS (no t#):** UCSX-ML-V5Q50G FCH272977RT, FCH272270UZ, FCH272978JZ; MTFDDAV240TDS 213931914B8A, 213931914A5B, 214131CE6FB9; EM7455/EM7430 355813079879315
**Import Australia UPS `t#1ZJW78500423326317`:** PWR-80W-AC DAB192754P8, DAB194450WP; CMPCT-DIN-MNT JNV23455437; MS120-8LP Q2BX-YLYW-556A, PSU HSQD76M73B1; STACK-T4-50CM LCC2920G58F; SATA-SSD-240GB 11S00VN646Y0M127A83181; XEON-6342 2A2109BF5F0F8383RKXA, 2A2134BF51728CD8RKXA, 2A226ABF4F28BBEARKXA, 2A2076BC995F5353RKXA, 3A4819C0FC3F792ARKXA, 3A481A1EB9C113CFRKXA; C9500-40X FJC254223DA + PWR DCI2504M35A/DCI2536M3VC + fans DCH2526Y0LH/0LM/0LN/0LK/0LR
**Import Australia UPS `t#1ZJW78500431774363`:** GLC-BX-D S080322BD001 / 080322BD001; NEX-PLAYGROUND NPG02C100668500; N9K-C93180YC-EX FDO2252023P, FDO21482ECD, FDO21111F2A + NXA-FAN-30CFM-F NID2146H5I7/I8/I9/I4, NID2250H1B1/B7/197/19B + NCV2520R0QG/R0GW/R073/R0DN; WS-F6K-PFC3B SAD092705FY
**Import Australia Fedex/TNT `t#873308530323`:** NXA-FAN-30CFM-B NID1922I1SG/1SC/1S0, NID2135I0RK, NID1922I1SE, NID2106I25F, NID2108I38W, DCH2237R3N6, NID2103I35Z, DCH21260BFD, DCH21281B4X, DCH21370ELG; C9300-24T-A FJC2333S06H, FJC2333S05T; C9200-STACK (20) FVH29100T7W, FVH29100X6E, FVH2910151B, FVH29100XBB, FVH29100VEM, FVH29100TH7, FVH29100VQ1, FVH29100U1P, FVH29100TNL, JAE23180HMY, JAE25360VTV, FDO23290KA5, FDO24260J5C, JAE241108B5, FDO23290LAT, JAE26360XUR, JAE26382LDW, JAE25360VD8, FDO22263HYZ, FDO22040VZ4; C9200L-48P-4X-E FOC2608054S; WS-SUP32-GE-3B SAL12330S7D + MSFC2A SAL12330JA2 + PFC3B SAL12330MGV
**Import Australia UPS `t#1ZJW78500401856894`:** C3850-NM-2-40G FOC192205F3; C9300-24T-A FOC2217Q02J
**TNT / Fedex (make New in AUS) `T#873256774277`:** 4x C9300-24T-E-WS FOC2721YA6K, FOC2721Y72K, FOC2648YCCD, FOC2721YA6F; WS-C2960X-48TS-L FOC2111S08L; XL710-QDA2 7981133843, 951013; UCSC-PSUV2-1050DC LIT23223D4H, LIT23223AH7; C9120AXI FJC25432C7X, FJC25432CB7, FJC25432CA2, FJC25432C7S; C3850-NM-4-10G foc23262kmy, fjz233020cu
**Other AUS notes:** SSD-SATA-960G "already on the shipment to AUS"; N6K-C6001-FAN-B "already shipped to AUS"; MR86-HW / AIR-AP-BRACKET "recently arrived from AUS".
*AUS-transfer total: ~10 Import-Australia shipments, 150+ serial numbers.*
---
## 4. SN in xxxx-xxxx-xxxx Format — Shipped/Outbound Context
| Hyphenated code | Section / context | Outbound? |
|---|---|---|
| `17-14693-01192` | ship PWR-115W-AC (DAB264260LD) | Yes |
| `10-2945-02` | TNT Import — QSFP-40G-SR-BD lot (8 shipped) | Yes |
| `10-14642-61136` | return C9300-24P-A (FJC2340S02R) | Yes |
| `14-14726-39035` | ship MR36-HW (Q3AJ-49HS-7EJT) | Yes |
| `04-14738-90912` | ship IPO-500-EXP-MOD (09FC19500851) | Yes |
| `18-14713-58108` | ship 2x CP-840 (TCL251100KM, TCL2511007X) | Yes |
| `02-14762-32221` | ship MA-MNT-MR-17-MR36 (CCA45679958) | Yes |
| `17-14713-99371` | ship C9136I-ROW (FGL2702LXZV) | Yes |
| `05-14746-55384` | ship VIC3-4FXS/DID (FOC14317P5Z) | Yes |
| `02-14656-21410` | return WS-SUP32-GE-3B set | Yes |
| `23-14741-66101` | ship AIR-ACC1530-PMK1 (JNV23453733) | Yes |
| `18-14577-09030` | return 2x C9120AXE-B (FJC27141WFH, FJC24441S1F) | Yes |
| `05-14777-19977` | ship 2x PVDM2-64 (FOC123546Z2) | Yes |
| `03-14766-07874` | ship GLC-ZX-SM-RGD (SPC1432030T) | Yes |
| `08-14781-17608` | ship 2x PWR-C1-440WDC (ART2210FCJL, ART2210FCDR) | Yes |
| `27-14612-29455-P01` | AUS — Micron MTFDDAV240TDS | Yes |
| `22-14679-31676` | return NIM-2GE-CU-SFP (FOC24331W30) | Yes |
| `26-14668-35871` | return NIM-2GE-CU-SFP (FOC23503PHV) | Yes |
| `17-14781-37655` | ship EX-UM-4SFPP-MR (XS0221420145) | Yes |
| `14-14782-66558` | ship 2x WS-C3560G-24PS-E | Yes |
| `18-14787-11692` | ship VIC3-4FXS/DID (FOC18375W7G) | Yes |
| `16-14788-52149` | ship VIC3-4FXS/DID (FOC16036FFK) | Yes |
| `21-14778-64209` | ship PVDM2-64 (FOC13313DX8) | Yes |
| `14-14784-46115` | ship VIC3-4FXS/DID (FOC17361MNE) | Yes |
| `13-14800-40754` | ship ASR-9902-2P-KIT | Yes |
| `15-14694-40587` | return 2x MR86-HW | Yes |
| `03-14718-50867` | return 2x PWR-C1-350WAC-P | Yes |
Excluded (incoming, not outbound): `12-14747-30779` (eBay receiving), `24-14738-31605` (return never completed).
*Hyphenated-shipped total: 27 codes.*
---
## Caveats
- CISCO2511-DC PO-5602 vs DS-C9132T-K9 PO-256275: the two sections reference tracking numbers near `t#872615359982` / `t#872615037247`; SN data for each is unambiguous but the exact t# pairing is flagged.
- Many "Please help to ship/return" single-item sections have no UPS/FedEx `t#` in Joe's message (label pre-supplied); treated as outbound per domain rules.
- Lowercase SNs and leading `S`/`X` prefixes are quoted exactly as in the source.

View File

@ -0,0 +1,285 @@
## You said
List all SN shipped , and the one kept
## Gemini said
Based on the communications regarding
PO-0272806
(100x C9120AXE-B-REF to CMS Communications, Inc.), here are the serial numbers that were successfully shipped, as well as the unit that appears to have been kept
.
SNs Shipped (Total: 69 Units)
A total of 69 units were shipped across three separate batches
:
Batch 1 (Shipped May 29, 2026 - 20 Units):
The following 20 serial numbers were shipped: FJC270918FN, FJC27041K7D, FJC270918HB, FJC2428184B, FJC242612Y8, FJC25331KK9, FJC242612YA, FJC242612Y9, FJC27111BBT, FJC234916D2, FJC242916WL, FJC24401SKH, FJC250618K5, FJC24381E9Q, FJC28081EXK, FJC250618KB, FJC24401SL0, FJC25331M5V, FJC242916VZ, and FJC24381DWC
.
Batch 2 (Shipped June 1, 2026 - 28 Units):
The following 28 serial numbers were shipped: FJC27081CA2, FJC24381DM7, FJC24381E8W, FJC24311C2L, FJC24381DV4, FJC24401SHB, FJC250618AX, FJC24331ATB, FJC242916V0, FJC24381DV2, FJC250618KG, FJC2428180H, FJC24381DV9, FJC27102DKZ, FJC24511LK7, FJC24381DUJ, FJC27081CAY, FJC27081CA0, FJC27081CAX, FJC27081CA1, FJC25331KQW, FJC24401SKB, FJC242916W2, FJC242612Y5, FJC242817ZG, FJC270918G9, FJC27043K60, and FJC271823UE
.
Batch 3 (Shipped June 5, 2026 - 21 Units):
The following 21 serial numbers were shipped: FJC27102CXY, FJC27102CSS, FJC27041KKE, FJC27081CAW, FJC27041K9U, FJC27041K89, FJC25081ALQ, FJC27081CA3, FJC25101JGZ, FJC25061LWW, FJC24381DUC, FJC26291GP3, FJC242916V4, FJC24381DWD, FJC250618B7, FJC25101JJW, FJC24381E5G, FJC25101JJV, FJC27102CXZ, FJC24441S1F, and FJC27141WFH
.
The SN Kept
FJC28081EXK
This serial number was initially documented as shipped in the first batch on May 29, 2026
. It was subsequently excluded from later shipments (such as PO-0272591 on June 2) with the explicit note that it had "already shipped to CMS last wk"
. However, on June 8, an internal message noted to "connect to TS if you find this" regarding FJC28081EXK, indicating that despite being marked as shipped, it was likely kept or accidentally left behind in the facility
.
(Note: SN
FJC27041KKE
was also temporarily kept aside on June 2 for recovery, but it was ultimately shipped out in the third batch on June 5
.)
## You said
List all items with issue (SN + PID if possible)
## Gemini said
Here are the items reported with issues, categorized by the type of problem:
Boot, System, and Connectivity Failures
PID:
C9300-24P-A |
SN:
FCW2303DHNT - This unit experienced a boot failure and displayed a "BOOT FAIL" error.
PID:
SUP32 |
SN:
SAL1338Z2TC - This unit generated a traceback error (Error75).
PID:
pfc2 |
SN:
SAL1339ZJD8 - This component generated a traceback error alongside the SUP32.
PID:
msfc2 |
SN:
SAL1339ZGPX - This component generated a traceback error alongside the SUP32.
PID:
N5K-C5596UP |
SN:
Not specified - The PSU, fans, and LEDs powered on successfully, but the unit failed to provide any console output.
Faulty or Failed Testing
PID:
NIM-2GE-CU-SFP |
SN:
FOC23503PHV - This module was explicitly marked as bad and faulty.
PID:
NIM-2GE-CU-SFP |
SN:
FOC24331W30 - This module was explicitly marked as bad and faulty.
PID:
NIM-2GE-CU |
SN:
Not specified (2 units) - These modules were tested in an ISR4331 and were suspected to be bad.
PID:
MV86-HW / MR86-HW |
SNs:
Q3AE-EGKE-B3X2, Q3AE-SZD8-3PPD, Q3AE-6S3U-Y6N5, Q3AE-VWFF-6MGG, Q3AE-8P53-P74T, Q3AE-SFXM-7CK2 - These units failed testing because they booted but the LED remained orange when an Ethernet cable was connected.
PID:
MR86-HW |
SN:
Q3AE-DMMG-D9KV - There was an explicit instruction not to ship this specific unit, indicating a probable fault.
Physical Damage (Broken Parts, Bent Chassis, Scratches)
PID:
C8300-1N1S-6T |
SN:
FLM290910DT - This chassis had a broken RFID module that could not be fixed.
PID:
C8300-1N1S-6T |
SN:
FLM280510M6 - This unit had a broken RFID module that could not be fixed, and the aesthetics were deemed unfixable.
PID:
DS-C9148T-K9 |
SN:
JPG2603008A - This switch suffered from a bent chassis and bent ports that could not be repaired.
PID:
DS-CAC-650W-E |
SNs:
ART2311FQQ3, ART2504F89E, ART2311FQQ2, ART2504F8H7 - These units were noted as having broken handles that required physical replacement.
PID:
WS-SUP32-GE-3B (Component) |
SN:
SAL09402986 - This component was returned with a damaged power connector.
PID:
IE-4010-16S12P |
SN:
FDO244114ZK - This unit was missing a black plastic mode button, which required a full module replacement to fix.
PID:
C9120AXE |
SN:
FJC27141WMG - This chassis was set aside due to physical condition and described as "most ugly".
PID:
C3850-NM-4-10G |
SNs:
FOC23262KMY, FJZ233020CU - These modules were described as "most ugly" due to physical condition.
PID:
WS-C3750X-48P-S |
SNs:
Not specified - Most units in a large batch were reported as bad with many scratches on the top of the chassis.
Missing Internal Components
PID:
CISCO2821 |
SN:
FTX1134A22A - This unit was missing its FAN, PSU, and FACEPLATE.
PID:
CISCO2821 |
SN:
FTX0950C0GD - This unit was missing its PSU and FACEPLATE.
PID:
CISCO2821 |
SN:
FTX1418AKP2 - This unit was missing its PSU and FAN.
PID:
ISR4331 |
SNs:
Not specified (2 units) - These units were missing the internal power cables that connect the PSU to the motherboard.
## You said
List all SN transferred to AUS/Australia
## Gemini said
Based on the records in
aJoe.md
, here are the serial numbers explicitly noted as being kept aside to ship or transferred to Australia (AUS):
C9200-STACK Modules (June 12 Records)
These units were instructed to be kept aside to ship to Australia
:
JAE234004PC
JAE234004T3
JAE25360Z1S
JAE25360ZXV
JAE25360ZXW
JAE25360WQ5
JAE25210K5G
JAE25360U84
JAE25360U7W
JAE26440B4B
JAE26402RM2
FDO24260JB2
JAE2610039Z
JAE25210GZQ
FDO19321P8C
JAE25360VU0
FDO20120G3J
JAE23051476
JAE230601NH
FVH27510R69
JAE25161CTG
FDO24250TUS
FDO24250T0B
FDO24260G32
STACK-T4-50CM Modules (June 12 Records)
These stacking cables were also set aside for the Australia shipment
:
LCC2428GA36
LCC2618GGEJ
MOP2732A3H9
LCC2246GX01
LCC2625GFEE
MOC261800C4
MOC261801GT
MOC261700NW
MOP2732A3EX
LCC2320GCQW
Additional C9200-STACK Modules (June 19 Records)
These serial numbers were documented under the "Import Australia Fedex / TNT" shipment log to increase the overall stack module transfer total
:
FVH29100T7W
FVH29100X6E
FVH2910151B
FVH29100XBB
FVH29100VEM
FVH29100TH7
FVH29100VQ1
FVH29100U1P
FVH29100TNL
JAE23180HMY
JAE25360VTV
FDO23290KA5
FDO24260J5C
JAE241108B5
FDO23290LAT
JAE26360XUR
JAE26382LDW
JAE25360VD8
FDO22263HYZ
FDO22040VZ4
(Note: It is also mentioned that a batch of
SSD-SATA-960G
units was successfully sent out on a shipment to Australia, though their specific serial numbers were not recorded in the text
.)
## You said
List all SN in this format that has been shipped:
xxxx-xxxx-xxxx
## Gemini said
Based on the provided document, here are all the serial numbers matching the
xxxx-xxxx-xxxx
format that have been explicitly shipped
:
MR36-HW
Q3AJ-49HS-7EJT
CW9162I-MR
Q5AA-JNPC-SYDC
MV32-HW
Q2PV-2885-9WH5
Q2PV-599H-DNSD
Q2PV-C6EG-KB2J
Q2PV-A6NU-EQX8
Q2PV-DL34-7UEG
Q2PV-WP5A-5HGM
Q2PV-KQL5-GFS3
Q2PV-VPCA-HBCB
Q2PV-LQ7U-3RFS
Q2PV-GQMH-F5LJ
Q2PV-5ZQ9-5DD8
Q2PV-VW84-729Z
Q2PV-GFSK-KDPC
Q2PV-3X3G-2QU2
Q2PV-EJ4U-9EG4
MR86-HW
Q3AE-AY5R-YV22
Q3AE-2MKJ-V78D
Q3AE-3PRL-VL68
Q3AE-52NF-QY6E
Q3AE-RNQP-HXGT
Q3AE-9FKB-VBJ7
Q3AE-DDPF-UH9A
MR46-HW
Q3AC-EM2G-7YQB
Q3AC-HR8D-SKQQ
Q3AC-ZGGB-7KH8
Q3AC-FHVK-PUFD
Q3AC-EC8J-J5L4
Q3AC-MUBP-Z6H6
## You said
Can a 7B model running on RTX 4000 8Gb GPU answer the questions above ?
## Gemini said
Yes, it is technically possible, but it would likely struggle to provide a perfectly accurate and complete answer. Running a 7B model on an 8GB GPU to perform complex data extraction requires significant compromises.
Here is a breakdown of the technical and cognitive hurdles a 7B model would face with this specific task:
1. Hardware Constraints (VRAM Limitations)
An 8GB GPU cannot load a standard 16-bit 7B model, which requires roughly 14GB of VRAM just for the weights. To run it on an RTX 4000 (8GB), you would have to
quantize
the model (e.g., using 4-bit quantization formats like GGUF, AWQ, or EXL2).
The Weight:
A 4-bit 7B model takes up about 4 to 4.5 GB of VRAM.
The KV Cache (Context Window):
The document you provided is quite long and contains thousands of tokens. The remaining 3.5 GB of VRAM limits how much text you can feed the model at once. If the document exceeds the context limit that fits in your remaining VRAM, the model will throw an Out of Memory (OOM) error.
2. Cognitive Limitations (Accuracy and Recall)
Even if you fit the document into the model's context window, smaller 7B and 8B models (like Llama 3 8B or Mistral 7B) have known limitations with this specific type of heavy data-extraction task:
The "Needle in a Haystack" Problem:
7B models struggle to maintain attention across long texts. When asked to find scattered details (like "items with issues" mixed into dozens of emails), they frequently miss items or hallucinate incorrect associations.
List Truncation:
When asked to list 69 random, non-sequential alphanumeric strings (like the shipped serial numbers), 7B models almost always get "lazy." They will output the first 10 to 15 items and then stop, hallucinate patterns, or summarize the rest instead of completing the exact task.
Pattern Matching:
While a 7B model understands the request to find the
xxxx-xxxx-xxxx
format, an LLM is essentially guessing the next token based on probability. It is far less reliable at scanning for strict formatting rules over a massive text block than a simple Python Regular Expression (Regex) script.
The Verdict
You can run a quantized 7B model on your hardware, and it will give you a
plausible-sounding
answer. However, for a task requiring 100% accuracy in extracting dozens of random serial numbers from a long, noisy email thread, a 7B model will likely drop data, hallucinate serial numbers, or truncate the lists.
Are you currently trying to set up a local AI environment for processing these types of inventory files?
This conversation was created with Gemini Advanced. Gemini Advanced isn't available for you, but you can start a new chat about this topic.
Learn more
Gemini is AI and can make mistakes.

BIN
example/image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 369 KiB

411
export_mails.py Normal file
View File

@ -0,0 +1,411 @@
#!/usr/bin/env python3
"""
Export emails from emailfiles/cur to Markdown.
Output: emailfiles/output/<email_id>/
markitdown/body.md markitdown/<stem>.md
docling/body.md docling/<stem>.md
Image OCR: vision LLM (llama-server via SSH tunnel) Tesseract fallback
Tunnel: ssh -fNL 11436:127.0.0.1:11436 -J kai@192.168.171.2 root@172.25.57.233
"""
import email, email.policy, http.client, json, logging, os, re, shutil
import subprocess, sys, tempfile, uuid
from pathlib import Path
from tqdm import tqdm
# .env loader
_env = Path(__file__).parent / ".env"
if _env.exists():
for _l in _env.read_text().splitlines():
_l = _l.strip()
if _l and not _l.startswith("#") and "=" in _l:
_k, _v = _l.split("=", 1)
os.environ.setdefault(_k.strip(), _v.strip())
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
log = logging.getLogger(__name__)
_WORKING = os.environ.get("WORKING_DIR", "emails")
MAILDIR = Path(__file__).parent / _WORKING / os.environ.get("WORKING_INPUT", "input")
OUTPUT = Path(__file__).parent / _WORKING / os.environ.get("WORKING_OUTPUT", "output")
MARKITDOWN_HOST = "localhost"
MARKITDOWN_PORT = 8282
DOCLING_HOST = "localhost"
DOCLING_PORT = 8383
DOCLING_CONTAINER = "ai-markdown-api-docling-1"
API_TIMEOUT = 300
VISION_HOST = os.environ.get("VISION_HOST", "localhost")
VISION_PORT = int(os.environ.get("VISION_PORT", "11436"))
PADDLE_HOST = os.environ.get("PADDLE_HOST", "localhost")
PADDLE_PORT = int(os.environ.get("PADDLE_PORT", "9999"))
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp"}
DOCLING_SUPPORTED = {".pdf", ".docx", ".xlsx", ".pptx", ".html", ".htm"} | IMAGE_EXTS
# ---------------------------------------------------------------------------
# Multipart POST helper
# ---------------------------------------------------------------------------
def _post(host, port, path, filepath, params="") -> str:
boundary = uuid.uuid4().hex
data = filepath.read_bytes()
body = (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="file"; filename="{filepath.name}"\r\n'
f"Content-Type: application/octet-stream\r\n\r\n"
).encode() + data + f"\r\n--{boundary}--\r\n".encode()
url = path + (f"?{params}" if params else "")
conn = http.client.HTTPConnection(host, port, timeout=API_TIMEOUT)
try:
conn.request("POST", url, body=body,
headers={"Content-Type": f"multipart/form-data; boundary={boundary}",
"Content-Length": str(len(body))})
resp = conn.getresponse()
raw = resp.read()
if resp.status != 200:
return f"<!-- API {resp.status} -->"
d = json.loads(raw)
return d.get("markdown") or d.get("content") or str(d)
finally:
conn.close()
def convert_markitdown(src: Path) -> str:
try:
return _post(MARKITDOWN_HOST, MARKITDOWN_PORT, "/convert", src, "use_llm=true")
except Exception as e:
return f"<!-- MarkItDown: {e} -->"
# ---------------------------------------------------------------------------
# Vision LLM — llama-server via SSH tunnel
# ---------------------------------------------------------------------------
def _vision_available() -> bool:
try:
conn = http.client.HTTPConnection(VISION_HOST, VISION_PORT, timeout=3)
conn.request("GET", "/health")
conn.getresponse().read()
conn.close()
return True
except Exception:
return False
def _vision_ocr(src: Path) -> str:
"""Send image to llama-server, return markdown. Returns '' on failure."""
import base64
from PIL import Image
import io
img = Image.open(src).convert("RGB")
if img.width < 1200:
scale = max(2, 1200 // img.width)
img = img.resize((img.width * scale, img.height * scale), Image.LANCZOS)
buf = io.BytesIO()
img.save(buf, "JPEG", quality=92)
b64 = base64.standard_b64encode(buf.getvalue()).decode()
payload = json.dumps({
"messages": [{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
{"type": "text", "text": (
"Read every cell in this table carefully. "
"Output a Markdown table with ALL rows. "
"Do not skip or duplicate any row. "
"Return only the Markdown table."
)},
]}],
"max_tokens": 700,
"temperature": 0,
}).encode()
conn = http.client.HTTPConnection(VISION_HOST, VISION_PORT, timeout=120)
try:
conn.request("POST", "/v1/chat/completions", body=payload,
headers={"Content-Type": "application/json",
"Content-Length": str(len(payload))})
resp = conn.getresponse()
d = json.loads(resp.read())
if resp.status != 200:
log.warning("vision %s: %s", resp.status, str(d)[:80])
return ""
content = d.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
return _dedup_rows(content)
except Exception as e:
log.warning("vision error: %s", e)
return ""
finally:
conn.close()
def _dedup_rows(md: str) -> str:
seen, out = set(), []
for line in md.splitlines():
key = line.strip()
if key.startswith("|") and key in seen:
continue
seen.add(key)
out.append(line)
return "\n".join(out)
def _is_data_image(src: Path) -> bool:
"""True for wide landscape images likely to contain table/text data."""
from PIL import Image
try:
w, h = Image.open(src).size
return w >= 300 and h >= 60 and w / h >= 1.5
except Exception:
return False
# ---------------------------------------------------------------------------
# PaddleOCR fallback — lightweight HTTP call to paddleocr_server on AI server
# ---------------------------------------------------------------------------
def _paddle_available() -> bool:
try:
conn = http.client.HTTPConnection(PADDLE_HOST, PADDLE_PORT, timeout=2)
conn.request("GET", "/")
conn.getresponse().read()
conn.close()
return True
except Exception:
return False
def _paddle_ocr(src: Path) -> str:
import base64
data = base64.standard_b64encode(src.read_bytes()).decode()
payload = json.dumps({"image_base64": data}).encode()
conn = http.client.HTTPConnection(PADDLE_HOST, PADDLE_PORT, timeout=60)
try:
conn.request("POST", "/ocr", body=payload,
headers={"Content-Type": "application/json",
"Content-Length": str(len(payload))})
resp = conn.getresponse()
d = json.loads(resp.read())
return d.get("markdown", "")
except Exception as e:
log.warning("paddle error: %s", e)
return ""
finally:
conn.close()
# ---------------------------------------------------------------------------
# Tesseract OCR fallback via Docling docker container
# ---------------------------------------------------------------------------
def _tesseract_ocr(src: Path) -> str:
from PIL import Image
img = Image.open(src)
scaled = src.with_suffix(".ocr.png")
img.resize((img.width * 3, img.height * 3), Image.LANCZOS).save(scaled)
container_path = f"/tmp/{scaled.name}"
cp = subprocess.run(["docker", "cp", str(scaled), f"{DOCLING_CONTAINER}:{container_path}"],
capture_output=True)
if cp.returncode != 0:
return ""
script = f"""
from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.datamodel.base_models import InputFormat
opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractCliOcrOptions())
conv = DocumentConverter(format_options={{
InputFormat.IMAGE: ImageFormatOption(pipeline_options=opts),
InputFormat.PDF: PdfFormatOption(pipeline_options=opts),
}})
print(conv.convert('{container_path}').document.export_to_markdown())
"""
run = subprocess.run(["docker", "exec", DOCLING_CONTAINER, "python3", "-c", script],
capture_output=True, text=True, timeout=120)
lines = [l for l in run.stdout.splitlines() if not l.startswith("Loading weights")]
return "\n".join(lines).strip()
# ---------------------------------------------------------------------------
# Docling convert (for non-image files)
# ---------------------------------------------------------------------------
def convert_docling(src: Path) -> str:
if src.suffix.lower() not in DOCLING_SUPPORTED:
return f"<!-- unsupported: {src.suffix} -->"
try:
if src.suffix.lower() in IMAGE_EXTS:
# 1. Vision LLM for data images (tables/screenshots)
if _is_data_image(src) and _vision_available():
result = _vision_ocr(src)
if result:
return result
# 2. PaddleOCR fallback (better than Tesseract for dense text)
if _paddle_available():
result = _paddle_ocr(src)
if result:
return result
# 3. Tesseract last resort
return _tesseract_ocr(src)
return _post(DOCLING_HOST, DOCLING_PORT, "/convert", src, "use_llm=false")
except Exception as e:
return f"<!-- Docling: {e} -->"
# ---------------------------------------------------------------------------
# Email helpers
# ---------------------------------------------------------------------------
def _email_id(name: str) -> str:
return name.split(",")[0]
def _html_body(msg) -> str | None:
for part in msg.walk():
if part.get_content_type() == "text/html":
p = part.get_payload(decode=True)
if p:
return p.decode(part.get_content_charset() or "utf-8", errors="replace")
for part in msg.walk():
if part.get_content_type() == "text/plain":
p = part.get_payload(decode=True)
if p:
return p.decode(part.get_content_charset() or "utf-8", errors="replace")
return None
def _write_md(path: Path, content: str):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
log.info(" wrote %s", path.relative_to(OUTPUT.parent))
def _write_jpg(dest_dir: Path, src: Path):
dest_dir.mkdir(parents=True, exist_ok=True)
dest = dest_dir / (src.stem + ".jpg")
if src.suffix.lower() in {".jpg", ".jpeg"}:
shutil.copy2(src, dest)
else:
from PIL import Image
try:
Image.open(src).convert("RGB").save(dest, "JPEG")
except Exception:
shutil.copy2(src, dest_dir / src.name)
dest = dest_dir / src.name
log.info(" saved %s", dest.relative_to(OUTPUT.parent))
# ---------------------------------------------------------------------------
# Process one email
# ---------------------------------------------------------------------------
def process_email(filepath: Path, msg=None):
if msg is None:
with filepath.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.compat32)
eid = _email_id(filepath.name)
log.info("Processing [%s] %s", eid[:30], msg.get("subject", "")[:60])
md_dir = OUTPUT / eid / "markitdown"
doc_dir = OUTPUT / eid / "docling"
with tempfile.TemporaryDirectory() as _tmp:
tmp = Path(_tmp)
# Build CID map and extract all parts to tmp
cid_map: dict[str, str] = {}
for part in msg.walk():
fn = part.get_filename() or part.get_param("name")
if not fn:
continue
payload = part.get_payload(decode=True)
if not payload:
continue
safe = "".join(c if c.isalnum() or c in "._-" else "_" for c in fn)
(tmp / safe).write_bytes(payload)
ext = Path(safe).suffix.lower()
out_name = Path(safe).stem + ".jpg" if ext in IMAGE_EXTS else safe
cid = part.get("Content-ID", "").strip("<>")
if cid:
cid_map[cid] = out_name
def fix_cid(html: str) -> str:
return re.sub(r'cid:([^\s"\'>\)]+)',
lambda m: cid_map.get(m.group(1), f"cid:{m.group(1)}"), html)
def fix_img_comments(md: str, html: str) -> str:
srcs = re.findall(r'<img[^>]+src=["\']?([^\s"\'>\)]+)', html, re.IGNORECASE)
it = iter(srcs)
return re.sub(r'<!-- image -->', lambda _: f"![]({next(it)})" if (s := next(it, None)) else "<!-- image -->",
md) if srcs else md
# Body
body_html = _html_body(msg)
if body_html:
html_fixed = fix_cid(body_html)
html_file = tmp / "body.html"
html_file.write_text(html_fixed, encoding="utf-8")
_write_md(md_dir / "body.md", convert_markitdown(html_file))
docling_body = convert_docling(html_file)
srcs = re.findall(r'<img[^>]+src=["\']?([^\s"\'>\)]+)', html_fixed, re.IGNORECASE)
it = iter(srcs)
docling_body = re.sub(r'<!-- image -->',
lambda _: f"![]({next(it, 'image')})", docling_body)
_write_md(doc_dir / "body.md", docling_body)
else:
log.warning(" no body")
# Attachments
for part in msg.walk():
fn = part.get_filename() or part.get_param("name")
if not fn:
continue
payload = part.get_payload(decode=True)
if not payload:
continue
safe = "".join(c if c.isalnum() or c in "._-" else "_" for c in fn)
att = tmp / safe
att.write_bytes(payload)
ext = Path(safe).suffix.lower()
stem = Path(safe).stem
if ext in IMAGE_EXTS:
_write_jpg(md_dir, att)
_write_jpg(doc_dir, att)
ocr = convert_docling(att)
if ocr and not ocr.startswith("<!--"):
_write_md(doc_dir / f"{stem}.md", ocr)
else:
_write_md(md_dir / f"{stem}.md", convert_markitdown(att))
_write_md(doc_dir / f"{stem}.md", convert_docling(att))
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
if not MAILDIR.exists():
sys.exit(f"Maildir not found: {MAILDIR}")
files = [f for f in sorted(MAILDIR.iterdir()) if f.is_file()]
log.info("Found %d emails — vision: %s:%s", len(files), VISION_HOST, VISION_PORT)
with tqdm(files, unit="email", desc="Exporting", ncols=80) as bar:
for fp in bar:
try:
with fp.open("rb") as f:
msg = email.message_from_binary_file(f, policy=email.policy.compat32)
bar.set_postfix_str(msg.get("subject", "")[:40], refresh=False)
process_email(fp, msg)
except Exception as e:
log.error("Failed %s: %s", fp.name[:40], e)
log.info("Done → %s", OUTPUT)
if __name__ == "__main__":
main()

View File

@ -10,6 +10,10 @@
reverse_proxy api-docling:8000
}
handle_path /api/unlimited-ocr/* {
reverse_proxy api-unlimited-ocr:8000
}
# Static files with SPA fallback
handle {
root * /srv

279
frontend/email-convert.html Normal file
View File

@ -0,0 +1,279 @@
<!DOCTYPE html>
<html lang="vi">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Email Convert — MarkItDown vs Docling</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" />
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.min.css" rel="stylesheet" />
<link href="/style.css" rel="stylesheet" />
<script>
function App() {
return {
basePath: '/workspace/emailfiles',
rootEntries: [],
expanded: {},
browserLoading: false,
selectedPath: null,
selectedName: '',
useLlm: true,
dlFormat: 'markdown',
converting: false,
md: pane('md', 'MarkItDown', 'bg-primary', 'text-primary'),
dl: pane('dl', 'Docling', 'bg-success', 'text-success'),
get visibleEntries() {
const out = []
const walk = (entries, depth) => {
for (const e of entries) {
out.push({ ...e, depth })
if (e.type === 'dir' && this.expanded[e.path])
walk(this.expanded[e.path], depth + 1)
}
}
walk(this.rootEntries, 0)
return out
},
async init() { await this.loadDir(this.basePath, true) },
async loadDir(path, isRoot = false) {
if (!isRoot && this.expanded[path]) {
const copy = { ...this.expanded }; delete copy[path]; this.expanded = copy; return
}
if (isRoot) this.browserLoading = true
try {
const d = await fetch(`/api/markitdown/browse?path=${encodeURIComponent(path)}`).then(r => r.json())
if (d.type === 'dir') {
if (isRoot) this.rootEntries = d.entries
else this.expanded = { ...this.expanded, [path]: d.entries }
}
} catch {}
this.browserLoading = false
},
async changeBase() { this.expanded = {}; this.rootEntries = []; await this.loadDir(this.basePath, true) },
onEntry(e) {
if (e.type === 'dir') this.loadDir(e.path)
else { this.selectedPath = e.path; this.selectedName = e.name }
},
fileIcon(e) {
if (e.type === 'dir') return this.expanded[e.path] ? 'bi-folder2-open text-warning' : 'bi-folder2 text-warning'
const m = { '.pdf':'bi-file-earmark-pdf text-danger','.docx':'bi-file-earmark-word text-primary',
'.xlsx':'bi-file-earmark-excel text-success','.html':'bi-file-earmark-code text-info',
'.htm':'bi-file-earmark-code text-info','.md':'bi-markdown text-secondary',
'.txt':'bi-file-earmark-text text-secondary','.png':'bi-file-earmark-image text-info',
'.jpg':'bi-file-earmark-image','.jpeg':'bi-file-earmark-image',
'.csv':'bi-file-earmark-spreadsheet text-success' }
return m[e.ext] || 'bi-file-earmark text-secondary'
},
async convert() {
if (!this.selectedPath || this.converting) return
this.converting = true
for (const p of [this.md, this.dl])
Object.assign(p, { loading: true, done: false, content: '', error: null, status: 'Đang xử lý...' })
const body = { path: this.selectedPath, use_llm: this.useLlm }
const [r1, r2] = await Promise.allSettled([
api('/api/markitdown/convert-path', body),
api('/api/docling/convert-path', { ...body, output_format: this.dlFormat }),
])
applyResult(this.md, r1); applyResult(this.dl, r2)
this.converting = false
},
download(key) {
const p = this[key], base = this.selectedName.replace(/\.[^.]+$/, '')
const a = Object.assign(document.createElement('a'), {
href: URL.createObjectURL(new Blob([p.content], { type: 'text/markdown' })),
download: `${base}_${key}.md`
})
a.click(); URL.revokeObjectURL(a.href)
},
}
}
function pane(id, label, badgeClass, spinnerClass) {
return { id, label, badgeClass, spinnerClass, tab: 'raw',
loading: false, done: false, error: null, content: '', preview: '',
llmEnabled: false, ms: 0, status: '' }
}
async function api(url, body) {
const t0 = performance.now()
const res = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body) })
const ms = Math.round(performance.now() - t0)
if (!res.ok) { const e = await res.json().catch(() => ({ detail: res.statusText })); throw new Error(e.detail) }
return { data: await res.json(), ms }
}
function applyResult(pane, result) {
if (result.status === 'fulfilled') {
const { data, ms } = result.value
pane.content = data.markdown || data.content || ''
pane.preview = marked.parse(pane.content)
pane.llmEnabled = !!data.llm_enabled
pane.ms = ms; pane.status = `✅ ${ms} ms`; pane.error = null
} else {
pane.content = ''; pane.error = result.reason.message; pane.status = `❌ ${result.reason.message}`
}
pane.loading = false; pane.done = true
}
</script>
</head>
<body>
<div id="app-nav"></div>
<script src="/layout.js"></script>
<div x-data="App()" x-init="init()">
<div class="container-xl py-4">
<div class="row g-3">
<!-- ── File browser ──────────────────────────────────────── -->
<div class="col-lg-4">
<div class="card shadow-sm h-100">
<div class="card-header py-2 d-flex align-items-center gap-2">
<i class="bi bi-hdd-fill text-secondary"></i>
<span class="fw-semibold small flex-grow-1">Duyệt file trên server</span>
<button class="btn btn-sm btn-outline-secondary py-0 px-2" @click="changeBase()">
<i class="bi bi-arrow-clockwise"></i>
</button>
</div>
<div class="px-3 pt-2 pb-1 border-bottom">
<select class="form-select form-select-sm" x-model="basePath" @change="changeBase()">
<option value="/workspace/emailfiles">emailfiles/</option>
<option value="/workspace/emailfiles/output">emailfiles/output/</option>
<option value="/workspace">/ (root project)</option>
</select>
<div class="path-badge text-muted mt-1" x-text="basePath"></div>
</div>
<div class="card-body p-0">
<div class="FileBrowser px-1 py-1">
<div x-show="browserLoading" class="text-center text-muted py-4 small">
<div class="spinner-border spinner-border-sm mb-2"></div><br>Đang tải...
</div>
<template x-if="!browserLoading">
<div>
<template x-for="e in visibleEntries" :key="e.path">
<div class="FileEntry"
:class="[e.type, selectedPath === e.path ? 'selected' : '', 'indent-' + Math.min(e.depth, 4)]"
@click="onEntry(e)" :title="e.path">
<i class="bi flex-shrink-0" :class="fileIcon(e)"></i>
<span class="text-truncate" x-text="e.name"></span>
</div>
</template>
</div>
</template>
</div>
</div>
</div>
</div>
<!-- ── Controls + Results ───────────────────────────────── -->
<div class="col-lg-8">
<!-- Controls bar -->
<div class="card shadow-sm mb-3">
<div class="card-body py-2">
<div class="d-flex flex-wrap align-items-center gap-2">
<i class="bi bi-file-earmark-text fs-5 text-secondary"></i>
<div class="flex-grow-1 overflow-hidden">
<div class="fw-semibold small text-truncate" :class="selectedPath ? '' : 'text-muted'"
x-text="selectedName || 'Chưa chọn file'"></div>
<div class="path-badge text-muted" x-text="selectedPath"></div>
</div>
<div class="d-flex align-items-center gap-2 ms-auto flex-shrink-0">
<label class="form-label mb-0 small fw-medium">Docling</label>
<select class="form-select form-select-sm" x-model="dlFormat" style="width:auto">
<option value="markdown">Markdown</option>
<option value="json">JSON</option>
<option value="html">HTML</option>
<option value="text">Plain Text</option>
</select>
<div class="form-check form-switch mb-0">
<input class="form-check-input" type="checkbox" x-model="useLlm" id="LlmToggle" />
<label class="form-check-label small fw-medium" for="LlmToggle">LLM</label>
</div>
<button class="btn btn-primary btn-sm" :disabled="!selectedPath || converting" @click="convert()">
<span x-show="converting" class="spinner-border spinner-border-sm me-1"></span>
<i x-show="!converting" class="bi bi-play-fill me-1"></i>Chuyển đổi
</button>
</div>
</div>
</div>
</div>
<!-- Status -->
<div class="row g-2 mb-3" x-show="md.done || dl.done || converting">
<template x-for="key in ['md','dl']" :key="key">
<div class="col-6">
<div class="card" :class="'border-' + $data[key].spinnerClass.replace('text-','') + '-subtle'">
<div class="card-body py-2 px-3 d-flex align-items-center gap-2">
<span class="badge rounded-circle p-1" :class="$data[key].badgeClass">&nbsp;</span>
<span class="small fw-semibold flex-grow-1" x-text="$data[key].label"></span>
<span x-show="$data[key].loading" class="spinner-border spinner-border-sm" :class="$data[key].spinnerClass"></span>
<small class="text-muted" x-text="$data[key].status"></small>
</div>
</div>
</div>
</template>
</div>
<!-- Result cards -->
<div class="row g-3">
<template x-for="key in ['md','dl']" :key="key">
<div class="col-md-6">
<div class="card shadow-sm h-100">
<div class="card-header d-flex justify-content-between align-items-center py-2">
<span class="fw-semibold small d-flex align-items-center gap-2">
<span class="badge rounded-circle p-1" :class="$data[key].badgeClass">&nbsp;</span>
<span x-text="$data[key].label"></span>
<span x-show="$data[key].llmEnabled" class="badge bg-primary-subtle text-primary" style="font-size:.65rem">🤖 LLM</span>
</span>
<div class="d-flex align-items-center gap-1">
<button x-show="$data[key].content" class="btn btn-outline-secondary btn-sm py-0 px-2"
@click="download(key)"><i class="bi bi-download"></i></button>
<ul class="nav nav-tabs card-header-tabs border-0">
<li class="nav-item">
<a class="nav-link py-1 px-2 small" :class="{ active: $data[key].tab==='raw' }"
href="#" @click.prevent="$data[key].tab='raw'">Raw</a>
</li>
<li class="nav-item">
<a class="nav-link py-1 px-2 small" :class="{ active: $data[key].tab==='preview' }"
href="#" @click.prevent="$data[key].tab='preview'">Preview</a>
</li>
</ul>
</div>
</div>
<div class="card-body p-0">
<div x-show="$data[key].loading" class="text-center text-muted py-5 small">
<div class="spinner-border spinner-border-sm mb-2" :class="$data[key].spinnerClass"></div>
<br>Đang xử lý...
</div>
<div x-show="!$data[key].loading && $data[key].error"
class="alert alert-danger m-3 small" x-text="$data[key].error"></div>
<pre x-show="!$data[key].loading && !$data[key].error && $data[key].tab==='raw'"
class="ResultPre p-3 m-0" x-text="$data[key].content"></pre>
<div x-show="!$data[key].loading && !$data[key].error && $data[key].tab==='preview'"
class="PreviewPane" x-html="$data[key].preview"></div>
<div x-show="!$data[key].loading && !$data[key].error && !$data[key].content"
class="text-center text-muted py-5 small">
<i class="bi bi-hdd fs-3 d-block mb-2"></i>Chọn file bên trái
</div>
</div>
</div>
</div>
</template>
</div>
</div>
</div>
</div>
</div>
<script src="https://cdn.jsdelivr.net/npm/marked@13/marked.min.js"></script>
<script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.14.9/dist/cdn.min.js"></script>
</body>
</html>

File diff suppressed because it is too large Load Diff

33
frontend/layout.js Normal file
View File

@ -0,0 +1,33 @@
/* Shared layout — inject navbar, mark active tab by current pathname */
(function () {
const NAV_LINKS = [
{ href: '/', icon: 'bi-upload', label: 'Upload File' },
{ href: '/email-convert.html', icon: 'bi-envelope', label: 'Email Convert' },
];
const path = location.pathname.replace(/\/$/, '') || '/';
const links = NAV_LINKS.map(L => {
const active = (L.href === '/' ? path === '/' : path.startsWith(L.href.replace('.html', '')));
return `<li class="nav-item">
<a class="nav-link py-1 px-3 small${active ? ' active' : ''}" href="${L.href}">
<i class="bi ${L.icon} me-1"></i>${L.label}
</a>
</li>`;
}).join('');
const html = `
<nav class="navbar navbar-light bg-white border-bottom px-4 py-2 d-flex justify-content-between">
<div class="d-flex align-items-center gap-3">
<span class="navbar-brand fw-bold mb-0">
MarkItDown <span class="text-muted fw-normal">vs</span> Docling
<span class="text-muted fw-normal">vs</span> Unlimited-OCR
</span>
<ul class="nav nav-pills">${links}</ul>
</div>
<span class="badge bg-primary-subtle text-primary">Demo LLM Input Processing</span>
</nav>`;
const el = document.getElementById('app-nav');
if (el) el.outerHTML = html;
})();

63
frontend/style.css Normal file
View File

@ -0,0 +1,63 @@
/* ── Shared styles ─────────────────────────────────────────── */
body { background: #f8f9fa; }
/* Upload zone */
#UploadZone {
border: 2px dashed #dee2e6;
border-radius: .5rem;
padding: 3rem 1.5rem;
text-align: center;
cursor: pointer;
transition: border-color .2s, background .2s;
}
#UploadZone:hover, #UploadZone.dragover {
border-color: #0d6efd;
background: #f0f6ff;
}
#UploadZone input[type="file"] { display: none; }
/* Result panes */
.ResultPre {
max-height: 460px;
overflow: auto;
white-space: pre-wrap;
word-break: break-word;
font-size: .78rem;
background: #f8f9fa;
}
/* github-markdown-css handles table/code/blockquote — only constrain height */
.PreviewPane {
max-height: 460px;
overflow: auto;
font-size: .85rem;
}
/* Server file browser */
.FileBrowser {
height: 560px;
overflow-y: auto;
font-size: .82rem;
}
.FileEntry {
display: flex;
align-items: center;
gap: .4rem;
padding: .25rem .5rem;
border-radius: .3rem;
cursor: pointer;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
user-select: none;
}
.FileEntry:hover { background: #e9ecef; }
.FileEntry.selected { background: #cfe2ff; color: #084298; font-weight: 500; }
.FileEntry.dir { color: #495057; }
.FileEntry.file { color: #212529; }
.indent-1 { padding-left: 1.4rem; }
.indent-2 { padding-left: 2.6rem; }
.indent-3 { padding-left: 3.8rem; }
.indent-4 { padding-left: 5rem; }
.path-badge { font-size: .72rem; font-family: monospace; word-break: break-all; }

18
laravel-app/.editorconfig Normal file
View File

@ -0,0 +1,18 @@
root = true
[*]
charset = utf-8
end_of_line = lf
indent_size = 4
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
[*.md]
trim_trailing_whitespace = false
[*.{yml,yaml}]
indent_size = 2
[{compose,docker-compose}.{yml,yaml}]
indent_size = 4

65
laravel-app/.env.example Normal file
View File

@ -0,0 +1,65 @@
APP_NAME=Laravel
APP_ENV=local
APP_KEY=
APP_DEBUG=true
APP_URL=http://localhost
APP_LOCALE=en
APP_FALLBACK_LOCALE=en
APP_FAKER_LOCALE=en_US
APP_MAINTENANCE_DRIVER=file
# APP_MAINTENANCE_STORE=database
# PHP_CLI_SERVER_WORKERS=4
BCRYPT_ROUNDS=12
LOG_CHANNEL=stack
LOG_STACK=single
LOG_DEPRECATIONS_CHANNEL=null
LOG_LEVEL=debug
DB_CONNECTION=sqlite
# DB_HOST=127.0.0.1
# DB_PORT=3306
# DB_DATABASE=laravel
# DB_USERNAME=root
# DB_PASSWORD=
SESSION_DRIVER=database
SESSION_LIFETIME=120
SESSION_ENCRYPT=false
SESSION_PATH=/
SESSION_DOMAIN=null
BROADCAST_CONNECTION=log
FILESYSTEM_DISK=local
QUEUE_CONNECTION=database
CACHE_STORE=database
# CACHE_PREFIX=
MEMCACHED_HOST=127.0.0.1
REDIS_CLIENT=phpredis
REDIS_HOST=127.0.0.1
REDIS_PASSWORD=null
REDIS_PORT=6379
MAIL_MAILER=log
MAIL_SCHEME=null
MAIL_HOST=127.0.0.1
MAIL_PORT=2525
MAIL_USERNAME=null
MAIL_PASSWORD=null
MAIL_FROM_ADDRESS="hello@example.com"
MAIL_FROM_NAME="${APP_NAME}"
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_DEFAULT_REGION=us-east-1
AWS_BUCKET=
AWS_USE_PATH_STYLE_ENDPOINT=false
VITE_APP_NAME="${APP_NAME}"

11
laravel-app/.gitattributes vendored Normal file
View File

@ -0,0 +1,11 @@
* text=auto eol=lf
*.blade.php diff=html
*.css diff=css
*.html diff=html
*.md diff=markdown
*.php diff=php
/.github export-ignore
CHANGELOG.md export-ignore
.styleci.yml export-ignore

27
laravel-app/.gitignore vendored Normal file
View File

@ -0,0 +1,27 @@
*.log
.DS_Store
.env
.env.backup
.env.production
.phpactor.json
.phpunit.result.cache
/.codex
/.cursor/
/.idea
/.nova
/.phpunit.cache
/.vscode
/.zed
/auth.json
/node_modules
/public/build
/public/fonts-manifest.dev.json
/public/hot
/public/storage
/storage/*.key
/storage/pail
/vendor
_ide_helper.php
Homestead.json
Homestead.yaml
Thumbs.db

2
laravel-app/.npmrc Normal file
View File

@ -0,0 +1,2 @@
ignore-scripts=true
audit=true

58
laravel-app/README.md Normal file
View File

@ -0,0 +1,58 @@
<p align="center"><a href="https://laravel.com" target="_blank"><img src="https://raw.githubusercontent.com/laravel/art/master/logo-lockup/5%20SVG/2%20CMYK/1%20Full%20Color/laravel-logolockup-cmyk-red.svg" width="400" alt="Laravel Logo"></a></p>
<p align="center">
<a href="https://github.com/laravel/framework/actions"><img src="https://github.com/laravel/framework/workflows/tests/badge.svg" alt="Build Status"></a>
<a href="https://packagist.org/packages/laravel/framework"><img src="https://img.shields.io/packagist/dt/laravel/framework" alt="Total Downloads"></a>
<a href="https://packagist.org/packages/laravel/framework"><img src="https://img.shields.io/packagist/v/laravel/framework" alt="Latest Stable Version"></a>
<a href="https://packagist.org/packages/laravel/framework"><img src="https://img.shields.io/packagist/l/laravel/framework" alt="License"></a>
</p>
## About Laravel
Laravel is a web application framework with expressive, elegant syntax. We believe development must be an enjoyable and creative experience to be truly fulfilling. Laravel takes the pain out of development by easing common tasks used in many web projects, such as:
- [Simple, fast routing engine](https://laravel.com/docs/routing).
- [Powerful dependency injection container](https://laravel.com/docs/container).
- Multiple back-ends for [session](https://laravel.com/docs/session) and [cache](https://laravel.com/docs/cache) storage.
- Expressive, intuitive [database ORM](https://laravel.com/docs/eloquent).
- Database agnostic [schema migrations](https://laravel.com/docs/migrations).
- [Robust background job processing](https://laravel.com/docs/queues).
- [Real-time event broadcasting](https://laravel.com/docs/broadcasting).
Laravel is accessible, powerful, and provides tools required for large, robust applications.
## Learning Laravel
Laravel has the most extensive and thorough [documentation](https://laravel.com/docs) and video tutorial library of all modern web application frameworks, making it a breeze to get started with the framework.
In addition, [Laracasts](https://laracasts.com) contains thousands of video tutorials on a range of topics including Laravel, modern PHP, unit testing, and JavaScript. Boost your skills by digging into our comprehensive video library.
You can also watch bite-sized lessons with real-world projects on [Laravel Learn](https://laravel.com/learn), where you will be guided through building a Laravel application from scratch while learning PHP fundamentals.
## Agentic Development
Laravel's predictable structure and conventions make it ideal for AI coding agents like Claude Code, Cursor, and GitHub Copilot. Install [Laravel Boost](https://laravel.com/docs/ai) to supercharge your AI workflow:
```bash
composer require laravel/boost --dev
php artisan boost:install
```
Boost provides your agent 15+ tools and skills that help agents build Laravel applications while following best practices.
## Contributing
Thank you for considering contributing to the Laravel framework! The contribution guide can be found in the [Laravel documentation](https://laravel.com/docs/contributions).
## Code of Conduct
In order to ensure that the Laravel community is welcoming to all, please review and abide by the [Code of Conduct](https://laravel.com/docs/contributions#code-of-conduct).
## Security Vulnerabilities
If you discover a security vulnerability within Laravel, please send an e-mail to Taylor Otwell via [taylor@laravel.com](mailto:taylor@laravel.com). All security vulnerabilities will be promptly addressed.
## License
The Laravel framework is open-sourced software licensed under the [MIT license](https://opensource.org/licenses/MIT).

View File

@ -0,0 +1,49 @@
<?php
namespace App\Http\Controllers;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Http;
use Spatie\RouteAttributes\Attributes\Any;
use Spatie\RouteAttributes\Attributes\Middleware;
use Spatie\RouteAttributes\Attributes\Where;
#[Middleware('web')]
class ApiProxyController extends Controller
{
private const SERVICES = [
'markitdown' => 'http://api-markitdown:8000',
'docling' => 'http://api-docling:8000',
'unlimited-ocr' => 'http://api-unlimited-ocr:8000',
];
#[Any('/api/{service}/{path?}')]
#[Where('path', '.*')]
public function proxy(Request $request, string $service, string $path = '')
{
$base = self::SERVICES[$service] ?? abort(404, "Unknown service: $service");
$url = rtrim($base, '/') . '/' . ltrim($path, '/');
if ($qs = $request->getQueryString()) {
$url .= '?' . $qs;
}
$http = Http::timeout(300)->withoutVerifying();
if ($request->hasFile('file')) {
$file = $request->file('file');
$response = $http
->attach('file', file_get_contents($file->getRealPath()), $file->getClientOriginalName())
->post($url);
} else {
$contentType = $request->header('Content-Type', 'application/json');
$response = $http
->withHeaders(['Content-Type' => $contentType])
->withBody($request->getContent(), $contentType)
->send($request->method(), $url);
}
return response($response->body(), $response->status())
->header('Content-Type', $response->header('Content-Type', 'application/json'));
}
}

View File

@ -0,0 +1,125 @@
<?php
namespace App\Http\Controllers;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\DB;
use Spatie\RouteAttributes\Attributes\Delete;
use Spatie\RouteAttributes\Attributes\Get;
use Spatie\RouteAttributes\Attributes\Middleware;
use Spatie\RouteAttributes\Attributes\Patch;
use Spatie\RouteAttributes\Attributes\Post;
#[Middleware('web')]
class BatchJobController extends Controller
{
private function db()
{
return DB::connection('db_markitdown');
}
#[Get('/batch-jobs')]
public function index()
{
$jobs = $this->db()->table('batch_jobs')
->orderByDesc('created_at')
->limit(20)
->get()
->map(fn($j) => [
'id' => $j->id,
'dir_name' => $j->dir_name,
'dir_path' => $j->dir_path,
'output_base' => $j->output_base,
'total' => $j->total,
'done_count' => $j->done_count,
'finished' => (bool)$j->finished,
'settings' => json_decode($j->settings, true),
'created_at' => $j->created_at,
]);
return response()->json($jobs);
}
#[Post('/batch-jobs')]
public function store(Request $request)
{
$data = $request->validate([
'dir_path' => 'required|string',
'dir_name' => 'required|string',
'output_base' => 'nullable|string',
'settings' => 'nullable|array',
'files' => 'required|array',
]);
$id = $this->db()->table('batch_jobs')->insertGetId([
'dir_path' => $data['dir_path'],
'dir_name' => $data['dir_name'],
'output_base' => $data['output_base'] ?? null,
'settings' => json_encode($data['settings'] ?? []),
'files' => json_encode($data['files']),
'results' => json_encode((object)[]),
'total' => \count($data['files']),
'done_count' => 0,
'finished' => false,
'created_at' => now(),
'updated_at' => now(),
]);
return response()->json(['id' => $id]);
}
#[Get('/batch-jobs/{id}')]
public function show(int $id)
{
$job = $this->db()->table('batch_jobs')->where('id', $id)->first();
if (!$job) return response()->json(['error' => 'Not found'], 404);
return response()->json([
'id' => $job->id,
'dir_path' => $job->dir_path,
'dir_name' => $job->dir_name,
'output_base' => $job->output_base,
'settings' => json_decode($job->settings, true),
'files' => json_decode($job->files, true),
'results' => json_decode($job->results, true),
'total' => $job->total,
'done_count' => $job->done_count,
'finished' => (bool)$job->finished,
'created_at' => $job->created_at,
]);
}
#[Patch('/batch-jobs/{id}')]
public function update(Request $request, int $id)
{
$job = $this->db()->table('batch_jobs')->where('id', $id)->first();
if (!$job) return response()->json(['error' => 'Not found'], 404);
$results = json_decode($job->results, true) ?? [];
$path = $request->input('path');
$entry = $request->input('entry', []);
$done = $request->boolean('finished', false);
if ($path && $entry) {
$results[$path] = $entry;
}
$doneCount = \count(array_filter($results, fn($r) => isset($r['md']) || isset($r['dl'])));
$this->db()->table('batch_jobs')->where('id', $id)->update([
'results' => json_encode($results),
'done_count' => $doneCount,
'finished' => $done,
'updated_at' => now(),
]);
return response()->json(['ok' => true, 'done_count' => $doneCount]);
}
#[Delete('/batch-jobs/{id}')]
public function destroy(int $id)
{
$this->db()->table('batch_jobs')->where('id', $id)->delete();
return response()->json(['ok' => true]);
}
}

View File

@ -0,0 +1,8 @@
<?php
namespace App\Http\Controllers;
abstract class Controller
{
//
}

View File

@ -0,0 +1,44 @@
<?php
namespace App\Http\Controllers;
use Illuminate\Http\Request;
use Spatie\RouteAttributes\Attributes\Get;
use Spatie\RouteAttributes\Attributes\Middleware;
#[Middleware('web')]
class PageController extends Controller
{
#[Get('/')]
public function index()
{
return view('index');
}
#[Get('/email-convert')]
public function emailConvert()
{
$workingDir = env('WORKING_DIR', 'working');
return view('email_convert', compact('workingDir'));
}
#[Get('/workspace-file')]
public function readFile(Request $request)
{
$path = $request->query('path', '');
$abs = realpath($path);
if (!$abs || !str_starts_with($abs, '/workspace')) {
return response()->json(['error' => 'Access denied'], 403);
}
if (!is_file($abs)) {
return response()->json(['error' => 'File not found'], 404);
}
$raw = file_get_contents($abs);
$content = mb_convert_encoding($raw, 'UTF-8', 'UTF-8');
// strip non-UTF8 bytes that would break json_encode
$content = mb_convert_encoding($raw, 'UTF-8', mb_detect_encoding($raw, 'UTF-8,ISO-8859-1,Windows-1252', true) ?: 'UTF-8');
return response()->json(['path' => $abs, 'content' => $content]);
}
}

View File

@ -0,0 +1,32 @@
<?php
namespace App\Models;
// use Illuminate\Contracts\Auth\MustVerifyEmail;
use Database\Factories\UserFactory;
use Illuminate\Database\Eloquent\Attributes\Fillable;
use Illuminate\Database\Eloquent\Attributes\Hidden;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Foundation\Auth\User as Authenticatable;
use Illuminate\Notifications\Notifiable;
#[Fillable(['name', 'email', 'password'])]
#[Hidden(['password', 'remember_token'])]
class User extends Authenticatable
{
/** @use HasFactory<UserFactory> */
use HasFactory, Notifiable;
/**
* Get the attributes that should be cast.
*
* @return array<string, string>
*/
protected function casts(): array
{
return [
'email_verified_at' => 'datetime',
'password' => 'hashed',
];
}
}

View File

@ -0,0 +1,24 @@
<?php
namespace App\Providers;
use Illuminate\Support\ServiceProvider;
class AppServiceProvider extends ServiceProvider
{
/**
* Register any application services.
*/
public function register(): void
{
//
}
/**
* Bootstrap any application services.
*/
public function boot(): void
{
//
}
}

18
laravel-app/artisan Executable file
View File

@ -0,0 +1,18 @@
#!/usr/bin/env php
<?php
use Illuminate\Foundation\Application;
use Symfony\Component\Console\Input\ArgvInput;
define('LARAVEL_START', microtime(true));
// Register the Composer autoloader...
require __DIR__.'/vendor/autoload.php';
// Bootstrap Laravel and handle the command...
/** @var Application $app */
$app = require_once __DIR__.'/bootstrap/app.php';
$status = $app->handleCommand(new ArgvInput);
exit($status);

View File

@ -0,0 +1,21 @@
<?php
use Illuminate\Foundation\Application;
use Illuminate\Foundation\Configuration\Exceptions;
use Illuminate\Foundation\Configuration\Middleware;
use Illuminate\Http\Request;
return Application::configure(basePath: dirname(__DIR__))
->withRouting(
web: __DIR__.'/../routes/web.php',
commands: __DIR__.'/../routes/console.php',
health: '/up',
)
->withMiddleware(function (Middleware $middleware): void {
//
})
->withExceptions(function (Exceptions $exceptions): void {
$exceptions->shouldRenderJsonWhen(
fn (Request $request) => $request->is('api/*'),
);
})->create();

View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -0,0 +1,7 @@
<?php
use App\Providers\AppServiceProvider;
return [
AppServiceProvider::class,
];

87
laravel-app/composer.json Normal file
View File

@ -0,0 +1,87 @@
{
"$schema": "https://getcomposer.org/schema.json",
"name": "laravel/laravel",
"type": "project",
"description": "The skeleton application for the Laravel framework.",
"keywords": ["laravel", "framework"],
"license": "MIT",
"require": {
"php": "^8.3",
"laravel/framework": "^13.17",
"laravel/tinker": "^3.0.2",
"spatie/laravel-route-attributes": "^1.28"
},
"require-dev": {
"fakerphp/faker": "^1.24.1",
"laravel/pail": "^1.2.7",
"laravel/pao": "^1.1.2",
"laravel/pint": "^1.29.3",
"mockery/mockery": "^1.6.12",
"nunomaduro/collision": "^8.9.4",
"phpunit/phpunit": "^12.5.30"
},
"autoload": {
"psr-4": {
"App\\": "app/",
"Database\\Factories\\": "database/factories/",
"Database\\Seeders\\": "database/seeders/"
}
},
"autoload-dev": {
"psr-4": {
"Tests\\": "tests/"
}
},
"scripts": {
"setup": [
"composer install",
"@php -r \"file_exists('.env') || copy('.env.example', '.env');\"",
"@php artisan key:generate",
"@php artisan migrate --force",
"npm install --ignore-scripts",
"npm run build"
],
"dev": [
"Composer\\Config::disableProcessTimeout",
"npx concurrently -c \"#93c5fd,#c4b5fd,#fb7185,#fdba74\" \"php artisan serve\" \"php artisan queue:listen --tries=1 --timeout=0\" \"php artisan pail --timeout=0\" \"npm run dev\" --names=server,queue,logs,vite --kill-others"
],
"test": [
"@php artisan config:clear --ansi @no_additional_args",
"@php artisan test"
],
"post-autoload-dump": [
"Illuminate\\Foundation\\ComposerScripts::postAutoloadDump",
"@php artisan package:discover --ansi"
],
"post-update-cmd": [
"@php artisan vendor:publish --tag=laravel-assets --ansi --force"
],
"post-root-package-install": [
"@php -r \"file_exists('.env') || copy('.env.example', '.env');\""
],
"post-create-project-cmd": [
"@php artisan key:generate --ansi",
"@php -r \"file_exists('database/database.sqlite') || touch('database/database.sqlite');\"",
"@php artisan migrate --graceful --ansi"
],
"pre-package-uninstall": [
"Illuminate\\Foundation\\ComposerScripts::prePackageUninstall"
]
},
"extra": {
"laravel": {
"dont-discover": []
}
},
"config": {
"optimize-autoloader": true,
"preferred-install": "dist",
"sort-packages": true,
"allow-plugins": {
"pestphp/pest-plugin": true,
"php-http/discovery": true
}
},
"minimum-stability": "stable",
"prefer-stable": true
}

8537
laravel-app/composer.lock generated Normal file

File diff suppressed because it is too large Load Diff

126
laravel-app/config/app.php Normal file
View File

@ -0,0 +1,126 @@
<?php
return [
/*
|--------------------------------------------------------------------------
| Application Name
|--------------------------------------------------------------------------
|
| This value is the name of your application, which will be used when the
| framework needs to place the application's name in a notification or
| other UI elements where an application name needs to be displayed.
|
*/
'name' => env('APP_NAME', 'Laravel'),
/*
|--------------------------------------------------------------------------
| Application Environment
|--------------------------------------------------------------------------
|
| This value determines the "environment" your application is currently
| running in. This may determine how you prefer to configure various
| services the application utilizes. Set this in your ".env" file.
|
*/
'env' => env('APP_ENV', 'production'),
/*
|--------------------------------------------------------------------------
| Application Debug Mode
|--------------------------------------------------------------------------
|
| When your application is in debug mode, detailed error messages with
| stack traces will be shown on every error that occurs within your
| application. If disabled, a simple generic error page is shown.
|
*/
'debug' => (bool) env('APP_DEBUG', false),
/*
|--------------------------------------------------------------------------
| Application URL
|--------------------------------------------------------------------------
|
| This URL is used by the console to properly generate URLs when using
| the Artisan command line tool. You should set this to the root of
| the application so that it's available within Artisan commands.
|
*/
'url' => env('APP_URL', 'http://localhost'),
/*
|--------------------------------------------------------------------------
| Application Timezone
|--------------------------------------------------------------------------
|
| Here you may specify the default timezone for your application, which
| will be used by the PHP date and date-time functions. The timezone
| is set to "UTC" by default as it is suitable for most use cases.
|
*/
'timezone' => 'UTC',
/*
|--------------------------------------------------------------------------
| Application Locale Configuration
|--------------------------------------------------------------------------
|
| The application locale determines the default locale that will be used
| by Laravel's translation / localization methods. This option can be
| set to any locale for which you plan to have translation strings.
|
*/
'locale' => env('APP_LOCALE', 'en'),
'fallback_locale' => env('APP_FALLBACK_LOCALE', 'en'),
'faker_locale' => env('APP_FAKER_LOCALE', 'en_US'),
/*
|--------------------------------------------------------------------------
| Encryption Key
|--------------------------------------------------------------------------
|
| This key is utilized by Laravel's encryption services and should be set
| to a random, 32 character string to ensure that all encrypted values
| are secure. You should do this prior to deploying the application.
|
*/
'cipher' => 'AES-256-CBC',
'key' => env('APP_KEY'),
'previous_keys' => [
...array_filter(
explode(',', (string) env('APP_PREVIOUS_KEYS', ''))
),
],
/*
|--------------------------------------------------------------------------
| Maintenance Mode Driver
|--------------------------------------------------------------------------
|
| These configuration options determine the driver used to determine and
| manage Laravel's "maintenance mode" status. The "cache" driver will
| allow maintenance mode to be controlled across multiple machines.
|
| Supported drivers: "file", "cache"
|
*/
'maintenance' => [
'driver' => env('APP_MAINTENANCE_DRIVER', 'file'),
'store' => env('APP_MAINTENANCE_STORE', 'database'),
],
];

117
laravel-app/config/auth.php Normal file
View File

@ -0,0 +1,117 @@
<?php
use App\Models\User;
return [
/*
|--------------------------------------------------------------------------
| Authentication Defaults
|--------------------------------------------------------------------------
|
| This option defines the default authentication "guard" and password
| reset "broker" for your application. You may change these values
| as required, but they're a perfect start for most applications.
|
*/
'defaults' => [
'guard' => env('AUTH_GUARD', 'web'),
'passwords' => env('AUTH_PASSWORD_BROKER', 'users'),
],
/*
|--------------------------------------------------------------------------
| Authentication Guards
|--------------------------------------------------------------------------
|
| Next, you may define every authentication guard for your application.
| Of course, a great default configuration has been defined for you
| which utilizes session storage plus the Eloquent user provider.
|
| All authentication guards have a user provider, which defines how the
| users are actually retrieved out of your database or other storage
| system used by the application. Typically, Eloquent is utilized.
|
| Supported: "session"
|
*/
'guards' => [
'web' => [
'driver' => 'session',
'provider' => 'users',
],
],
/*
|--------------------------------------------------------------------------
| User Providers
|--------------------------------------------------------------------------
|
| All authentication guards have a user provider, which defines how the
| users are actually retrieved out of your database or other storage
| system used by the application. Typically, Eloquent is utilized.
|
| If you have multiple user tables or models you may configure multiple
| providers to represent the model / table. These providers may then
| be assigned to any extra authentication guards you have defined.
|
| Supported: "database", "eloquent"
|
*/
'providers' => [
'users' => [
'driver' => 'eloquent',
'model' => env('AUTH_MODEL', User::class),
],
// 'users' => [
// 'driver' => 'database',
// 'table' => 'users',
// ],
],
/*
|--------------------------------------------------------------------------
| Resetting Passwords
|--------------------------------------------------------------------------
|
| These configuration options specify the behavior of Laravel's password
| reset functionality, including the table utilized for token storage
| and the user provider that is invoked to actually retrieve users.
|
| The expiry time is the number of minutes that each reset token will be
| considered valid. This security feature keeps tokens short-lived so
| they have less time to be guessed. You may change this as needed.
|
| The throttle setting is the number of seconds a user must wait before
| generating more password reset tokens. This prevents the user from
| quickly generating a very large amount of password reset tokens.
|
*/
'passwords' => [
'users' => [
'provider' => 'users',
'table' => env('AUTH_PASSWORD_RESET_TOKEN_TABLE', 'password_reset_tokens'),
'expire' => 60,
'throttle' => 60,
],
],
/*
|--------------------------------------------------------------------------
| Password Confirmation Timeout
|--------------------------------------------------------------------------
|
| Here you may define the number of seconds before a password confirmation
| window expires and users are asked to re-enter their password via the
| confirmation screen. By default, the timeout lasts for three hours.
|
*/
'password_timeout' => env('AUTH_PASSWORD_TIMEOUT', 10800),
];

View File

@ -0,0 +1,136 @@
<?php
use Illuminate\Support\Str;
return [
/*
|--------------------------------------------------------------------------
| Default Cache Store
|--------------------------------------------------------------------------
|
| This option controls the default cache store that will be used by the
| framework. This connection is utilized if another isn't explicitly
| specified when running a cache operation inside the application.
|
*/
'default' => env('CACHE_STORE', 'database'),
/*
|--------------------------------------------------------------------------
| Cache Stores
|--------------------------------------------------------------------------
|
| Here you may define all of the cache "stores" for your application as
| well as their drivers. You may even define multiple stores for the
| same cache driver to group types of items stored in your caches.
|
| Supported drivers: "array", "database", "file", "memcached",
| "redis", "dynamodb", "storage", "octane",
| "session", "failover", "null"
|
*/
'stores' => [
'array' => [
'driver' => 'array',
'serialize' => false,
],
'database' => [
'driver' => 'database',
'connection' => env('DB_CACHE_CONNECTION'),
'table' => env('DB_CACHE_TABLE', 'cache'),
'lock_connection' => env('DB_CACHE_LOCK_CONNECTION'),
'lock_table' => env('DB_CACHE_LOCK_TABLE'),
],
'file' => [
'driver' => 'file',
'path' => storage_path('framework/cache/data'),
'lock_path' => storage_path('framework/cache/data'),
],
'storage' => [
'driver' => 'storage',
'disk' => env('CACHE_STORAGE_DISK'),
'path' => env('CACHE_STORAGE_PATH', 'framework/cache/data'),
],
'memcached' => [
'driver' => 'memcached',
'persistent_id' => env('MEMCACHED_PERSISTENT_ID'),
'sasl' => [
env('MEMCACHED_USERNAME'),
env('MEMCACHED_PASSWORD'),
],
'options' => [
// Memcached::OPT_CONNECT_TIMEOUT => 2000,
],
'servers' => [
[
'host' => env('MEMCACHED_HOST', '127.0.0.1'),
'port' => env('MEMCACHED_PORT', 11211),
'weight' => 100,
],
],
],
'redis' => [
'driver' => 'redis',
'connection' => env('REDIS_CACHE_CONNECTION', 'cache'),
'lock_connection' => env('REDIS_CACHE_LOCK_CONNECTION', 'default'),
],
'dynamodb' => [
'driver' => 'dynamodb',
'key' => env('AWS_ACCESS_KEY_ID'),
'secret' => env('AWS_SECRET_ACCESS_KEY'),
'region' => env('AWS_DEFAULT_REGION', 'us-east-1'),
'table' => env('DYNAMODB_CACHE_TABLE', 'cache'),
'endpoint' => env('DYNAMODB_ENDPOINT'),
],
'octane' => [
'driver' => 'octane',
],
'failover' => [
'driver' => 'failover',
'stores' => [
'database',
'array',
],
],
],
/*
|--------------------------------------------------------------------------
| Cache Key Prefix
|--------------------------------------------------------------------------
|
| When utilizing the APC, database, memcached, Redis, and DynamoDB cache
| stores, there might be other applications using the same cache. For
| that reason, you may prefix every cache key to avoid collisions.
|
*/
'prefix' => env('CACHE_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')).'-cache-'),
/*
|--------------------------------------------------------------------------
| Serializable Classes
|--------------------------------------------------------------------------
|
| This value determines the classes that can be unserialized from cache
| storage. By default, no PHP classes will be unserialized from your
| cache to prevent gadget chain attacks if your APP_KEY is leaked.
|
*/
'serializable_classes' => false,
];

View File

@ -0,0 +1,224 @@
<?php
use Illuminate\Support\Str;
use Pdo\Mysql;
return [
/*
|--------------------------------------------------------------------------
| Default Database Connection Name
|--------------------------------------------------------------------------
|
| Here you may specify which of the database connections below you wish
| to use as your default connection for database operations. This is
| the connection which will be utilized unless another connection
| is explicitly specified when you execute a query / statement.
|
*/
'default' => env('DB_CONNECTION', 'sqlite'),
/*
|--------------------------------------------------------------------------
| Database Connections
|--------------------------------------------------------------------------
|
| Below are all of the database connections defined for your application.
| An example configuration is provided for each database system which
| is supported by Laravel. You're free to add / remove connections.
|
*/
'connections' => [
'sqlite' => [
'driver' => 'sqlite',
'url' => env('DB_URL'),
'database' => env('DB_DATABASE', database_path('database.sqlite')),
'prefix' => '',
'foreign_key_constraints' => env('DB_FOREIGN_KEYS', true),
'busy_timeout' => null,
'journal_mode' => null,
'synchronous' => null,
'transaction_mode' => 'DEFERRED',
],
'mysql' => [
'driver' => 'mysql',
'url' => env('DB_URL'),
'host' => env('DB_HOST', '127.0.0.1'),
'port' => env('DB_PORT', '3306'),
'database' => env('DB_DATABASE', 'laravel'),
'username' => env('DB_USERNAME', 'root'),
'password' => env('DB_PASSWORD', ''),
'unix_socket' => env('DB_SOCKET', ''),
'charset' => env('DB_CHARSET', 'utf8mb4'),
'collation' => env('DB_COLLATION', 'utf8mb4_unicode_ci'),
'prefix' => '',
'prefix_indexes' => true,
'strict' => true,
'engine' => null,
'options' => extension_loaded('pdo_mysql') ? array_filter([
Mysql::ATTR_SSL_CA => env('MYSQL_ATTR_SSL_CA'),
]) : [],
],
'mariadb' => [
'driver' => 'mariadb',
'url' => env('DB_URL'),
'host' => env('DB_HOST', '127.0.0.1'),
'port' => env('DB_PORT', '3306'),
'database' => env('DB_DATABASE', 'laravel'),
'username' => env('DB_USERNAME', 'root'),
'password' => env('DB_PASSWORD', ''),
'unix_socket' => env('DB_SOCKET', ''),
'charset' => env('DB_CHARSET', 'utf8mb4'),
'collation' => env('DB_COLLATION', 'utf8mb4_unicode_ci'),
'prefix' => '',
'prefix_indexes' => true,
'strict' => true,
'engine' => null,
'options' => extension_loaded('pdo_mysql') ? array_filter([
Mysql::ATTR_SSL_CA => env('MYSQL_ATTR_SSL_CA'),
]) : [],
],
'pgsql' => [
'driver' => 'pgsql',
'url' => env('DB_URL'),
'host' => env('DB_HOST', '127.0.0.1'),
'port' => env('DB_PORT', '5432'),
'database' => env('DB_DATABASE', 'laravel'),
'username' => env('DB_USERNAME', 'root'),
'password' => env('DB_PASSWORD', ''),
'charset' => env('DB_CHARSET', 'utf8'),
'prefix' => '',
'prefix_indexes' => true,
'search_path' => 'public',
'sslmode' => env('DB_SSLMODE', 'prefer'),
],
// ── AI Markdown services ──────────────────────────────────
'db_markitdown' => [
'driver' => 'pgsql',
'host' => env('DB_MARKITDOWN_HOST', 'db-markitdown'),
'port' => env('DB_MARKITDOWN_PORT', '5432'),
'database' => env('DB_MARKITDOWN_NAME', 'markitdown'),
'username' => env('DB_MARKITDOWN_USER', 'admin'),
'password' => env('DB_MARKITDOWN_PASS', 'admin'),
'charset' => 'utf8',
'prefix' => '',
'search_path' => 'public',
'sslmode' => 'prefer',
],
'db_docling' => [
'driver' => 'pgsql',
'host' => env('DB_DOCLING_HOST', 'db-docling'),
'port' => env('DB_DOCLING_PORT', '5432'),
'database' => env('DB_DOCLING_NAME', 'docling'),
'username' => env('DB_DOCLING_USER', 'admin'),
'password' => env('DB_DOCLING_PASS', 'admin'),
'charset' => 'utf8',
'prefix' => '',
'search_path' => 'public',
'sslmode' => 'prefer',
],
'db_unlimited_ocr' => [
'driver' => 'pgsql',
'host' => env('DB_UNLIMITED_OCR_HOST', 'db-unlimited-ocr'),
'port' => env('DB_UNLIMITED_OCR_PORT', '5432'),
'database' => env('DB_UNLIMITED_OCR_NAME', 'unlimited_ocr'),
'username' => env('DB_UNLIMITED_OCR_USER', 'admin'),
'password' => env('DB_UNLIMITED_OCR_PASS', 'admin'),
'charset' => 'utf8',
'prefix' => '',
'search_path' => 'public',
'sslmode' => 'prefer',
],
'sqlsrv' => [
'driver' => 'sqlsrv',
'url' => env('DB_URL'),
'host' => env('DB_HOST', 'localhost'),
'port' => env('DB_PORT', '1433'),
'database' => env('DB_DATABASE', 'laravel'),
'username' => env('DB_USERNAME', 'root'),
'password' => env('DB_PASSWORD', ''),
'charset' => env('DB_CHARSET', 'utf8'),
'prefix' => '',
'prefix_indexes' => true,
// 'encrypt' => env('DB_ENCRYPT', 'yes'),
// 'trust_server_certificate' => env('DB_TRUST_SERVER_CERTIFICATE', 'false'),
],
],
/*
|--------------------------------------------------------------------------
| Migration Repository Table
|--------------------------------------------------------------------------
|
| This table keeps track of all the migrations that have already run for
| your application. Using this information, we can determine which of
| the migrations on disk haven't actually been run on the database.
|
*/
'migrations' => [
'table' => 'migrations',
'update_date_on_publish' => true,
],
/*
|--------------------------------------------------------------------------
| Redis Databases
|--------------------------------------------------------------------------
|
| Redis is an open source, fast, and advanced key-value store that also
| provides a richer body of commands than a typical key-value system
| such as Memcached. You may define your connection settings here.
|
*/
'redis' => [
'client' => env('REDIS_CLIENT', 'phpredis'),
'options' => [
'cluster' => env('REDIS_CLUSTER', 'redis'),
'prefix' => env('REDIS_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')).'-database-'),
'persistent' => env('REDIS_PERSISTENT', false),
],
'default' => [
'url' => env('REDIS_URL'),
'host' => env('REDIS_HOST', '127.0.0.1'),
'username' => env('REDIS_USERNAME'),
'password' => env('REDIS_PASSWORD'),
'port' => env('REDIS_PORT', '6379'),
'database' => env('REDIS_DB', '0'),
'max_retries' => env('REDIS_MAX_RETRIES', 3),
'backoff_algorithm' => env('REDIS_BACKOFF_ALGORITHM', 'decorrelated_jitter'),
'backoff_base' => env('REDIS_BACKOFF_BASE', 100),
'backoff_cap' => env('REDIS_BACKOFF_CAP', 1000),
],
'cache' => [
'url' => env('REDIS_URL'),
'host' => env('REDIS_HOST', '127.0.0.1'),
'username' => env('REDIS_USERNAME'),
'password' => env('REDIS_PASSWORD'),
'port' => env('REDIS_PORT', '6379'),
'database' => env('REDIS_CACHE_DB', '1'),
'max_retries' => env('REDIS_MAX_RETRIES', 3),
'backoff_algorithm' => env('REDIS_BACKOFF_ALGORITHM', 'decorrelated_jitter'),
'backoff_base' => env('REDIS_BACKOFF_BASE', 100),
'backoff_cap' => env('REDIS_BACKOFF_CAP', 1000),
],
],
];

View File

@ -0,0 +1,80 @@
<?php
return [
/*
|--------------------------------------------------------------------------
| Default Filesystem Disk
|--------------------------------------------------------------------------
|
| Here you may specify the default filesystem disk that should be used
| by the framework. The "local" disk, as well as a variety of cloud
| based disks are available to your application for file storage.
|
*/
'default' => env('FILESYSTEM_DISK', 'local'),
/*
|--------------------------------------------------------------------------
| Filesystem Disks
|--------------------------------------------------------------------------
|
| Below you may configure as many filesystem disks as necessary, and you
| may even configure multiple disks for the same driver. Examples for
| most supported storage drivers are configured here for reference.
|
| Supported drivers: "local", "ftp", "sftp", "s3"
|
*/
'disks' => [
'local' => [
'driver' => 'local',
'root' => storage_path('app/private'),
'serve' => true,
'throw' => false,
'report' => false,
],
'public' => [
'driver' => 'local',
'root' => storage_path('app/public'),
'url' => rtrim(env('APP_URL', 'http://localhost'), '/').'/storage',
'visibility' => 'public',
'throw' => false,
'report' => false,
],
's3' => [
'driver' => 's3',
'key' => env('AWS_ACCESS_KEY_ID'),
'secret' => env('AWS_SECRET_ACCESS_KEY'),
'region' => env('AWS_DEFAULT_REGION'),
'bucket' => env('AWS_BUCKET'),
'url' => env('AWS_URL'),
'endpoint' => env('AWS_ENDPOINT'),
'use_path_style_endpoint' => env('AWS_USE_PATH_STYLE_ENDPOINT', false),
'throw' => false,
'report' => false,
],
],
/*
|--------------------------------------------------------------------------
| Symbolic Links
|--------------------------------------------------------------------------
|
| Here you may configure the symbolic links that will be created when the
| `storage:link` Artisan command is executed. The array keys should be
| the locations of the links and the values should be their targets.
|
*/
'links' => [
public_path('storage') => storage_path('app/public'),
],
];

View File

@ -0,0 +1,132 @@
<?php
use Monolog\Handler\NullHandler;
use Monolog\Handler\StreamHandler;
use Monolog\Handler\SyslogUdpHandler;
use Monolog\Processor\PsrLogMessageProcessor;
return [
/*
|--------------------------------------------------------------------------
| Default Log Channel
|--------------------------------------------------------------------------
|
| This option defines the default log channel that is utilized to write
| messages to your logs. The value provided here should match one of
| the channels present in the list of "channels" configured below.
|
*/
'default' => env('LOG_CHANNEL', 'stack'),
/*
|--------------------------------------------------------------------------
| Deprecations Log Channel
|--------------------------------------------------------------------------
|
| This option controls the log channel that should be used to log warnings
| regarding deprecated PHP and library features. This allows you to get
| your application ready for upcoming major versions of dependencies.
|
*/
'deprecations' => [
'channel' => env('LOG_DEPRECATIONS_CHANNEL', 'null'),
'trace' => env('LOG_DEPRECATIONS_TRACE', false),
],
/*
|--------------------------------------------------------------------------
| Log Channels
|--------------------------------------------------------------------------
|
| Here you may configure the log channels for your application. Laravel
| utilizes the Monolog PHP logging library, which includes a variety
| of powerful log handlers and formatters that you're free to use.
|
| Available drivers: "single", "daily", "slack", "syslog",
| "errorlog", "monolog", "custom", "stack"
|
*/
'channels' => [
'stack' => [
'driver' => 'stack',
'channels' => explode(',', (string) env('LOG_STACK', 'single')),
'ignore_exceptions' => false,
],
'single' => [
'driver' => 'single',
'path' => storage_path('logs/laravel.log'),
'level' => env('LOG_LEVEL', 'debug'),
'replace_placeholders' => true,
],
'daily' => [
'driver' => 'daily',
'path' => storage_path('logs/laravel.log'),
'level' => env('LOG_LEVEL', 'debug'),
'days' => env('LOG_DAILY_DAYS', 14),
'replace_placeholders' => true,
],
'slack' => [
'driver' => 'slack',
'url' => env('LOG_SLACK_WEBHOOK_URL'),
'username' => env('LOG_SLACK_USERNAME', env('APP_NAME', 'Laravel')),
'emoji' => env('LOG_SLACK_EMOJI', ':boom:'),
'level' => env('LOG_LEVEL', 'critical'),
'replace_placeholders' => true,
],
'papertrail' => [
'driver' => 'monolog',
'level' => env('LOG_LEVEL', 'debug'),
'handler' => env('LOG_PAPERTRAIL_HANDLER', SyslogUdpHandler::class),
'handler_with' => [
'host' => env('PAPERTRAIL_URL'),
'port' => env('PAPERTRAIL_PORT'),
'connectionString' => 'tls://'.env('PAPERTRAIL_URL').':'.env('PAPERTRAIL_PORT'),
],
'processors' => [PsrLogMessageProcessor::class],
],
'stderr' => [
'driver' => 'monolog',
'level' => env('LOG_LEVEL', 'debug'),
'handler' => StreamHandler::class,
'handler_with' => [
'stream' => 'php://stderr',
],
'formatter' => env('LOG_STDERR_FORMATTER'),
'processors' => [PsrLogMessageProcessor::class],
],
'syslog' => [
'driver' => 'syslog',
'level' => env('LOG_LEVEL', 'debug'),
'facility' => env('LOG_SYSLOG_FACILITY', LOG_USER),
'replace_placeholders' => true,
],
'errorlog' => [
'driver' => 'errorlog',
'level' => env('LOG_LEVEL', 'debug'),
'replace_placeholders' => true,
],
'null' => [
'driver' => 'monolog',
'handler' => NullHandler::class,
],
'emergency' => [
'path' => storage_path('logs/laravel.log'),
],
],
];

118
laravel-app/config/mail.php Normal file
View File

@ -0,0 +1,118 @@
<?php
return [
/*
|--------------------------------------------------------------------------
| Default Mailer
|--------------------------------------------------------------------------
|
| This option controls the default mailer that is used to send all email
| messages unless another mailer is explicitly specified when sending
| the message. All additional mailers can be configured within the
| "mailers" array. Examples of each type of mailer are provided.
|
*/
'default' => env('MAIL_MAILER', 'log'),
/*
|--------------------------------------------------------------------------
| Mailer Configurations
|--------------------------------------------------------------------------
|
| Here you may configure all of the mailers used by your application plus
| their respective settings. Several examples have been configured for
| you and you are free to add your own as your application requires.
|
| Laravel supports a variety of mail "transport" drivers that can be used
| when delivering an email. You may specify which one you're using for
| your mailers below. You may also add additional mailers if needed.
|
| Supported: "smtp", "sendmail", "mailgun", "ses", "ses-v2",
| "postmark", "resend", "log", "array",
| "failover", "roundrobin"
|
*/
'mailers' => [
'smtp' => [
'transport' => 'smtp',
'scheme' => env('MAIL_SCHEME'),
'url' => env('MAIL_URL'),
'host' => env('MAIL_HOST', '127.0.0.1'),
'port' => env('MAIL_PORT', 2525),
'username' => env('MAIL_USERNAME'),
'password' => env('MAIL_PASSWORD'),
'timeout' => null,
'local_domain' => env('MAIL_EHLO_DOMAIN', parse_url((string) env('APP_URL', 'http://localhost'), PHP_URL_HOST)),
],
'ses' => [
'transport' => 'ses',
],
'postmark' => [
'transport' => 'postmark',
// 'message_stream_id' => env('POSTMARK_MESSAGE_STREAM_ID'),
// 'client' => [
// 'timeout' => 5,
// ],
],
'resend' => [
'transport' => 'resend',
],
'sendmail' => [
'transport' => 'sendmail',
'path' => env('MAIL_SENDMAIL_PATH', '/usr/sbin/sendmail -bs -i'),
],
'log' => [
'transport' => 'log',
'channel' => env('MAIL_LOG_CHANNEL'),
],
'array' => [
'transport' => 'array',
],
'failover' => [
'transport' => 'failover',
'mailers' => [
'smtp',
'log',
],
'retry_after' => 60,
],
'roundrobin' => [
'transport' => 'roundrobin',
'mailers' => [
'ses',
'postmark',
],
'retry_after' => 60,
],
],
/*
|--------------------------------------------------------------------------
| Global "From" Address
|--------------------------------------------------------------------------
|
| You may wish for all emails sent by your application to be sent from
| the same address. Here you may specify a name and address that is
| used globally for all emails that are sent by your application.
|
*/
'from' => [
'address' => env('MAIL_FROM_ADDRESS', 'hello@example.com'),
'name' => env('MAIL_FROM_NAME', env('APP_NAME', 'Laravel')),
],
];

View File

@ -0,0 +1,129 @@
<?php
return [
/*
|--------------------------------------------------------------------------
| Default Queue Connection Name
|--------------------------------------------------------------------------
|
| Laravel's queue supports a variety of backends via a single, unified
| API, giving you convenient access to each backend using identical
| syntax for each. The default queue connection is defined below.
|
*/
'default' => env('QUEUE_CONNECTION', 'database'),
/*
|--------------------------------------------------------------------------
| Queue Connections
|--------------------------------------------------------------------------
|
| Here you may configure the connection options for every queue backend
| used by your application. An example configuration is provided for
| each backend supported by Laravel. You're also free to add more.
|
| Drivers: "sync", "database", "beanstalkd", "sqs", "redis",
| "deferred", "background", "failover", "null"
|
*/
'connections' => [
'sync' => [
'driver' => 'sync',
],
'database' => [
'driver' => 'database',
'connection' => env('DB_QUEUE_CONNECTION'),
'table' => env('DB_QUEUE_TABLE', 'jobs'),
'queue' => env('DB_QUEUE', 'default'),
'retry_after' => (int) env('DB_QUEUE_RETRY_AFTER', 90),
'after_commit' => false,
],
'beanstalkd' => [
'driver' => 'beanstalkd',
'host' => env('BEANSTALKD_QUEUE_HOST', 'localhost'),
'queue' => env('BEANSTALKD_QUEUE', 'default'),
'retry_after' => (int) env('BEANSTALKD_QUEUE_RETRY_AFTER', 90),
'block_for' => 0,
'after_commit' => false,
],
'sqs' => [
'driver' => 'sqs',
'key' => env('AWS_ACCESS_KEY_ID'),
'secret' => env('AWS_SECRET_ACCESS_KEY'),
'prefix' => env('SQS_PREFIX', 'https://sqs.us-east-1.amazonaws.com/your-account-id'),
'queue' => env('SQS_QUEUE', 'default'),
'suffix' => env('SQS_SUFFIX'),
'region' => env('AWS_DEFAULT_REGION', 'us-east-1'),
'after_commit' => false,
],
'redis' => [
'driver' => 'redis',
'connection' => env('REDIS_QUEUE_CONNECTION', 'default'),
'queue' => env('REDIS_QUEUE', 'default'),
'retry_after' => (int) env('REDIS_QUEUE_RETRY_AFTER', 90),
'block_for' => null,
'after_commit' => false,
],
'deferred' => [
'driver' => 'deferred',
],
'background' => [
'driver' => 'background',
],
'failover' => [
'driver' => 'failover',
'connections' => [
'database',
'deferred',
],
],
],
/*
|--------------------------------------------------------------------------
| Job Batching
|--------------------------------------------------------------------------
|
| The following options configure the database and table that store job
| batching information. These options can be updated to any database
| connection and table which has been defined by your application.
|
*/
'batching' => [
'database' => env('DB_CONNECTION', 'sqlite'),
'table' => 'job_batches',
],
/*
|--------------------------------------------------------------------------
| Failed Queue Jobs
|--------------------------------------------------------------------------
|
| These options configure the behavior of failed queue job logging so you
| can control how and where failed jobs are stored. Laravel ships with
| support for storing failed jobs in a simple file or in a database.
|
| Supported drivers: "database-uuids", "dynamodb", "file", "null"
|
*/
'failed' => [
'driver' => env('QUEUE_FAILED_DRIVER', 'database-uuids'),
'database' => env('DB_CONNECTION', 'sqlite'),
'table' => 'failed_jobs',
],
];

View File

@ -0,0 +1,46 @@
<?php
return [
/*
* Automatic registration of routes will only happen if this setting is `true`
*/
'enabled' => true,
/*
* Controllers in these directories that have routing attributes
* will automatically be registered.
*
* Optionally, you can specify group configuration by using key/values
*/
'directories' => [
app_path('Http/Controllers'),
/*
app_path('Http/Controllers/Api') => [
'prefix' => 'api',
'middleware' => 'api',
// only register routes in files that match the patterns
'patterns' => ['*Controller.php'],
// do not register routes in files that match the patterns
'not_patterns' => [],
],
*/
],
/*
* This middleware will be applied to all routes.
*/
'middleware' => [
\Illuminate\Routing\Middleware\SubstituteBindings::class,
],
/*
* When enabled, implicitly scoped bindings will be enabled by default.
* You can override this behaviour by using the `ScopeBindings` attribute, and passing `false` to it.
*
* Possible values:
* - null: use the default behaviour
* - true: enable implicitly scoped bindings for all routes
* - false: disable implicitly scoped bindings for all routes
*/
'scope-bindings' => null,
];

View File

@ -0,0 +1,38 @@
<?php
return [
/*
|--------------------------------------------------------------------------
| Third Party Services
|--------------------------------------------------------------------------
|
| This file is for storing the credentials for third party services such
| as Mailgun, Postmark, AWS and more. This file provides the de facto
| location for this type of information, allowing packages to have
| a conventional file to locate the various service credentials.
|
*/
'postmark' => [
'key' => env('POSTMARK_API_KEY'),
],
'resend' => [
'key' => env('RESEND_API_KEY'),
],
'ses' => [
'key' => env('AWS_ACCESS_KEY_ID'),
'secret' => env('AWS_SECRET_ACCESS_KEY'),
'region' => env('AWS_DEFAULT_REGION', 'us-east-1'),
],
'slack' => [
'notifications' => [
'bot_user_oauth_token' => env('SLACK_BOT_USER_OAUTH_TOKEN'),
'channel' => env('SLACK_BOT_USER_DEFAULT_CHANNEL'),
],
],
];

View File

@ -0,0 +1,233 @@
<?php
use Illuminate\Support\Str;
return [
/*
|--------------------------------------------------------------------------
| Default Session Driver
|--------------------------------------------------------------------------
|
| This option determines the default session driver that is utilized for
| incoming requests. Laravel supports a variety of storage options to
| persist session data. Database storage is a great default choice.
|
| Supported: "file", "cookie", "database", "memcached",
| "redis", "dynamodb", "array"
|
*/
'driver' => env('SESSION_DRIVER', 'database'),
/*
|--------------------------------------------------------------------------
| Session Lifetime
|--------------------------------------------------------------------------
|
| Here you may specify the number of minutes that you wish the session
| to be allowed to remain idle before it expires. If you want them
| to expire immediately when the browser is closed then you may
| indicate that via the expire_on_close configuration option.
|
*/
'lifetime' => (int) env('SESSION_LIFETIME', 120),
'expire_on_close' => env('SESSION_EXPIRE_ON_CLOSE', false),
/*
|--------------------------------------------------------------------------
| Session Encryption
|--------------------------------------------------------------------------
|
| This option allows you to easily specify that all of your session data
| should be encrypted before it's stored. All encryption is performed
| automatically by Laravel and you may use the session like normal.
|
*/
'encrypt' => env('SESSION_ENCRYPT', false),
/*
|--------------------------------------------------------------------------
| Session File Location
|--------------------------------------------------------------------------
|
| When utilizing the "file" session driver, the session files are placed
| on disk. The default storage location is defined here; however, you
| are free to provide another location where they should be stored.
|
*/
'files' => storage_path('framework/sessions'),
/*
|--------------------------------------------------------------------------
| Session Database Connection
|--------------------------------------------------------------------------
|
| When using the "database" or "redis" session drivers, you may specify a
| connection that should be used to manage these sessions. This should
| correspond to a connection in your database configuration options.
|
*/
'connection' => env('SESSION_CONNECTION'),
/*
|--------------------------------------------------------------------------
| Session Database Table
|--------------------------------------------------------------------------
|
| When using the "database" session driver, you may specify the table to
| be used to store sessions. Of course, a sensible default is defined
| for you; however, you're welcome to change this to another table.
|
*/
'table' => env('SESSION_TABLE', 'sessions'),
/*
|--------------------------------------------------------------------------
| Session Cache Store
|--------------------------------------------------------------------------
|
| When using one of the framework's cache driven session backends, you may
| define the cache store which should be used to store the session data
| between requests. This must match one of your defined cache stores.
|
| Affects: "dynamodb", "memcached", "redis"
|
*/
'store' => env('SESSION_STORE'),
/*
|--------------------------------------------------------------------------
| Session Sweeping Lottery
|--------------------------------------------------------------------------
|
| Some session drivers must manually sweep their storage location to get
| rid of old sessions from storage. Here are the chances that it will
| happen on a given request. By default, the odds are 2 out of 100.
|
*/
'lottery' => [2, 100],
/*
|--------------------------------------------------------------------------
| Session Cookie Name
|--------------------------------------------------------------------------
|
| Here you may change the name of the session cookie that is created by
| the framework. Typically, you should not need to change this value
| since doing so does not grant a meaningful security improvement.
|
*/
'cookie' => env(
'SESSION_COOKIE',
Str::slug((string) env('APP_NAME', 'laravel')).'-session'
),
/*
|--------------------------------------------------------------------------
| Session Cookie Path
|--------------------------------------------------------------------------
|
| The session cookie path determines the path for which the cookie will
| be regarded as available. Typically, this will be the root path of
| your application, but you're free to change this when necessary.
|
*/
'path' => env('SESSION_PATH', '/'),
/*
|--------------------------------------------------------------------------
| Session Cookie Domain
|--------------------------------------------------------------------------
|
| This value determines the domain and subdomains the session cookie is
| available to. By default, the cookie will be available to the root
| domain without subdomains. Typically, this shouldn't be changed.
|
*/
'domain' => env('SESSION_DOMAIN'),
/*
|--------------------------------------------------------------------------
| HTTPS Only Cookies
|--------------------------------------------------------------------------
|
| By setting this option to true, session cookies will only be sent back
| to the server if the browser has a HTTPS connection. This will keep
| the cookie from being sent to you when it can't be done securely.
|
*/
'secure' => env('SESSION_SECURE_COOKIE'),
/*
|--------------------------------------------------------------------------
| HTTP Access Only
|--------------------------------------------------------------------------
|
| Setting this value to true will prevent JavaScript from accessing the
| value of the cookie and the cookie will only be accessible through
| the HTTP protocol. It's unlikely you should disable this option.
|
*/
'http_only' => env('SESSION_HTTP_ONLY', true),
/*
|--------------------------------------------------------------------------
| Same-Site Cookies
|--------------------------------------------------------------------------
|
| This option determines how your cookies behave when cross-site requests
| take place, and can be used to mitigate CSRF attacks. By default, we
| will set this value to "lax" to permit secure cross-site requests.
|
| See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Set-Cookie#samesitesamesite-value
|
| Supported: "lax", "strict", "none", null
|
*/
'same_site' => env('SESSION_SAME_SITE', 'lax'),
/*
|--------------------------------------------------------------------------
| Partitioned Cookies
|--------------------------------------------------------------------------
|
| Setting this value to true will tie the cookie to the top-level site for
| a cross-site context. Partitioned cookies are accepted by the browser
| when flagged "secure" and the Same-Site attribute is set to "none".
|
*/
'partitioned' => env('SESSION_PARTITIONED_COOKIE', false),
/*
|--------------------------------------------------------------------------
| Session Serialization
|--------------------------------------------------------------------------
|
| This value controls the serialization strategy for session data, which
| is JSON by default. Setting this to "php" allows the storage of PHP
| objects in the session but can make an application vulnerable to
| "gadget chain" serialization attacks if the APP_KEY is leaked.
|
| Supported: "json", "php"
|
*/
'serialization' => 'json',
];

1
laravel-app/database/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.sqlite*

View File

@ -0,0 +1,45 @@
<?php
namespace Database\Factories;
use App\Models\User;
use Illuminate\Database\Eloquent\Factories\Factory;
use Illuminate\Support\Facades\Hash;
use Illuminate\Support\Str;
/**
* @extends Factory<User>
*/
class UserFactory extends Factory
{
/**
* The current password being used by the factory.
*/
protected static ?string $password;
/**
* Define the model's default state.
*
* @return array<string, mixed>
*/
public function definition(): array
{
return [
'name' => fake()->name(),
'email' => fake()->unique()->safeEmail(),
'email_verified_at' => now(),
'password' => static::$password ??= Hash::make('password'),
'remember_token' => Str::random(10),
];
}
/**
* Indicate that the model's email address should be unverified.
*/
public function unverified(): static
{
return $this->state(fn (array $attributes) => [
'email_verified_at' => null,
]);
}
}

View File

@ -0,0 +1,49 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::create('users', function (Blueprint $table) {
$table->id();
$table->string('name');
$table->string('email')->unique();
$table->timestamp('email_verified_at')->nullable();
$table->string('password');
$table->rememberToken();
$table->timestamps();
});
Schema::create('password_reset_tokens', function (Blueprint $table) {
$table->string('email')->primary();
$table->string('token');
$table->timestamp('created_at')->nullable();
});
Schema::create('sessions', function (Blueprint $table) {
$table->string('id')->primary();
$table->foreignId('user_id')->nullable()->index();
$table->string('ip_address', 45)->nullable();
$table->text('user_agent')->nullable();
$table->longText('payload');
$table->integer('last_activity')->index();
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::dropIfExists('users');
Schema::dropIfExists('password_reset_tokens');
Schema::dropIfExists('sessions');
}
};

View File

@ -0,0 +1,35 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::create('cache', function (Blueprint $table) {
$table->string('key')->primary();
$table->mediumText('value');
$table->bigInteger('expiration')->index();
});
Schema::create('cache_locks', function (Blueprint $table) {
$table->string('key')->primary();
$table->string('owner');
$table->bigInteger('expiration')->index();
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::dropIfExists('cache');
Schema::dropIfExists('cache_locks');
}
};

View File

@ -0,0 +1,59 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::create('jobs', function (Blueprint $table) {
$table->id();
$table->string('queue')->index();
$table->longText('payload');
$table->unsignedSmallInteger('attempts');
$table->unsignedInteger('reserved_at')->nullable();
$table->unsignedInteger('available_at');
$table->unsignedInteger('created_at');
});
Schema::create('job_batches', function (Blueprint $table) {
$table->string('id')->primary();
$table->string('name');
$table->integer('total_jobs');
$table->integer('pending_jobs');
$table->integer('failed_jobs');
$table->longText('failed_job_ids');
$table->mediumText('options')->nullable();
$table->integer('cancelled_at')->nullable();
$table->integer('created_at');
$table->integer('finished_at')->nullable();
});
Schema::create('failed_jobs', function (Blueprint $table) {
$table->id();
$table->string('uuid')->unique();
$table->string('connection');
$table->string('queue');
$table->longText('payload');
$table->longText('exception');
$table->timestamp('failed_at')->useCurrent();
$table->index(['connection', 'queue', 'failed_at']);
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::dropIfExists('jobs');
Schema::dropIfExists('job_batches');
Schema::dropIfExists('failed_jobs');
}
};

View File

@ -0,0 +1,25 @@
<?php
namespace Database\Seeders;
use App\Models\User;
use Illuminate\Database\Console\Seeds\WithoutModelEvents;
use Illuminate\Database\Seeder;
class DatabaseSeeder extends Seeder
{
use WithoutModelEvents;
/**
* Seed the application's database.
*/
public function run(): void
{
// User::factory(10)->create();
User::factory()->create([
'name' => 'Test User',
'email' => 'test@example.com',
]);
}
}

1515
laravel-app/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

16
laravel-app/package.json Normal file
View File

@ -0,0 +1,16 @@
{
"$schema": "https://www.schemastore.org/package.json",
"private": true,
"type": "module",
"scripts": {
"build": "vite build",
"dev": "vite"
},
"devDependencies": {
"@tailwindcss/vite": "^4.0.0",
"concurrently": "^9.0.1",
"laravel-vite-plugin": "^3.1",
"tailwindcss": "^4.0.0",
"vite": "^8.0.0"
}
}

36
laravel-app/phpunit.xml Normal file
View File

@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd"
bootstrap="vendor/autoload.php"
colors="true"
>
<testsuites>
<testsuite name="Unit">
<directory>tests/Unit</directory>
</testsuite>
<testsuite name="Feature">
<directory>tests/Feature</directory>
</testsuite>
</testsuites>
<source>
<include>
<directory>app</directory>
</include>
</source>
<php>
<env name="APP_ENV" value="testing"/>
<env name="APP_MAINTENANCE_DRIVER" value="file"/>
<env name="BCRYPT_ROUNDS" value="4"/>
<env name="BROADCAST_CONNECTION" value="null"/>
<env name="CACHE_STORE" value="array"/>
<env name="DB_CONNECTION" value="sqlite"/>
<env name="DB_DATABASE" value=":memory:"/>
<env name="DB_URL" value=""/>
<env name="MAIL_MAILER" value="array"/>
<env name="QUEUE_CONNECTION" value="sync"/>
<env name="SESSION_DRIVER" value="array"/>
<env name="PULSE_ENABLED" value="false"/>
<env name="TELESCOPE_ENABLED" value="false"/>
<env name="NIGHTWATCH_ENABLED" value="false"/>
</php>
</phpunit>

View File

@ -0,0 +1,25 @@
<IfModule mod_rewrite.c>
<IfModule mod_negotiation.c>
Options -MultiViews -Indexes
</IfModule>
RewriteEngine On
# Handle Authorization Header
RewriteCond %{HTTP:Authorization} .
RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization}]
# Handle X-XSRF-Token Header
RewriteCond %{HTTP:x-xsrf-token} .
RewriteRule .* - [E=HTTP_X_XSRF_TOKEN:%{HTTP:X-XSRF-Token}]
# Redirect Trailing Slashes If Not A Folder...
RewriteCond %{REQUEST_FILENAME} !-d
RewriteCond %{REQUEST_URI} (.+)/$
RewriteRule ^ %1 [L,R=301]
# Send Requests To Front Controller...
RewriteCond %{REQUEST_FILENAME} !-d
RewriteCond %{REQUEST_FILENAME} !-f
RewriteRule ^ index.php [L]
</IfModule>

View File

View File

@ -0,0 +1,20 @@
<?php
use Illuminate\Foundation\Application;
use Illuminate\Http\Request;
define('LARAVEL_START', microtime(true));
// Determine if the application is in maintenance mode...
if (file_exists($maintenance = __DIR__.'/../storage/framework/maintenance.php')) {
require $maintenance;
}
// Register the Composer autoloader...
require __DIR__.'/../vendor/autoload.php';
// Bootstrap Laravel and handle the request...
/** @var Application $app */
$app = require_once __DIR__.'/../bootstrap/app.php';
$app->handleRequest(Request::capture());

View File

@ -0,0 +1,2 @@
User-agent: *
Disallow:

View File

@ -0,0 +1,9 @@
@import 'tailwindcss';
@source '../../vendor/laravel/framework/src/Illuminate/Pagination/resources/views/*.blade.php';
@source '../../storage/framework/views/*.php';
@theme {
--font-sans: 'Instrument Sans', ui-sans-serif, system-ui, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji',
'Segoe UI Symbol', 'Noto Color Emoji';
}

View File

@ -0,0 +1 @@
//

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,427 @@
@extends('layouts.app')
@section('title', 'Upload File — AI Markdown Demo')
@section('content')
{{-- ── Upload card ──────────────────────────────────────────────── --}}
<div class="card shadow-sm mb-4">
<div class="card-body">
<h6 class="card-title fw-semibold mb-3">Tải lên tài liệu để so sánh</h6>
{{-- Mode tabs --}}
<ul class="nav nav-tabs mb-3" id="modeTabs">
<li class="nav-item">
<a class="nav-link py-1 px-3 small active" href="#" data-mode="file">
<i class="bi bi-file-earmark-text me-1"></i>File
</a>
</li>
<li class="nav-item">
<a class="nav-link py-1 px-3 small" href="#" data-mode="youtube">
<i class="bi bi-youtube me-1 text-danger"></i>YouTube
</a>
</li>
</ul>
{{-- File zone --}}
<div id="fileZone">
<div class="drop-zone mb-2" id="dropZone">
<input type="file" id="fileInput" style="display:none"
accept=".pdf,.docx,.xlsx,.pptx,.html,.htm,.csv,.txt,.jpg,.jpeg,.png,.tiff,.tif,.bmp,.md,.epub,.zip,.asciidoc,.adoc,.webp" />
<i class="bi bi-file-earmark-text fs-1 text-secondary"></i>
<p class="text-muted mt-2 mb-1">Kéo thả hoặc click để chọn file</p>
<div id="fileInfo" class="fw-semibold text-primary small"></div>
</div>
<div class="d-flex flex-wrap gap-1">
@foreach(['PDF','DOCX','XLSX','PPTX','HTML','CSV','TXT','JPG/PNG','EPUB','TIFF','ASCIIDoc'] as $ext)
<span class="badge bg-secondary-subtle text-secondary">{{ $ext }}</span>
@endforeach
</div>
</div>
{{-- YouTube zone --}}
<div id="youtubeZone" style="display:none">
<div class="input-group">
<span class="input-group-text bg-danger text-white"><i class="bi bi-youtube"></i></span>
<input type="url" class="form-control" id="ytUrl" placeholder="https://www.youtube.com/watch?v=..." />
<button class="btn btn-outline-secondary" id="ytClear"><i class="bi bi-x"></i></button>
</div>
<div class="form-text mt-1">
<i class="bi bi-info-circle me-1"></i>
<b>MarkItDown</b>: yt-dlp &nbsp;|&nbsp;
<b>Docling</b>: transcript markdown &nbsp;|&nbsp;
<b>Unlimited-OCR</b>: <span class="text-warning-emphasis">không hỗ trợ URL</span>
</div>
</div>
{{-- Controls --}}
<div class="d-flex flex-wrap align-items-center gap-3 mt-3">
<div class="d-flex align-items-center gap-2">
<label class="form-label mb-0 small fw-medium">Docling format</label>
<select class="form-select form-select-sm" id="dlFormat" style="width:auto">
<option value="markdown">Markdown</option>
<option value="json">JSON</option>
<option value="html">HTML</option>
<option value="text">Plain Text</option>
</select>
</div>
<div class="form-check form-switch mb-0">
<input class="form-check-input" type="checkbox" id="llmToggle" checked />
<label class="form-check-label small fw-medium" for="llmToggle">LLM</label>
</div>
<button class="btn btn-primary btn-sm" id="btnConvert" disabled>
<i class="bi bi-play-fill me-1"></i>Chuyển đổi & So sánh
</button>
<button class="btn btn-outline-secondary btn-sm" id="btnClear">
<i class="bi bi-x-circle me-1"></i>Xoá
</button>
</div>
{{-- Prompt panel --}}
<div id="promptPanel" class="mt-3">
<div class="p-3 rounded border bg-primary-subtle">
<div class="d-flex justify-content-between align-items-center mb-2">
<span class="small fw-semibold text-primary"><i class="bi bi-pencil-square me-1"></i>Custom LLM Prompt</span>
<button class="btn btn-link btn-sm p-0 text-primary" id="btnClearPrompt">Xoá</button>
</div>
<textarea class="form-control form-control-sm" id="llmPrompt" rows="3"
placeholder="Để trống = dùng default prompt..."></textarea>
</div>
</div>
</div>
</div>
{{-- ── Status row ───────────────────────────────────────────────── --}}
<div class="row g-3 mb-4" id="statusRow" style="display:none!important">
@foreach([['md','MarkItDown','bg-primary','text-primary'],['dl','Docling','bg-success','text-success'],['uo','Unlimited-OCR','bg-danger','text-danger']] as [$k,$label,$bg,$tc])
<div class="col-md-4">
<div class="card h-100">
<div class="card-body">
<h6 class="card-title d-flex align-items-center gap-2">
<span class="badge rounded-circle p-1 {{ $bg }}">&nbsp;</span>{{ $label }}
</h6>
<div class="d-flex align-items-center gap-2">
<div class="spinner-border spinner-border-sm {{ $tc }} status-spinner-{{ $k }}" style="display:none"></div>
<small class="text-muted status-text-{{ $k }}">Đang chờ...</small>
</div>
<div class="row g-2 mt-2 status-stats-{{ $k }}" style="display:none">
<div class="col-4"><div class="border rounded text-center py-2">
<div class="fw-bold stat-ms-{{ $k }}"></div>
<div class="text-muted" style="font-size:.7rem">ms</div>
</div></div>
<div class="col-4"><div class="border rounded text-center py-2">
<div class="fw-bold stat-len-{{ $k }}"></div>
<div class="text-muted" style="font-size:.7rem"> tự</div>
</div></div>
<div class="col-4"><div class="border rounded text-center py-2">
<div class="fw-bold stat-lines-{{ $k }}"></div>
<div class="text-muted" style="font-size:.7rem">dòng</div>
</div></div>
</div>
</div>
</div>
</div>
@endforeach
</div>
{{-- ── Result cards ─────────────────────────────────────────────── --}}
<div class="row g-3 mb-4">
@foreach([['md','MarkItDown','bg-primary','text-primary'],['dl','Docling','bg-success','text-success'],['uo','Unlimited-OCR','bg-danger','text-danger']] as [$k,$label,$bg,$tc])
<div class="col-md-4">
<div class="card shadow-sm h-100">
<div class="card-header d-flex justify-content-between align-items-center py-2">
<span class="fw-semibold small d-flex align-items-center gap-2">
<span class="badge rounded-circle p-1 {{ $bg }}">&nbsp;</span>
{{ $label }}
<span class="badge bg-primary-subtle text-primary llm-badge-{{ $k }}" style="display:none;font-size:.65rem">🤖 LLM</span>
</span>
<div class="d-flex align-items-center gap-1">
@if($k === 'md')
<button class="btn btn-outline-warning btn-sm py-0 px-2" id="btnCleanup" style="display:none">
Làm đẹp
</button>
@endif
<button class="btn btn-outline-secondary btn-sm py-0 px-2 btn-dl-{{ $k }}" style="display:none">
<i class="bi bi-download"></i> .md
</button>
@include('partials.pane-tabs', ['k' => $k])
</div>
</div>
<div class="card-body p-0">
@include('partials.pane-body', ['k' => $k, 'tc' => $tc])
</div>
</div>
</div>
@endforeach
</div>
{{-- ── History ──────────────────────────────────────────────────── --}}
<h6 class="fw-semibold mb-3">Lịch sử chuyển đổi gần đây</h6>
<div class="row g-3">
@foreach([['md','MarkItDown','bg-primary'],['dl','Docling','bg-success'],['uo','Unlimited-OCR','bg-danger']] as [$k,$label,$bg])
<div class="col-md-4">
<div class="card shadow-sm">
<div class="card-header py-2 small fw-semibold d-flex align-items-center gap-2">
<span class="badge rounded-circle p-1 {{ $bg }}">&nbsp;</span>{{ $label }}
</div>
<div class="history-{{ $k }}">
<div class="text-center text-muted py-3 small">Chưa lịch sử</div>
</div>
</div>
</div>
@endforeach
</div>
@endsection
@section('scripts')
<script>
$.ajaxSetup({ headers: { 'X-CSRF-TOKEN': $('meta[name="csrf-token"]').attr('content') } })
$(function () {
// ── State ────────────────────────────────────────────────────
let mode = 'file'
let currentFile = null
const results = { md: '', dl: '', uo: '' }
const tabs = { md: 'raw', dl: 'raw', uo: 'raw' }
// ── Init ─────────────────────────────────────────────────────
loadHistory()
const savedPrompt = localStorage.getItem('llm_prompt')
if (savedPrompt) $('#llmPrompt').val(savedPrompt)
if (localStorage.getItem('llm_enabled') === '0') $('#llmToggle').prop('checked', false)
togglePromptPanel()
// ── Mode tabs ────────────────────────────────────────────────
$('#modeTabs a').on('click', function (e) {
e.preventDefault()
mode = $(this).data('mode')
$('#modeTabs a').removeClass('active')
$(this).addClass('active')
$('#fileZone').toggle(mode === 'file')
$('#youtubeZone').toggle(mode === 'youtube')
updateConvertBtn()
})
// ── File input ───────────────────────────────────────────────
$('#dropZone').on('click', () => $('#fileInput').click())
$('#fileInput').on('change', function () { setFile(this.files[0]) })
$('#dropZone').on('dragover', function (e) { e.preventDefault(); $(this).addClass('drag-over') })
$('#dropZone').on('dragleave', function () { $(this).removeClass('drag-over') })
$('#dropZone').on('drop', function (e) {
e.preventDefault(); $(this).removeClass('drag-over')
setFile(e.originalEvent.dataTransfer.files[0])
})
function setFile(f) {
if (!f) return
currentFile = f
const size = f.size < 1048576 ? (f.size/1024).toFixed(1)+' KB' : (f.size/1048576).toFixed(1)+' MB'
$('#fileInfo').text(f.name + ' (' + size + ')')
updateConvertBtn()
}
// ── YouTube ──────────────────────────────────────────────────
$('#ytUrl').on('input', updateConvertBtn)
$('#ytClear').on('click', function () { $('#ytUrl').val(''); updateConvertBtn() })
function updateConvertBtn() {
const ok = mode === 'file' ? !!currentFile : !!$('#ytUrl').val().trim()
$('#btnConvert').prop('disabled', !ok)
}
// ── LLM toggle ───────────────────────────────────────────────
$('#llmToggle').on('change', function () {
localStorage.setItem('llm_enabled', this.checked ? '1' : '0')
togglePromptPanel()
})
function togglePromptPanel() {
$('#promptPanel').toggle($('#llmToggle').is(':checked'))
}
$('#btnClearPrompt').on('click', function () { $('#llmPrompt').val('') })
// ── Clear ────────────────────────────────────────────────────
$('#btnClear').on('click', function () {
currentFile = null; $('#fileInfo').text(''); $('#ytUrl').val('')
updateConvertBtn()
['md','dl','uo'].forEach(k => resetPane(k))
$('#statusRow').hide()
})
// ── Convert ──────────────────────────────────────────────────
$('#btnConvert').on('click', function () {
const useLlm = $('#llmToggle').is(':checked')
const prompt = $('#llmPrompt').val().trim() || null
const dlFmt = $('#dlFormat').val()
$('#statusRow').show()
$('#btnConvert').prop('disabled', true).html('<span class="spinner-border spinner-border-sm me-1"></span>Đang xử lý...')
const t0 = performance.now()
;['md','dl','uo'].forEach(k => startPane(k))
const mkUrl = (base, extra = {}) => {
const p = new URLSearchParams({ use_llm: useLlm, ...extra })
if (prompt) p.set('llm_prompt', prompt)
return base + '?' + p.toString()
}
const requests = {
md: mode === 'youtube'
? $.ajax({ url: '/api/markitdown/convert-url', method: 'POST', contentType: 'application/json',
data: JSON.stringify({ url: $('#ytUrl').val(), use_llm: useLlm, llm_prompt: prompt }) })
: (function () {
const fd = new FormData(); fd.append('file', currentFile)
return $.ajax({ url: mkUrl('/api/markitdown/convert'), method: 'POST', data: fd,
processData: false, contentType: false })
})(),
dl: mode === 'youtube'
? $.ajax({ url: '/api/docling/convert-url', method: 'POST', contentType: 'application/json',
data: JSON.stringify({ url: $('#ytUrl').val(), output_format: dlFmt, use_llm: useLlm, llm_prompt: prompt }) })
: (function () {
const fd = new FormData(); fd.append('file', currentFile)
return $.ajax({ url: mkUrl('/api/docling/convert', { output_format: dlFmt }), method: 'POST',
data: fd, processData: false, contentType: false })
})(),
uo: mode === 'youtube'
? Promise.reject({ responseJSON: { detail: 'Unlimited-OCR không hỗ trợ YouTube URL' } })
: (function () {
const fd = new FormData(); fd.append('file', currentFile)
return $.ajax({ url: mkUrl('/api/unlimited-ocr/convert'), method: 'POST', data: fd,
processData: false, contentType: false })
})(),
}
const settled = k => new Promise(resolve => {
const t1 = performance.now()
Promise.resolve(requests[k])
.then(d => resolve({ ok: true, data: d, ms: Math.round(performance.now() - t1) }))
.catch(e => resolve({ ok: false, err: e?.responseJSON?.detail || e?.statusText || 'Lỗi không xác định' }))
})
Promise.all(['md','dl','uo'].map(k => settled(k).then(r => ({ k, ...r })))).then(all => {
all.forEach(({ k, ok, data, ms, err }) => {
if (ok) finishPane(k, data, ms)
else errorPane(k, err)
})
$('#btnConvert').prop('disabled', false).html('<i class="bi bi-play-fill me-1"></i>Chuyển đổi & So sánh')
loadHistory()
localStorage.setItem('llm_prompt', $('#llmPrompt').val())
})
})
// ── Pane helpers ─────────────────────────────────────────────
function resetPane(k) {
results[k] = ''; tabs[k] = 'raw'
$(`.pane-idle-${k}`).show()
$(`.pane-loading-${k}, .pane-error-${k}, .pane-raw-${k}, .pane-preview-${k}`).hide()
$(`.status-spinner-${k}, .status-stats-${k}, .llm-badge-${k}, .btn-dl-${k}`).hide()
$(`.status-text-${k}`).text('Đang chờ...')
if (k === 'md') $('#btnCleanup').hide()
}
function startPane(k) {
$(`.pane-idle-${k}, .pane-error-${k}, .pane-raw-${k}, .pane-preview-${k}`).hide()
$(`.pane-loading-${k}`).show()
$(`.status-spinner-${k}`).show()
$(`.status-text-${k}`).text('Đang xử lý...')
}
function finishPane(k, data, ms) {
const content = data.markdown || data.content || ''
results[k] = content
$(`.pane-loading-${k}`).hide()
$(`.pane-raw-${k}`).text(content).show()
$(`.pane-preview-${k}`).html(mdRender(content)).hide()
tabs[k] = 'raw'
$(`.tab-raw-${k}`).addClass('active')
$(`.tab-preview-${k}`).removeClass('active')
$(`.status-spinner-${k}`).hide()
$(`.status-text-${k}`).text(`✅ ${ms} ms` + (data.llm_enabled ? ' 🤖' : ''))
$(`.status-stats-${k}`).show()
$(`.stat-ms-${k}`).text(ms.toLocaleString())
$(`.stat-len-${k}`).text(content.length.toLocaleString())
$(`.stat-lines-${k}`).text(content.split('\n').length)
if (data.llm_enabled) $(`.llm-badge-${k}`).show()
if (content) { $(`.btn-dl-${k}`).show() }
if (k === 'md' && content) $('#btnCleanup').show()
}
function errorPane(k, msg) {
$(`.pane-loading-${k}`).hide()
$(`.pane-error-${k}`).text(msg).show()
$(`.status-spinner-${k}`).hide()
$(`.status-text-${k}`).text('❌ ' + msg)
}
// ── Tab switching ─────────────────────────────────────────────
$(document).on('click', '[data-tab]', function (e) {
e.preventDefault()
const k = $(this).data('pane')
const t = $(this).data('tab')
tabs[k] = t
$(`.tab-raw-${k}, .tab-preview-${k}`).removeClass('active')
$(`.tab-${t}-${k}`).addClass('active')
$(`.pane-raw-${k}, .pane-preview-${k}`).hide()
$(`.pane-${t}-${k}`).show()
})
// ── Download ─────────────────────────────────────────────────
$(document).on('click', '[class*="btn-dl-"]', function () {
const k = $(this).attr('class').match(/btn-dl-(\w+)/)[1]
const name = (currentFile?.name || 'output').replace(/\.[^.]+$/, '')
const a = document.createElement('a')
a.href = URL.createObjectURL(new Blob([results[k]], { type: 'text/markdown' }))
a.download = `${name}_${k}.md`
a.click(); URL.revokeObjectURL(a.href)
})
// ── Cleanup ──────────────────────────────────────────────────
$('#btnCleanup').on('click', function () {
if (!results.md) return
$(this).prop('disabled', true).html('<span class="spinner-border spinner-border-sm"></span>')
$.ajax({
url: '/api/markitdown/cleanup',
method: 'POST', contentType: 'application/json',
data: JSON.stringify({ text: results.md, prompt: $('#llmPrompt').val() || null }),
}).done(function (d) {
results.md = d.text
$(`.pane-raw-md`).text(d.text)
$(`.pane-preview-md`).html(mdRender(d.text))
}).fail(function (e) {
alert('Cleanup thất bại: ' + (e?.responseJSON?.detail || 'Lỗi'))
}).always(function () {
$('#btnCleanup').prop('disabled', false).text('✨ Làm đẹp')
})
})
// ── History ──────────────────────────────────────────────────
function loadHistory() {
const endpoints = {
md: '/api/markitdown/history?limit=8',
dl: '/api/docling/history?limit=8',
uo: '/api/unlimited-ocr/history?limit=8',
}
Object.entries(endpoints).forEach(([k, url]) => {
$.getJSON(url).done(function (items) {
const $el = $(`.history-${k}`)
if (!items.length) { $el.html('<div class="text-center text-muted py-3 small">Chưa có lịch sử</div>'); return }
const rows = items.map(item => {
const t = item.created_at ? new Date(item.created_at).toLocaleTimeString('vi-VN') : ''
const llm = item.llm_enabled ? '<span class="badge bg-primary-subtle text-primary">🤖 LLM</span>' : ''
const ft = item.file_type ? `<span class="badge bg-secondary-subtle text-secondary">${item.file_type}</span>` : ''
return `<li class="list-group-item d-flex justify-content-between align-items-center py-2 px-3">
<span class="small fw-medium text-truncate me-2" style="max-width:60%">${item.filename}</span>
<span class="d-flex gap-1 align-items-center flex-shrink-0">
${ft} ${llm}
<span class="text-muted" style="font-size:.7rem">${t}</span>
</span>
</li>`
}).join('')
$el.html(`<ul class="list-group list-group-flush">${rows}</ul>`)
})
})
}
})
</script>
@endsection

View File

@ -0,0 +1,102 @@
<!DOCTYPE html>
<html lang="vi">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>@yield('title', 'AI Markdown Demo')</title>
<meta name="csrf-token" content="{{ csrf_token() }}" />
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" />
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.min.css" rel="stylesheet" />
<link href="https://cdn.jsdelivr.net/npm/github-markdown-css@5/github-markdown-light.css" rel="stylesheet" />
<style>
body { background: #f8f9fa; }
.ResultPre {
max-height: 460px; overflow: auto; white-space: pre-wrap;
word-break: break-word; font-size: .78rem; background: #f8f9fa;
}
/* github-markdown-css handles all preview styling */
.PreviewPane { max-height: 460px; overflow: auto; padding: 1rem; font-size: .85rem; }
.drop-zone {
border: 2px dashed #dee2e6; border-radius: .5rem;
padding: 3rem 1.5rem; text-align: center; cursor: pointer;
transition: border-color .2s, background .2s;
}
.drop-zone:hover, .drop-zone.drag-over { border-color: #0d6efd; background: #f0f6ff; }
.FileBrowser { height: 560px; overflow-y: auto; font-size: .82rem; }
.file-entry {
display: flex; align-items: center; gap: .4rem;
padding: .25rem .5rem; border-radius: .3rem; cursor: pointer;
user-select: none; overflow: visible; position: relative;
}
.file-entry > span.text-truncate { min-width:0; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
.file-entry:hover { background: #e9ecef; }
.file-entry.selected { background: #cfe2ff; color: #084298; font-weight: 500; }
.file-entry.input-sel { background: #cfe2ff; color: #084298; font-weight: 500; }
.file-entry.output-sel { background: #d1e7dd; color: #0a3622; }
.act-active { background: rgba(255,255,255,.2) !important; }
.tree-actions { display:none; gap:2px; align-items:center; flex-shrink:0; margin-left:auto; }
.file-entry:hover .tree-actions { display:flex; }
.btn-tree-act {
border: none; background: none; padding: 1px 5px; border-radius: 3px;
font-size: .75rem; cursor: pointer; color: #6c757d; line-height:1.4;
}
.btn-tree-act:hover { background: #dee2e6; color: #212529; }
.btn-set-output:hover { color: #0a3622; background: #d1e7dd; }
.btn-preview-md:hover { color: #084298; background: #cfe2ff; }
.path-badge { font-size: .72rem; font-family: monospace; word-break: break-all; color: #6c757d; }
.indent-1 { padding-left: 1.4rem; }
.indent-2 { padding-left: 2.6rem; }
.indent-3 { padding-left: 3.8rem; }
.indent-4 { padding-left: 5rem; }
</style>
@yield('head')
</head>
<body>
<nav class="navbar navbar-light bg-white border-bottom px-4 py-2 d-flex justify-content-between">
<div class="d-flex align-items-center gap-3">
<span class="navbar-brand fw-bold mb-0">
MarkItDown <span class="text-muted fw-normal">vs</span> Docling
<span class="text-muted fw-normal">vs</span> Unlimited-OCR
</span>
<ul class="nav nav-pills">
<li class="nav-item">
<a class="nav-link py-1 px-3 small {{ request()->is('/') ? 'active' : '' }}" href="/">
<i class="bi bi-upload me-1"></i>Upload File
</a>
</li>
<li class="nav-item">
<a class="nav-link py-1 px-3 small {{ request()->is('email-convert') ? 'active' : '' }}" href="/email-convert">
<i class="bi bi-envelope me-1"></i>Email Convert
</a>
</li>
</ul>
</div>
<span class="badge bg-primary-subtle text-primary">Demo LLM Input Processing</span>
</nav>
<div class="container-xl py-4">
@yield('content')
</div>
<script src="https://cdn.jsdelivr.net/npm/jquery@3.7.1/dist/jquery.min.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11/build/styles/github.min.css" />
<script src="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11/build/highlight.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/markdown-it@14/dist/markdown-it.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/dompurify@3/dist/purify.min.js"></script>
<script>
const _md = markdownit({
html: true, linkify: true, typographer: true, breaks: true,
highlight: (str, lang) => {
if (lang && hljs.getLanguage(lang)) {
try { return '<pre class="hljs"><code>' + hljs.highlight(str, { language: lang, ignoreIllegals: true }).value + '</code></pre>' }
catch {}
}
return '<pre class="hljs"><code>' + _md.utils.escapeHtml(str) + '</code></pre>'
}
})
const mdRender = s => DOMPurify.sanitize(_md.render(s || ''), { USE_PROFILES: { html: true } })
</script>
@yield('scripts')
</body>
</html>

View File

@ -0,0 +1,11 @@
{{-- @include('partials.pane-body', ['k' => $k, 'tc' => $tc, 'idle' => '...']) --}}
@php $idle ??= 'Tải file lên để xem kết quả'; @endphp
<div class="text-center text-muted py-5 small pane-idle-{{ $k }}">
<i class="bi bi-upload fs-3 d-block mb-2"></i>{{ $idle }}
</div>
<div class="text-center text-muted py-5 small pane-loading-{{ $k }}" style="display:none">
<div class="spinner-border spinner-border-sm mb-2 {{ $tc }}"></div><br>Đang xử ...
</div>
<div class="alert alert-danger m-3 small pane-error-{{ $k }}" style="display:none"></div>
<pre class="ResultPre p-3 m-0 pane-raw-{{ $k }}" style="display:none"></pre>
<div class="PreviewPane markdown-body pane-preview-{{ $k }}" style="display:none"></div>

View File

@ -0,0 +1,11 @@
{{-- @include('partials.pane-tabs', ['k' => $k]) --}}
<ul class="nav nav-tabs card-header-tabs border-0">
<li class="nav-item">
<a class="nav-link py-1 px-2 small active tab-raw-{{ $k }}"
href="#" data-pane="{{ $k }}" data-tab="raw">Raw</a>
</li>
<li class="nav-item">
<a class="nav-link py-1 px-2 small tab-preview-{{ $k }}"
href="#" data-pane="{{ $k }}" data-tab="preview">Preview</a>
</li>
</ul>

View File

@ -0,0 +1,8 @@
<?php
use Illuminate\Foundation\Inspiring;
use Illuminate\Support\Facades\Artisan;
Artisan::command('inspire', function () {
$this->comment(Inspiring::quote());
})->purpose('Display an inspiring quote');

View File

@ -0,0 +1,4 @@
<?php
// Routes are declared via PHP attributes on each controller.
// See: spatie/laravel-route-attributes

4
laravel-app/storage/app/.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
*
!private/
!public/
!.gitignore

View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -0,0 +1,9 @@
compiled.php
config.php
down
events.scanned.php
maintenance.php
routes.php
routes.scanned.php
schedule-*
services.json

View File

@ -0,0 +1,3 @@
*
!data/
!.gitignore

View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -0,0 +1,2 @@
*
!.gitignore

2
laravel-app/storage/logs/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -0,0 +1,19 @@
<?php
namespace Tests\Feature;
// use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class ExampleTest extends TestCase
{
/**
* A basic test example.
*/
public function test_the_application_returns_a_successful_response(): void
{
$response = $this->get('/');
$response->assertStatus(200);
}
}

View File

@ -0,0 +1,10 @@
<?php
namespace Tests;
use Illuminate\Foundation\Testing\TestCase as BaseTestCase;
abstract class TestCase extends BaseTestCase
{
//
}

View File

@ -0,0 +1,16 @@
<?php
namespace Tests\Unit;
use PHPUnit\Framework\TestCase;
class ExampleTest extends TestCase
{
/**
* A basic test example.
*/
public function test_that_true_is_true(): void
{
$this->assertTrue(true);
}
}

View File

@ -0,0 +1,24 @@
import { defineConfig } from 'vite';
import laravel from 'laravel-vite-plugin';
import { bunny } from 'laravel-vite-plugin/fonts';
import tailwindcss from '@tailwindcss/vite';
export default defineConfig({
plugins: [
laravel({
input: ['resources/css/app.css', 'resources/js/app.js'],
refresh: true,
fonts: [
bunny('Instrument Sans', {
weights: [400, 500, 600],
}),
],
}),
tailwindcss(),
],
server: {
watch: {
ignored: ['**/storage/framework/views/**'],
},
},
});

View File

@ -1,219 +0,0 @@
import os
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from markitdown import MarkItDown
from app.models.ConvertModel import Conversion
logger = logging.getLogger(__name__)
import openai as _openai
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") or None
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llava")
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
LLM_ACTIVE = False
_llm_client = None
md_plain = MarkItDown()
md = md_plain
def _init_llm(base_url: str | None, model: str) -> bool:
global OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md
if not base_url:
OLLAMA_BASE_URL, OLLAMA_MODEL, LLM_ACTIVE, _llm_client, md = None, model, False, None, md_plain
return False
try:
client = _openai.OpenAI(base_url=base_url, api_key="ollama")
OLLAMA_BASE_URL = base_url
OLLAMA_MODEL = model
_llm_client = client
md = MarkItDown(llm_client=client, llm_model=model)
LLM_ACTIVE = True
logger.info("MarkItDown: LLM enabled via %s (model=%s)", base_url, model)
return True
except Exception as e:
logger.warning("MarkItDown: LLM init failed (%s)", e)
LLM_ACTIVE = False
return False
_init_llm(OLLAMA_BASE_URL, OLLAMA_MODEL)
DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \
The text below was extracted from a multi-column PDF using OCR and is poorly structured: \
columns are merged, headers are mixed with values, and content is out of order.
Your task:
1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.)
2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value |
3. Keep bullet lists where appropriate (e.g. ports, certifications)
4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows)
5. Preserve all technical values exactly do not paraphrase specs
Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble."""
import re as _re
def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str:
if not _llm_client or not text.strip():
return text
try:
resp = _llm_client.chat.completions.create(
model=model or OLLAMA_MODEL,
messages=[
{"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT},
{"role": "user", "content": text},
],
temperature=0,
)
result = resp.choices[0].message.content or text
result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
result = _re.sub(r"\n?```\s*$", "", result.strip())
return result.strip() or text
except Exception as e:
logger.warning("MarkItDown: cleanup failed (%s)", e)
return text
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
}
YOUTUBE_PATTERN = _re.compile(
r"(https?://)?(www\.)?(youtube\.com/watch|youtu\.be/|youtube\.com/shorts/)"
)
async def convert_url(
url: str,
db: AsyncSession,
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
use_llm_now = LLM_ACTIVE and use_llm
if use_llm_now and llm_prompt:
try:
converter = MarkItDown(
llm_client=_llm_client,
llm_model=OLLAMA_MODEL,
llm_prompt=llm_prompt,
)
except TypeError:
converter = md
elif use_llm_now:
converter = md
else:
converter = md_plain
try:
result = converter.convert(url)
actual_llm = use_llm_now
except Exception as llm_err:
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
result = md_plain.convert(url)
actual_llm = False
else:
raise HTTPException(status_code=500, detail=str(llm_err))
# Use last segment of URL as filename
slug = url.rstrip("/").split("/")[-1].split("?")[0] or "youtube"
filename = f"{slug}.md"
try:
record = Conversion(
filename=filename,
file_type="youtube",
markdown=result.text_content,
llm_enabled=actual_llm,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
async def convert_file(
file: UploadFile,
db: AsyncSession,
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
if not _allowed_file(file.filename):
raise HTTPException(
status_code=422,
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
suffix = os.path.splitext(file.filename)[1]
file_type = suffix.lstrip(".").lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
# Choose converter: LLM only if enabled globally AND requested per-call
use_llm_now = LLM_ACTIVE and use_llm
# If custom prompt provided, create a one-off MarkItDown with that prompt
if use_llm_now and llm_prompt:
try:
converter = MarkItDown(
llm_client=_llm_client,
llm_model=OLLAMA_MODEL,
llm_prompt=llm_prompt,
)
except TypeError:
# older markitdown versions may not support llm_prompt kwarg
converter = md
elif use_llm_now:
converter = md
else:
converter = md_plain
try:
try:
result = converter.convert(tmp_path)
actual_llm = use_llm_now
except Exception as llm_err:
# Ollama OOM / 500 — fallback to plain conversion without LLM
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
result = md_plain.convert(tmp_path)
actual_llm = False
else:
raise
record = Conversion(
filename=file.filename,
file_type=file_type,
markdown=result.text_content,
llm_enabled=actual_llm,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
result = await db.execute(
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
)
return result.scalars().all()

View File

@ -0,0 +1,11 @@
"""Central config — single source of truth cho LLM/service settings.
Đổi base_url / model chỉ cần sửa file này."""
import os
# llama-server (Qwen2.5-VL) qua SSH tunnel; Docker reach host qua host.docker.internal
LLM_BASE_URL = "http://host.docker.internal:11436/v1"
LLM_MODEL = "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF"
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
# Vision OCR gom về vision-service (container api-vision). Ảnh → POST {VISION_URL}/ocr
VISION_URL = os.getenv("VISION_URL", "http://api-vision:8000")

View File

@ -13,22 +13,26 @@ class UrlRequest(BaseModel):
use_llm: bool = True
llm_prompt: str | None = None
class PathRequest(BaseModel):
path: str
output_format: str = "markdown"
use_llm: bool = True
llm_prompt: str | None = None
class SettingsRequest(BaseModel):
ollama_base_url: str | None = None
ollama_model: str = "llava"
cleanup_model: str | None = None
class SettingsResponse(BaseModel):
llm_enabled: bool
ollama_base_url: str | None
ollama_model: str
llm_base_url: str | None
llm_model: str
cleanup_model: str | None = None
default_prompt: str | None = None
SUPPORTED_INPUT_FORMATS = sorted([
"pdf", "docx", "xlsx", "pptx",
"html", "htm", "jpg", "jpeg", "png",
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc"
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc", "elm"
])
SUPPORTED_OUTPUT_FORMATS = ["markdown", "json", "html", "text"]
@ -38,34 +42,34 @@ SUPPORTED_OUTPUT_FORMATS = ["markdown", "json", "html", "text"]
def get_settings():
return SettingsResponse(
llm_enabled=docling_service.LLM_ACTIVE,
ollama_base_url=docling_service.OLLAMA_BASE_URL,
ollama_model=docling_service.OLLAMA_MODEL,
llm_base_url=docling_service.LLM_BASE_URL,
llm_model=docling_service.LLM_MODEL,
cleanup_model=docling_service.CLEANUP_MODEL or None,
default_prompt=docling_service.DEFAULT_ENRICH_PROMPT,
)
@router.post("/settings", response_model=SettingsResponse)
def update_settings(req: SettingsRequest):
docling_service._init_llm(req.ollama_base_url or None, req.ollama_model)
docling_service._init_llm()
docling_service.CLEANUP_MODEL = req.cleanup_model or ""
return SettingsResponse(
llm_enabled=docling_service.LLM_ACTIVE,
ollama_base_url=docling_service.OLLAMA_BASE_URL,
ollama_model=docling_service.OLLAMA_MODEL,
llm_base_url=docling_service.LLM_BASE_URL,
llm_model=docling_service.LLM_MODEL,
cleanup_model=docling_service.CLEANUP_MODEL or None,
default_prompt=docling_service.DEFAULT_ENRICH_PROMPT,
)
@router.get("/health", response_model=HealthResponse)
def health():
from app.services.DoclingService import LLM_ACTIVE, OLLAMA_MODEL
from app.services.DoclingService import LLM_ACTIVE, LLM_MODEL
ocr = "tesseract" if _ocr_available() else "none"
return HealthResponse(
status="ok",
supported_formats=SUPPORTED_INPUT_FORMATS,
output_formats=SUPPORTED_OUTPUT_FORMATS,
llm_enabled=LLM_ACTIVE,
llm_model=OLLAMA_MODEL if LLM_ACTIVE else None,
llm_model=LLM_MODEL if LLM_ACTIVE else None,
ocr_engine=ocr,
)
@ -125,6 +129,26 @@ async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db))
)
@router.post("/convert-path")
async def convert_path(req: PathRequest, db: AsyncSession = Depends(get_db)):
import os
from fastapi import HTTPException as _HTTPException
abs_path = os.path.realpath(req.path)
if not abs_path.startswith("/workspace"):
raise _HTTPException(status_code=403, detail="Access denied")
if not os.path.exists(abs_path) or not os.path.isfile(abs_path):
raise _HTTPException(status_code=404, detail="File not found")
record = await docling_service.convert_path(abs_path, db, req.output_format, use_llm=req.use_llm, llm_prompt=req.llm_prompt)
return ConvertResponse(
id=record.id,
filename=record.filename,
output_format=record.output_format,
content=record.content,
page_count=record.page_count,
llm_enabled=record.llm_enabled,
)
@router.get("/history", response_model=list[ConversionRecord])
async def history(limit: int = 20, db: AsyncSession = Depends(get_db)):
records = await docling_service.get_history(db, limit)

View File

@ -0,0 +1,632 @@
import os
import re
import json
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.datamodel.base_models import InputFormat
from app.models.ConvertModel import Conversion
logger = logging.getLogger(__name__)
import openai as _openai
from app.config import LLM_BASE_URL, LLM_MODEL, CLEANUP_MODEL, VISION_URL
def _build_converter() -> DocumentConverter:
try:
ocr_opts = TesseractCliOcrOptions()
pdf_opts = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_opts)
logger.info("Docling: OCR enabled via Tesseract CLI")
except Exception as e:
logger.warning("Docling: Tesseract unavailable (%s) — OCR disabled", e)
pdf_opts = PdfPipelineOptions(do_ocr=False)
# ImageFormatOption also uses StandardPdfPipeline — pass same pdf_opts
# to prevent docling from falling back to RapidOCR / PP-OCRv6
return DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_opts),
})
converter = _build_converter()
_llm_client = None
LLM_ACTIVE = False
def _init_llm() -> bool:
global LLM_ACTIVE, _llm_client
try:
_llm_client = _openai.OpenAI(base_url=LLM_BASE_URL, api_key="none")
LLM_ACTIVE = True
logger.info("Docling: LLM enabled via %s (model=%s)", LLM_BASE_URL, LLM_MODEL)
return True
except Exception as e:
logger.warning("Docling: LLM init failed (%s)", e)
LLM_ACTIVE = False
return False
_init_llm()
DEFAULT_ENRICH_PROMPT = (
"You are a document cleaning assistant. "
"Fix OCR errors, normalise whitespace, and improve the Markdown structure. "
"Return ONLY the raw Markdown text — no code fences, no commentary, no explanation."
)
def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str:
"""Text cleanup pass — DISABLED under OCRFlux (OCR-only model always emits its
JSON schema regardless of prompt, which corrupts non-image text). Passthrough."""
return markdown
if not _llm_client or not markdown.strip():
return markdown
# Rough token estimate: 1 token ≈ 4 chars. Reserve ~800 tokens for system + response.
max_chars = (4096 - 800) * 4
truncated = markdown[:max_chars]
if len(markdown) > max_chars:
logger.warning("_llm_enrich: truncated input from %d to %d chars", len(markdown), max_chars)
try:
resp = _llm_client.chat.completions.create(
model=LLM_MODEL,
messages=[
{"role": "system", "content": system_prompt or DEFAULT_ENRICH_PROMPT},
{"role": "user", "content": truncated},
],
temperature=0,
)
result = resp.choices[0].message.content or truncated
# llava tends to wrap output in code fences regardless of instructions — strip them
result = re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
result = re.sub(r"\n?```\s*$", "", result.strip())
return result.strip() or markdown
except Exception as e:
logger.warning("Docling: LLM enrichment failed (%s) — returning raw output", e)
return markdown
async def convert_url(
url: str,
db: AsyncSession,
output_format: str = "markdown",
use_llm: bool = True,
llm_prompt: str | None = None,
) -> "Conversion":
"""Fetch a YouTube (or any URL) transcript via yt-dlp, then convert with Docling."""
try:
import yt_dlp # noqa: PLC0415
except ImportError:
raise HTTPException(status_code=500, detail="yt-dlp not installed")
ydl_opts = {
"quiet": True,
"skip_download": True,
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": ["vi", "en"],
"outtmpl": "%(id)s.%(ext)s",
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
except Exception as e:
raise HTTPException(status_code=422, detail=f"yt-dlp error: {e}")
title = info.get("title", "YouTube Video")
description = info.get("description", "") or ""
channel = info.get("channel", info.get("uploader", ""))
duration = info.get("duration_string", "")
upload_date = info.get("upload_date", "")
view_count = info.get("view_count")
chapters = info.get("chapters") or []
# Build markdown from available metadata
lines = [f"# {title}", ""]
meta_rows = []
if channel:
meta_rows.append(f"**Kênh:** {channel}")
if duration:
meta_rows.append(f"**Thời lượng:** {duration}")
if upload_date and len(upload_date) == 8:
meta_rows.append(f"**Ngày đăng:** {upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:]}")
if view_count is not None:
meta_rows.append(f"**Lượt xem:** {view_count:,}")
meta_rows.append(f"**URL:** {url}")
lines.extend(meta_rows)
lines.append("")
# Subtitles/transcript
subtitles = info.get("subtitles") or {}
auto_subtitles = info.get("automatic_captions") or {}
transcript_text = None
for lang in ("vi", "en"):
tracks = subtitles.get(lang) or auto_subtitles.get(lang)
if tracks:
# Find a json3 or srv3 track to extract plain text
for track in tracks:
if track.get("ext") in ("json3", "srv3", "ttml", "vtt"):
try:
import urllib.request
with urllib.request.urlopen(track["url"], timeout=15) as r:
raw = r.read().decode("utf-8", errors="ignore")
# Strip VTT timestamps for vtt format
if track.get("ext") == "vtt":
cleaned = re.sub(r"\d{2}:\d{2}:\d{2}\.\d+ --> .*", "", raw)
cleaned = re.sub(r"^\d+$", "", cleaned, flags=re.MULTILINE)
cleaned = re.sub(r"<[^>]+>", "", cleaned)
transcript_text = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
else:
transcript_text = raw
break
except Exception:
pass
if transcript_text:
break
if transcript_text:
lines += ["## Transcript / Phụ đề", "", transcript_text, ""]
elif description:
lines += ["## Mô tả", "", description[:3000], ""]
else:
lines += ["## Ghi chú", "", "_Không có transcript hoặc mô tả._", ""]
if chapters:
lines += ["## Chapters", ""]
for ch in chapters:
start = ch.get("start_time", 0)
m, s = divmod(int(start), 60)
lines.append(f"- **{m:02d}:{s:02d}** — {ch.get('title', '')}")
lines.append("")
markdown_text = "\n".join(lines)
# Write temp file and run through Docling
video_id = info.get("id", "youtube")
with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp:
tmp.write(markdown_text)
tmp_path = tmp.name
try:
result = converter.convert(tmp_path)
doc = result.document
page_count = None
if output_format == "markdown":
content = doc.export_to_markdown()
elif output_format == "json":
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format == "html":
content = doc.export_to_html()
else:
content = markdown_text
llm_used = False
if _llm_client and use_llm and output_format in ("markdown", "text"):
content = _llm_enrich(content, system_prompt=llm_prompt or None)
llm_used = True
from app.models.ConvertModel import Conversion
record = Conversion(
filename=f"{video_id}.md",
file_type="youtube",
output_format=output_format,
content=content,
page_count=page_count,
llm_enabled=llm_used,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
# -----------------------------------------------------------------
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
"html", "htm", "jpg", "jpeg", "png",
"tiff", "tif", "bmp", "md", "txt", "asciidoc", "adoc", "elm"
}
OUTPUT_FORMATS = {"markdown", "json", "html", "text"}
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
async def convert_file(
file: UploadFile,
db: AsyncSession,
output_format: str = "markdown",
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
if not _allowed_file(file.filename):
raise HTTPException(
status_code=422,
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
if output_format not in OUTPUT_FORMATS:
raise HTTPException(
status_code=422,
detail=f"Output format not supported. Supported: {', '.join(sorted(OUTPUT_FORMATS))}"
)
suffix = os.path.splitext(file.filename)[1]
file_type = suffix.lstrip(".").lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
result = converter.convert(tmp_path)
doc = result.document
page_count = len(doc.pages) if hasattr(doc, "pages") and doc.pages else None
if output_format == "markdown":
content = doc.export_to_markdown()
elif output_format == "json":
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format == "html":
content = doc.export_to_html()
elif output_format == "text":
content = doc.export_to_markdown()
content = re.sub(r"#{1,6}\s?", "", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
# LLM enrichment — only for markdown / text output, and only if requested
llm_used = False
if _llm_client and use_llm and output_format in ("markdown", "text"):
content = _llm_enrich(content, system_prompt=llm_prompt or None)
llm_used = True
record = Conversion(
filename=file.filename,
file_type=file_type,
output_format=output_format,
content=content,
page_count=page_count,
llm_enabled=llm_used,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
def _is_email_file(path: str, file_type: str) -> bool:
"""Detect email files by extension OR by reading the first few bytes."""
if file_type in ("eml", "msg"):
return True
try:
with open(path, "rb") as f:
head = f.read(2048).decode("utf-8", errors="ignore")
email_headers = (
"from:", "to:", "subject:", "date:", "mime-version:", "message-id:",
"received:", "return-path:", "delivered-to:", "envelope-to:",
"x-mailer:", "content-type:", "x-spam-", "reply-to:",
)
lines = [l.lower() for l in head.splitlines()[:30]]
hits = sum(1 for l in lines if any(l.startswith(h) for h in email_headers))
return hits >= 2
except Exception:
return False
_IMAGE_MIME = {"image/jpeg", "image/jpg", "image/png", "image/gif", "image/bmp", "image/webp", "image/tiff"}
_IMAGE_EXT = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".tif"}
_DOCLING_EXT = {".pdf", ".docx", ".xlsx", ".pptx", ".html", ".htm",
".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp",
".md", ".txt", ".asciidoc", ".adoc", ".eml"}
def _eml_to_html(path: str) -> tuple[str, list[tuple[str, bytes]]]:
"""Parse email: extract HTML body, embed inline CID images as base64 data-URIs.
Returns (html_string, ordered_images) where ordered_images is [(mime, raw_bytes), ...]
in document appearance order used to replace <!-- image --> placeholders later."""
import email as _email, email.policy as _policy, re as _re, base64 as _b64
with open(path, "rb") as f:
msg = _email.message_from_binary_file(f, policy=_policy.compat32)
html_body = None
text_body = None
cid_uri: dict[str, str] = {} # cid → data-URI
cid_raw: dict[str, tuple[str, bytes]] = {} # cid → (mime, bytes)
loose: list[tuple[str, bytes, str]] = [] # (mime, bytes, filename)
for part in msg.walk():
ct = part.get_content_type()
data = part.get_payload(decode=True)
if not data:
continue
if ct in _IMAGE_MIME or ct.startswith("image/"):
cid = part.get("Content-ID", "").strip("<>")
mime = ct if "/" in ct else "image/jpeg"
b64 = _b64.standard_b64encode(data).decode()
uri = f"data:{mime};base64,{b64}"
if cid:
cid_uri[cid] = uri
cid_raw[cid] = (mime, data)
else:
fname = part.get_filename() or ""
loose.append((mime, data, fname or "image"))
continue
charset = part.get_content_charset() or "utf-8"
decoded = data.decode(charset, errors="replace")
if ct == "text/html" and html_body is None:
html_body = decoded
elif ct == "text/plain" and text_body is None:
text_body = decoded
# Replace cid: src refs, tracking order of appearance
inline_images: list[tuple[str, bytes]] = []
def _replace_cid(html: str) -> str:
def _sub(m):
cid = m.group(2)
if cid in cid_raw:
inline_images.append(cid_raw[cid])
return m.group(1) + cid_uri[cid] + m.group(3)
return m.group(0)
return _re.sub(r'(src=["\'])cid:([^\s"\'>\)]+)(["\'])', _sub, html)
subject = msg.get("Subject", "")
sender = msg.get("From", "")
date = msg.get("Date", "")
header = f"<h2>{subject}</h2><p><b>From:</b> {sender}<br><b>Date:</b> {date}</p><hr>"
loose_html = ""
if loose:
parts = []
for mime, data, alt in loose:
b64 = _b64.standard_b64encode(data).decode()
uri = f"data:{mime};base64,{b64}"
parts.append(f'<figure><img src="{uri}" alt="{alt}" style="max-width:100%"><figcaption>{alt}</figcaption></figure>')
loose_html = "\n<hr><h3>Attachments</h3>\n" + "\n".join(parts)
if html_body:
html_body = _replace_cid(html_body)
if "<body" in html_body.lower():
result = re.sub(r"(<body[^>]*>)", r"\1" + header, html_body, count=1, flags=re.IGNORECASE)
html_str = result.rstrip().rstrip("</body></html>").rstrip() + loose_html + "\n</body></html>"
else:
html_str = f"<html><body>{header}{html_body}{loose_html}</body></html>"
elif text_body:
escaped = text_body.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
html_str = f"<html><body>{header}<pre>{escaped}</pre>{loose_html}</body></html>"
else:
html_str = f"<html><body>{header}<p><em>No body content found.</em></p>{loose_html}</body></html>"
ordered_images = inline_images + [(m, d) for m, d, _ in loose]
return html_str, ordered_images
def _ocr_image_llm(mime: str, data: bytes) -> str:
"""OCR a single image via the shared vision-service (OCRFlux) → Markdown/HTML.
'' on failure. Upscaling + prompt now live in vision-service."""
import httpx
try:
resp = httpx.post(
f"{VISION_URL}/ocr",
files={"file": ("image", data, mime or "application/octet-stream")},
timeout=300,
)
resp.raise_for_status()
return resp.json().get("text", "")
except Exception as e:
logger.warning("Docling: vision OCR failed (%s)", e)
return ""
def _replace_image_placeholders(markdown: str, images: list[tuple[str, bytes]]) -> str:
"""Replace Docling <!-- image --> placeholders with OCRFlux output.
Each image is OCR'd; falls back to *[Image]* when nothing usable is extracted."""
if not images or "<!-- image -->" not in markdown:
return markdown
img_iter = iter(images)
def _sub(_m):
try:
mime, data = next(img_iter)
except StopIteration:
return "*[Image]*"
return _ocr_image_llm(mime, data) or "*[Image]*"
return re.sub(r"<!-- image -->", _sub, markdown)
def _detect_ext(path: str, declared_ext: str) -> str:
"""Return best-guess extension for file. Falls back to magic bytes."""
if declared_ext and f".{declared_ext}" in _DOCLING_EXT:
return f".{declared_ext}"
try:
with open(path, "rb") as f:
hdr = f.read(16)
magic = {
b"\x89PNG": ".png", b"\xff\xd8\xff": ".jpg",
b"GIF8": ".gif", b"BM": ".bmp",
b"II*\x00": ".tiff", b"MM\x00*": ".tiff",
b"%PDF": ".pdf",
b"PK\x03\x04": ".docx", # zip-based (docx/xlsx/pptx — good enough)
}
for sig, ext in magic.items():
if hdr.startswith(sig):
return ext
except Exception:
pass
return f".{declared_ext}" if declared_ext else ""
async def convert_path(
path: str,
db: AsyncSession,
output_format: str = "markdown",
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
import os as _os, shutil as _shutil
filename = _os.path.basename(path)
file_type = _os.path.splitext(filename)[1].lstrip(".").lower()
if output_format not in OUTPUT_FORMATS:
raise HTTPException(status_code=422, detail=f"Output format not supported: {output_format}")
actual_path = path
tmp_html = None
tmp_copy = None
email_images: list[tuple[str, bytes]] = []
# ── Email / Maildir ──────────────────────────────────────────
if _is_email_file(path, file_type):
file_type = "eml"
try:
html_content, email_images = _eml_to_html(path)
tmp_html = tempfile.NamedTemporaryFile(delete=False, suffix=".html", mode="w", encoding="utf-8")
tmp_html.write(html_content)
tmp_html.close()
actual_path = tmp_html.name
except Exception as e:
raise HTTPException(status_code=500, detail=f"Email parse error: {e}")
# ── No / wrong extension → copy to temp file with detected ext ─
elif not file_type or f".{file_type}" not in _DOCLING_EXT:
detected = _detect_ext(path, file_type)
if not detected or detected not in _DOCLING_EXT:
raise HTTPException(
status_code=422,
detail=f"Unsupported file format: '{file_type or 'no extension'}'"
)
tmp_copy = tempfile.NamedTemporaryFile(delete=False, suffix=detected)
tmp_copy.close()
_shutil.copy2(path, tmp_copy.name)
actual_path = tmp_copy.name
file_type = detected.lstrip(".")
# ── Standalone image file → OCRFlux directly (skip docling/Tesseract) ─
if (f".{file_type}" in _IMAGE_EXT and use_llm and _llm_client
and output_format in ("markdown", "text")):
try:
with open(actual_path, "rb") as _f:
_img = _f.read()
_mime = "image/png" if file_type == "png" else "image/jpeg"
_ocr = _ocr_image_llm(_mime, _img)
if _ocr:
record = Conversion(
filename=filename, file_type=file_type,
output_format=output_format, content=_ocr,
page_count=1, llm_enabled=True,
)
db.add(record); await db.commit(); await db.refresh(record)
for _t in (tmp_html, tmp_copy):
if _t:
try: os.unlink(_t.name)
except Exception: pass
return record
except Exception as e:
logger.warning("Docling: OCRFlux image path failed (%s) — falling back", e)
try:
result = converter.convert(actual_path)
doc = result.document
page_count = len(doc.pages) if hasattr(doc, "pages") and doc.pages else None
if output_format == "markdown":
content = doc.export_to_markdown()
elif output_format == "json":
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format == "html":
content = doc.export_to_html()
else:
content = doc.export_to_markdown()
content = re.sub(r"#{1,6}\s?", "", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
llm_used = False
if output_format in ("markdown", "text"):
images_replaced = False
# Step 1: replace <!-- image --> placeholders via Vision LLM OCR
if email_images and use_llm and "<!-- image -->" in content:
content = _replace_image_placeholders(content, email_images)
images_replaced = True
llm_used = bool(_llm_client)
# Step 2: fallback — any remaining <!-- image --> → *[Image]*
if "<!-- image -->" in content:
content = content.replace("<!-- image -->", "*[Image]*")
# Step 3: text LLM enrichment — SKIP when we just OCR'd images, because the
# free-form cleanup model tends to summarise away the extracted tables.
if _llm_client and use_llm and not images_replaced:
content = _llm_enrich(content, system_prompt=llm_prompt or None)
llm_used = True
record = Conversion(
filename=filename,
file_type=file_type,
output_format=output_format,
content=content,
page_count=page_count,
llm_enabled=llm_used,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except HTTPException:
raise
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
for tmp in (tmp_html, tmp_copy):
if tmp:
try: os.unlink(tmp.name)
except Exception: pass
async def get_conversion(conversion_id: int, db: AsyncSession) -> Conversion:
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
record = result.scalar_one_or_none()
if not record:
raise HTTPException(status_code=404, detail="Conversion not found")
return record
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
result = await db.execute(
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
)
return result.scalars().all()
async def delete_conversion(conversion_id: int, db: AsyncSession) -> dict:
result = await db.execute(select(Conversion).where(Conversion.id == conversion_id))
record = result.scalar_one_or_none()
if not record:
raise HTTPException(status_code=404, detail="Conversion not found")
await db.delete(record)
await db.commit()
return {"message": f"Conversion {conversion_id} deleted"}

View File

@ -5,5 +5,6 @@ python-multipart
asyncpg
sqlalchemy[asyncio]
openai
httpx
yt-dlp
ffmpeg

View File

@ -0,0 +1,11 @@
"""Central config — single source of truth cho LLM/service settings.
Đổi base_url / model chỉ cần sửa file này."""
import os
# llama-server (Qwen2.5-VL) qua SSH tunnel; Docker reach host qua host.docker.internal
LLM_BASE_URL = "http://host.docker.internal:11436/v1"
LLM_MODEL = "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF"
CLEANUP_MODEL = os.getenv("CLEANUP_MODEL", "")
# Vision OCR gom về vision-service (container api-vision). Ảnh → POST {VISION_URL}/ocr
VISION_URL = os.getenv("VISION_URL", "http://api-vision:8000")

View File

@ -19,14 +19,12 @@ class CleanupResponse(BaseModel):
text: str
class SettingsRequest(BaseModel):
ollama_base_url: str | None = None
ollama_model: str = "llava"
cleanup_model: str | None = None
class SettingsResponse(BaseModel):
llm_enabled: bool
ollama_base_url: str | None
ollama_model: str
llm_base_url: str | None
llm_model: str
cleanup_model: str | None = None
default_prompt: str | None = None
@ -38,7 +36,7 @@ def health():
return HealthResponse(
status="ok",
llm_enabled=markitdown_service.LLM_ACTIVE,
llm_model=markitdown_service.OLLAMA_MODEL if markitdown_service.LLM_ACTIVE else None,
llm_model=markitdown_service.LLM_MODEL if markitdown_service.LLM_ACTIVE else None,
)
@ -66,11 +64,11 @@ async def convert_url(
@router.get("/models")
def list_models():
if not markitdown_service.OLLAMA_BASE_URL:
if not markitdown_service.LLM_BASE_URL:
return {"models": []}
try:
import httpx, re
base = re.sub(r"/v1/?$", "", markitdown_service.OLLAMA_BASE_URL.rstrip("/"))
base = re.sub(r"/v1/?$", "", markitdown_service.LLM_BASE_URL.rstrip("/"))
resp = httpx.get(f"{base}/api/tags", timeout=5)
resp.raise_for_status()
names = [m["name"] for m in resp.json().get("models", [])]
@ -82,20 +80,20 @@ def list_models():
def get_settings():
return SettingsResponse(
llm_enabled=markitdown_service.LLM_ACTIVE,
ollama_base_url=markitdown_service.OLLAMA_BASE_URL,
ollama_model=markitdown_service.OLLAMA_MODEL,
llm_base_url=markitdown_service.LLM_BASE_URL,
llm_model=markitdown_service.LLM_MODEL,
cleanup_model=markitdown_service.CLEANUP_MODEL or None,
default_prompt=markitdown_service.DEFAULT_CLEANUP_PROMPT,
)
@router.post("/settings", response_model=SettingsResponse)
def update_settings(req: SettingsRequest):
markitdown_service._init_llm(req.ollama_base_url or None, req.ollama_model)
markitdown_service._init_llm()
markitdown_service.CLEANUP_MODEL = req.cleanup_model or ""
return SettingsResponse(
llm_enabled=markitdown_service.LLM_ACTIVE,
ollama_base_url=markitdown_service.OLLAMA_BASE_URL,
ollama_model=markitdown_service.OLLAMA_MODEL,
llm_base_url=markitdown_service.LLM_BASE_URL,
llm_model=markitdown_service.LLM_MODEL,
cleanup_model=markitdown_service.CLEANUP_MODEL or None,
default_prompt=markitdown_service.DEFAULT_CLEANUP_PROMPT,
)
@ -108,6 +106,66 @@ async def cleanup(req: CleanupRequest):
return CleanupResponse(text=cleaned)
class PathRequest(BaseModel):
path: str
use_llm: bool = True
llm_prompt: str | None = None
@router.get("/browse")
def browse(path: str = Query("/workspace")):
import os
abs_path = os.path.realpath(path)
if not abs_path.startswith("/workspace"):
raise HTTPException(status_code=403, detail="Access denied")
if not os.path.exists(abs_path):
raise HTTPException(status_code=404, detail="Path not found")
if os.path.isfile(abs_path):
return {"type": "file", "path": abs_path, "name": os.path.basename(abs_path)}
entries = []
try:
for name in sorted(os.listdir(abs_path)):
full = os.path.join(abs_path, name)
entries.append({
"name": name,
"path": full,
"type": "dir" if os.path.isdir(full) else "file",
"ext": os.path.splitext(name)[1].lower() if os.path.isfile(full) else None,
})
except PermissionError:
raise HTTPException(status_code=403, detail="Permission denied")
return {"type": "dir", "path": abs_path, "entries": entries}
@router.post("/convert-path")
async def convert_path(req: PathRequest, db: AsyncSession = Depends(get_db)):
import os
abs_path = os.path.realpath(req.path)
if not abs_path.startswith("/workspace"):
raise HTTPException(status_code=403, detail="Access denied")
if not os.path.exists(abs_path) or not os.path.isfile(abs_path):
raise HTTPException(status_code=404, detail="File not found")
record = await markitdown_service.convert_path(abs_path, db, use_llm=req.use_llm, llm_prompt=req.llm_prompt)
return record
class WriteFileRequest(BaseModel):
path: str
content: str
@router.post("/write-file")
def write_file(req: WriteFileRequest):
import os
abs_path = os.path.realpath(req.path)
if not abs_path.startswith("/workspace"):
raise HTTPException(status_code=403, detail="Access denied")
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "w", encoding="utf-8") as f:
f.write(req.content)
return {"path": abs_path, "bytes": len(req.content.encode())}
@router.get("/history", response_model=list[ConversionRecord])
async def history(limit: int = 20, db: AsyncSession = Depends(get_db)):
records = await markitdown_service.get_history(db, limit)

View File

@ -0,0 +1,370 @@
import os
import tempfile
import logging
from fastapi import UploadFile, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from markitdown import MarkItDown
from app.models.ConvertModel import Conversion
logger = logging.getLogger(__name__)
import openai as _openai
from app.config import LLM_BASE_URL, LLM_MODEL, CLEANUP_MODEL, VISION_URL
LLM_ACTIVE = False
_llm_client = None
md_plain = MarkItDown()
md = md_plain
def _init_llm() -> bool:
global LLM_ACTIVE, _llm_client, md
try:
client = _openai.OpenAI(base_url=LLM_BASE_URL, api_key="none")
_llm_client = client
md = MarkItDown(llm_client=client, llm_model=LLM_MODEL)
LLM_ACTIVE = True
logger.info("MarkItDown: LLM enabled via %s (model=%s)", LLM_BASE_URL, LLM_MODEL)
return True
except Exception as e:
logger.warning("MarkItDown: LLM init failed (%s)", e)
LLM_ACTIVE = False
return False
_init_llm()
DEFAULT_CLEANUP_PROMPT = """You are a technical document formatter. \
The text below was extracted from a multi-column PDF using OCR and is poorly structured: \
columns are merged, headers are mixed with values, and content is out of order.
Your task:
1. Identify the logical sections (e.g. PERFORMANCE, MEMORY, STORAGE, CONNECTIVITY, etc.)
2. Under each section, format specs as a clean two-column Markdown table: | Spec | Value |
3. Keep bullet lists where appropriate (e.g. ports, certifications)
4. Remove duplicate lines and OCR artifacts (e.g. stray "---", lone "|", empty rows)
5. Preserve all technical values exactly do not paraphrase specs
Return ONLY the cleaned Markdown. No code fences, no commentary, no preamble."""
import re as _re
def llm_cleanup(text: str, prompt: str | None = None, model: str | None = None) -> str:
if not _llm_client or not text.strip():
return text
try:
resp = _llm_client.chat.completions.create(
model=model or LLM_MODEL,
messages=[
{"role": "system", "content": prompt or DEFAULT_CLEANUP_PROMPT},
{"role": "user", "content": text},
],
temperature=0,
)
result = resp.choices[0].message.content or text
result = _re.sub(r"^```(?:markdown)?\s*\n?", "", result.strip())
result = _re.sub(r"\n?```\s*$", "", result.strip())
return result.strip() or text
except Exception as e:
logger.warning("MarkItDown: cleanup failed (%s)", e)
return text
# ── Vision-LLM image OCR — gom về vision-service (container api-vision) ───────
def _ocr_image_llm(mime: str, data: bytes) -> str:
"""OCR a single image via the shared vision-service (OCRFlux) → Markdown/HTML.
'' on failure. Upscaling + prompt now live in vision-service."""
import httpx
try:
resp = httpx.post(
f"{VISION_URL}/ocr",
files={"file": ("image", data, mime or "application/octet-stream")},
timeout=300,
)
resp.raise_for_status()
return resp.json().get("text", "")
except Exception as e:
logger.warning("MarkItDown: vision OCR failed (%s)", e)
return ""
def _ocr_cid_images(markdown: str, cid_map: dict[str, tuple[str, bytes]]) -> str:
"""Replace MarkItDown ![..](cid:CID) image links with OCRFlux OCR of each image.
Falls back to *[Image]* when nothing usable is extracted."""
if not cid_map or "cid:" not in markdown:
return markdown
def _sub(m):
entry = cid_map.get(m.group(1).strip())
if not entry:
return "*[Image]*"
return _ocr_image_llm(*entry) or "*[Image]*"
return _re.sub(r"!\[[^\]]*\]\(cid:([^)]+)\)", _sub, markdown)
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
}
YOUTUBE_PATTERN = _re.compile(
r"(https?://)?(www\.)?(youtube\.com/watch|youtu\.be/|youtube\.com/shorts/)"
)
async def convert_url(
url: str,
db: AsyncSession,
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
use_llm_now = LLM_ACTIVE and use_llm
if use_llm_now and llm_prompt:
try:
converter = MarkItDown(
llm_client=_llm_client,
llm_model=LLM_MODEL,
llm_prompt=llm_prompt,
)
except TypeError:
converter = md
elif use_llm_now:
converter = md
else:
converter = md_plain
try:
result = converter.convert(url)
actual_llm = use_llm_now
except Exception as llm_err:
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
result = md_plain.convert(url)
actual_llm = False
else:
raise HTTPException(status_code=500, detail=str(llm_err))
# Use last segment of URL as filename
slug = url.rstrip("/").split("/")[-1].split("?")[0] or "youtube"
filename = f"{slug}.md"
try:
record = Conversion(
filename=filename,
file_type="youtube",
markdown=result.text_content,
llm_enabled=actual_llm,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
async def convert_file(
file: UploadFile,
db: AsyncSession,
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
if not _allowed_file(file.filename):
raise HTTPException(
status_code=422,
detail=f"File type not allowed. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
suffix = os.path.splitext(file.filename)[1]
file_type = suffix.lstrip(".").lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
# Choose converter: LLM only if enabled globally AND requested per-call
use_llm_now = LLM_ACTIVE and use_llm
# If custom prompt provided, create a one-off MarkItDown with that prompt
if use_llm_now and llm_prompt:
try:
converter = MarkItDown(
llm_client=_llm_client,
llm_model=LLM_MODEL,
llm_prompt=llm_prompt,
)
except TypeError:
# older markitdown versions may not support llm_prompt kwarg
converter = md
elif use_llm_now:
converter = md
else:
converter = md_plain
try:
try:
result = converter.convert(tmp_path)
actual_llm = use_llm_now
except Exception as llm_err:
# LLM OOM / 500 — fallback to plain conversion without LLM
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
result = md_plain.convert(tmp_path)
actual_llm = False
else:
raise
record = Conversion(
filename=file.filename,
file_type=file_type,
markdown=result.text_content,
llm_enabled=actual_llm,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
def _is_email_file(path: str, file_type: str) -> bool:
if file_type in ("eml", "msg"):
return True
try:
with open(path, "rb") as f:
head = f.read(2048).decode("utf-8", errors="ignore")
email_headers = (
"from:", "to:", "subject:", "date:", "mime-version:", "message-id:",
"received:", "return-path:", "delivered-to:", "envelope-to:",
"x-mailer:", "content-type:", "x-spam-", "reply-to:",
)
lines = [l.lower() for l in head.splitlines()[:30]]
return sum(1 for l in lines if any(l.startswith(h) for h in email_headers)) >= 2
except Exception:
return False
async def convert_path(path: str, db: AsyncSession, use_llm: bool = True, llm_prompt: str | None = None) -> Conversion:
import os, shutil
filename = os.path.basename(path)
file_type = os.path.splitext(filename)[1].lstrip(".").lower()
# Standalone image → OCRFlux directly (bypass MarkItDown's caption prompt,
# which OCRFlux answers with raw JSON wrapped in "# Description:")
if file_type in {"png", "jpg", "jpeg", "gif", "bmp", "webp", "tiff", "tif"} \
and LLM_ACTIVE and use_llm and _llm_client:
try:
with open(path, "rb") as _f:
_img = _f.read()
_mime = "image/png" if file_type == "png" else "image/jpeg"
_ocr = _ocr_image_llm(_mime, _img)
if _ocr:
record = Conversion(filename=filename, file_type=file_type,
markdown=_ocr, llm_enabled=True)
db.add(record); await db.commit(); await db.refresh(record)
return record
except Exception as e:
logger.warning("MarkItDown: OCRFlux image path failed (%s) — falling back", e)
# Email files (with or without .eml extension) — extract HTML body → temp .html for MarkItDown
tmp_eml = None
actual_path = path
cid_map: dict[str, tuple[str, bytes]] = {} # cid → (mime, raw_bytes) for vision OCR
if _is_email_file(path, file_type):
import tempfile, email as _email, email.policy as _policy, re as _re
file_type = "eml"
try:
with open(path, "rb") as f:
msg = _email.message_from_binary_file(f, policy=_policy.compat32)
html_body = text_body = None
for part in msg.walk():
ct = part.get_content_type()
payload = part.get_payload(decode=True)
if not payload:
continue
if ct.startswith("image/"):
cid = part.get("Content-ID", "").strip("<>")
if cid:
cid_map[cid] = (ct, payload)
continue
charset = part.get_content_charset() or "utf-8"
decoded = payload.decode(charset, errors="replace")
if ct == "text/html" and html_body is None:
html_body = decoded
elif ct == "text/plain" and text_body is None:
text_body = decoded
subject = msg.get("Subject", "")
sender = msg.get("From", "")
date = msg.get("Date", "")
header = f"<h2>{subject}</h2><p><b>From:</b> {sender}<br><b>Date:</b> {date}</p><hr>"
body = html_body or (f"<pre>{text_body}</pre>" if text_body else "<em>No body</em>")
if html_body and "<body" in html_body.lower():
html_out = _re.sub(r"(<body[^>]*>)", r"\1" + header, html_body, count=1, flags=_re.IGNORECASE)
else:
html_out = f"<html><body>{header}{body}</body></html>"
tmp_eml = tempfile.NamedTemporaryFile(delete=False, suffix=".html", mode="w", encoding="utf-8")
tmp_eml.write(html_out)
tmp_eml.close()
actual_path = tmp_eml.name
except Exception as e:
logger.warning("Email parse failed, falling back to raw: %s", e)
use_llm_now = LLM_ACTIVE and use_llm
if use_llm_now and llm_prompt:
try:
converter = MarkItDown(llm_client=_llm_client, llm_model=LLM_MODEL, llm_prompt=llm_prompt)
except TypeError:
converter = md
elif use_llm_now:
converter = md
else:
converter = md_plain
try:
try:
result = converter.convert(actual_path)
actual_llm = use_llm_now
except Exception as llm_err:
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
result = md_plain.convert(path)
actual_llm = False
else:
raise HTTPException(status_code=500, detail=str(llm_err))
content = result.text_content
# Email inline images: replace ![](cid:..) links with Vision-LLM OCR text
if cid_map and use_llm_now and "cid:" in content:
content = _ocr_cid_images(content, cid_map)
actual_llm = True
record = Conversion(filename=filename, file_type=file_type, markdown=content, llm_enabled=actual_llm)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except HTTPException:
raise
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
if tmp_eml:
import os as _os
try: _os.unlink(tmp_eml.name)
except Exception: pass
async def get_history(db: AsyncSession, limit: int = 20) -> list[Conversion]:
result = await db.execute(
select(Conversion).order_by(Conversion.created_at.desc()).limit(limit)
)
return result.scalars().all()

Some files were not shown because too many files have changed in this diff Show More