update youtube link
This commit is contained in:
parent
22cc0d0857
commit
6ba704865f
|
|
@ -78,7 +78,7 @@ services:
|
||||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
|
||||||
volumes:
|
volumes:
|
||||||
- db_markitdown_data:/var/lib/postgresql/data
|
- db_markitdown_data:/var/lib/postgresql/data
|
||||||
- ./db/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
|
- ./db/init_markitdown.sql:/docker-entrypoint-initdb.d/init_markitdown.sql:ro
|
||||||
ports:
|
ports:
|
||||||
- "5432:5432"
|
- "5432:5432"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
|
|
@ -95,7 +95,7 @@ services:
|
||||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
|
||||||
volumes:
|
volumes:
|
||||||
- db_docling_data:/var/lib/postgresql/data
|
- db_docling_data:/var/lib/postgresql/data
|
||||||
- ./db/init_docling.sql:/docker-entrypoint-initdb.d/init.sql:ro
|
- ./db/init_docling.sql:/docker-entrypoint-initdb.d/init_markitdown.sql:ro
|
||||||
healthcheck:
|
healthcheck:
|
||||||
<<: *healthcheck-defaults
|
<<: *healthcheck-defaults
|
||||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"]
|
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"]
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,12 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
class UrlRequest(BaseModel):
|
||||||
|
url: str
|
||||||
|
output_format: str = "markdown"
|
||||||
|
use_llm: bool = True
|
||||||
|
llm_prompt: str | None = None
|
||||||
|
|
||||||
class SettingsRequest(BaseModel):
|
class SettingsRequest(BaseModel):
|
||||||
ollama_base_url: str | None = None
|
ollama_base_url: str | None = None
|
||||||
ollama_model: str = "llava"
|
ollama_model: str = "llava"
|
||||||
|
|
@ -88,6 +94,24 @@ async def convert(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/convert-url", response_model=ConvertResponse)
|
||||||
|
async def convert_url(
|
||||||
|
req: UrlRequest,
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
):
|
||||||
|
record = await docling_service.convert_url(
|
||||||
|
req.url, db, req.output_format, use_llm=req.use_llm, llm_prompt=req.llm_prompt
|
||||||
|
)
|
||||||
|
return ConvertResponse(
|
||||||
|
id=record.id,
|
||||||
|
filename=record.filename,
|
||||||
|
output_format=record.output_format,
|
||||||
|
content=record.content,
|
||||||
|
page_count=record.page_count,
|
||||||
|
llm_enabled=record.llm_enabled,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/conversions/{conversion_id}", response_model=ConvertResponse)
|
@router.get("/conversions/{conversion_id}", response_model=ConvertResponse)
|
||||||
async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
|
async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
|
||||||
record = await docling_service.get_conversion(conversion_id, db)
|
record = await docling_service.get_conversion(conversion_id, db)
|
||||||
|
|
|
||||||
|
|
@ -94,6 +94,148 @@ def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def convert_url(
|
||||||
|
url: str,
|
||||||
|
db: AsyncSession,
|
||||||
|
output_format: str = "markdown",
|
||||||
|
use_llm: bool = True,
|
||||||
|
llm_prompt: str | None = None,
|
||||||
|
) -> "Conversion":
|
||||||
|
"""Fetch a YouTube (or any URL) transcript via yt-dlp, then convert with Docling."""
|
||||||
|
try:
|
||||||
|
import yt_dlp # noqa: PLC0415
|
||||||
|
except ImportError:
|
||||||
|
raise HTTPException(status_code=500, detail="yt-dlp not installed")
|
||||||
|
|
||||||
|
ydl_opts = {
|
||||||
|
"quiet": True,
|
||||||
|
"skip_download": True,
|
||||||
|
"writesubtitles": True,
|
||||||
|
"writeautomaticsub": True,
|
||||||
|
"subtitleslangs": ["vi", "en"],
|
||||||
|
"outtmpl": "%(id)s.%(ext)s",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||||
|
info = ydl.extract_info(url, download=False)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=422, detail=f"yt-dlp error: {e}")
|
||||||
|
|
||||||
|
title = info.get("title", "YouTube Video")
|
||||||
|
description = info.get("description", "") or ""
|
||||||
|
channel = info.get("channel", info.get("uploader", ""))
|
||||||
|
duration = info.get("duration_string", "")
|
||||||
|
upload_date = info.get("upload_date", "")
|
||||||
|
view_count = info.get("view_count")
|
||||||
|
chapters = info.get("chapters") or []
|
||||||
|
|
||||||
|
# Build markdown from available metadata
|
||||||
|
lines = [f"# {title}", ""]
|
||||||
|
meta_rows = []
|
||||||
|
if channel:
|
||||||
|
meta_rows.append(f"**Kênh:** {channel}")
|
||||||
|
if duration:
|
||||||
|
meta_rows.append(f"**Thời lượng:** {duration}")
|
||||||
|
if upload_date and len(upload_date) == 8:
|
||||||
|
meta_rows.append(f"**Ngày đăng:** {upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:]}")
|
||||||
|
if view_count is not None:
|
||||||
|
meta_rows.append(f"**Lượt xem:** {view_count:,}")
|
||||||
|
meta_rows.append(f"**URL:** {url}")
|
||||||
|
lines.extend(meta_rows)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Subtitles/transcript
|
||||||
|
subtitles = info.get("subtitles") or {}
|
||||||
|
auto_subtitles = info.get("automatic_captions") or {}
|
||||||
|
|
||||||
|
transcript_text = None
|
||||||
|
for lang in ("vi", "en"):
|
||||||
|
tracks = subtitles.get(lang) or auto_subtitles.get(lang)
|
||||||
|
if tracks:
|
||||||
|
# Find a json3 or srv3 track to extract plain text
|
||||||
|
for track in tracks:
|
||||||
|
if track.get("ext") in ("json3", "srv3", "ttml", "vtt"):
|
||||||
|
try:
|
||||||
|
import urllib.request
|
||||||
|
with urllib.request.urlopen(track["url"], timeout=15) as r:
|
||||||
|
raw = r.read().decode("utf-8", errors="ignore")
|
||||||
|
# Strip VTT timestamps for vtt format
|
||||||
|
if track.get("ext") == "vtt":
|
||||||
|
cleaned = re.sub(r"\d{2}:\d{2}:\d{2}\.\d+ --> .*", "", raw)
|
||||||
|
cleaned = re.sub(r"^\d+$", "", cleaned, flags=re.MULTILINE)
|
||||||
|
cleaned = re.sub(r"<[^>]+>", "", cleaned)
|
||||||
|
transcript_text = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
|
||||||
|
else:
|
||||||
|
transcript_text = raw
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if transcript_text:
|
||||||
|
break
|
||||||
|
|
||||||
|
if transcript_text:
|
||||||
|
lines += ["## Transcript / Phụ đề", "", transcript_text, ""]
|
||||||
|
elif description:
|
||||||
|
lines += ["## Mô tả", "", description[:3000], ""]
|
||||||
|
else:
|
||||||
|
lines += ["## Ghi chú", "", "_Không có transcript hoặc mô tả._", ""]
|
||||||
|
|
||||||
|
if chapters:
|
||||||
|
lines += ["## Chapters", ""]
|
||||||
|
for ch in chapters:
|
||||||
|
start = ch.get("start_time", 0)
|
||||||
|
m, s = divmod(int(start), 60)
|
||||||
|
lines.append(f"- **{m:02d}:{s:02d}** — {ch.get('title', '')}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
markdown_text = "\n".join(lines)
|
||||||
|
|
||||||
|
# Write temp file and run through Docling
|
||||||
|
video_id = info.get("id", "youtube")
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp:
|
||||||
|
tmp.write(markdown_text)
|
||||||
|
tmp_path = tmp.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = converter.convert(tmp_path)
|
||||||
|
doc = result.document
|
||||||
|
page_count = None
|
||||||
|
|
||||||
|
if output_format == "markdown":
|
||||||
|
content = doc.export_to_markdown()
|
||||||
|
elif output_format == "json":
|
||||||
|
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
|
||||||
|
elif output_format == "html":
|
||||||
|
content = doc.export_to_html()
|
||||||
|
else:
|
||||||
|
content = markdown_text
|
||||||
|
|
||||||
|
llm_used = False
|
||||||
|
if _llm_client and use_llm and output_format in ("markdown", "text"):
|
||||||
|
content = _llm_enrich(content, system_prompt=llm_prompt or None)
|
||||||
|
llm_used = True
|
||||||
|
|
||||||
|
from app.models.ConvertModel import Conversion
|
||||||
|
record = Conversion(
|
||||||
|
filename=f"{video_id}.md",
|
||||||
|
file_type="youtube",
|
||||||
|
output_format=output_format,
|
||||||
|
content=content,
|
||||||
|
page_count=page_count,
|
||||||
|
llm_enabled=llm_used,
|
||||||
|
)
|
||||||
|
db.add(record)
|
||||||
|
await db.commit()
|
||||||
|
await db.refresh(record)
|
||||||
|
return record
|
||||||
|
except Exception as e:
|
||||||
|
await db.rollback()
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------
|
# -----------------------------------------------------------------
|
||||||
ALLOWED_EXTENSIONS = {
|
ALLOWED_EXTENSIONS = {
|
||||||
"pdf", "docx", "xlsx", "pptx",
|
"pdf", "docx", "xlsx", "pptx",
|
||||||
|
|
|
||||||
|
|
@ -5,3 +5,5 @@ python-multipart
|
||||||
asyncpg
|
asyncpg
|
||||||
sqlalchemy[asyncio]
|
sqlalchemy[asyncio]
|
||||||
openai
|
openai
|
||||||
|
yt-dlp
|
||||||
|
ffmpeg
|
||||||
|
|
@ -96,6 +96,22 @@
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<h6 class="card-title fw-semibold mb-3">Tải lên tài liệu để so sánh</h6>
|
<h6 class="card-title fw-semibold mb-3">Tải lên tài liệu để so sánh</h6>
|
||||||
|
|
||||||
|
<!-- Input mode tabs -->
|
||||||
|
<ul class="nav nav-tabs mb-3" id="InputModeTabs">
|
||||||
|
<li class="nav-item">
|
||||||
|
<a class="nav-link active py-1 px-3 small" href="#" onclick="SwitchInputMode('file',this);return false">
|
||||||
|
<i class="bi bi-file-earmark-text me-1"></i>File
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
<li class="nav-item">
|
||||||
|
<a class="nav-link py-1 px-3 small" href="#" onclick="SwitchInputMode('youtube',this);return false">
|
||||||
|
<i class="bi bi-youtube me-1 text-danger"></i>YouTube
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<!-- File upload pane -->
|
||||||
|
<div id="FilePane">
|
||||||
<div id="UploadZone">
|
<div id="UploadZone">
|
||||||
<input type="file" id="FileInput"
|
<input type="file" id="FileInput"
|
||||||
accept=".pdf,.docx,.xlsx,.pptx,.html,.htm,.csv,.txt,.jpg,.jpeg,.png,.tiff,.tif,.bmp,.md,.epub,.zip,.asciidoc,.adoc" />
|
accept=".pdf,.docx,.xlsx,.pptx,.html,.htm,.csv,.txt,.jpg,.jpeg,.png,.tiff,.tif,.bmp,.md,.epub,.zip,.asciidoc,.adoc" />
|
||||||
|
|
@ -117,10 +133,29 @@
|
||||||
<span class="badge bg-secondary-subtle text-secondary">TIFF</span>
|
<span class="badge bg-secondary-subtle text-secondary">TIFF</span>
|
||||||
<span class="badge bg-secondary-subtle text-secondary">ASCIIDoc</span>
|
<span class="badge bg-secondary-subtle text-secondary">ASCIIDoc</span>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- YouTube URL pane -->
|
||||||
|
<div id="YoutubePane" class="d-none">
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-text bg-danger text-white"><i class="bi bi-youtube"></i></span>
|
||||||
|
<input type="url" class="form-control" id="YoutubeUrl"
|
||||||
|
placeholder="https://www.youtube.com/watch?v=..."
|
||||||
|
oninput="OnYoutubeInput()" />
|
||||||
|
<button class="btn btn-outline-secondary" onclick="document.getElementById('YoutubeUrl').value='';OnYoutubeInput()">
|
||||||
|
<i class="bi bi-x"></i>
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="form-text mt-1">
|
||||||
|
<i class="bi bi-info-circle me-1"></i>
|
||||||
|
<b>MarkItDown</b>: dùng <code>yt-dlp</code> native |
|
||||||
|
<b>Docling</b>: trích transcript → convert markdown
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Controls row -->
|
<!-- Controls row -->
|
||||||
<div class="d-flex flex-wrap align-items-center gap-3 mt-3">
|
<div class="d-flex flex-wrap align-items-center gap-3 mt-3">
|
||||||
<div class="d-flex align-items-center gap-2">
|
<div class="d-flex align-items-center gap-2" id="DoclingFmtWrap">
|
||||||
<label class="form-label mb-0 small fw-medium" for="DoclingFormat">Docling format</label>
|
<label class="form-label mb-0 small fw-medium" for="DoclingFormat">Docling format</label>
|
||||||
<select class="form-select form-select-sm" id="DoclingFormat" style="width:auto">
|
<select class="form-select form-select-sm" id="DoclingFormat" style="width:auto">
|
||||||
<option value="markdown">Markdown</option>
|
<option value="markdown">Markdown</option>
|
||||||
|
|
@ -444,9 +479,31 @@
|
||||||
const DoclingUrl = '/api/docling';
|
const DoclingUrl = '/api/docling';
|
||||||
|
|
||||||
let CurrentFile = null;
|
let CurrentFile = null;
|
||||||
|
let CurrentYoutubeUrl = '';
|
||||||
|
let InputMode = 'file'; // 'file' | 'youtube'
|
||||||
let MdContent = '';
|
let MdContent = '';
|
||||||
let DlContent = '';
|
let DlContent = '';
|
||||||
|
|
||||||
|
// ── Input mode toggle ─────────────────────────────────────────
|
||||||
|
function SwitchInputMode(Mode, Link) {
|
||||||
|
InputMode = Mode;
|
||||||
|
document.querySelectorAll('#InputModeTabs .nav-link').forEach(L => L.classList.remove('active'));
|
||||||
|
Link.classList.add('active');
|
||||||
|
SetDisplay('FilePane', Mode === 'file');
|
||||||
|
SetDisplay('YoutubePane', Mode === 'youtube');
|
||||||
|
// Re-evaluate button state
|
||||||
|
if (Mode === 'file') {
|
||||||
|
document.getElementById('ConvertBtn').disabled = !CurrentFile;
|
||||||
|
} else {
|
||||||
|
document.getElementById('ConvertBtn').disabled = !CurrentYoutubeUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function OnYoutubeInput() {
|
||||||
|
CurrentYoutubeUrl = document.getElementById('YoutubeUrl').value.trim();
|
||||||
|
document.getElementById('ConvertBtn').disabled = !CurrentYoutubeUrl;
|
||||||
|
}
|
||||||
|
|
||||||
// ── File input ────────────────────────────────────────────────
|
// ── File input ────────────────────────────────────────────────
|
||||||
const UploadZone = document.getElementById('UploadZone');
|
const UploadZone = document.getElementById('UploadZone');
|
||||||
const FileInput = document.getElementById('FileInput');
|
const FileInput = document.getElementById('FileInput');
|
||||||
|
|
@ -492,7 +549,8 @@
|
||||||
|
|
||||||
// ── Conversion ────────────────────────────────────────────────
|
// ── Conversion ────────────────────────────────────────────────
|
||||||
async function RunConversion() {
|
async function RunConversion() {
|
||||||
if (!CurrentFile) return;
|
if (InputMode === 'file' && !CurrentFile) return;
|
||||||
|
if (InputMode === 'youtube' && !CurrentYoutubeUrl) return;
|
||||||
|
|
||||||
document.getElementById('ConvertBtn').disabled = true;
|
document.getElementById('ConvertBtn').disabled = true;
|
||||||
SetDisplay('ProgressRow', true);
|
SetDisplay('ProgressRow', true);
|
||||||
|
|
@ -509,10 +567,16 @@
|
||||||
const UseLlm = document.getElementById('LlmToggle').checked;
|
const UseLlm = document.getElementById('LlmToggle').checked;
|
||||||
const CustomPrompt = document.getElementById('LlmPrompt').value.trim();
|
const CustomPrompt = document.getElementById('LlmPrompt').value.trim();
|
||||||
|
|
||||||
const [MdResult, DlResult] = await Promise.allSettled([
|
let MdPromise, DlPromise;
|
||||||
ConvertMarkItDown(CurrentFile, UseLlm, CustomPrompt),
|
if (InputMode === 'youtube') {
|
||||||
ConvertDocling(CurrentFile, DoclingFmt, UseLlm, CustomPrompt),
|
MdPromise = ConvertMarkItDownUrl(CurrentYoutubeUrl, UseLlm, CustomPrompt);
|
||||||
]);
|
DlPromise = ConvertDoclingUrl(CurrentYoutubeUrl, DoclingFmt, UseLlm, CustomPrompt);
|
||||||
|
} else {
|
||||||
|
MdPromise = ConvertMarkItDown(CurrentFile, UseLlm, CustomPrompt);
|
||||||
|
DlPromise = ConvertDocling(CurrentFile, DoclingFmt, UseLlm, CustomPrompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
const [MdResult, DlResult] = await Promise.allSettled([MdPromise, DlPromise]);
|
||||||
|
|
||||||
document.getElementById('ConvertBtn').disabled = false;
|
document.getElementById('ConvertBtn').disabled = false;
|
||||||
ShowCompare(MdResult, DlResult);
|
ShowCompare(MdResult, DlResult);
|
||||||
|
|
@ -532,6 +596,19 @@
|
||||||
return { Content: Data.markdown, Ms, LlmEnabled: Data.llm_enabled };
|
return { Content: Data.markdown, Ms, LlmEnabled: Data.llm_enabled };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function ConvertMarkItDownUrl(Url, UseLlm, CustomPrompt) {
|
||||||
|
const T0 = performance.now();
|
||||||
|
const Res = await fetch(MarkItDownUrl + '/convert-url', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ url: Url, use_llm: UseLlm, llm_prompt: CustomPrompt || null }),
|
||||||
|
});
|
||||||
|
const Ms = Math.round(performance.now() - T0);
|
||||||
|
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
|
||||||
|
const Data = await Res.json();
|
||||||
|
return { Content: Data.markdown, Ms, LlmEnabled: Data.llm_enabled };
|
||||||
|
}
|
||||||
|
|
||||||
async function ConvertDocling(File_, Fmt, UseLlm, CustomPrompt) {
|
async function ConvertDocling(File_, Fmt, UseLlm, CustomPrompt) {
|
||||||
const Form = new FormData();
|
const Form = new FormData();
|
||||||
Form.append('file', File_);
|
Form.append('file', File_);
|
||||||
|
|
@ -545,6 +622,19 @@
|
||||||
return { Content: Data.content, Ms, Pages: Data.page_count, LlmEnabled: Data.llm_enabled };
|
return { Content: Data.content, Ms, Pages: Data.page_count, LlmEnabled: Data.llm_enabled };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function ConvertDoclingUrl(Url, Fmt, UseLlm, CustomPrompt) {
|
||||||
|
const T0 = performance.now();
|
||||||
|
const Res = await fetch(DoclingUrl + '/convert-url', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ url: Url, output_format: Fmt, use_llm: UseLlm, llm_prompt: CustomPrompt || null }),
|
||||||
|
});
|
||||||
|
const Ms = Math.round(performance.now() - T0);
|
||||||
|
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
|
||||||
|
const Data = await Res.json();
|
||||||
|
return { Content: Data.content, Ms, Pages: Data.page_count, LlmEnabled: Data.llm_enabled };
|
||||||
|
}
|
||||||
|
|
||||||
function SetDisplay(Id, Visible) {
|
function SetDisplay(Id, Visible) {
|
||||||
document.getElementById(Id).classList.toggle('d-none', !Visible);
|
document.getElementById(Id).classList.toggle('d-none', !Visible);
|
||||||
}
|
}
|
||||||
|
|
@ -664,11 +754,13 @@
|
||||||
function ClearResults() {
|
function ClearResults() {
|
||||||
MdContent = ''; DlContent = '';
|
MdContent = ''; DlContent = '';
|
||||||
CurrentFile = null;
|
CurrentFile = null;
|
||||||
|
CurrentYoutubeUrl = '';
|
||||||
SetDisplay('ProgressRow', false);
|
SetDisplay('ProgressRow', false);
|
||||||
SetDisplay('CompareBanner', false);
|
SetDisplay('CompareBanner', false);
|
||||||
document.getElementById('MdBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
|
document.getElementById('MdBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
|
||||||
document.getElementById('DlBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
|
document.getElementById('DlBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
|
||||||
document.getElementById('FileName').textContent = '';
|
document.getElementById('FileName').textContent = '';
|
||||||
|
document.getElementById('YoutubeUrl').value = '';
|
||||||
document.getElementById('ConvertBtn').disabled = true;
|
document.getElementById('ConvertBtn').disabled = true;
|
||||||
document.getElementById('CleanupBtn').classList.add('d-none');
|
document.getElementById('CleanupBtn').classList.add('d-none');
|
||||||
document.getElementById('MdDownloadBtn').classList.add('d-none');
|
document.getElementById('MdDownloadBtn').classList.add('d-none');
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,8 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
ffmpeg libmagic1 \
|
ffmpeg \
|
||||||
|
libmagic1 \
|
||||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,11 @@ from app.services import MarkitdownService as markitdown_service
|
||||||
from app.database import get_db
|
from app.database import get_db
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
class UrlRequest(BaseModel):
|
||||||
|
url: str
|
||||||
|
use_llm: bool = True
|
||||||
|
llm_prompt: str | None = None
|
||||||
|
|
||||||
class CleanupRequest(BaseModel):
|
class CleanupRequest(BaseModel):
|
||||||
text: str
|
text: str
|
||||||
prompt: str | None = None
|
prompt: str | None = None
|
||||||
|
|
@ -48,6 +53,17 @@ async def convert(
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/convert-url", response_model=ConvertResponse)
|
||||||
|
async def convert_url(
|
||||||
|
req: UrlRequest,
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
):
|
||||||
|
record = await markitdown_service.convert_url(
|
||||||
|
req.url, db, use_llm=req.use_llm, llm_prompt=req.llm_prompt
|
||||||
|
)
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
@router.get("/models")
|
@router.get("/models")
|
||||||
def list_models():
|
def list_models():
|
||||||
if not markitdown_service.OLLAMA_BASE_URL:
|
if not markitdown_service.OLLAMA_BASE_URL:
|
||||||
|
|
|
||||||
|
|
@ -84,6 +84,62 @@ ALLOWED_EXTENSIONS = {
|
||||||
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
|
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
YOUTUBE_PATTERN = _re.compile(
|
||||||
|
r"(https?://)?(www\.)?(youtube\.com/watch|youtu\.be/|youtube\.com/shorts/)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def convert_url(
|
||||||
|
url: str,
|
||||||
|
db: AsyncSession,
|
||||||
|
use_llm: bool = True,
|
||||||
|
llm_prompt: str | None = None,
|
||||||
|
) -> Conversion:
|
||||||
|
use_llm_now = LLM_ACTIVE and use_llm
|
||||||
|
if use_llm_now and llm_prompt:
|
||||||
|
try:
|
||||||
|
converter = MarkItDown(
|
||||||
|
llm_client=_llm_client,
|
||||||
|
llm_model=OLLAMA_MODEL,
|
||||||
|
llm_prompt=llm_prompt,
|
||||||
|
)
|
||||||
|
except TypeError:
|
||||||
|
converter = md
|
||||||
|
elif use_llm_now:
|
||||||
|
converter = md
|
||||||
|
else:
|
||||||
|
converter = md_plain
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = converter.convert(url)
|
||||||
|
actual_llm = use_llm_now
|
||||||
|
except Exception as llm_err:
|
||||||
|
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
|
||||||
|
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
|
||||||
|
result = md_plain.convert(url)
|
||||||
|
actual_llm = False
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=500, detail=str(llm_err))
|
||||||
|
|
||||||
|
# Use last segment of URL as filename
|
||||||
|
slug = url.rstrip("/").split("/")[-1].split("?")[0] or "youtube"
|
||||||
|
filename = f"{slug}.md"
|
||||||
|
|
||||||
|
try:
|
||||||
|
record = Conversion(
|
||||||
|
filename=filename,
|
||||||
|
file_type="youtube",
|
||||||
|
markdown=result.text_content,
|
||||||
|
llm_enabled=actual_llm,
|
||||||
|
)
|
||||||
|
db.add(record)
|
||||||
|
await db.commit()
|
||||||
|
await db.refresh(record)
|
||||||
|
return record
|
||||||
|
except Exception as e:
|
||||||
|
await db.rollback()
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
def _allowed_file(filename: str) -> bool:
|
def _allowed_file(filename: str) -> bool:
|
||||||
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
||||||
|
|
|
||||||
|
|
@ -6,3 +6,4 @@ asyncpg
|
||||||
sqlalchemy[asyncio]
|
sqlalchemy[asyncio]
|
||||||
openai
|
openai
|
||||||
httpx
|
httpx
|
||||||
|
yt-dlp
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue