update youtube link

This commit is contained in:
Kai Ton 2026-06-26 07:57:43 +00:00
parent 22cc0d0857
commit 6ba704865f
10 changed files with 361 additions and 27 deletions

View File

@ -78,7 +78,7 @@ services:
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
volumes:
- db_markitdown_data:/var/lib/postgresql/data
- ./db/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
- ./db/init_markitdown.sql:/docker-entrypoint-initdb.d/init_markitdown.sql:ro
ports:
- "5432:5432"
healthcheck:
@ -95,7 +95,7 @@ services:
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
volumes:
- db_docling_data:/var/lib/postgresql/data
- ./db/init_docling.sql:/docker-entrypoint-initdb.d/init.sql:ro
- ./db/init_docling.sql:/docker-entrypoint-initdb.d/init_markitdown.sql:ro
healthcheck:
<<: *healthcheck-defaults
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"]

View File

@ -7,6 +7,12 @@ from pydantic import BaseModel
router = APIRouter()
class UrlRequest(BaseModel):
url: str
output_format: str = "markdown"
use_llm: bool = True
llm_prompt: str | None = None
class SettingsRequest(BaseModel):
ollama_base_url: str | None = None
ollama_model: str = "llava"
@ -88,6 +94,24 @@ async def convert(
)
@router.post("/convert-url", response_model=ConvertResponse)
async def convert_url(
req: UrlRequest,
db: AsyncSession = Depends(get_db),
):
record = await docling_service.convert_url(
req.url, db, req.output_format, use_llm=req.use_llm, llm_prompt=req.llm_prompt
)
return ConvertResponse(
id=record.id,
filename=record.filename,
output_format=record.output_format,
content=record.content,
page_count=record.page_count,
llm_enabled=record.llm_enabled,
)
@router.get("/conversions/{conversion_id}", response_model=ConvertResponse)
async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
record = await docling_service.get_conversion(conversion_id, db)

View File

@ -94,6 +94,148 @@ def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str:
async def convert_url(
url: str,
db: AsyncSession,
output_format: str = "markdown",
use_llm: bool = True,
llm_prompt: str | None = None,
) -> "Conversion":
"""Fetch a YouTube (or any URL) transcript via yt-dlp, then convert with Docling."""
try:
import yt_dlp # noqa: PLC0415
except ImportError:
raise HTTPException(status_code=500, detail="yt-dlp not installed")
ydl_opts = {
"quiet": True,
"skip_download": True,
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": ["vi", "en"],
"outtmpl": "%(id)s.%(ext)s",
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
except Exception as e:
raise HTTPException(status_code=422, detail=f"yt-dlp error: {e}")
title = info.get("title", "YouTube Video")
description = info.get("description", "") or ""
channel = info.get("channel", info.get("uploader", ""))
duration = info.get("duration_string", "")
upload_date = info.get("upload_date", "")
view_count = info.get("view_count")
chapters = info.get("chapters") or []
# Build markdown from available metadata
lines = [f"# {title}", ""]
meta_rows = []
if channel:
meta_rows.append(f"**Kênh:** {channel}")
if duration:
meta_rows.append(f"**Thời lượng:** {duration}")
if upload_date and len(upload_date) == 8:
meta_rows.append(f"**Ngày đăng:** {upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:]}")
if view_count is not None:
meta_rows.append(f"**Lượt xem:** {view_count:,}")
meta_rows.append(f"**URL:** {url}")
lines.extend(meta_rows)
lines.append("")
# Subtitles/transcript
subtitles = info.get("subtitles") or {}
auto_subtitles = info.get("automatic_captions") or {}
transcript_text = None
for lang in ("vi", "en"):
tracks = subtitles.get(lang) or auto_subtitles.get(lang)
if tracks:
# Find a json3 or srv3 track to extract plain text
for track in tracks:
if track.get("ext") in ("json3", "srv3", "ttml", "vtt"):
try:
import urllib.request
with urllib.request.urlopen(track["url"], timeout=15) as r:
raw = r.read().decode("utf-8", errors="ignore")
# Strip VTT timestamps for vtt format
if track.get("ext") == "vtt":
cleaned = re.sub(r"\d{2}:\d{2}:\d{2}\.\d+ --> .*", "", raw)
cleaned = re.sub(r"^\d+$", "", cleaned, flags=re.MULTILINE)
cleaned = re.sub(r"<[^>]+>", "", cleaned)
transcript_text = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
else:
transcript_text = raw
break
except Exception:
pass
if transcript_text:
break
if transcript_text:
lines += ["## Transcript / Phụ đề", "", transcript_text, ""]
elif description:
lines += ["## Mô tả", "", description[:3000], ""]
else:
lines += ["## Ghi chú", "", "_Không có transcript hoặc mô tả._", ""]
if chapters:
lines += ["## Chapters", ""]
for ch in chapters:
start = ch.get("start_time", 0)
m, s = divmod(int(start), 60)
lines.append(f"- **{m:02d}:{s:02d}** — {ch.get('title', '')}")
lines.append("")
markdown_text = "\n".join(lines)
# Write temp file and run through Docling
video_id = info.get("id", "youtube")
with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp:
tmp.write(markdown_text)
tmp_path = tmp.name
try:
result = converter.convert(tmp_path)
doc = result.document
page_count = None
if output_format == "markdown":
content = doc.export_to_markdown()
elif output_format == "json":
content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format == "html":
content = doc.export_to_html()
else:
content = markdown_text
llm_used = False
if _llm_client and use_llm and output_format in ("markdown", "text"):
content = _llm_enrich(content, system_prompt=llm_prompt or None)
llm_used = True
from app.models.ConvertModel import Conversion
record = Conversion(
filename=f"{video_id}.md",
file_type="youtube",
output_format=output_format,
content=content,
page_count=page_count,
llm_enabled=llm_used,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
os.unlink(tmp_path)
# -----------------------------------------------------------------
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",

View File

@ -5,3 +5,5 @@ python-multipart
asyncpg
sqlalchemy[asyncio]
openai
yt-dlp
ffmpeg

View File

@ -96,6 +96,22 @@
<div class="card-body">
<h6 class="card-title fw-semibold mb-3">Tải lên tài liệu để so sánh</h6>
<!-- Input mode tabs -->
<ul class="nav nav-tabs mb-3" id="InputModeTabs">
<li class="nav-item">
<a class="nav-link active py-1 px-3 small" href="#" onclick="SwitchInputMode('file',this);return false">
<i class="bi bi-file-earmark-text me-1"></i>File
</a>
</li>
<li class="nav-item">
<a class="nav-link py-1 px-3 small" href="#" onclick="SwitchInputMode('youtube',this);return false">
<i class="bi bi-youtube me-1 text-danger"></i>YouTube
</a>
</li>
</ul>
<!-- File upload pane -->
<div id="FilePane">
<div id="UploadZone">
<input type="file" id="FileInput"
accept=".pdf,.docx,.xlsx,.pptx,.html,.htm,.csv,.txt,.jpg,.jpeg,.png,.tiff,.tif,.bmp,.md,.epub,.zip,.asciidoc,.adoc" />
@ -117,10 +133,29 @@
<span class="badge bg-secondary-subtle text-secondary">TIFF</span>
<span class="badge bg-secondary-subtle text-secondary">ASCIIDoc</span>
</div>
</div>
<!-- YouTube URL pane -->
<div id="YoutubePane" class="d-none">
<div class="input-group">
<span class="input-group-text bg-danger text-white"><i class="bi bi-youtube"></i></span>
<input type="url" class="form-control" id="YoutubeUrl"
placeholder="https://www.youtube.com/watch?v=..."
oninput="OnYoutubeInput()" />
<button class="btn btn-outline-secondary" onclick="document.getElementById('YoutubeUrl').value='';OnYoutubeInput()">
<i class="bi bi-x"></i>
</button>
</div>
<div class="form-text mt-1">
<i class="bi bi-info-circle me-1"></i>
<b>MarkItDown</b>: dùng <code>yt-dlp</code> native &nbsp;|&nbsp;
<b>Docling</b>: trích transcript → convert markdown
</div>
</div>
<!-- Controls row -->
<div class="d-flex flex-wrap align-items-center gap-3 mt-3">
<div class="d-flex align-items-center gap-2">
<div class="d-flex align-items-center gap-2" id="DoclingFmtWrap">
<label class="form-label mb-0 small fw-medium" for="DoclingFormat">Docling format</label>
<select class="form-select form-select-sm" id="DoclingFormat" style="width:auto">
<option value="markdown">Markdown</option>
@ -444,9 +479,31 @@
const DoclingUrl = '/api/docling';
let CurrentFile = null;
let CurrentYoutubeUrl = '';
let InputMode = 'file'; // 'file' | 'youtube'
let MdContent = '';
let DlContent = '';
// ── Input mode toggle ─────────────────────────────────────────
function SwitchInputMode(Mode, Link) {
InputMode = Mode;
document.querySelectorAll('#InputModeTabs .nav-link').forEach(L => L.classList.remove('active'));
Link.classList.add('active');
SetDisplay('FilePane', Mode === 'file');
SetDisplay('YoutubePane', Mode === 'youtube');
// Re-evaluate button state
if (Mode === 'file') {
document.getElementById('ConvertBtn').disabled = !CurrentFile;
} else {
document.getElementById('ConvertBtn').disabled = !CurrentYoutubeUrl;
}
}
function OnYoutubeInput() {
CurrentYoutubeUrl = document.getElementById('YoutubeUrl').value.trim();
document.getElementById('ConvertBtn').disabled = !CurrentYoutubeUrl;
}
// ── File input ────────────────────────────────────────────────
const UploadZone = document.getElementById('UploadZone');
const FileInput = document.getElementById('FileInput');
@ -492,7 +549,8 @@
// ── Conversion ────────────────────────────────────────────────
async function RunConversion() {
if (!CurrentFile) return;
if (InputMode === 'file' && !CurrentFile) return;
if (InputMode === 'youtube' && !CurrentYoutubeUrl) return;
document.getElementById('ConvertBtn').disabled = true;
SetDisplay('ProgressRow', true);
@ -509,10 +567,16 @@
const UseLlm = document.getElementById('LlmToggle').checked;
const CustomPrompt = document.getElementById('LlmPrompt').value.trim();
const [MdResult, DlResult] = await Promise.allSettled([
ConvertMarkItDown(CurrentFile, UseLlm, CustomPrompt),
ConvertDocling(CurrentFile, DoclingFmt, UseLlm, CustomPrompt),
]);
let MdPromise, DlPromise;
if (InputMode === 'youtube') {
MdPromise = ConvertMarkItDownUrl(CurrentYoutubeUrl, UseLlm, CustomPrompt);
DlPromise = ConvertDoclingUrl(CurrentYoutubeUrl, DoclingFmt, UseLlm, CustomPrompt);
} else {
MdPromise = ConvertMarkItDown(CurrentFile, UseLlm, CustomPrompt);
DlPromise = ConvertDocling(CurrentFile, DoclingFmt, UseLlm, CustomPrompt);
}
const [MdResult, DlResult] = await Promise.allSettled([MdPromise, DlPromise]);
document.getElementById('ConvertBtn').disabled = false;
ShowCompare(MdResult, DlResult);
@ -532,6 +596,19 @@
return { Content: Data.markdown, Ms, LlmEnabled: Data.llm_enabled };
}
async function ConvertMarkItDownUrl(Url, UseLlm, CustomPrompt) {
const T0 = performance.now();
const Res = await fetch(MarkItDownUrl + '/convert-url', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url: Url, use_llm: UseLlm, llm_prompt: CustomPrompt || null }),
});
const Ms = Math.round(performance.now() - T0);
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
const Data = await Res.json();
return { Content: Data.markdown, Ms, LlmEnabled: Data.llm_enabled };
}
async function ConvertDocling(File_, Fmt, UseLlm, CustomPrompt) {
const Form = new FormData();
Form.append('file', File_);
@ -545,6 +622,19 @@
return { Content: Data.content, Ms, Pages: Data.page_count, LlmEnabled: Data.llm_enabled };
}
async function ConvertDoclingUrl(Url, Fmt, UseLlm, CustomPrompt) {
const T0 = performance.now();
const Res = await fetch(DoclingUrl + '/convert-url', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url: Url, output_format: Fmt, use_llm: UseLlm, llm_prompt: CustomPrompt || null }),
});
const Ms = Math.round(performance.now() - T0);
if (!Res.ok) throw new Error((await Res.json()).detail || Res.statusText);
const Data = await Res.json();
return { Content: Data.content, Ms, Pages: Data.page_count, LlmEnabled: Data.llm_enabled };
}
function SetDisplay(Id, Visible) {
document.getElementById(Id).classList.toggle('d-none', !Visible);
}
@ -664,11 +754,13 @@
function ClearResults() {
MdContent = ''; DlContent = '';
CurrentFile = null;
CurrentYoutubeUrl = '';
SetDisplay('ProgressRow', false);
SetDisplay('CompareBanner', false);
document.getElementById('MdBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
document.getElementById('DlBody').innerHTML = '<div class="text-center text-muted py-5 small"><i class="bi bi-upload fs-3 d-block mb-2"></i>Tải file lên để xem kết quả</div>';
document.getElementById('FileName').textContent = '';
document.getElementById('YoutubeUrl').value = '';
document.getElementById('ConvertBtn').disabled = true;
document.getElementById('CleanupBtn').classList.add('d-none');
document.getElementById('MdDownloadBtn').classList.add('d-none');

View File

@ -6,7 +6,8 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
WORKDIR /app
RUN apt-get update && apt-get install -y \
ffmpeg libmagic1 \
ffmpeg \
libmagic1 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .

View File

@ -5,6 +5,11 @@ from app.services import MarkitdownService as markitdown_service
from app.database import get_db
from pydantic import BaseModel
class UrlRequest(BaseModel):
url: str
use_llm: bool = True
llm_prompt: str | None = None
class CleanupRequest(BaseModel):
text: str
prompt: str | None = None
@ -48,6 +53,17 @@ async def convert(
return record
@router.post("/convert-url", response_model=ConvertResponse)
async def convert_url(
req: UrlRequest,
db: AsyncSession = Depends(get_db),
):
record = await markitdown_service.convert_url(
req.url, db, use_llm=req.use_llm, llm_prompt=req.llm_prompt
)
return record
@router.get("/models")
def list_models():
if not markitdown_service.OLLAMA_BASE_URL:

View File

@ -84,6 +84,62 @@ ALLOWED_EXTENSIONS = {
"html", "csv", "txt", "jpg", "jpeg", "png", "zip", "epub"
}
YOUTUBE_PATTERN = _re.compile(
r"(https?://)?(www\.)?(youtube\.com/watch|youtu\.be/|youtube\.com/shorts/)"
)
async def convert_url(
url: str,
db: AsyncSession,
use_llm: bool = True,
llm_prompt: str | None = None,
) -> Conversion:
use_llm_now = LLM_ACTIVE and use_llm
if use_llm_now and llm_prompt:
try:
converter = MarkItDown(
llm_client=_llm_client,
llm_model=OLLAMA_MODEL,
llm_prompt=llm_prompt,
)
except TypeError:
converter = md
elif use_llm_now:
converter = md
else:
converter = md_plain
try:
result = converter.convert(url)
actual_llm = use_llm_now
except Exception as llm_err:
if use_llm_now and ("500" in str(llm_err) or "InternalServerError" in type(llm_err).__name__):
logger.warning("MarkItDown: LLM failed (%s), retrying without LLM", llm_err)
result = md_plain.convert(url)
actual_llm = False
else:
raise HTTPException(status_code=500, detail=str(llm_err))
# Use last segment of URL as filename
slug = url.rstrip("/").split("/")[-1].split("?")[0] or "youtube"
filename = f"{slug}.md"
try:
record = Conversion(
filename=filename,
file_type="youtube",
markdown=result.text_content,
llm_enabled=actual_llm,
)
db.add(record)
await db.commit()
await db.refresh(record)
return record
except Exception as e:
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
def _allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

View File

@ -6,3 +6,4 @@ asyncpg
sqlalchemy[asyncio]
openai
httpx
yt-dlp