diff --git a/db/init.sql b/db/init_markitdown.sql
similarity index 100%
rename from db/init.sql
rename to db/init_markitdown.sql
diff --git a/docker-compose.yml b/docker-compose.yml
index 42ac8ec..9b378b3 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -78,7 +78,7 @@ services:
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
volumes:
- db_markitdown_data:/var/lib/postgresql/data
- - ./db/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
+ - ./db/init_markitdown.sql:/docker-entrypoint-initdb.d/init_markitdown.sql:ro
ports:
- "5432:5432"
healthcheck:
@@ -95,7 +95,7 @@ services:
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin}
volumes:
- db_docling_data:/var/lib/postgresql/data
- - ./db/init_docling.sql:/docker-entrypoint-initdb.d/init.sql:ro
+ - ./db/init_docling.sql:/docker-entrypoint-initdb.d/init_markitdown.sql:ro
healthcheck:
<<: *healthcheck-defaults
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"]
diff --git a/docling-service/app/controllers/ConvertController.py b/docling-service/app/controllers/ConvertController.py
index 6df3183..b924fa2 100644
--- a/docling-service/app/controllers/ConvertController.py
+++ b/docling-service/app/controllers/ConvertController.py
@@ -7,6 +7,12 @@ from pydantic import BaseModel
router = APIRouter()
+class UrlRequest(BaseModel):
+ url: str
+ output_format: str = "markdown"
+ use_llm: bool = True
+ llm_prompt: str | None = None
+
class SettingsRequest(BaseModel):
ollama_base_url: str | None = None
ollama_model: str = "llava"
@@ -88,6 +94,24 @@ async def convert(
)
+@router.post("/convert-url", response_model=ConvertResponse)
+async def convert_url(
+ req: UrlRequest,
+ db: AsyncSession = Depends(get_db),
+):
+ record = await docling_service.convert_url(
+ req.url, db, req.output_format, use_llm=req.use_llm, llm_prompt=req.llm_prompt
+ )
+ return ConvertResponse(
+ id=record.id,
+ filename=record.filename,
+ output_format=record.output_format,
+ content=record.content,
+ page_count=record.page_count,
+ llm_enabled=record.llm_enabled,
+ )
+
+
@router.get("/conversions/{conversion_id}", response_model=ConvertResponse)
async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)):
record = await docling_service.get_conversion(conversion_id, db)
diff --git a/docling-service/app/services/DoclingService.py b/docling-service/app/services/DoclingService.py
index f6fa537..c370c6e 100644
--- a/docling-service/app/services/DoclingService.py
+++ b/docling-service/app/services/DoclingService.py
@@ -94,6 +94,148 @@ def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str:
+async def convert_url(
+ url: str,
+ db: AsyncSession,
+ output_format: str = "markdown",
+ use_llm: bool = True,
+ llm_prompt: str | None = None,
+) -> "Conversion":
+ """Fetch a YouTube (or any URL) transcript via yt-dlp, then convert with Docling."""
+ try:
+ import yt_dlp # noqa: PLC0415
+ except ImportError:
+ raise HTTPException(status_code=500, detail="yt-dlp not installed")
+
+ ydl_opts = {
+ "quiet": True,
+ "skip_download": True,
+ "writesubtitles": True,
+ "writeautomaticsub": True,
+ "subtitleslangs": ["vi", "en"],
+ "outtmpl": "%(id)s.%(ext)s",
+ }
+
+ try:
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+ info = ydl.extract_info(url, download=False)
+ except Exception as e:
+ raise HTTPException(status_code=422, detail=f"yt-dlp error: {e}")
+
+ title = info.get("title", "YouTube Video")
+ description = info.get("description", "") or ""
+ channel = info.get("channel", info.get("uploader", ""))
+ duration = info.get("duration_string", "")
+ upload_date = info.get("upload_date", "")
+ view_count = info.get("view_count")
+ chapters = info.get("chapters") or []
+
+ # Build markdown from available metadata
+ lines = [f"# {title}", ""]
+ meta_rows = []
+ if channel:
+ meta_rows.append(f"**Kênh:** {channel}")
+ if duration:
+ meta_rows.append(f"**Thời lượng:** {duration}")
+ if upload_date and len(upload_date) == 8:
+ meta_rows.append(f"**Ngày đăng:** {upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:]}")
+ if view_count is not None:
+ meta_rows.append(f"**Lượt xem:** {view_count:,}")
+ meta_rows.append(f"**URL:** {url}")
+ lines.extend(meta_rows)
+ lines.append("")
+
+ # Subtitles/transcript
+ subtitles = info.get("subtitles") or {}
+ auto_subtitles = info.get("automatic_captions") or {}
+
+ transcript_text = None
+ for lang in ("vi", "en"):
+ tracks = subtitles.get(lang) or auto_subtitles.get(lang)
+ if tracks:
+ # Find a json3 or srv3 track to extract plain text
+ for track in tracks:
+ if track.get("ext") in ("json3", "srv3", "ttml", "vtt"):
+ try:
+ import urllib.request
+ with urllib.request.urlopen(track["url"], timeout=15) as r:
+ raw = r.read().decode("utf-8", errors="ignore")
+ # Strip VTT timestamps for vtt format
+ if track.get("ext") == "vtt":
+ cleaned = re.sub(r"\d{2}:\d{2}:\d{2}\.\d+ --> .*", "", raw)
+ cleaned = re.sub(r"^\d+$", "", cleaned, flags=re.MULTILINE)
+ cleaned = re.sub(r"<[^>]+>", "", cleaned)
+ transcript_text = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
+ else:
+ transcript_text = raw
+ break
+ except Exception:
+ pass
+ if transcript_text:
+ break
+
+ if transcript_text:
+ lines += ["## Transcript / Phụ đề", "", transcript_text, ""]
+ elif description:
+ lines += ["## Mô tả", "", description[:3000], ""]
+ else:
+ lines += ["## Ghi chú", "", "_Không có transcript hoặc mô tả._", ""]
+
+ if chapters:
+ lines += ["## Chapters", ""]
+ for ch in chapters:
+ start = ch.get("start_time", 0)
+ m, s = divmod(int(start), 60)
+ lines.append(f"- **{m:02d}:{s:02d}** — {ch.get('title', '')}")
+ lines.append("")
+
+ markdown_text = "\n".join(lines)
+
+ # Write temp file and run through Docling
+ video_id = info.get("id", "youtube")
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp:
+ tmp.write(markdown_text)
+ tmp_path = tmp.name
+
+ try:
+ result = converter.convert(tmp_path)
+ doc = result.document
+ page_count = None
+
+ if output_format == "markdown":
+ content = doc.export_to_markdown()
+ elif output_format == "json":
+ content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
+ elif output_format == "html":
+ content = doc.export_to_html()
+ else:
+ content = markdown_text
+
+ llm_used = False
+ if _llm_client and use_llm and output_format in ("markdown", "text"):
+ content = _llm_enrich(content, system_prompt=llm_prompt or None)
+ llm_used = True
+
+ from app.models.ConvertModel import Conversion
+ record = Conversion(
+ filename=f"{video_id}.md",
+ file_type="youtube",
+ output_format=output_format,
+ content=content,
+ page_count=page_count,
+ llm_enabled=llm_used,
+ )
+ db.add(record)
+ await db.commit()
+ await db.refresh(record)
+ return record
+ except Exception as e:
+ await db.rollback()
+ raise HTTPException(status_code=500, detail=str(e))
+ finally:
+ os.unlink(tmp_path)
+
+
# -----------------------------------------------------------------
ALLOWED_EXTENSIONS = {
"pdf", "docx", "xlsx", "pptx",
diff --git a/docling-service/requirements.txt b/docling-service/requirements.txt
index e023c30..1a82484 100644
--- a/docling-service/requirements.txt
+++ b/docling-service/requirements.txt
@@ -5,3 +5,5 @@ python-multipart
asyncpg
sqlalchemy[asyncio]
openai
+yt-dlp
+ffmpeg
\ No newline at end of file
diff --git a/frontend/index.html b/frontend/index.html
index 36a8f57..d8a34d3 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -96,31 +96,66 @@
Tải lên tài liệu để so sánh
-
-
-
-
Kéo thả hoặc click để chọn file
-
+
+
+
+
+
+
+
+
+
Kéo thả hoặc click để chọn file
+
+
+
+
+ PDF
+ DOCX
+ XLSX
+ PPTX
+ HTML
+ CSV
+ TXT
+ JPG/PNG
+ EPUB
+ TIFF
+ ASCIIDoc
+
-
-
PDF
-
DOCX
-
XLSX
-
PPTX
-
HTML
-
CSV
-
TXT
-
JPG/PNG
-
EPUB
-
TIFF
-
ASCIIDoc
+
+
+
+
+
+
+
+
+
+ MarkItDown: dùng yt-dlp native |
+ Docling: trích transcript → convert markdown
+
-
+