diff --git a/db/init.sql b/db/init_markitdown.sql similarity index 100% rename from db/init.sql rename to db/init_markitdown.sql diff --git a/docker-compose.yml b/docker-compose.yml index 42ac8ec..9b378b3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -78,7 +78,7 @@ services: POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin} volumes: - db_markitdown_data:/var/lib/postgresql/data - - ./db/init.sql:/docker-entrypoint-initdb.d/init.sql:ro + - ./db/init_markitdown.sql:/docker-entrypoint-initdb.d/init_markitdown.sql:ro ports: - "5432:5432" healthcheck: @@ -95,7 +95,7 @@ services: POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-admin} volumes: - db_docling_data:/var/lib/postgresql/data - - ./db/init_docling.sql:/docker-entrypoint-initdb.d/init.sql:ro + - ./db/init_docling.sql:/docker-entrypoint-initdb.d/init_markitdown.sql:ro healthcheck: <<: *healthcheck-defaults test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-admin} -d docling"] diff --git a/docling-service/app/controllers/ConvertController.py b/docling-service/app/controllers/ConvertController.py index 6df3183..b924fa2 100644 --- a/docling-service/app/controllers/ConvertController.py +++ b/docling-service/app/controllers/ConvertController.py @@ -7,6 +7,12 @@ from pydantic import BaseModel router = APIRouter() +class UrlRequest(BaseModel): + url: str + output_format: str = "markdown" + use_llm: bool = True + llm_prompt: str | None = None + class SettingsRequest(BaseModel): ollama_base_url: str | None = None ollama_model: str = "llava" @@ -88,6 +94,24 @@ async def convert( ) +@router.post("/convert-url", response_model=ConvertResponse) +async def convert_url( + req: UrlRequest, + db: AsyncSession = Depends(get_db), +): + record = await docling_service.convert_url( + req.url, db, req.output_format, use_llm=req.use_llm, llm_prompt=req.llm_prompt + ) + return ConvertResponse( + id=record.id, + filename=record.filename, + output_format=record.output_format, + content=record.content, + page_count=record.page_count, + llm_enabled=record.llm_enabled, + ) + + @router.get("/conversions/{conversion_id}", response_model=ConvertResponse) async def get_conversion(conversion_id: int, db: AsyncSession = Depends(get_db)): record = await docling_service.get_conversion(conversion_id, db) diff --git a/docling-service/app/services/DoclingService.py b/docling-service/app/services/DoclingService.py index f6fa537..c370c6e 100644 --- a/docling-service/app/services/DoclingService.py +++ b/docling-service/app/services/DoclingService.py @@ -94,6 +94,148 @@ def _llm_enrich(markdown: str, system_prompt: str | None = None) -> str: +async def convert_url( + url: str, + db: AsyncSession, + output_format: str = "markdown", + use_llm: bool = True, + llm_prompt: str | None = None, +) -> "Conversion": + """Fetch a YouTube (or any URL) transcript via yt-dlp, then convert with Docling.""" + try: + import yt_dlp # noqa: PLC0415 + except ImportError: + raise HTTPException(status_code=500, detail="yt-dlp not installed") + + ydl_opts = { + "quiet": True, + "skip_download": True, + "writesubtitles": True, + "writeautomaticsub": True, + "subtitleslangs": ["vi", "en"], + "outtmpl": "%(id)s.%(ext)s", + } + + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=False) + except Exception as e: + raise HTTPException(status_code=422, detail=f"yt-dlp error: {e}") + + title = info.get("title", "YouTube Video") + description = info.get("description", "") or "" + channel = info.get("channel", info.get("uploader", "")) + duration = info.get("duration_string", "") + upload_date = info.get("upload_date", "") + view_count = info.get("view_count") + chapters = info.get("chapters") or [] + + # Build markdown from available metadata + lines = [f"# {title}", ""] + meta_rows = [] + if channel: + meta_rows.append(f"**Kênh:** {channel}") + if duration: + meta_rows.append(f"**Thời lượng:** {duration}") + if upload_date and len(upload_date) == 8: + meta_rows.append(f"**Ngày đăng:** {upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:]}") + if view_count is not None: + meta_rows.append(f"**Lượt xem:** {view_count:,}") + meta_rows.append(f"**URL:** {url}") + lines.extend(meta_rows) + lines.append("") + + # Subtitles/transcript + subtitles = info.get("subtitles") or {} + auto_subtitles = info.get("automatic_captions") or {} + + transcript_text = None + for lang in ("vi", "en"): + tracks = subtitles.get(lang) or auto_subtitles.get(lang) + if tracks: + # Find a json3 or srv3 track to extract plain text + for track in tracks: + if track.get("ext") in ("json3", "srv3", "ttml", "vtt"): + try: + import urllib.request + with urllib.request.urlopen(track["url"], timeout=15) as r: + raw = r.read().decode("utf-8", errors="ignore") + # Strip VTT timestamps for vtt format + if track.get("ext") == "vtt": + cleaned = re.sub(r"\d{2}:\d{2}:\d{2}\.\d+ --> .*", "", raw) + cleaned = re.sub(r"^\d+$", "", cleaned, flags=re.MULTILINE) + cleaned = re.sub(r"<[^>]+>", "", cleaned) + transcript_text = re.sub(r"\n{3,}", "\n\n", cleaned).strip() + else: + transcript_text = raw + break + except Exception: + pass + if transcript_text: + break + + if transcript_text: + lines += ["## Transcript / Phụ đề", "", transcript_text, ""] + elif description: + lines += ["## Mô tả", "", description[:3000], ""] + else: + lines += ["## Ghi chú", "", "_Không có transcript hoặc mô tả._", ""] + + if chapters: + lines += ["## Chapters", ""] + for ch in chapters: + start = ch.get("start_time", 0) + m, s = divmod(int(start), 60) + lines.append(f"- **{m:02d}:{s:02d}** — {ch.get('title', '')}") + lines.append("") + + markdown_text = "\n".join(lines) + + # Write temp file and run through Docling + video_id = info.get("id", "youtube") + with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp: + tmp.write(markdown_text) + tmp_path = tmp.name + + try: + result = converter.convert(tmp_path) + doc = result.document + page_count = None + + if output_format == "markdown": + content = doc.export_to_markdown() + elif output_format == "json": + content = json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2) + elif output_format == "html": + content = doc.export_to_html() + else: + content = markdown_text + + llm_used = False + if _llm_client and use_llm and output_format in ("markdown", "text"): + content = _llm_enrich(content, system_prompt=llm_prompt or None) + llm_used = True + + from app.models.ConvertModel import Conversion + record = Conversion( + filename=f"{video_id}.md", + file_type="youtube", + output_format=output_format, + content=content, + page_count=page_count, + llm_enabled=llm_used, + ) + db.add(record) + await db.commit() + await db.refresh(record) + return record + except Exception as e: + await db.rollback() + raise HTTPException(status_code=500, detail=str(e)) + finally: + os.unlink(tmp_path) + + # ----------------------------------------------------------------- ALLOWED_EXTENSIONS = { "pdf", "docx", "xlsx", "pptx", diff --git a/docling-service/requirements.txt b/docling-service/requirements.txt index e023c30..1a82484 100644 --- a/docling-service/requirements.txt +++ b/docling-service/requirements.txt @@ -5,3 +5,5 @@ python-multipart asyncpg sqlalchemy[asyncio] openai +yt-dlp +ffmpeg \ No newline at end of file diff --git a/frontend/index.html b/frontend/index.html index 36a8f57..d8a34d3 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -96,31 +96,66 @@
Tải lên tài liệu để so sánh
-
- - -

Kéo thả hoặc click để chọn file

-
+ + + + +
+
+ + +

Kéo thả hoặc click để chọn file

+
+
+ +
+ PDF + DOCX + XLSX + PPTX + HTML + CSV + TXT + JPG/PNG + EPUB + TIFF + ASCIIDoc +
-
- PDF - DOCX - XLSX - PPTX - HTML - CSV - TXT - JPG/PNG - EPUB - TIFF - ASCIIDoc + +
+
+ + + +
+
+ + MarkItDown: dùng yt-dlp native  |  + Docling: trích transcript → convert markdown +
-
+