feat(ocr): prioritize direct DOCX parse to resolve Google Drive numbering...

feat(ocr): prioritize direct DOCX parse to resolve Google Drive numbering corruption, add text export + regex fallback for PDFs

feat(ocr): prioritize direct DOCX parse to resolve Google Drive numbering...
feat(ocr): prioritize direct DOCX parse to resolve Google Drive numbering corruption, add text export + regex fallback for PDFs
371783cc · Vũ Hoàng Anh · d92639e2 · 371783cc · 371783cc · 371783cc
Commit 371783cc authored Apr 08, 2026 by Vũ Hoàng Anh
13 changed files
--- a/app.py
+++ b/app.py
 """
-Document Converter API Wrapper.
- Any file type → convert via Google Drive OCR/export
+Document Converter API.
+- DOCX → direct parse (preserves original numbering)
+- PDF/Image → Google Drive OCR → text export → regex heading
 """

 import os
 import logging
-import inspect
 import re
 import tempfile

-import httpx
-from fastapi import FastAPI, File, Form, UploadFile
-from fastapi.responses import JSONResponse
-import pymupdf4llm
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse, RedirectResponse
+from fastapi.staticfiles import StaticFiles
 from ocr_drive import get_drive_service, ocr_file
 from docx_converter import convert_docx_to_markdown

@@ -25,6 +24,15 @@ app = FastAPI(
    description="Wrapper API: Any file type -> Google Drive OCR/export",
 )

+# ── Serve static UI ──
+_static_dir = os.path.join(os.path.dirname(__file__), "static")
+if os.path.isdir(_static_dir):
+    app.mount("/static", StaticFiles(directory=_static_dir, html=True), name="static")
+
+@app.get("/")
+async def root():
+    return RedirectResponse(url="/static/index.html")
+

 def _looks_like_low_information_markdown(md_text: str) -> bool:
    """
@@ -51,49 +59,6 @@ def _looks_like_low_information_markdown(md_text: str) -> bool:
    return len(separator_lines) >= 3 and len(meaningful_lines) <= 1


-def _convert_pdf_markdown_best_effort(doc):
-    """
-    Convert PDF to markdown and retry with OCR when output seems empty.
-    """
-    sig = inspect.signature(pymupdf4llm.to_markdown)
-    params = sig.parameters
-
-    base_kwargs = {}
-    if "show_progress" in params:
-        base_kwargs["show_progress"] = False
-
-    md_text = pymupdf4llm.to_markdown(doc, **base_kwargs)
-    low_information = _looks_like_low_information_markdown(md_text)
-
-    ocr_attempted = False
-    ocr_used = False
-    ocr_error = None
-
-    # Retry once with OCR only when the first pass is likely useless.
-    if low_information and "use_ocr" in params:
-        ocr_attempted = True
-        ocr_kwargs = dict(base_kwargs)
-        ocr_kwargs["use_ocr"] = True
-        if "ocr_language" in params:
-            ocr_kwargs["ocr_language"] = os.getenv("OCR_LANGUAGE", "vie+eng")
-
-        try:
-            md_text_ocr = pymupdf4llm.to_markdown(doc, **ocr_kwargs)
-            if len(md_text_ocr.strip()) > len(md_text.strip()):
-                md_text = md_text_ocr
-                ocr_used = True
-                low_information = _looks_like_low_information_markdown(md_text)
-        except Exception as exc:
-            ocr_error = str(exc)
-
-    return md_text, {
-        "low_information": low_information,
-        "ocr_attempted": ocr_attempted,
-        "ocr_used": ocr_used,
-        "ocr_error": ocr_error,
-    }
-
-
 @app.get("/health")
 async def health():
    return {"status": "ok"}
@@ -104,8 +69,11 @@ async def convert_file(
    files: UploadFile = File(...),
 ):
    """
-    Convert uploaded file to markdown-compatible text using Google Drive OCR/export.
-    This endpoint intentionally routes every file type through Drive processing.
+    Convert uploaded file to markdown.
+
+    Strategy:
+      1. DOCX/DOC → parse trực tiếp (giữ đúng numbering gốc, không qua Drive)
+      2. PDF/Image/Scan → Google Drive OCR → text export → regex heading
    """
    filename = files.filename or "unknown"
    ext = os.path.splitext(filename)[1].lower() or ".bin"
@@ -120,46 +88,65 @@ async def convert_file(
        )

    try:
-        logger.info(f"Processing file via Google Drive OCR/export: {filename}")
+        md_text = ""
+        backend_used = "unknown"
+
+        # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        # STRATEGY 1: DOCX → Parse trực tiếp (không qua Drive)
+        # Giữ đúng numbering gốc (ĐIỀU 1., 4.1., 5.2.1.)
+        # Drive sẽ nhả loạn số + ăn chữ tiếng Việt
+        # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        if ext in (".docx", ".doc"):
+            logger.info(f"DOCX detected — parsing directly (no Drive)")
+            try:
+                if ext == ".docx":
+                    parsed = convert_docx_to_markdown(file_bytes, filename)
+                    md_text = parsed.get("document", {}).get("md_content", "")
+                    backend_used = "direct_docx"
+                    logger.info(f"Direct DOCX parse: {len(md_text)} chars")
+
+                    # Nếu DOCX parse ra ít quá (VD: file scan, ảnh) → cũng fallback Drive  
+                    if md_text and _looks_like_low_information_markdown(md_text):
+                        logger.warning(f"DOCX parse returned low info ({len(md_text)} chars), will try Drive OCR")
+                        md_text = ""
+            except Exception as e:
+                logger.warning(f"Direct DOCX parse failed, will try Drive OCR: {e}")
+                md_text = ""
+
+        # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        # STRATEGY 2: Nếu direct parse thất bại hoặc file không phải DOCX
+        # → Google Drive OCR (text export, KHÔNG phải DOCX export)
+        # Text export giữ đúng numbering, DOCX export nhả loạn số
+        # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        if not md_text:
+            logger.info(f"Falling back to Google Drive OCR: {filename}")

-        # Save uploaded content to a temporary file before Drive upload.
            with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
                tmp.write(file_bytes)
                tmp_path = tmp.name

            try:
                drive = get_drive_service()
-            docx_bytes = ocr_file(
-                drive,
-                tmp_path,
-                source_mime_type=files.content_type,
-                filename=filename,
-                export_mode="docx",
-            )
-
-            md_text = ""
-            if docx_bytes:
-                try:
-                    parsed = convert_docx_to_markdown(docx_bytes, filename)
-                    md_text = parsed.get("document", {}).get("md_content", "")
-                except Exception as parse_exc:
-                    logger.warning(
-                        f"DOCX parse after Drive export failed, fallback to text export: {parse_exc}"
-                    )

-            if not md_text:
-                md_text = ocr_file(
+                # Export TEXT (không phải DOCX!) — text export giữ đúng numbering
+                raw_text = ocr_file(
                    drive,
                    tmp_path,
                    source_mime_type=files.content_type,
                    filename=filename,
                    export_mode="text",
                )
+
+                if raw_text:
+                    from text_to_markdown import text_to_markdown
+                    md_text = text_to_markdown(raw_text)
+                    backend_used = "google_drive_text"
+                    logger.info(f"Drive text export + regex: {len(md_text)} chars")
            finally:
                if os.path.exists(tmp_path):
                    os.remove(tmp_path)

-        logger.info(f"Drive OCR/export conversion done: {len(md_text)} chars")
+        logger.info(f"Conversion done: {len(md_text)} chars via {backend_used}")

        response = {
            "document": {
@@ -168,19 +155,17 @@ async def convert_file(
            },
            "meta": {
                "low_information": _looks_like_low_information_markdown(md_text),
-                "ocr_attempted": True,
-                "ocr_used": True,
-                "backend": "google_drive",
+                "backend": backend_used,
                "source_content_type": files.content_type,
            },
        }
        if not md_text:
-            response["warning"] = "Google Drive did not extract text for this file."
+            response["warning"] = "Could not extract text from this file."

        return JSONResponse(content=response)
    except Exception as e:
-        logger.error(f"Drive OCR/export conversion failed: {str(e)}")
+        logger.error(f"Conversion failed: {str(e)}")
        return JSONResponse(
            status_code=500,
-            content={"error": f"Drive OCR/export conversion failed: {str(e)}"},
+            content={"error": f"Conversion failed: {str(e)}"},
        )
--- a/docx_converter.py
+++ b/docx_converter.py
@@ -225,16 +225,36 @@ def _paragraph_to_markdown(
    if re.fullmatch(r"Trang\s+\d+(/\d+)?|Page\s+\d+(/\d+)?|\d+\s*-.*-.*|^\d+$", line, re.IGNORECASE):
        return ""

-    # Apply heading styles
+    # ── Detect heading level ──────────────────────────────
+    # Ưu tiên 1: Đọc outlineLvl trực tiếp từ XML (chính xác nhất, Word tự gắn)
+    # Ưu tiên 2: Đọc từ tên Style (Heading1, Heading2...)
+    # Ưu tiên 3: Regex nhận diện cấu trúc pháp lý VN (Phần, Chương, Điều, Mục...)
+    #            CHỈ áp dụng khi style có chứa "heading" hoặc "title"
+    # → Tuyệt đối KHÔNG bịa heading từ text bôi đậm thông thường.
+
    heading_level = 0
+
+    # ── P1: outlineLvl trong XML (ưu tiên cao nhất) ──
+    if ppr is not None:
+        outline_el = ppr.find("w:outlineLvl", NS)
+        if outline_el is not None:
+            try:
+                raw_lvl = int(outline_el.get(f"{{{WORD_NS}}}val", "9"))
+                if 0 <= raw_lvl <= 5:
+                    heading_level = raw_lvl + 1  # outlineLvl 0 = H1, 1 = H2, ...
+            except ValueError:
+                pass
+
+    # ── P2: Style name (nếu outlineLvl không có) ──
+    if heading_level == 0:
        style_lower = style_name.lower()
-    if "heading" in style_lower:
+        if "heading" in style_lower or "title" in style_lower:
            for i in range(1, 7):
                if str(i) in style_name:
                    heading_level = i
                    break
-        if heading_level == 0 and ("title" in style_lower or "heading" in style_lower):
-            heading_level = 1
+            if heading_level == 0:
+                heading_level = 1  # Generic "Heading" hoặc "Title" → H1

    if heading_level > 0:
        return f"{'#' * heading_level} {line}"
@@ -260,6 +280,17 @@ def _paragraph_to_markdown(
            numbering_map.get(num_id, {}).get(ilvl, {}).get("start", 1) - 1,
        ) + 1

+        # ── CRITICAL: Reset sub-level counters when parent level changes ──
+        # Khi ĐIỀU 4 (ilvl 0) tăng counter, phải reset ilvl 1,2,3...
+        # Nếu không: 4.1 thành 4.10 vì counter ilvl 1 vẫn tiếp tục từ ĐIỀU trước
+        current_ilvl_int = int(ilvl)
+        if num_id in numbering_map:
+            for sub_ilvl in list(numbering_map[num_id].keys()):
+                if int(sub_ilvl) > current_ilvl_int:
+                    sub_key = f"{num_id}_{sub_ilvl}"
+                    if sub_key in list_counters:
+                        del list_counters[sub_key]
+
        # Render the full numbering text
        rendered = _render_lvl_text(
            lvl_text, list_counters, num_id, ilvl, numbering_map
@@ -370,6 +401,9 @@ def convert_docx_to_markdown(file_bytes: bytes, filename: str) -> dict:
        md_content = md_content.replace("\n\n\n", "\n\n")
    md_content = md_content.strip()

+    # ── Post-processing: fix heading levels dựa trên nội dung text ──
+    md_content = _postprocess_legal_headings(md_content)
+
    return {
        "document": {
            "filename": filename,
@@ -378,6 +412,79 @@ def convert_docx_to_markdown(file_bytes: bytes, filename: str) -> dict:
    }


+# ───────────────────────────────────────────────────────────────
+# Post-processing: regex-based heading detection trên markdown
+# ───────────────────────────────────────────────────────────────
+
+_RE_DIEU = re.compile(
+    r"^(?:#+\s+)?(?:\*{0,3})((?:ĐIỀU|Điều)\s+\d+[\.\:]?\s*.+?)(?:\*{0,3})\s*$",
+    re.MULTILINE | re.UNICODE,
+)
+_RE_H2_NUM = re.compile(
+    r"^(?:#+\s+)?(?:\*{0,3})(\d+\.\d+\.?\s+.+?)(?:\*{0,3})\s*$",
+    re.MULTILINE | re.UNICODE,
+)
+_RE_H3_NUM = re.compile(
+    r"^(?:#+\s+)?(?:\*{0,3})(\d+\.\d+\.\d+\.?\s+.+?)(?:\*{0,3})\s*$",
+    re.MULTILINE | re.UNICODE,
+)
+_RE_ALL_CAPS_LINE = re.compile(
+    r"^(?:#+\s+)?(?:\*{0,3})([A-ZÀ-Ỹ\s\-–:,\.]+)(?:\*{0,3})\s*$",
+    re.MULTILINE | re.UNICODE,
+)
+
+
+def _postprocess_legal_headings(md: str) -> str:
+    """
+    Post-process markdown: phát hiện cấu trúc pháp lý VN và gán đúng heading.
+    Chạy regex trên markdown text đã convert, bất kể DOCX heading style.
+    """
+    lines = md.split("\n")
+    result = []
+
+    for line in lines:
+        stripped = line.strip()
+        if not stripped:
+            result.append(line)
+            continue
+
+        # Bỏ heading markers hiện có để đánh giá lại
+        clean = re.sub(r"^#+\s+", "", stripped)
+        # Bỏ bold/italic markers để check text gốc
+        text_only = re.sub(r"\*{1,3}", "", clean).strip()
+
+        if not text_only or len(text_only) > 150:
+            result.append(line)
+            continue
+
+        # ── Priority 1: "ĐIỀU X." pattern → H1 ──
+        if re.match(r"^(?:ĐIỀU|Điều)\s+\d+[\.\:\s]", text_only, re.UNICODE | re.IGNORECASE):
+            result.append(f"# {clean}")
+            continue
+
+        # ── Priority 2: ALL CAPS short line → H1 ──
+        letters = [c for c in text_only if c.isalpha()]
+        if letters and all(c == c.upper() for c in letters) and len(text_only) < 100:
+            # Không phải dòng bảng hoặc list
+            if not text_only.startswith("|") and not text_only.startswith("-"):
+                result.append(f"# {clean}")
+                continue
+
+        # ── Priority 3: "X.Y.Z." pattern → H3 ──
+        if re.match(r"^\d+\.\d+\.\d+\.?\s", text_only):
+            result.append(f"### {clean}")
+            continue
+
+        # ── Priority 4: "X.Y." pattern → H2 ──
+        if re.match(r"^\d+\.\d+\.?\s", text_only):
+            result.append(f"## {clean}")
+            continue
+
+        result.append(line)
+
+    return "\n".join(result)
+
+
 def _extract_headers_footers(z: zipfile.ZipFile) -> List[str]:
    """Extract text from document headers and footers."""
    parts = []

--- a/final_test.txt
+++ b/final_test.txt
 BTesting test/67_formatted.docx
--- a/format_contract.py
+++ b/format_contract.py
+"""
+format_contract.py
+------------------
+Phục hồi Heading style cho file DOCX sau khi Google Drive OCR export.
+Google Drive OCR hay xoá sạch heading styles VÀ bold formatting.
+
+Logic detect heading (không cần bold):
+  - ALL CAPS + < 120 ký tự           →  Heading 1
+  - Regex: "ĐIỀU X." / "Chương X"    →  Heading 1
+  - Regex: "X.Y." (sub-section)      →  Heading 2
+  - Bold + Mixed case + < 120 ký tự  →  Heading 2 (fallback)
+"""
+
+import re
+import shutil
+from io import BytesIO
+from pathlib import Path
+
+from docx import Document
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+from lxml import etree
+
+
+# ───────────────────────────────────────────────
+# Regex patterns cho cấu trúc pháp lý VN
+# ───────────────────────────────────────────────
+
+# Heading 1: "ĐIỀU 1.", "Điều 1.", "CHƯƠNG I", "Phần 1", etc.
+H1_PATTERNS = re.compile(
+    r"^(?:"
+    r"(?:ĐIỀU|Điều)\s+\d+[\.\:\s]"       # Điều 1. / ĐIỀU 1:
+    r"|(?:CHƯƠNG|Chương)\s+[IVXLCDM\d]+"  # Chương I / CHƯƠNG 2
+    r"|(?:PHẦN|Phần)\s+[IVXLCDM\d]+"      # Phần I / PHẦN 2
+    r"|(?:MỤC|Mục)\s+[IVXLCDM\d]+"        # Mục I / MỤC 2
+    r")",
+    re.IGNORECASE | re.UNICODE,
+)
+
+# Heading 2: "1.1.", "2.3.", "10.26." etc.
+H2_PATTERNS = re.compile(
+    r"^\d+\.\d+\.?\s",
+    re.UNICODE,
+)
+
+# Heading 3: "1.1.1.", "2.3.4." etc.
+H3_PATTERNS = re.compile(
+    r"^\d+\.\d+\.\d+\.?\s",
+    re.UNICODE,
+)
+
+
+# ───────────────────────────────────────────────
+# Helpers
+# ───────────────────────────────────────────────
+
+def _is_all_caps(text: str) -> bool:
+    """True nếu toàn bộ chữ cái trong text đều là hoa."""
+    letters = [c for c in text if c.isalpha()]
+    return len(letters) > 0 and all(c == c.upper() for c in letters)
+
+
+def _para_is_bold(para) -> bool:
+    """True nếu đoạn văn có ít nhất 1 run in đậm."""
+    return any(run.bold for run in para.runs if run.text.strip())
+
+
+def _ensure_heading_style(doc, style_id: str, style_name: str, outline_level: int, font_size: int):
+    """Thêm heading style vào document nếu chưa có."""
+    styles_el = doc.styles.element
+    for s in styles_el.findall(qn("w:style")):
+        if s.get(qn("w:styleId")) == style_id:
+            return
+
+    xml = f"""<w:style xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+        w:type="paragraph" w:styleId="{style_id}">
+      <w:name w:val="{style_name}"/>
+      <w:basedOn w:val="Normal"/>
+      <w:next w:val="Normal"/>
+      <w:pPr>
+        <w:outlineLvl w:val="{outline_level}"/>
+        <w:spacing w:before="240" w:after="120"/>
+      </w:pPr>
+      <w:rPr>
+        <w:b/>
+        <w:sz w:val="{font_size}"/>
+        <w:szCs w:val="{font_size}"/>
+      </w:rPr>
+    </w:style>"""
+    styles_el.append(etree.fromstring(xml))
+
+
+def _set_outline_level(para, level: int):
+    """Gán outlineLvl cho paragraph, tuân thủ XML Schema của Word."""
+    pPr = para._p.get_or_add_pPr()
+    for existing in pPr.findall(qn("w:outlineLvl")):
+        pPr.remove(existing)
+    el = OxmlElement("w:outlineLvl")
+    el.set(qn("w:val"), str(level))
+
+    # w:outlineLvl PHẢI nằm trước w:rPr để tránh nhao chữ
+    rPr = pPr.find(qn('w:rPr'))
+    if rPr is not None:
+        rPr.addprevious(el)
+    else:
+        sectPr = pPr.find(qn('w:sectPr'))
+        if sectPr is not None:
+            sectPr.addprevious(el)
+        else:
+            pPr.append(el)
+
+
+def _get_style(doc, primary: str, fallback: str):
+    """Lấy style từ document, thử primary trước rồi fallback."""
+    try:
+        return doc.styles[primary]
+    except KeyError:
+        try:
+            return doc.styles[fallback]
+        except KeyError:
+            return None
+
+
+# ───────────────────────────────────────────────
+# Core detection logic
+# ───────────────────────────────────────────────
+
+def _detect_heading_level(text: str, is_bold: bool) -> int:
+    """
+    Detect heading level dựa trên nội dung text và bold.
+    Returns: 0 = không phải heading, 1 = H1, 2 = H2, 3 = H3
+    """
+    if not text or len(text) >= 120:
+        return 0
+
+    # ── Ưu tiên 1: Regex cấu trúc pháp lý ──
+    # H3 check trước H2 vì H3 pattern (1.1.1.) cũng match H2 (1.1.)
+    if H3_PATTERNS.match(text):
+        return 3
+
+    if H1_PATTERNS.match(text):
+        return 1
+
+    if H2_PATTERNS.match(text):
+        return 2
+
+    # ── Ưu tiên 2: ALL CAPS = H1 (KHÔNG cần bold) ──
+    # Google Drive OCR thường xoá bold nhưng giữ nguyên chữ hoa
+    if _is_all_caps(text) and len(text) < 100:
+        return 1
+
+    # ── Ưu tiên 3: Bold + ngắn = H2 (fallback) ──
+    word_count = len(text.split())
+    if is_bold and word_count <= 15:
+        return 2
+
+    return 0
+
+
+# ───────────────────────────────────────────────
+# API: format từ bytes (dùng trong pipeline)
+# ───────────────────────────────────────────────
+
+def fix_headings_in_memory(docx_bytes: bytes) -> bytes:
+    """
+    Nhận DOCX bytes (từ Google Drive OCR export),
+    phục hồi heading styles, trả về DOCX bytes đã fix.
+    """
+    doc = Document(BytesIO(docx_bytes))
+    count = _apply_heading_fixes(doc)
+
+    if count == 0:
+        return docx_bytes
+
+    buf = BytesIO()
+    doc.save(buf)
+    buf.seek(0)
+    return buf.read()
+
+
+# ───────────────────────────────────────────────
+# API: format từ file path
+# ───────────────────────────────────────────────
+
+def format_contract(src_path: Path, dst_path: Path) -> int:
+    """Format 1 file DOCX trên disk. Trả về số heading đã fix."""
+    shutil.copy2(src_path, dst_path)
+    doc = Document(str(dst_path))
+    count = _apply_heading_fixes(doc)
+    doc.save(str(dst_path))
+    return count
+
+
+# ───────────────────────────────────────────────
+# Internal: áp dụng fix
+# ───────────────────────────────────────────────
+
+def _apply_heading_fixes(doc) -> int:
+    """Quét paragraphs và gán heading style. Trả về số heading đã fix."""
+    _ensure_heading_style(doc, "Heading1", "heading 1", 0, 28)
+    _ensure_heading_style(doc, "Heading2", "heading 2", 1, 24)
+    _ensure_heading_style(doc, "Heading3", "heading 3", 2, 22)
+
+    h1_style = _get_style(doc, "Heading 1", "Heading1")
+    h2_style = _get_style(doc, "Heading 2", "Heading2")
+    h3_style = _get_style(doc, "Heading 3", "Heading3")
+
+    if not h1_style or not h2_style:
+        return 0
+
+    style_map = {
+        1: (h1_style, 0),
+        2: (h2_style, 1),
+        3: (h3_style or h2_style, 2),
+    }
+
+    count = 0
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if not text:
+            continue
+
+        is_bold = _para_is_bold(para)
+        level = _detect_heading_level(text, is_bold)
+
+        if level == 0:
+            continue
+
+        # Luôn override heading style + outlineLvl dựa trên nội dung text
+        # (Google Drive hay gán sai level, ví dụ tất cả thành Heading 2)
+        heading_style, outline_val = style_map[level]
+        para.style = heading_style
+        _set_outline_level(para, outline_val)
+        for run in para.runs:
+            run.bold = True
+        count += 1
+
+    return count
--- a/out.txt
+++ b/out.txt
 B1 | CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,5 @@ pymupdf4llm==0.0.17
 google-api-python-client
 google-auth-httplib2
 google-auth-oauthlib
+python-docx
+lxml
--- a/static/index.html
+++ b/static/index.html
--- a/test/67_formatted.docx
+++ b/test/67_formatted.docx
--- a/test_drive_ocr.txt
+++ b/test_drive_ocr.txt
 BTesting test/67_formatted.docx
--- a/test_e2e.py
+++ b/test_e2e.py
+"""Test E2E: kiểm tra cấu trúc ĐIỀU / 1.1 / 1.2.1 có ra đúng heading không."""
+import sys
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+import httpx
+
+filepath = "test/67_formatted.docx"
+with open(filepath, "rb") as f:
+    resp = httpx.post(
+        "http://localhost:8000/v1/convert/file",
+        files={"files": (filepath, f, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
+        timeout=60,
+    )
+
+md = resp.json().get("document", {}).get("md_content", "")
+
+# Chỉ in các dòng có heading (#) hoặc chứa "Điều"/"ĐIỀU"
+for i, line in enumerate(md.split("\n")):
+    if line.startswith("#") or "điều" in line.lower() or "ĐIỀU" in line:
+        print(f"{i+1:3d} | {line}")
--- a/test_ocr.txt
+++ b/test_ocr.txt
--- a/test_out.txt
+++ b/test_out.txt
 B# **CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM**
--- a/text_to_markdown.py
+++ b/text_to_markdown.py
+"""
+text_to_markdown.py
+-------------------
+Convert plain text (from Google Drive text export) to structured Markdown.
+Google Drive text export giữ đúng numbering (ĐIỀU 1., 1.1., 4.1., 5.2.1.)
+trong khi DOCX export bị nhả loạn số.
+
+Pipeline:
+  1. Detect tab-separated rows → convert thành markdown tables
+  2. Detect heading patterns (ĐIỀU, X.Y., X.Y.Z., ALL CAPS) → gán #/##/###
+  3. Detect bullet lists (a), (b), -, • etc.
+  4. Clean up formatting
+"""
+
+import re
+from typing import List, Tuple
+
+
+# ───────────────────────────────────────────────
+# Heading patterns cho hợp đồng pháp lý VN
+# ───────────────────────────────────────────────
+
+RE_DIEU = re.compile(
+    r"^((?:ĐIỀU|Điều)\s+\d+[\.\:]?\s*.*)$",
+    re.UNICODE,
+)
+
+RE_CHUONG = re.compile(
+    r"^((?:CHƯƠNG|Chương|PHẦN|Phần|MỤC|Mục)\s+[IVXLCDM\d]+[\.\:]?\s*.*)$",
+    re.UNICODE | re.IGNORECASE,
+)
+
+# X.Y.Z. pattern (H3) - check TRƯỚC X.Y. vì X.Y.Z cũng match X.Y
+RE_H3 = re.compile(r"^(\d+\.\d+\.\d+\.?\s)", re.UNICODE)
+
+# X.Y. pattern (H2)
+RE_H2 = re.compile(r"^(\d+\.\d+\.?\s)", re.UNICODE)
+
+
+def _is_all_caps(text: str) -> bool:
+    """True nếu toàn bộ chữ cái đều viết hoa."""
+    letters = [c for c in text if c.isalpha()]
+    return len(letters) > 3 and all(c == c.upper() for c in letters)
+
+
+def _is_table_block(lines: List[str], start: int) -> Tuple[bool, int]:
+    """
+    Detect một block tab-separated (bảng) bắt đầu từ dòng start.
+    Returns (is_table, end_index).
+    Bảng = ít nhất 2 dòng liên tiếp có >= 2 tabs.
+    """
+    count = 0
+    i = start
+    while i < len(lines):
+        if lines[i].count("\t") >= 2:
+            count += 1
+            i += 1
+        else:
+            break
+
+    return count >= 2, i
+
+
+def _tab_block_to_markdown_table(lines: List[str]) -> str:
+    """Convert tab-separated lines thành markdown table."""
+    rows = []
+    max_cols = 0
+
+    for line in lines:
+        cells = [c.strip() for c in line.split("\t")]
+        # Loại bỏ trailing empty cells
+        while cells and not cells[-1]:
+            cells.pop()
+        if cells:
+            rows.append(cells)
+            max_cols = max(max_cols, len(cells))
+
+    if not rows or max_cols < 2:
+        return "\n".join(lines)
+
+    # Normalize row lengths
+    for row in rows:
+        while len(row) < max_cols:
+            row.append("")
+
+    md_lines = []
+    # Header row
+    md_lines.append("| " + " | ".join(rows[0]) + " |")
+    md_lines.append("| " + " | ".join(["---"] * max_cols) + " |")
+    # Data rows
+    for row in rows[1:]:
+        md_lines.append("| " + " | ".join(row) + " |")
+
+    return "\n".join(md_lines)
+
+
+def _detect_heading(line: str) -> Tuple[int, str]:
+    """
+    Detect heading level từ nội dung text.
+    Returns: (level, clean_text) where level=0 means not a heading.
+    """
+    text = line.strip()
+    if not text or len(text) > 200:
+        return 0, text
+
+    # ── Priority 1: "ĐIỀU X." → H1 ──
+    if RE_DIEU.match(text):
+        return 1, text
+
+    # ── Priority 2: "CHƯƠNG/PHẦN/MỤC" → H1 ──
+    if RE_CHUONG.match(text):
+        return 1, text
+
+    # ── Priority 3: ALL CAPS short line → H1 ──
+    # Loại trừ các dòng trong bảng, dòng quá ngắn (<5 chars), bullets
+    if (_is_all_caps(text)
+        and len(text) < 120
+        and not text.startswith("|")
+        and not text.startswith("-")
+        and not text.startswith("•")
+        and "\t" not in text):
+        return 1, text
+
+    # ── Priority 4: "X.Y.Z." → H3 ──
+    if RE_H3.match(text):
+        return 3, text
+
+    # ── Priority 5: "X.Y." → H2 ──
+    if RE_H2.match(text):
+        return 2, text
+
+    return 0, text
+
+
+def _clean_line(line: str) -> str:
+    """Clean up a single line: normalize tabs thành spaces cho non-table lines."""
+    # Giữ nguyên nếu là table (nhiều tabs)
+    if line.count("\t") >= 2:
+        return line
+
+    # Thay tabs đơn thành spaces cho readability
+    return line.replace("\t", " ").strip()
+
+
+def text_to_markdown(text: str) -> str:
+    """
+    Convert plain text (từ Google Drive text export) thành structured Markdown.
+
+    - Detect tab-separated tables → markdown tables
+    - Detect heading patterns → # / ## / ###
+    - Detect bullets → - list items
+    - Clean formatting
+    """
+    if not text or not text.strip():
+        return ""
+
+    lines = text.split("\n")
+    result = []
+    i = 0
+
+    while i < len(lines):
+        line = lines[i]
+
+        # ── Check tab-separated table block ──
+        is_table, table_end = _is_table_block(lines, i)
+        if is_table:
+            table_md = _tab_block_to_markdown_table(lines[i:table_end])
+            result.append("")
+            result.append(table_md)
+            result.append("")
+            i = table_end
+            continue
+
+        # ── Clean the line ──
+        cleaned = _clean_line(line)
+
+        if not cleaned:
+            result.append("")
+            i += 1
+            continue
+
+        # ── Lọc page numbers / headers ──
+        if re.fullmatch(r"\d{1,3}", cleaned):
+            i += 1
+            continue
+        if re.fullmatch(r"Trang\s+\d+(/\d+)?|Page\s+\d+(/\d+)?", cleaned, re.IGNORECASE):
+            i += 1
+            continue
+
+        # ── Detect heading ──
+        level, text_clean = _detect_heading(cleaned)
+        if level > 0:
+            result.append(f"{'#' * level} {text_clean}")
+            i += 1
+            continue
+
+        # ── Detect bullet patterns: (a), (b), -, • ──
+        if re.match(r"^\([a-z]\)\s", cleaned):
+            result.append(cleaned)  # Keep as-is, already looks like a list
+            i += 1
+            continue
+
+        # ── Normal text ──
+        result.append(cleaned)
+        i += 1
+
+    # Clean up excessive blank lines
+    md = "\n".join(result)
+    while "\n\n\n" in md:
+        md = md.replace("\n\n\n", "\n\n")
+
+    return md.strip()