Commit 371783cc authored by Vũ Hoàng Anh's avatar Vũ Hoàng Anh

feat(ocr): prioritize direct DOCX parse to resolve Google Drive numbering...

feat(ocr): prioritize direct DOCX parse to resolve Google Drive numbering corruption, add text export + regex fallback for PDFs
parent d92639e2
"""
Document Converter API Wrapper.
- Any file type → convert via Google Drive OCR/export
Document Converter API.
- DOCX → direct parse (preserves original numbering)
- PDF/Image → Google Drive OCR → text export → regex heading
"""
import os
import logging
import inspect
import re
import tempfile
import httpx
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse
import pymupdf4llm
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles
from ocr_drive import get_drive_service, ocr_file
from docx_converter import convert_docx_to_markdown
......@@ -25,6 +24,15 @@ app = FastAPI(
description="Wrapper API: Any file type -> Google Drive OCR/export",
)
# ── Serve static UI ──
_static_dir = os.path.join(os.path.dirname(__file__), "static")
if os.path.isdir(_static_dir):
app.mount("/static", StaticFiles(directory=_static_dir, html=True), name="static")
@app.get("/")
async def root():
return RedirectResponse(url="/static/index.html")
def _looks_like_low_information_markdown(md_text: str) -> bool:
"""
......@@ -51,49 +59,6 @@ def _looks_like_low_information_markdown(md_text: str) -> bool:
return len(separator_lines) >= 3 and len(meaningful_lines) <= 1
def _convert_pdf_markdown_best_effort(doc):
"""
Convert PDF to markdown and retry with OCR when output seems empty.
"""
sig = inspect.signature(pymupdf4llm.to_markdown)
params = sig.parameters
base_kwargs = {}
if "show_progress" in params:
base_kwargs["show_progress"] = False
md_text = pymupdf4llm.to_markdown(doc, **base_kwargs)
low_information = _looks_like_low_information_markdown(md_text)
ocr_attempted = False
ocr_used = False
ocr_error = None
# Retry once with OCR only when the first pass is likely useless.
if low_information and "use_ocr" in params:
ocr_attempted = True
ocr_kwargs = dict(base_kwargs)
ocr_kwargs["use_ocr"] = True
if "ocr_language" in params:
ocr_kwargs["ocr_language"] = os.getenv("OCR_LANGUAGE", "vie+eng")
try:
md_text_ocr = pymupdf4llm.to_markdown(doc, **ocr_kwargs)
if len(md_text_ocr.strip()) > len(md_text.strip()):
md_text = md_text_ocr
ocr_used = True
low_information = _looks_like_low_information_markdown(md_text)
except Exception as exc:
ocr_error = str(exc)
return md_text, {
"low_information": low_information,
"ocr_attempted": ocr_attempted,
"ocr_used": ocr_used,
"ocr_error": ocr_error,
}
@app.get("/health")
async def health():
return {"status": "ok"}
......@@ -104,8 +69,11 @@ async def convert_file(
files: UploadFile = File(...),
):
"""
Convert uploaded file to markdown-compatible text using Google Drive OCR/export.
This endpoint intentionally routes every file type through Drive processing.
Convert uploaded file to markdown.
Strategy:
1. DOCX/DOC → parse trực tiếp (giữ đúng numbering gốc, không qua Drive)
2. PDF/Image/Scan → Google Drive OCR → text export → regex heading
"""
filename = files.filename or "unknown"
ext = os.path.splitext(filename)[1].lower() or ".bin"
......@@ -120,46 +88,65 @@ async def convert_file(
)
try:
logger.info(f"Processing file via Google Drive OCR/export: {filename}")
md_text = ""
backend_used = "unknown"
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# STRATEGY 1: DOCX → Parse trực tiếp (không qua Drive)
# Giữ đúng numbering gốc (ĐIỀU 1., 4.1., 5.2.1.)
# Drive sẽ nhả loạn số + ăn chữ tiếng Việt
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
if ext in (".docx", ".doc"):
logger.info(f"DOCX detected — parsing directly (no Drive)")
try:
if ext == ".docx":
parsed = convert_docx_to_markdown(file_bytes, filename)
md_text = parsed.get("document", {}).get("md_content", "")
backend_used = "direct_docx"
logger.info(f"Direct DOCX parse: {len(md_text)} chars")
# Nếu DOCX parse ra ít quá (VD: file scan, ảnh) → cũng fallback Drive
if md_text and _looks_like_low_information_markdown(md_text):
logger.warning(f"DOCX parse returned low info ({len(md_text)} chars), will try Drive OCR")
md_text = ""
except Exception as e:
logger.warning(f"Direct DOCX parse failed, will try Drive OCR: {e}")
md_text = ""
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# STRATEGY 2: Nếu direct parse thất bại hoặc file không phải DOCX
# → Google Drive OCR (text export, KHÔNG phải DOCX export)
# Text export giữ đúng numbering, DOCX export nhả loạn số
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
if not md_text:
logger.info(f"Falling back to Google Drive OCR: {filename}")
# Save uploaded content to a temporary file before Drive upload.
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
tmp.write(file_bytes)
tmp_path = tmp.name
try:
drive = get_drive_service()
docx_bytes = ocr_file(
drive,
tmp_path,
source_mime_type=files.content_type,
filename=filename,
export_mode="docx",
)
md_text = ""
if docx_bytes:
try:
parsed = convert_docx_to_markdown(docx_bytes, filename)
md_text = parsed.get("document", {}).get("md_content", "")
except Exception as parse_exc:
logger.warning(
f"DOCX parse after Drive export failed, fallback to text export: {parse_exc}"
)
if not md_text:
md_text = ocr_file(
# Export TEXT (không phải DOCX!) — text export giữ đúng numbering
raw_text = ocr_file(
drive,
tmp_path,
source_mime_type=files.content_type,
filename=filename,
export_mode="text",
)
if raw_text:
from text_to_markdown import text_to_markdown
md_text = text_to_markdown(raw_text)
backend_used = "google_drive_text"
logger.info(f"Drive text export + regex: {len(md_text)} chars")
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
logger.info(f"Drive OCR/export conversion done: {len(md_text)} chars")
logger.info(f"Conversion done: {len(md_text)} chars via {backend_used}")
response = {
"document": {
......@@ -168,19 +155,17 @@ async def convert_file(
},
"meta": {
"low_information": _looks_like_low_information_markdown(md_text),
"ocr_attempted": True,
"ocr_used": True,
"backend": "google_drive",
"backend": backend_used,
"source_content_type": files.content_type,
},
}
if not md_text:
response["warning"] = "Google Drive did not extract text for this file."
response["warning"] = "Could not extract text from this file."
return JSONResponse(content=response)
except Exception as e:
logger.error(f"Drive OCR/export conversion failed: {str(e)}")
logger.error(f"Conversion failed: {str(e)}")
return JSONResponse(
status_code=500,
content={"error": f"Drive OCR/export conversion failed: {str(e)}"},
content={"error": f"Conversion failed: {str(e)}"},
)
......@@ -225,16 +225,36 @@ def _paragraph_to_markdown(
if re.fullmatch(r"Trang\s+\d+(/\d+)?|Page\s+\d+(/\d+)?|\d+\s*-.*-.*|^\d+$", line, re.IGNORECASE):
return ""
# Apply heading styles
# ── Detect heading level ──────────────────────────────
# Ưu tiên 1: Đọc outlineLvl trực tiếp từ XML (chính xác nhất, Word tự gắn)
# Ưu tiên 2: Đọc từ tên Style (Heading1, Heading2...)
# Ưu tiên 3: Regex nhận diện cấu trúc pháp lý VN (Phần, Chương, Điều, Mục...)
# CHỈ áp dụng khi style có chứa "heading" hoặc "title"
# → Tuyệt đối KHÔNG bịa heading từ text bôi đậm thông thường.
heading_level = 0
# ── P1: outlineLvl trong XML (ưu tiên cao nhất) ──
if ppr is not None:
outline_el = ppr.find("w:outlineLvl", NS)
if outline_el is not None:
try:
raw_lvl = int(outline_el.get(f"{{{WORD_NS}}}val", "9"))
if 0 <= raw_lvl <= 5:
heading_level = raw_lvl + 1 # outlineLvl 0 = H1, 1 = H2, ...
except ValueError:
pass
# ── P2: Style name (nếu outlineLvl không có) ──
if heading_level == 0:
style_lower = style_name.lower()
if "heading" in style_lower:
if "heading" in style_lower or "title" in style_lower:
for i in range(1, 7):
if str(i) in style_name:
heading_level = i
break
if heading_level == 0 and ("title" in style_lower or "heading" in style_lower):
heading_level = 1
if heading_level == 0:
heading_level = 1 # Generic "Heading" hoặc "Title" → H1
if heading_level > 0:
return f"{'#' * heading_level} {line}"
......@@ -260,6 +280,17 @@ def _paragraph_to_markdown(
numbering_map.get(num_id, {}).get(ilvl, {}).get("start", 1) - 1,
) + 1
# ── CRITICAL: Reset sub-level counters when parent level changes ──
# Khi ĐIỀU 4 (ilvl 0) tăng counter, phải reset ilvl 1,2,3...
# Nếu không: 4.1 thành 4.10 vì counter ilvl 1 vẫn tiếp tục từ ĐIỀU trước
current_ilvl_int = int(ilvl)
if num_id in numbering_map:
for sub_ilvl in list(numbering_map[num_id].keys()):
if int(sub_ilvl) > current_ilvl_int:
sub_key = f"{num_id}_{sub_ilvl}"
if sub_key in list_counters:
del list_counters[sub_key]
# Render the full numbering text
rendered = _render_lvl_text(
lvl_text, list_counters, num_id, ilvl, numbering_map
......@@ -370,6 +401,9 @@ def convert_docx_to_markdown(file_bytes: bytes, filename: str) -> dict:
md_content = md_content.replace("\n\n\n", "\n\n")
md_content = md_content.strip()
# ── Post-processing: fix heading levels dựa trên nội dung text ──
md_content = _postprocess_legal_headings(md_content)
return {
"document": {
"filename": filename,
......@@ -378,6 +412,79 @@ def convert_docx_to_markdown(file_bytes: bytes, filename: str) -> dict:
}
# ───────────────────────────────────────────────────────────────
# Post-processing: regex-based heading detection trên markdown
# ───────────────────────────────────────────────────────────────
_RE_DIEU = re.compile(
r"^(?:#+\s+)?(?:\*{0,3})((?:ĐIỀU|Điều)\s+\d+[\.\:]?\s*.+?)(?:\*{0,3})\s*$",
re.MULTILINE | re.UNICODE,
)
_RE_H2_NUM = re.compile(
r"^(?:#+\s+)?(?:\*{0,3})(\d+\.\d+\.?\s+.+?)(?:\*{0,3})\s*$",
re.MULTILINE | re.UNICODE,
)
_RE_H3_NUM = re.compile(
r"^(?:#+\s+)?(?:\*{0,3})(\d+\.\d+\.\d+\.?\s+.+?)(?:\*{0,3})\s*$",
re.MULTILINE | re.UNICODE,
)
_RE_ALL_CAPS_LINE = re.compile(
r"^(?:#+\s+)?(?:\*{0,3})([A-ZÀ-Ỹ\s\-–:,\.]+)(?:\*{0,3})\s*$",
re.MULTILINE | re.UNICODE,
)
def _postprocess_legal_headings(md: str) -> str:
"""
Post-process markdown: phát hiện cấu trúc pháp lý VN và gán đúng heading.
Chạy regex trên markdown text đã convert, bất kể DOCX heading style.
"""
lines = md.split("\n")
result = []
for line in lines:
stripped = line.strip()
if not stripped:
result.append(line)
continue
# Bỏ heading markers hiện có để đánh giá lại
clean = re.sub(r"^#+\s+", "", stripped)
# Bỏ bold/italic markers để check text gốc
text_only = re.sub(r"\*{1,3}", "", clean).strip()
if not text_only or len(text_only) > 150:
result.append(line)
continue
# ── Priority 1: "ĐIỀU X." pattern → H1 ──
if re.match(r"^(?:ĐIỀU|Điều)\s+\d+[\.\:\s]", text_only, re.UNICODE | re.IGNORECASE):
result.append(f"# {clean}")
continue
# ── Priority 2: ALL CAPS short line → H1 ──
letters = [c for c in text_only if c.isalpha()]
if letters and all(c == c.upper() for c in letters) and len(text_only) < 100:
# Không phải dòng bảng hoặc list
if not text_only.startswith("|") and not text_only.startswith("-"):
result.append(f"# {clean}")
continue
# ── Priority 3: "X.Y.Z." pattern → H3 ──
if re.match(r"^\d+\.\d+\.\d+\.?\s", text_only):
result.append(f"### {clean}")
continue
# ── Priority 4: "X.Y." pattern → H2 ──
if re.match(r"^\d+\.\d+\.?\s", text_only):
result.append(f"## {clean}")
continue
result.append(line)
return "\n".join(result)
def _extract_headers_footers(z: zipfile.ZipFile) -> List[str]:
"""Extract text from document headers and footers."""
parts = []
......
BTesting test/67_formatted.docx
"""
format_contract.py
------------------
Phục hồi Heading style cho file DOCX sau khi Google Drive OCR export.
Google Drive OCR hay xoá sạch heading styles VÀ bold formatting.
Logic detect heading (không cần bold):
- ALL CAPS + < 120 ký tự → Heading 1
- Regex: "ĐIỀU X." / "Chương X" → Heading 1
- Regex: "X.Y." (sub-section) → Heading 2
- Bold + Mixed case + < 120 ký tự → Heading 2 (fallback)
"""
import re
import shutil
from io import BytesIO
from pathlib import Path
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from lxml import etree
# ───────────────────────────────────────────────
# Regex patterns cho cấu trúc pháp lý VN
# ───────────────────────────────────────────────
# Heading 1: "ĐIỀU 1.", "Điều 1.", "CHƯƠNG I", "Phần 1", etc.
H1_PATTERNS = re.compile(
r"^(?:"
r"(?:ĐIỀU|Điều)\s+\d+[\.\:\s]" # Điều 1. / ĐIỀU 1:
r"|(?:CHƯƠNG|Chương)\s+[IVXLCDM\d]+" # Chương I / CHƯƠNG 2
r"|(?:PHẦN|Phần)\s+[IVXLCDM\d]+" # Phần I / PHẦN 2
r"|(?:MỤC|Mục)\s+[IVXLCDM\d]+" # Mục I / MỤC 2
r")",
re.IGNORECASE | re.UNICODE,
)
# Heading 2: "1.1.", "2.3.", "10.26." etc.
H2_PATTERNS = re.compile(
r"^\d+\.\d+\.?\s",
re.UNICODE,
)
# Heading 3: "1.1.1.", "2.3.4." etc.
H3_PATTERNS = re.compile(
r"^\d+\.\d+\.\d+\.?\s",
re.UNICODE,
)
# ───────────────────────────────────────────────
# Helpers
# ───────────────────────────────────────────────
def _is_all_caps(text: str) -> bool:
"""True nếu toàn bộ chữ cái trong text đều là hoa."""
letters = [c for c in text if c.isalpha()]
return len(letters) > 0 and all(c == c.upper() for c in letters)
def _para_is_bold(para) -> bool:
"""True nếu đoạn văn có ít nhất 1 run in đậm."""
return any(run.bold for run in para.runs if run.text.strip())
def _ensure_heading_style(doc, style_id: str, style_name: str, outline_level: int, font_size: int):
"""Thêm heading style vào document nếu chưa có."""
styles_el = doc.styles.element
for s in styles_el.findall(qn("w:style")):
if s.get(qn("w:styleId")) == style_id:
return
xml = f"""<w:style xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
w:type="paragraph" w:styleId="{style_id}">
<w:name w:val="{style_name}"/>
<w:basedOn w:val="Normal"/>
<w:next w:val="Normal"/>
<w:pPr>
<w:outlineLvl w:val="{outline_level}"/>
<w:spacing w:before="240" w:after="120"/>
</w:pPr>
<w:rPr>
<w:b/>
<w:sz w:val="{font_size}"/>
<w:szCs w:val="{font_size}"/>
</w:rPr>
</w:style>"""
styles_el.append(etree.fromstring(xml))
def _set_outline_level(para, level: int):
"""Gán outlineLvl cho paragraph, tuân thủ XML Schema của Word."""
pPr = para._p.get_or_add_pPr()
for existing in pPr.findall(qn("w:outlineLvl")):
pPr.remove(existing)
el = OxmlElement("w:outlineLvl")
el.set(qn("w:val"), str(level))
# w:outlineLvl PHẢI nằm trước w:rPr để tránh nhao chữ
rPr = pPr.find(qn('w:rPr'))
if rPr is not None:
rPr.addprevious(el)
else:
sectPr = pPr.find(qn('w:sectPr'))
if sectPr is not None:
sectPr.addprevious(el)
else:
pPr.append(el)
def _get_style(doc, primary: str, fallback: str):
"""Lấy style từ document, thử primary trước rồi fallback."""
try:
return doc.styles[primary]
except KeyError:
try:
return doc.styles[fallback]
except KeyError:
return None
# ───────────────────────────────────────────────
# Core detection logic
# ───────────────────────────────────────────────
def _detect_heading_level(text: str, is_bold: bool) -> int:
"""
Detect heading level dựa trên nội dung text và bold.
Returns: 0 = không phải heading, 1 = H1, 2 = H2, 3 = H3
"""
if not text or len(text) >= 120:
return 0
# ── Ưu tiên 1: Regex cấu trúc pháp lý ──
# H3 check trước H2 vì H3 pattern (1.1.1.) cũng match H2 (1.1.)
if H3_PATTERNS.match(text):
return 3
if H1_PATTERNS.match(text):
return 1
if H2_PATTERNS.match(text):
return 2
# ── Ưu tiên 2: ALL CAPS = H1 (KHÔNG cần bold) ──
# Google Drive OCR thường xoá bold nhưng giữ nguyên chữ hoa
if _is_all_caps(text) and len(text) < 100:
return 1
# ── Ưu tiên 3: Bold + ngắn = H2 (fallback) ──
word_count = len(text.split())
if is_bold and word_count <= 15:
return 2
return 0
# ───────────────────────────────────────────────
# API: format từ bytes (dùng trong pipeline)
# ───────────────────────────────────────────────
def fix_headings_in_memory(docx_bytes: bytes) -> bytes:
"""
Nhận DOCX bytes (từ Google Drive OCR export),
phục hồi heading styles, trả về DOCX bytes đã fix.
"""
doc = Document(BytesIO(docx_bytes))
count = _apply_heading_fixes(doc)
if count == 0:
return docx_bytes
buf = BytesIO()
doc.save(buf)
buf.seek(0)
return buf.read()
# ───────────────────────────────────────────────
# API: format từ file path
# ───────────────────────────────────────────────
def format_contract(src_path: Path, dst_path: Path) -> int:
"""Format 1 file DOCX trên disk. Trả về số heading đã fix."""
shutil.copy2(src_path, dst_path)
doc = Document(str(dst_path))
count = _apply_heading_fixes(doc)
doc.save(str(dst_path))
return count
# ───────────────────────────────────────────────
# Internal: áp dụng fix
# ───────────────────────────────────────────────
def _apply_heading_fixes(doc) -> int:
"""Quét paragraphs và gán heading style. Trả về số heading đã fix."""
_ensure_heading_style(doc, "Heading1", "heading 1", 0, 28)
_ensure_heading_style(doc, "Heading2", "heading 2", 1, 24)
_ensure_heading_style(doc, "Heading3", "heading 3", 2, 22)
h1_style = _get_style(doc, "Heading 1", "Heading1")
h2_style = _get_style(doc, "Heading 2", "Heading2")
h3_style = _get_style(doc, "Heading 3", "Heading3")
if not h1_style or not h2_style:
return 0
style_map = {
1: (h1_style, 0),
2: (h2_style, 1),
3: (h3_style or h2_style, 2),
}
count = 0
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
is_bold = _para_is_bold(para)
level = _detect_heading_level(text, is_bold)
if level == 0:
continue
# Luôn override heading style + outlineLvl dựa trên nội dung text
# (Google Drive hay gán sai level, ví dụ tất cả thành Heading 2)
heading_style, outline_val = style_map[level]
para.style = heading_style
_set_outline_level(para, outline_val)
for run in para.runs:
run.bold = True
count += 1
return count
B1 | CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM
This diff is collapsed.
BTesting test/67_formatted.docx
"""Test E2E: kiểm tra cấu trúc ĐIỀU / 1.1 / 1.2.1 có ra đúng heading không."""
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
import httpx
filepath = "test/67_formatted.docx"
with open(filepath, "rb") as f:
resp = httpx.post(
"http://localhost:8000/v1/convert/file",
files={"files": (filepath, f, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
timeout=60,
)
md = resp.json().get("document", {}).get("md_content", "")
# Chỉ in các dòng có heading (#) hoặc chứa "Điều"/"ĐIỀU"
for i, line in enumerate(md.split("\n")):
if line.startswith("#") or "điều" in line.lower() or "ĐIỀU" in line:
print(f"{i+1:3d} | {line}")
B# **CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM**
"""
text_to_markdown.py
-------------------
Convert plain text (from Google Drive text export) to structured Markdown.
Google Drive text export giữ đúng numbering (ĐIỀU 1., 1.1., 4.1., 5.2.1.)
trong khi DOCX export bị nhả loạn số.
Pipeline:
1. Detect tab-separated rows → convert thành markdown tables
2. Detect heading patterns (ĐIỀU, X.Y., X.Y.Z., ALL CAPS) → gán #/##/###
3. Detect bullet lists (a), (b), -, • etc.
4. Clean up formatting
"""
import re
from typing import List, Tuple
# ───────────────────────────────────────────────
# Heading patterns cho hợp đồng pháp lý VN
# ───────────────────────────────────────────────
RE_DIEU = re.compile(
r"^((?:ĐIỀU|Điều)\s+\d+[\.\:]?\s*.*)$",
re.UNICODE,
)
RE_CHUONG = re.compile(
r"^((?:CHƯƠNG|Chương|PHẦN|Phần|MỤC|Mục)\s+[IVXLCDM\d]+[\.\:]?\s*.*)$",
re.UNICODE | re.IGNORECASE,
)
# X.Y.Z. pattern (H3) - check TRƯỚC X.Y. vì X.Y.Z cũng match X.Y
RE_H3 = re.compile(r"^(\d+\.\d+\.\d+\.?\s)", re.UNICODE)
# X.Y. pattern (H2)
RE_H2 = re.compile(r"^(\d+\.\d+\.?\s)", re.UNICODE)
def _is_all_caps(text: str) -> bool:
"""True nếu toàn bộ chữ cái đều viết hoa."""
letters = [c for c in text if c.isalpha()]
return len(letters) > 3 and all(c == c.upper() for c in letters)
def _is_table_block(lines: List[str], start: int) -> Tuple[bool, int]:
"""
Detect một block tab-separated (bảng) bắt đầu từ dòng start.
Returns (is_table, end_index).
Bảng = ít nhất 2 dòng liên tiếp có >= 2 tabs.
"""
count = 0
i = start
while i < len(lines):
if lines[i].count("\t") >= 2:
count += 1
i += 1
else:
break
return count >= 2, i
def _tab_block_to_markdown_table(lines: List[str]) -> str:
"""Convert tab-separated lines thành markdown table."""
rows = []
max_cols = 0
for line in lines:
cells = [c.strip() for c in line.split("\t")]
# Loại bỏ trailing empty cells
while cells and not cells[-1]:
cells.pop()
if cells:
rows.append(cells)
max_cols = max(max_cols, len(cells))
if not rows or max_cols < 2:
return "\n".join(lines)
# Normalize row lengths
for row in rows:
while len(row) < max_cols:
row.append("")
md_lines = []
# Header row
md_lines.append("| " + " | ".join(rows[0]) + " |")
md_lines.append("| " + " | ".join(["---"] * max_cols) + " |")
# Data rows
for row in rows[1:]:
md_lines.append("| " + " | ".join(row) + " |")
return "\n".join(md_lines)
def _detect_heading(line: str) -> Tuple[int, str]:
"""
Detect heading level từ nội dung text.
Returns: (level, clean_text) where level=0 means not a heading.
"""
text = line.strip()
if not text or len(text) > 200:
return 0, text
# ── Priority 1: "ĐIỀU X." → H1 ──
if RE_DIEU.match(text):
return 1, text
# ── Priority 2: "CHƯƠNG/PHẦN/MỤC" → H1 ──
if RE_CHUONG.match(text):
return 1, text
# ── Priority 3: ALL CAPS short line → H1 ──
# Loại trừ các dòng trong bảng, dòng quá ngắn (<5 chars), bullets
if (_is_all_caps(text)
and len(text) < 120
and not text.startswith("|")
and not text.startswith("-")
and not text.startswith("•")
and "\t" not in text):
return 1, text
# ── Priority 4: "X.Y.Z." → H3 ──
if RE_H3.match(text):
return 3, text
# ── Priority 5: "X.Y." → H2 ──
if RE_H2.match(text):
return 2, text
return 0, text
def _clean_line(line: str) -> str:
"""Clean up a single line: normalize tabs thành spaces cho non-table lines."""
# Giữ nguyên nếu là table (nhiều tabs)
if line.count("\t") >= 2:
return line
# Thay tabs đơn thành spaces cho readability
return line.replace("\t", " ").strip()
def text_to_markdown(text: str) -> str:
"""
Convert plain text (từ Google Drive text export) thành structured Markdown.
- Detect tab-separated tables → markdown tables
- Detect heading patterns → # / ## / ###
- Detect bullets → - list items
- Clean formatting
"""
if not text or not text.strip():
return ""
lines = text.split("\n")
result = []
i = 0
while i < len(lines):
line = lines[i]
# ── Check tab-separated table block ──
is_table, table_end = _is_table_block(lines, i)
if is_table:
table_md = _tab_block_to_markdown_table(lines[i:table_end])
result.append("")
result.append(table_md)
result.append("")
i = table_end
continue
# ── Clean the line ──
cleaned = _clean_line(line)
if not cleaned:
result.append("")
i += 1
continue
# ── Lọc page numbers / headers ──
if re.fullmatch(r"\d{1,3}", cleaned):
i += 1
continue
if re.fullmatch(r"Trang\s+\d+(/\d+)?|Page\s+\d+(/\d+)?", cleaned, re.IGNORECASE):
i += 1
continue
# ── Detect heading ──
level, text_clean = _detect_heading(cleaned)
if level > 0:
result.append(f"{'#' * level} {text_clean}")
i += 1
continue
# ── Detect bullet patterns: (a), (b), -, • ──
if re.match(r"^\([a-z]\)\s", cleaned):
result.append(cleaned) # Keep as-is, already looks like a list
i += 1
continue
# ── Normal text ──
result.append(cleaned)
i += 1
# Clean up excessive blank lines
md = "\n".join(result)
while "\n\n\n" in md:
md = md.replace("\n\n\n", "\n\n")
return md.strip()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment