Initial commit

b9f8d512 · Vũ Hoàng Anh · b9f8d512 · b9f8d512 · b9f8d512 · b9f8d512
Commit b9f8d512 authored Apr 08, 2026 by Vũ Hoàng Anh
8 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+env/
+venv/
+.venv/
+*.pdf
+ocr_service_account.json
+.env
--- a/Dockerfile
+++ b/Dockerfile
+# ──────────────────────────────────────
+# Stage 1: Build dependencies
+# ──────────────────────────────────────
+FROM python:3.12-alpine AS builder
+
+RUN apk add --no-cache gcc musl-dev gcompat libstdc++
+
+WORKDIR /build
+COPY requirements.txt .
+RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
+
+# ──────────────────────────────────────
+# Stage 2: Runtime (minimal)
+# ──────────────────────────────────────
+FROM python:3.12-alpine AS runtime
+
+LABEL maintainer="anhvh"
+LABEL description="Document Converter API - PDF→Docling, DOCX→python-docx"
+
+# Create non-root user
+RUN addgroup -S appgroup && adduser -S appuser -G appgroup
+
+# Install runtime dependencies for PyMuPDF
+RUN apk add --no-cache libstdc++ gcompat
+
+WORKDIR /app
+
+# Copy installed packages from builder
+COPY --from=builder /install /usr/local
+
+# Copy source code
+COPY app.py .
+COPY docx_converter.py .
+
+# Switch to non-root user
+USER appuser
+
+EXPOSE 8000
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
--- a/app.py
+++ b/app.py
+"""
+Document Converter API Wrapper.
+- Any file type → convert via Google Drive OCR/export
+"""
+
+import os
+import logging
+import inspect
+import re
+import tempfile
+
+import httpx
+from fastapi import FastAPI, File, Form, UploadFile
+from fastapi.responses import JSONResponse
+import pymupdf4llm
+from ocr_drive import get_drive_service, ocr_file
+from docx_converter import convert_docx_to_markdown
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("doc-converter")
+
+app = FastAPI(
+    title="Document Converter API",
+    version="1.0.0",
+    description="Wrapper API: Any file type -> Google Drive OCR/export",
+)
+
+
+def _looks_like_low_information_markdown(md_text: str) -> bool:
+    """
+    Heuristic to detect markdown output that has almost no meaningful text.
+    Typical examples are repeated page separators like '-----'.
+    """
+    if not md_text:
+        return True
+
+    lines = [line.strip() for line in md_text.splitlines() if line.strip()]
+    if not lines:
+        return True
+
+    meaningful_lines = [line for line in lines if re.sub(r"[-=\s]", "", line)]
+    if not meaningful_lines:
+        return True
+
+    # Must contain at least one word-like character to be considered useful text.
+    has_word_content = any(re.search(r"\w", line, flags=re.UNICODE) for line in meaningful_lines)
+    if not has_word_content:
+        return True
+
+    separator_lines = [line for line in lines if re.fullmatch(r"[-=]{3,}", line)]
+    return len(separator_lines) >= 3 and len(meaningful_lines) <= 1
+
+
+def _convert_pdf_markdown_best_effort(doc):
+    """
+    Convert PDF to markdown and retry with OCR when output seems empty.
+    """
+    sig = inspect.signature(pymupdf4llm.to_markdown)
+    params = sig.parameters
+
+    base_kwargs = {}
+    if "show_progress" in params:
+        base_kwargs["show_progress"] = False
+
+    md_text = pymupdf4llm.to_markdown(doc, **base_kwargs)
+    low_information = _looks_like_low_information_markdown(md_text)
+
+    ocr_attempted = False
+    ocr_used = False
+    ocr_error = None
+
+    # Retry once with OCR only when the first pass is likely useless.
+    if low_information and "use_ocr" in params:
+        ocr_attempted = True
+        ocr_kwargs = dict(base_kwargs)
+        ocr_kwargs["use_ocr"] = True
+        if "ocr_language" in params:
+            ocr_kwargs["ocr_language"] = os.getenv("OCR_LANGUAGE", "vie+eng")
+
+        try:
+            md_text_ocr = pymupdf4llm.to_markdown(doc, **ocr_kwargs)
+            if len(md_text_ocr.strip()) > len(md_text.strip()):
+                md_text = md_text_ocr
+                ocr_used = True
+                low_information = _looks_like_low_information_markdown(md_text)
+        except Exception as exc:
+            ocr_error = str(exc)
+
+    return md_text, {
+        "low_information": low_information,
+        "ocr_attempted": ocr_attempted,
+        "ocr_used": ocr_used,
+        "ocr_error": ocr_error,
+    }
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+@app.post("/v1/convert/file")
+async def convert_file(
+    files: UploadFile = File(...),
+):
+    """
+    Convert uploaded file to markdown-compatible text using Google Drive OCR/export.
+    This endpoint intentionally routes every file type through Drive processing.
+    """
+    filename = files.filename or "unknown"
+    ext = os.path.splitext(filename)[1].lower() or ".bin"
+    file_bytes = await files.read()
+
+    logger.info(f"Received file: {filename} ({len(file_bytes)} bytes, ext={ext}, content_type={files.content_type})")
+
+    if not file_bytes:
+        return JSONResponse(
+            status_code=400,
+            content={"error": "Empty file upload."},
+        )
+
+    try:
+        logger.info(f"Processing file via Google Drive OCR/export: {filename}")
+
+        # Save uploaded content to a temporary file before Drive upload.
+        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
+            tmp.write(file_bytes)
+            tmp_path = tmp.name
+
+        try:
+            drive = get_drive_service()
+            docx_bytes = ocr_file(
+                drive,
+                tmp_path,
+                source_mime_type=files.content_type,
+                filename=filename,
+                export_mode="docx",
+            )
+
+            md_text = ""
+            if docx_bytes:
+                try:
+                    parsed = convert_docx_to_markdown(docx_bytes, filename)
+                    md_text = parsed.get("document", {}).get("md_content", "")
+                except Exception as parse_exc:
+                    logger.warning(
+                        f"DOCX parse after Drive export failed, fallback to text export: {parse_exc}"
+                    )
+
+            if not md_text:
+                md_text = ocr_file(
+                    drive,
+                    tmp_path,
+                    source_mime_type=files.content_type,
+                    filename=filename,
+                    export_mode="text",
+                )
+        finally:
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+
+        logger.info(f"Drive OCR/export conversion done: {len(md_text)} chars")
+
+        response = {
+            "document": {
+                "filename": filename,
+                "md_content": md_text,
+            },
+            "meta": {
+                "low_information": _looks_like_low_information_markdown(md_text),
+                "ocr_attempted": True,
+                "ocr_used": True,
+                "backend": "google_drive",
+                "source_content_type": files.content_type,
+            },
+        }
+        if not md_text:
+            response["warning"] = "Google Drive did not extract text for this file."
+
+        return JSONResponse(content=response)
+    except Exception as e:
+        logger.error(f"Drive OCR/export conversion failed: {str(e)}")
+        return JSONResponse(
+            status_code=500,
+            content={"error": f"Drive OCR/export conversion failed: {str(e)}"},
+        )
--- a/docker-compose.yml
+++ b/docker-compose.yml
+
+services:
+  doc-converter:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: doc-converter
+    restart: unless-stopped
+    ports:
+      - "5005:8000"
+    dns:
+      - 8.8.8.8      # DNS của Google
+      - 1.1.1.1      # DNS của Cloudflare
+    volumes:
+      - ./app.py:/app/app.py
+      - ./docx_converter.py:/app/docx_converter.py
+      - ./ocr_drive.py:/app/ocr_drive.py
+      - ./ocr_service_account.json:/app/ocr_service_account.json
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    command: ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
+    networks:
+      doc-net:
+        ipv4_address: 10.10.1.2
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 5s
+
+networks:
+  doc-net:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 10.10.1.0/24
+          gateway: 10.10.1.1
--- a/docx_converter.py
+++ b/docx_converter.py
+"""
+DOCX to Markdown converter using raw XML parsing.
+Extracts paragraphs, tables, headers/footers — preserves 100% content.
+Supports lvlText templates (e.g. "Điều %1.") for correct numbering.
+"""
+
+import re
+import zipfile
+import xml.etree.ElementTree as ET
+from io import BytesIO
+from typing import Dict, List, Tuple, Optional
+
+
+WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+NS = {"w": WORD_NS}
+
+
+def _get_numbering_map(z: zipfile.ZipFile) -> dict:
+    """
+    Parse word/numbering.xml to build numId+level -> (format, lvlText, start) mapping.
+    Returns: {numId: {ilvl: {"fmt": ..., "lvlText": ..., "start": ...}}}
+    """
+    mapping = {}
+    try:
+        numbering_xml = z.read("word/numbering.xml")
+    except KeyError:
+        return mapping
+
+    root = ET.fromstring(numbering_xml)
+
+    # Build abstractNum lookup
+    abstract_nums = {}
+    for abstract in root.findall(".//w:abstractNum", NS):
+        abstract_id = abstract.get(f"{{{WORD_NS}}}abstractNumId")
+        levels = {}
+        for lvl in abstract.findall("w:lvl", NS):
+            ilvl = lvl.get(f"{{{WORD_NS}}}ilvl")
+
+            num_fmt_el = lvl.find("w:numFmt", NS)
+            fmt = num_fmt_el.get(f"{{{WORD_NS}}}val") if num_fmt_el is not None else "bullet"
+
+            lvl_text_el = lvl.find("w:lvlText", NS)
+            lvl_text = lvl_text_el.get(f"{{{WORD_NS}}}val") if lvl_text_el is not None else "%1."
+
+            start_el = lvl.find("w:start", NS)
+            start_val = int(start_el.get(f"{{{WORD_NS}}}val")) if start_el is not None else 1
+
+            levels[ilvl] = {
+                "fmt": fmt,
+                "lvlText": lvl_text,
+                "start": start_val,
+            }
+        abstract_nums[abstract_id] = levels
+
+    # Build numId -> abstractNumId mapping
+    for num in root.findall(".//w:num", NS):
+        num_id = num.get(f"{{{WORD_NS}}}numId")
+        abstract_ref = num.find("w:abstractNumId", NS)
+        if abstract_ref is not None:
+            abstract_id = abstract_ref.get(f"{{{WORD_NS}}}val")
+            if abstract_id in abstract_nums:
+                mapping[num_id] = abstract_nums[abstract_id]
+
+    return mapping
+
+
+def _format_number(value: int, fmt: str) -> str:
+    """Convert a number to the specified format."""
+    if fmt == "decimal":
+        return str(value)
+    elif fmt == "lowerLetter":
+        return chr(ord('a') + value - 1) if 1 <= value <= 26 else str(value)
+    elif fmt == "upperLetter":
+        return chr(ord('A') + value - 1) if 1 <= value <= 26 else str(value)
+    elif fmt == "lowerRoman":
+        romans = [(1000,'m'),(900,'cm'),(500,'d'),(400,'cd'),(100,'c'),(90,'xc'),
+                  (50,'l'),(40,'xl'),(10,'x'),(9,'ix'),(5,'v'),(4,'iv'),(1,'i')]
+        result = ""
+        for val, numeral in romans:
+            while value >= val:
+                result += numeral
+                value -= val
+        return result
+    elif fmt == "upperRoman":
+        romans = [(1000,'M'),(900,'CM'),(500,'D'),(400,'CD'),(100,'C'),(90,'XC'),
+                  (50,'L'),(40,'XL'),(10,'X'),(9,'IX'),(5,'V'),(4,'IV'),(1,'I')]
+        result = ""
+        for val, numeral in romans:
+            while value >= val:
+                result += numeral
+                value -= val
+        return result
+    else:
+        return str(value)
+
+
+def _render_lvl_text(
+    lvl_text: str,
+    counters: Dict[str, int],
+    num_id: str,
+    current_ilvl: str,
+    numbering_map: dict,
+) -> str:
+    """
+    Render a lvlText template like 'Điều %1.' or '%1.%2.' by substituting
+    counter values for each level.
+    """
+    result = lvl_text
+
+    # Find all %N references in the lvlText
+    for match in re.finditer(r"%(\d+)", lvl_text):
+        ref_level = str(int(match.group(1)) - 1)  # %1 = ilvl 0, %2 = ilvl 1, etc.
+        counter_key = f"{num_id}_{ref_level}"
+        counter_val = counters.get(counter_key, 1)
+
+        # Get the format for this referenced level
+        ref_fmt = "decimal"
+        if num_id in numbering_map and ref_level in numbering_map[num_id]:
+            ref_fmt = numbering_map[num_id][ref_level]["fmt"]
+
+        formatted = _format_number(counter_val, ref_fmt)
+        result = result.replace(match.group(0), formatted, 1)
+
+    return result
+
+
+def _extract_run_text(run) -> Tuple[str, bool, bool]:
+    """Extract text from a run element with bold/italic info."""
+    texts = []
+    for t in run.findall(".//w:t", NS):
+        if t.text:
+            texts.append(t.text)
+
+    text = "".join(texts)
+
+    rpr = run.find("w:rPr", NS)
+    bold = False
+    italic = False
+    if rpr is not None:
+        b_el = rpr.find("w:b", NS)
+        if b_el is not None:
+            val = b_el.get(f"{{{WORD_NS}}}val", "true")
+            bold = val.lower() != "false"
+        i_el = rpr.find("w:i", NS)
+        if i_el is not None:
+            val = i_el.get(f"{{{WORD_NS}}}val", "true")
+            italic = val.lower() != "false"
+
+    return text, bold, italic
+
+
+def _merge_runs(runs_data: List[Tuple[str, bool, bool]]) -> str:
+    """
+    Merge adjacent runs with the same formatting to avoid ****artifacts****.
+    """
+    if not runs_data:
+        return ""
+
+    # Group consecutive runs with same formatting
+    groups = []
+    for text, bold, italic in runs_data:
+        if not text:
+            continue
+        if groups and groups[-1][1] == bold and groups[-1][2] == italic:
+            groups[-1] = (groups[-1][0] + text, bold, italic)
+        else:
+            groups.append((text, bold, italic))
+
+    # Convert groups to markdown
+    parts = []
+    for text, bold, italic in groups:
+        if bold and italic:
+            parts.append(f"***{text}***")
+        elif bold:
+            parts.append(f"**{text}**")
+        elif italic:
+            parts.append(f"*{text}*")
+        else:
+            parts.append(text)
+
+    return "".join(parts)
+
+
+def _paragraph_to_markdown(
+    para,
+    numbering_map: dict,
+    list_counters: dict,
+) -> str:
+    """Convert a single paragraph element to markdown."""
+    # Extract style
+    ppr = para.find("w:pPr", NS)
+    style_name = ""
+    num_id = None
+    ilvl = "0"
+
+    if ppr is not None:
+        style_el = ppr.find("w:pStyle", NS)
+        if style_el is not None:
+            style_name = style_el.get(f"{{{WORD_NS}}}val", "")
+
+        # Check for list numbering
+        num_pr = ppr.find("w:numPr", NS)
+        if num_pr is not None:
+            num_id_el = num_pr.find("w:numId", NS)
+            ilvl_el = num_pr.find("w:ilvl", NS)
+            if num_id_el is not None:
+                num_id = num_id_el.get(f"{{{WORD_NS}}}val")
+            if ilvl_el is not None:
+                ilvl = ilvl_el.get(f"{{{WORD_NS}}}val", "0")
+
+    # Process all runs — collect then merge
+    runs_data = []
+    for run in para.findall(".//w:r", NS):
+        text, bold, italic = _extract_run_text(run)
+        if text:
+            runs_data.append((text, bold, italic))
+
+    line = _merge_runs(runs_data).strip()
+    if not line:
+        return ""
+
+    # Apply heading styles
+    heading_level = 0
+    style_lower = style_name.lower()
+    if "heading" in style_lower:
+        for i in range(1, 7):
+            if str(i) in style_name:
+                heading_level = i
+                break
+        if heading_level == 0 and ("title" in style_lower or "heading" in style_lower):
+            heading_level = 1
+
+    if heading_level > 0:
+        return f"{'#' * heading_level} {line}"
+
+    # Apply list formatting with lvlText support
+    if num_id is not None and num_id != "0":
+        indent = "    " * int(ilvl)
+        fmt = "bullet"
+        lvl_text = "%1."
+
+        if num_id in numbering_map and ilvl in numbering_map[num_id]:
+            lvl_info = numbering_map[num_id][ilvl]
+            fmt = lvl_info["fmt"]
+            lvl_text = lvl_info["lvlText"]
+
+        if fmt == "bullet":
+            return f"{indent}- {line}"
+
+        # Increment counter for current level
+        counter_key = f"{num_id}_{ilvl}"
+        list_counters[counter_key] = list_counters.get(
+            counter_key,
+            numbering_map.get(num_id, {}).get(ilvl, {}).get("start", 1) - 1,
+        ) + 1
+
+        # Render the full numbering text
+        rendered = _render_lvl_text(
+            lvl_text, list_counters, num_id, ilvl, numbering_map
+        )
+
+        return f"{indent}{rendered} {line}"
+
+    return line
+
+
+def _table_to_markdown(table) -> str:
+    """Convert a table element to markdown table."""
+    rows = table.findall(".//w:tr", NS)
+    if not rows:
+        return ""
+
+    md_rows = []
+    max_cols = 0
+
+    for row in rows:
+        cells = row.findall(".//w:tc", NS)
+        cell_texts = []
+        for cell in cells:
+            # Extract all text from cell paragraphs
+            cell_parts = []
+            for p in cell.findall(".//w:p", NS):
+                runs_data = []
+                for run in p.findall(".//w:r", NS):
+                    text, bold, italic = _extract_run_text(run)
+                    if text:
+                        runs_data.append((text, bold, italic))
+                p_text = _merge_runs(runs_data).strip()
+                if p_text:
+                    cell_parts.append(p_text)
+            cell_texts.append(" ".join(cell_parts))
+        md_rows.append(cell_texts)
+        max_cols = max(max_cols, len(cell_texts))
+
+    if max_cols == 0:
+        return ""
+
+    # Normalize row lengths
+    for row in md_rows:
+        while len(row) < max_cols:
+            row.append("")
+
+    lines = []
+    # Header row
+    header = md_rows[0]
+    lines.append("| " + " | ".join(header) + " |")
+    lines.append("| " + " | ".join(["---"] * max_cols) + " |")
+
+    # Data rows
+    for row in md_rows[1:]:
+        lines.append("| " + " | ".join(row) + " |")
+
+    return "\n".join(lines)
+
+
+def convert_docx_to_markdown(file_bytes: bytes, filename: str) -> dict:
+    """
+    Convert DOCX file bytes to markdown content.
+    Returns dict matching Docling response format.
+    """
+    z = zipfile.ZipFile(BytesIO(file_bytes))
+    doc_xml = z.read("word/document.xml")
+    root = ET.fromstring(doc_xml)
+
+    numbering_map = _get_numbering_map(z)
+    list_counters: Dict[str, int] = {}
+
+    # Find the document body
+    body = root.find(f"{{{WORD_NS}}}body")
+    if body is None:
+        body = root
+
+    md_parts = []
+
+    # Process all top-level elements in order (paragraphs and tables)
+    for element in body:
+        tag = element.tag.replace(f"{{{WORD_NS}}}", "")
+
+        if tag == "p":
+            line = _paragraph_to_markdown(element, numbering_map, list_counters)
+            if line:
+                md_parts.append(line)
+            else:
+                md_parts.append("")  # preserve blank lines
+
+        elif tag == "tbl":
+            table_md = _table_to_markdown(element)
+            if table_md:
+                md_parts.append("")
+                md_parts.append(table_md)
+                md_parts.append("")
+
+        elif tag == "sectPr":
+            pass  # skip section properties
+
+    # Also extract headers and footers
+    header_footer_parts = _extract_headers_footers(z)
+    if header_footer_parts:
+        md_parts = header_footer_parts + ["", "---", ""] + md_parts
+
+    # Clean up excessive blank lines
+    md_content = "\n".join(md_parts)
+    while "\n\n\n" in md_content:
+        md_content = md_content.replace("\n\n\n", "\n\n")
+    md_content = md_content.strip()
+
+    return {
+        "document": {
+            "filename": filename,
+            "md_content": md_content,
+        }
+    }
+
+
+def _extract_headers_footers(z: zipfile.ZipFile) -> List[str]:
+    """Extract text from document headers and footers."""
+    parts = []
+    for name in z.namelist():
+        if name.startswith("word/header") or name.startswith("word/footer"):
+            try:
+                xml_data = z.read(name)
+                root = ET.fromstring(xml_data)
+                for p in root.findall(f".//{{{WORD_NS}}}p"):
+                    texts = []
+                    for t in p.findall(f".//{{{WORD_NS}}}t"):
+                        if t.text:
+                            texts.append(t.text)
+                    line = "".join(texts).strip()
+                    if line and line not in parts:
+                        parts.append(line)
+            except Exception:
+                pass
+    return parts
--- a/ocr_drive.py
+++ b/ocr_drive.py
+"""
+PDF Scan -> Google Drive OCR -> Text Extraction
+==============================================
+Upload scanned PDFs to Google Drive (auto-converts to Google Docs with OCR),
+export the recognized text, save as .txt files, and clean up.
+
+Usage:
+    python ocr_drive.py                    # Process all PDFs in current directory
+    python ocr_drive.py file1.pdf file2.pdf # Process specific files
+"""
+
+import os
+import sys
+import io
+import json
+import time
+import glob
+import mimetypes
+from pathlib import Path
+
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
+
+# --- CONFIG ---
+SCRIPT_DIR = Path(__file__).parent
+SERVICE_ACCOUNT_FILE = SCRIPT_DIR / "ocr_service_account.json"
+SCOPES = ["https://www.googleapis.com/auth/drive.file"]
+WAIT_SECONDS = 5  # Wait for Google to process OCR
+GOOGLE_DOC_MIME = "application/vnd.google-apps.document"
+DOCX_EXPORT_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+TEXT_EXPORT_MIME = "text/plain"
+
+
+def get_drive_service():
+    """Authenticate and return Google Drive service."""
+    if not SERVICE_ACCOUNT_FILE.exists():
+        print(f"[ERROR] Khong tim thay file credentials: {SERVICE_ACCOUNT_FILE}")
+        sys.exit(1)
+
+    creds = service_account.Credentials.from_service_account_file(
+        str(SERVICE_ACCOUNT_FILE), scopes=SCOPES
+    )
+    return build("drive", "v3", credentials=creds)
+
+
+def _guess_mime_type(file_path: str, source_mime_type: str = None) -> str:
+    """Best-effort MIME detection for upload to Google Drive."""
+    if source_mime_type:
+        source_mime_type = source_mime_type.strip().lower()
+        if source_mime_type and source_mime_type != "application/octet-stream":
+            return source_mime_type
+
+    guessed, _ = mimetypes.guess_type(file_path)
+    if guessed:
+        return guessed
+
+    return "application/octet-stream"
+
+
+def ocr_file(
+    drive,
+    file_path: str,
+    source_mime_type: str = None,
+    filename: str = None,
+    export_mode: str = "text",
+):
+    """
+    Upload a file to Google Drive -> convert to Google Docs (OCR/export) -> export data -> cleanup.
+
+    export_mode:
+      - "text": return UTF-8 string
+      - "docx": return DOCX bytes
+    """
+    local_name = os.path.basename(file_path)
+    display_name = filename or local_name
+    upload_mime = _guess_mime_type(file_path, source_mime_type)
+
+    print(f"\n{'='*60}")
+    print(f"[FILE] Processing: {display_name}")
+    print(f"   Size: {os.path.getsize(file_path) / 1024:.0f} KB")
+    print(f"   MIME: {upload_mime}")
+
+    doc_id = None
+    try:
+        # Step 1: Upload source file -> Google Docs conversion
+        print(f"   [UPLOAD] Uploading to Google Drive...")
+        file_metadata = {"name": display_name, "mimeType": GOOGLE_DOC_MIME}
+        media = MediaFileUpload(file_path, mimetype=upload_mime)
+        uploaded = (
+            drive.files()
+            .create(body=file_metadata, media_body=media, fields="id")
+            .execute()
+        )
+        doc_id = uploaded.get("id")
+        print(f"   [OK] Uploaded! Doc ID: {doc_id}")
+
+        # Step 2: Wait for Google OCR processing
+        print(f"   [WAIT] Waiting {WAIT_SECONDS}s for OCR processing...")
+        time.sleep(WAIT_SECONDS)
+
+        # Step 3: Export from Google Docs
+        if export_mode == "docx":
+            export_mime = DOCX_EXPORT_MIME
+            print(f"   [EXPORT] Exporting DOCX...")
+        else:
+            export_mime = TEXT_EXPORT_MIME
+            print(f"   [EXPORT] Exporting text...")
+
+        request = drive.files().export_media(fileId=doc_id, mimeType=export_mime)
+        fh = io.BytesIO()
+        downloader = MediaIoBaseDownload(fh, request)
+        done = False
+        while not done:
+            status, done = downloader.next_chunk()
+            if status:
+                print(f"   [DOWNLOAD] {int(status.progress() * 100)}%")
+
+        payload = fh.getvalue()
+        if export_mode == "docx":
+            print(f"   [OK] Exported {len(payload):,} bytes DOCX")
+            return payload
+
+        text_content = payload.decode("utf-8").strip()
+        print(f"   [OK] Extracted {len(text_content):,} characters")
+        return text_content
+
+    except Exception as e:
+        print(f"   [ERROR] {e}")
+        if export_mode == "docx":
+            return b""
+        return ""
+
+    finally:
+        # Step 4: Cleanup - delete from Google Drive
+        if doc_id:
+            try:
+                drive.files().delete(fileId=doc_id).execute()
+                print(f"   [CLEANUP] Deleted Drive file")
+            except Exception as e:
+                print(f"   [WARN] Cleanup failed: {e}")
+
+
+def ocr_pdf(drive, pdf_path: str) -> str:
+    """Backward-compatible wrapper for legacy callers."""
+    return ocr_file(drive, pdf_path, source_mime_type="application/pdf")
+
+
+def save_text(pdf_path: str, text: str) -> str:
+    """Save extracted text to a .txt file with the same name."""
+    txt_path = os.path.splitext(pdf_path)[0] + ".txt"
+    with open(txt_path, "w", encoding="utf-8") as f:
+        f.write(text)
+    print(f"   [SAVED] {os.path.basename(txt_path)}")
+    return txt_path
+
+
+def main():
+    # Determine which files to process
+    if len(sys.argv) > 1:
+        pdf_files = [f for f in sys.argv[1:] if f.lower().endswith(".pdf")]
+    else:
+        pdf_files = glob.glob(str(SCRIPT_DIR / "*.pdf"))
+
+    if not pdf_files:
+        print("[ERROR] Khong tim thay file PDF nao!")
+        print(f"   Thu muc: {SCRIPT_DIR}")
+        sys.exit(1)
+
+    print(f"[START] Google Drive OCR - {len(pdf_files)} file(s)")
+    print(f"   Credentials: {SERVICE_ACCOUNT_FILE}")
+
+    # Authenticate
+    drive = get_drive_service()
+    print("[OK] Ket noi Google Drive thanh cong!")
+
+    # Process each PDF
+    results = []
+    for pdf_path in pdf_files:
+        text = ocr_file(drive, pdf_path, source_mime_type="application/pdf")
+        if text:
+            txt_path = save_text(pdf_path, text)
+            results.append((pdf_path, txt_path, len(text)))
+
+            # Preview first 500 chars
+            print(f"\n   [PREVIEW] 500 ky tu dau:")
+            print(f"   {'-'*50}")
+            preview = text[:500].replace("\n", "\n   ")
+            print(f"   {preview}")
+            print(f"   {'-'*50}")
+        else:
+            print(f"   [FAIL] Khong trich xuat duoc text tu {os.path.basename(pdf_path)}")
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"[SUMMARY] KET QUA TONG HOP")
+    print(f"{'='*60}")
+    print(f"   Tong file: {len(pdf_files)}")
+    print(f"   Thanh cong: {len(results)}")
+    print(f"   That bai: {len(pdf_files) - len(results)}")
+    for pdf_path, txt_path, chars in results:
+        print(f"   [OK] {os.path.basename(pdf_path)} -> {chars:,} ky tu")
+    print()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+fastapi==0.115.6
+uvicorn==0.34.0
+python-multipart==0.0.20
+httpx==0.28.1
+pymupdf4llm==0.0.17
+google-api-python-client
+google-auth-httplib2
+google-auth-oauthlib
--- a/test/2024_4.2. HD FOB +BH_(0,25-0,5)_Ap dung khi xin duoc phe duyet.doc
+++ b/test/2024_4.2. HD FOB +BH_(0,25-0,5)_Ap dung khi xin duoc phe duyet.doc