Initial commit

b9f8d512 · Vũ Hoàng Anh · b9f8d512 · b9f8d512 · b9f8d512 · b9f8d512
Commit b9f8d512 authored Apr 08, 2026 by Vũ Hoàng Anh
8 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+env/
+venv/
+.venv/
+*.pdf
+ocr_service_account.json
+.env
--- a/Dockerfile
+++ b/Dockerfile
+# ──────────────────────────────────────
+# Stage 1: Build dependencies
+# ──────────────────────────────────────
+FROM python:3.12-alpine AS builder
+RUN apk add --no-cache gcc musl-dev gcompat libstdc++
+WORKDIR /build
+COPY requirements.txt .
+RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
+# ──────────────────────────────────────
+# Stage 2: Runtime (minimal)
+# ──────────────────────────────────────
+FROM python:3.12-alpine AS runtime
+LABEL maintainer="anhvh"
+LABEL description="Document Converter API - PDF→Docling, DOCX→python-docx"
+# Create non-root user
+RUN addgroup -S appgroup && adduser -S appuser -G appgroup
+# Install runtime dependencies for PyMuPDF
+RUN apk add --no-cache libstdc++ gcompat
+WORKDIR /app
+# Copy installed packages from builder
+COPY --from=builder /install /usr/local
+# Copy source code
+COPY app.py .
+COPY docx_converter.py .
+# Switch to non-root user
+USER appuser
+EXPOSE 8000
+HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
--- a/app.py
+++ b/app.py
+"""
+Document Converter API Wrapper.
+- Any file type → convert via Google Drive OCR/export
+"""
+import os
+import logging
+import inspect
+import re
+import tempfile
+import httpx
+from fastapi import FastAPI, File, Form, UploadFile
+from fastapi.responses import JSONResponse
+import pymupdf4llm
+from ocr_drive import get_drive_service, ocr_file
+from docx_converter import convert_docx_to_markdown
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("doc-converter")
+app = FastAPI(
+    title="Document Converter API",
+    version="1.0.0",
+    description="Wrapper API: Any file type -> Google Drive OCR/export",
+)
+def _looks_like_low_information_markdown(md_text: str) -> bool:
+    """
+    Heuristic to detect markdown output that has almost no meaningful text.
+    Typical examples are repeated page separators like '-----'.
+    """
+    if not md_text:
+        return True
+    lines = [line.strip() for line in md_text.splitlines() if line.strip()]
+    if not lines:
+        return True
+    meaningful_lines = [line for line in lines if re.sub(r"[-=\s]", "", line)]
+    if not meaningful_lines:
+        return True
+    # Must contain at least one word-like character to be considered useful text.
+    has_word_content = any(re.search(r"\w", line, flags=re.UNICODE) for line in meaningful_lines)
+    if not has_word_content:
+        return True
+    separator_lines = [line for line in lines if re.fullmatch(r"[-=]{3,}", line)]
+    return len(separator_lines) >= 3 and len(meaningful_lines) <= 1
+def _convert_pdf_markdown_best_effort(doc):
+    """
+    Convert PDF to markdown and retry with OCR when output seems empty.
+    """
+    sig = inspect.signature(pymupdf4llm.to_markdown)
+    params = sig.parameters
+    base_kwargs = {}
+    if "show_progress" in params:
+        base_kwargs["show_progress"] = False
+    md_text = pymupdf4llm.to_markdown(doc, **base_kwargs)
+    low_information = _looks_like_low_information_markdown(md_text)
+    ocr_attempted = False
+    ocr_used = False
+    ocr_error = None
+    # Retry once with OCR only when the first pass is likely useless.
+    if low_information and "use_ocr" in params:
+        ocr_attempted = True
+        ocr_kwargs = dict(base_kwargs)
+        ocr_kwargs["use_ocr"] = True
+        if "ocr_language" in params:
+            ocr_kwargs["ocr_language"] = os.getenv("OCR_LANGUAGE", "vie+eng")
+        try:
+            md_text_ocr = pymupdf4llm.to_markdown(doc, **ocr_kwargs)
+            if len(md_text_ocr.strip()) > len(md_text.strip()):
+                md_text = md_text_ocr
+                ocr_used = True
+                low_information = _looks_like_low_information_markdown(md_text)
+        except Exception as exc:
+            ocr_error = str(exc)
+    return md_text, {
+        "low_information": low_information,
+        "ocr_attempted": ocr_attempted,
+        "ocr_used": ocr_used,
+        "ocr_error": ocr_error,
+    }
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+@app.post("/v1/convert/file")
+async def convert_file(
+    files: UploadFile = File(...),
+):
+    """
+    Convert uploaded file to markdown-compatible text using Google Drive OCR/export.
+    This endpoint intentionally routes every file type through Drive processing.
+    """
+    filename = files.filename or "unknown"
+    ext = os.path.splitext(filename)[1].lower() or ".bin"
+    file_bytes = await files.read()
+    logger.info(f"Received file: {filename} ({len(file_bytes)} bytes, ext={ext}, content_type={files.content_type})")
+    if not file_bytes:
+        return JSONResponse(
+            status_code=400,
+            content={"error": "Empty file upload."},
+        )
+    try:
+        logger.info(f"Processing file via Google Drive OCR/export: {filename}")
+        # Save uploaded content to a temporary file before Drive upload.
+        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
+            tmp.write(file_bytes)
+            tmp_path = tmp.name
+        try:
+            drive = get_drive_service()
+            docx_bytes = ocr_file(
+                drive,
+                tmp_path,
+                source_mime_type=files.content_type,
+                filename=filename,
+                export_mode="docx",
+            )
+            md_text = ""
+            if docx_bytes:
+                try:
+                    parsed = convert_docx_to_markdown(docx_bytes, filename)
+                    md_text = parsed.get("document", {}).get("md_content", "")
+                except Exception as parse_exc:
+                    logger.warning(
+                        f"DOCX parse after Drive export failed, fallback to text export: {parse_exc}"
+                    )
+            if not md_text:
+                md_text = ocr_file(
+                    drive,
+                    tmp_path,
+                    source_mime_type=files.content_type,
+                    filename=filename,
+                    export_mode="text",
+                )
+        finally:
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+        logger.info(f"Drive OCR/export conversion done: {len(md_text)} chars")
+        response = {
+            "document": {
+                "filename": filename,
+                "md_content": md_text,
+            },
+            "meta": {
+                "low_information": _looks_like_low_information_markdown(md_text),
+                "ocr_attempted": True,
+                "ocr_used": True,
+                "backend": "google_drive",
+                "source_content_type": files.content_type,
+            },
+        }
+        if not md_text:
+            response["warning"] = "Google Drive did not extract text for this file."
+        return JSONResponse(content=response)
+    except Exception as e:
+        logger.error(f"Drive OCR/export conversion failed: {str(e)}")
+        return JSONResponse(
+            status_code=500,
+            content={"error": f"Drive OCR/export conversion failed: {str(e)}"},
+        )
--- a/docker-compose.yml
+++ b/docker-compose.yml
+services:
+  doc-converter:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: doc-converter
+    restart: unless-stopped
+    ports:
+      - "5005:8000"
+    dns:
+      - 8.8.8.8      # DNS của Google
+      - 1.1.1.1      # DNS của Cloudflare
+    volumes:
+      - ./app.py:/app/app.py
+      - ./docx_converter.py:/app/docx_converter.py
+      - ./ocr_drive.py:/app/ocr_drive.py
+      - ./ocr_service_account.json:/app/ocr_service_account.json
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    command: ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
+    networks:
+      doc-net:
+        ipv4_address: 10.10.1.2
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 5s
+networks:
+  doc-net:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 10.10.1.0/24
+          gateway: 10.10.1.1
--- a/docx_converter.py
+++ b/docx_converter.py
--- a/ocr_drive.py
+++ b/ocr_drive.py
+"""
+PDF Scan -> Google Drive OCR -> Text Extraction
+==============================================
+Upload scanned PDFs to Google Drive (auto-converts to Google Docs with OCR),
+export the recognized text, save as .txt files, and clean up.
+Usage:
+    python ocr_drive.py                    # Process all PDFs in current directory
+    python ocr_drive.py file1.pdf file2.pdf # Process specific files
+"""
+import os
+import sys
+import io
+import json
+import time
+import glob
+import mimetypes
+from pathlib import Path
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
+# --- CONFIG ---
+SCRIPT_DIR = Path(__file__).parent
+SERVICE_ACCOUNT_FILE = SCRIPT_DIR / "ocr_service_account.json"
+SCOPES = ["https://www.googleapis.com/auth/drive.file"]
+WAIT_SECONDS = 5  # Wait for Google to process OCR
+GOOGLE_DOC_MIME = "application/vnd.google-apps.document"
+DOCX_EXPORT_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+TEXT_EXPORT_MIME = "text/plain"
+def get_drive_service():
+    """Authenticate and return Google Drive service."""
+    if not SERVICE_ACCOUNT_FILE.exists():
+        print(f"[ERROR] Khong tim thay file credentials: {SERVICE_ACCOUNT_FILE}")
+        sys.exit(1)
+    creds = service_account.Credentials.from_service_account_file(
+        str(SERVICE_ACCOUNT_FILE), scopes=SCOPES
+    )
+    return build("drive", "v3", credentials=creds)
+def _guess_mime_type(file_path: str, source_mime_type: str = None) -> str:
+    """Best-effort MIME detection for upload to Google Drive."""
+    if source_mime_type:
+        source_mime_type = source_mime_type.strip().lower()
+        if source_mime_type and source_mime_type != "application/octet-stream":
+            return source_mime_type
+    guessed, _ = mimetypes.guess_type(file_path)
+    if guessed:
+        return guessed
+    return "application/octet-stream"
+def ocr_file(
+    drive,
+    file_path: str,
+    source_mime_type: str = None,
+    filename: str = None,
+    export_mode: str = "text",
+):
+    """
+    Upload a file to Google Drive -> convert to Google Docs (OCR/export) -> export data -> cleanup.
+    export_mode:
+      - "text": return UTF-8 string
+      - "docx": return DOCX bytes
+    """
+    local_name = os.path.basename(file_path)
+    display_name = filename or local_name
+    upload_mime = _guess_mime_type(file_path, source_mime_type)
+    print(f"\n{'='*60}")
+    print(f"[FILE] Processing: {display_name}")
+    print(f"   Size: {os.path.getsize(file_path) / 1024:.0f} KB")
+    print(f"   MIME: {upload_mime}")
+    doc_id = None
+    try:
+        # Step 1: Upload source file -> Google Docs conversion
+        print(f"   [UPLOAD] Uploading to Google Drive...")
+        file_metadata = {"name": display_name, "mimeType": GOOGLE_DOC_MIME}
+        media = MediaFileUpload(file_path, mimetype=upload_mime)
+        uploaded = (
+            drive.files()
+            .create(body=file_metadata, media_body=media, fields="id")
+            .execute()
+        )
+        doc_id = uploaded.get("id")
+        print(f"   [OK] Uploaded! Doc ID: {doc_id}")
+        # Step 2: Wait for Google OCR processing
+        print(f"   [WAIT] Waiting {WAIT_SECONDS}s for OCR processing...")
+        time.sleep(WAIT_SECONDS)
+        # Step 3: Export from Google Docs
+        if export_mode == "docx":
+            export_mime = DOCX_EXPORT_MIME
+            print(f"   [EXPORT] Exporting DOCX...")
+        else:
+            export_mime = TEXT_EXPORT_MIME
+            print(f"   [EXPORT] Exporting text...")
+        request = drive.files().export_media(fileId=doc_id, mimeType=export_mime)
+        fh = io.BytesIO()
+        downloader = MediaIoBaseDownload(fh, request)
+        done = False
+        while not done:
+            status, done = downloader.next_chunk()
+            if status:
+                print(f"   [DOWNLOAD] {int(status.progress() * 100)}%")
+        payload = fh.getvalue()
+        if export_mode == "docx":
+            print(f"   [OK] Exported {len(payload):,} bytes DOCX")
+            return payload
+        text_content = payload.decode("utf-8").strip()
+        print(f"   [OK] Extracted {len(text_content):,} characters")
+        return text_content
+    except Exception as e:
+        print(f"   [ERROR] {e}")
+        if export_mode == "docx":
+            return b""
+        return ""
+    finally:
+        # Step 4: Cleanup - delete from Google Drive
+        if doc_id:
+            try:
+                drive.files().delete(fileId=doc_id).execute()
+                print(f"   [CLEANUP] Deleted Drive file")
+            except Exception as e:
+                print(f"   [WARN] Cleanup failed: {e}")
+def ocr_pdf(drive, pdf_path: str) -> str:
+    """Backward-compatible wrapper for legacy callers."""
+    return ocr_file(drive, pdf_path, source_mime_type="application/pdf")
+def save_text(pdf_path: str, text: str) -> str:
+    """Save extracted text to a .txt file with the same name."""
+    txt_path = os.path.splitext(pdf_path)[0] + ".txt"
+    with open(txt_path, "w", encoding="utf-8") as f:
+        f.write(text)
+    print(f"   [SAVED] {os.path.basename(txt_path)}")
+    return txt_path
+def main():
+    # Determine which files to process
+    if len(sys.argv) > 1:
+        pdf_files = [f for f in sys.argv[1:] if f.lower().endswith(".pdf")]
+    else:
+        pdf_files = glob.glob(str(SCRIPT_DIR / "*.pdf"))
+    if not pdf_files:
+        print("[ERROR] Khong tim thay file PDF nao!")
+        print(f"   Thu muc: {SCRIPT_DIR}")
+        sys.exit(1)
+    print(f"[START] Google Drive OCR - {len(pdf_files)} file(s)")
+    print(f"   Credentials: {SERVICE_ACCOUNT_FILE}")
+    # Authenticate
+    drive = get_drive_service()
+    print("[OK] Ket noi Google Drive thanh cong!")
+    # Process each PDF
+    results = []
+    for pdf_path in pdf_files:
+        text = ocr_file(drive, pdf_path, source_mime_type="application/pdf")
+        if text:
+            txt_path = save_text(pdf_path, text)
+            results.append((pdf_path, txt_path, len(text)))
+            # Preview first 500 chars
+            print(f"\n   [PREVIEW] 500 ky tu dau:")
+            print(f"   {'-'*50}")
+            preview = text[:500].replace("\n", "\n   ")
+            print(f"   {preview}")
+            print(f"   {'-'*50}")
+        else:
+            print(f"   [FAIL] Khong trich xuat duoc text tu {os.path.basename(pdf_path)}")
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"[SUMMARY] KET QUA TONG HOP")
+    print(f"{'='*60}")
+    print(f"   Tong file: {len(pdf_files)}")
+    print(f"   Thanh cong: {len(results)}")
+    print(f"   That bai: {len(pdf_files) - len(results)}")
+    for pdf_path, txt_path, chars in results:
+        print(f"   [OK] {os.path.basename(pdf_path)} -> {chars:,} ky tu")
+    print()
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+fastapi==0.115.6
+uvicorn==0.34.0
+python-multipart==0.0.20
+httpx==0.28.1
+pymupdf4llm==0.0.17
+google-api-python-client
+google-auth-httplib2
+google-auth-oauthlib
--- a/test/2024_4.2. HD FOB +BH_(0,25-0,5)_Ap dung khi xin duoc phe duyet.doc
+++ b/test/2024_4.2. HD FOB +BH_(0,25-0,5)_Ap dung khi xin duoc phe duyet.doc