Commit b9f8d512 authored by Vũ Hoàng Anh's avatar Vũ Hoàng Anh

Initial commit

parents
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
env/
venv/
.venv/
*.pdf
ocr_service_account.json
.env
# ──────────────────────────────────────
# Stage 1: Build dependencies
# ──────────────────────────────────────
FROM python:3.12-alpine AS builder
RUN apk add --no-cache gcc musl-dev gcompat libstdc++
WORKDIR /build
COPY requirements.txt .
RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
# ──────────────────────────────────────
# Stage 2: Runtime (minimal)
# ──────────────────────────────────────
FROM python:3.12-alpine AS runtime
LABEL maintainer="anhvh"
LABEL description="Document Converter API - PDF→Docling, DOCX→python-docx"
# Create non-root user
RUN addgroup -S appgroup && adduser -S appuser -G appgroup
# Install runtime dependencies for PyMuPDF
RUN apk add --no-cache libstdc++ gcompat
WORKDIR /app
# Copy installed packages from builder
COPY --from=builder /install /usr/local
# Copy source code
COPY app.py .
COPY docx_converter.py .
# Switch to non-root user
USER appuser
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
"""
Document Converter API Wrapper.
- Any file type → convert via Google Drive OCR/export
"""
import os
import logging
import inspect
import re
import tempfile
import httpx
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse
import pymupdf4llm
from ocr_drive import get_drive_service, ocr_file
from docx_converter import convert_docx_to_markdown
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("doc-converter")
app = FastAPI(
title="Document Converter API",
version="1.0.0",
description="Wrapper API: Any file type -> Google Drive OCR/export",
)
def _looks_like_low_information_markdown(md_text: str) -> bool:
"""
Heuristic to detect markdown output that has almost no meaningful text.
Typical examples are repeated page separators like '-----'.
"""
if not md_text:
return True
lines = [line.strip() for line in md_text.splitlines() if line.strip()]
if not lines:
return True
meaningful_lines = [line for line in lines if re.sub(r"[-=\s]", "", line)]
if not meaningful_lines:
return True
# Must contain at least one word-like character to be considered useful text.
has_word_content = any(re.search(r"\w", line, flags=re.UNICODE) for line in meaningful_lines)
if not has_word_content:
return True
separator_lines = [line for line in lines if re.fullmatch(r"[-=]{3,}", line)]
return len(separator_lines) >= 3 and len(meaningful_lines) <= 1
def _convert_pdf_markdown_best_effort(doc):
"""
Convert PDF to markdown and retry with OCR when output seems empty.
"""
sig = inspect.signature(pymupdf4llm.to_markdown)
params = sig.parameters
base_kwargs = {}
if "show_progress" in params:
base_kwargs["show_progress"] = False
md_text = pymupdf4llm.to_markdown(doc, **base_kwargs)
low_information = _looks_like_low_information_markdown(md_text)
ocr_attempted = False
ocr_used = False
ocr_error = None
# Retry once with OCR only when the first pass is likely useless.
if low_information and "use_ocr" in params:
ocr_attempted = True
ocr_kwargs = dict(base_kwargs)
ocr_kwargs["use_ocr"] = True
if "ocr_language" in params:
ocr_kwargs["ocr_language"] = os.getenv("OCR_LANGUAGE", "vie+eng")
try:
md_text_ocr = pymupdf4llm.to_markdown(doc, **ocr_kwargs)
if len(md_text_ocr.strip()) > len(md_text.strip()):
md_text = md_text_ocr
ocr_used = True
low_information = _looks_like_low_information_markdown(md_text)
except Exception as exc:
ocr_error = str(exc)
return md_text, {
"low_information": low_information,
"ocr_attempted": ocr_attempted,
"ocr_used": ocr_used,
"ocr_error": ocr_error,
}
@app.get("/health")
async def health():
return {"status": "ok"}
@app.post("/v1/convert/file")
async def convert_file(
files: UploadFile = File(...),
):
"""
Convert uploaded file to markdown-compatible text using Google Drive OCR/export.
This endpoint intentionally routes every file type through Drive processing.
"""
filename = files.filename or "unknown"
ext = os.path.splitext(filename)[1].lower() or ".bin"
file_bytes = await files.read()
logger.info(f"Received file: {filename} ({len(file_bytes)} bytes, ext={ext}, content_type={files.content_type})")
if not file_bytes:
return JSONResponse(
status_code=400,
content={"error": "Empty file upload."},
)
try:
logger.info(f"Processing file via Google Drive OCR/export: {filename}")
# Save uploaded content to a temporary file before Drive upload.
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
tmp.write(file_bytes)
tmp_path = tmp.name
try:
drive = get_drive_service()
docx_bytes = ocr_file(
drive,
tmp_path,
source_mime_type=files.content_type,
filename=filename,
export_mode="docx",
)
md_text = ""
if docx_bytes:
try:
parsed = convert_docx_to_markdown(docx_bytes, filename)
md_text = parsed.get("document", {}).get("md_content", "")
except Exception as parse_exc:
logger.warning(
f"DOCX parse after Drive export failed, fallback to text export: {parse_exc}"
)
if not md_text:
md_text = ocr_file(
drive,
tmp_path,
source_mime_type=files.content_type,
filename=filename,
export_mode="text",
)
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
logger.info(f"Drive OCR/export conversion done: {len(md_text)} chars")
response = {
"document": {
"filename": filename,
"md_content": md_text,
},
"meta": {
"low_information": _looks_like_low_information_markdown(md_text),
"ocr_attempted": True,
"ocr_used": True,
"backend": "google_drive",
"source_content_type": files.content_type,
},
}
if not md_text:
response["warning"] = "Google Drive did not extract text for this file."
return JSONResponse(content=response)
except Exception as e:
logger.error(f"Drive OCR/export conversion failed: {str(e)}")
return JSONResponse(
status_code=500,
content={"error": f"Drive OCR/export conversion failed: {str(e)}"},
)
services:
doc-converter:
build:
context: .
dockerfile: Dockerfile
container_name: doc-converter
restart: unless-stopped
ports:
- "5005:8000"
dns:
- 8.8.8.8 # DNS của Google
- 1.1.1.1 # DNS của Cloudflare
volumes:
- ./app.py:/app/app.py
- ./docx_converter.py:/app/docx_converter.py
- ./ocr_drive.py:/app/ocr_drive.py
- ./ocr_service_account.json:/app/ocr_service_account.json
extra_hosts:
- "host.docker.internal:host-gateway"
command: ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
networks:
doc-net:
ipv4_address: 10.10.1.2
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 5s
retries: 3
start_period: 5s
networks:
doc-net:
driver: bridge
ipam:
config:
- subnet: 10.10.1.0/24
gateway: 10.10.1.1
This diff is collapsed.
"""
PDF Scan -> Google Drive OCR -> Text Extraction
==============================================
Upload scanned PDFs to Google Drive (auto-converts to Google Docs with OCR),
export the recognized text, save as .txt files, and clean up.
Usage:
python ocr_drive.py # Process all PDFs in current directory
python ocr_drive.py file1.pdf file2.pdf # Process specific files
"""
import os
import sys
import io
import json
import time
import glob
import mimetypes
from pathlib import Path
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
# --- CONFIG ---
SCRIPT_DIR = Path(__file__).parent
SERVICE_ACCOUNT_FILE = SCRIPT_DIR / "ocr_service_account.json"
SCOPES = ["https://www.googleapis.com/auth/drive.file"]
WAIT_SECONDS = 5 # Wait for Google to process OCR
GOOGLE_DOC_MIME = "application/vnd.google-apps.document"
DOCX_EXPORT_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
TEXT_EXPORT_MIME = "text/plain"
def get_drive_service():
"""Authenticate and return Google Drive service."""
if not SERVICE_ACCOUNT_FILE.exists():
print(f"[ERROR] Khong tim thay file credentials: {SERVICE_ACCOUNT_FILE}")
sys.exit(1)
creds = service_account.Credentials.from_service_account_file(
str(SERVICE_ACCOUNT_FILE), scopes=SCOPES
)
return build("drive", "v3", credentials=creds)
def _guess_mime_type(file_path: str, source_mime_type: str = None) -> str:
"""Best-effort MIME detection for upload to Google Drive."""
if source_mime_type:
source_mime_type = source_mime_type.strip().lower()
if source_mime_type and source_mime_type != "application/octet-stream":
return source_mime_type
guessed, _ = mimetypes.guess_type(file_path)
if guessed:
return guessed
return "application/octet-stream"
def ocr_file(
drive,
file_path: str,
source_mime_type: str = None,
filename: str = None,
export_mode: str = "text",
):
"""
Upload a file to Google Drive -> convert to Google Docs (OCR/export) -> export data -> cleanup.
export_mode:
- "text": return UTF-8 string
- "docx": return DOCX bytes
"""
local_name = os.path.basename(file_path)
display_name = filename or local_name
upload_mime = _guess_mime_type(file_path, source_mime_type)
print(f"\n{'='*60}")
print(f"[FILE] Processing: {display_name}")
print(f" Size: {os.path.getsize(file_path) / 1024:.0f} KB")
print(f" MIME: {upload_mime}")
doc_id = None
try:
# Step 1: Upload source file -> Google Docs conversion
print(f" [UPLOAD] Uploading to Google Drive...")
file_metadata = {"name": display_name, "mimeType": GOOGLE_DOC_MIME}
media = MediaFileUpload(file_path, mimetype=upload_mime)
uploaded = (
drive.files()
.create(body=file_metadata, media_body=media, fields="id")
.execute()
)
doc_id = uploaded.get("id")
print(f" [OK] Uploaded! Doc ID: {doc_id}")
# Step 2: Wait for Google OCR processing
print(f" [WAIT] Waiting {WAIT_SECONDS}s for OCR processing...")
time.sleep(WAIT_SECONDS)
# Step 3: Export from Google Docs
if export_mode == "docx":
export_mime = DOCX_EXPORT_MIME
print(f" [EXPORT] Exporting DOCX...")
else:
export_mime = TEXT_EXPORT_MIME
print(f" [EXPORT] Exporting text...")
request = drive.files().export_media(fileId=doc_id, mimeType=export_mime)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while not done:
status, done = downloader.next_chunk()
if status:
print(f" [DOWNLOAD] {int(status.progress() * 100)}%")
payload = fh.getvalue()
if export_mode == "docx":
print(f" [OK] Exported {len(payload):,} bytes DOCX")
return payload
text_content = payload.decode("utf-8").strip()
print(f" [OK] Extracted {len(text_content):,} characters")
return text_content
except Exception as e:
print(f" [ERROR] {e}")
if export_mode == "docx":
return b""
return ""
finally:
# Step 4: Cleanup - delete from Google Drive
if doc_id:
try:
drive.files().delete(fileId=doc_id).execute()
print(f" [CLEANUP] Deleted Drive file")
except Exception as e:
print(f" [WARN] Cleanup failed: {e}")
def ocr_pdf(drive, pdf_path: str) -> str:
"""Backward-compatible wrapper for legacy callers."""
return ocr_file(drive, pdf_path, source_mime_type="application/pdf")
def save_text(pdf_path: str, text: str) -> str:
"""Save extracted text to a .txt file with the same name."""
txt_path = os.path.splitext(pdf_path)[0] + ".txt"
with open(txt_path, "w", encoding="utf-8") as f:
f.write(text)
print(f" [SAVED] {os.path.basename(txt_path)}")
return txt_path
def main():
# Determine which files to process
if len(sys.argv) > 1:
pdf_files = [f for f in sys.argv[1:] if f.lower().endswith(".pdf")]
else:
pdf_files = glob.glob(str(SCRIPT_DIR / "*.pdf"))
if not pdf_files:
print("[ERROR] Khong tim thay file PDF nao!")
print(f" Thu muc: {SCRIPT_DIR}")
sys.exit(1)
print(f"[START] Google Drive OCR - {len(pdf_files)} file(s)")
print(f" Credentials: {SERVICE_ACCOUNT_FILE}")
# Authenticate
drive = get_drive_service()
print("[OK] Ket noi Google Drive thanh cong!")
# Process each PDF
results = []
for pdf_path in pdf_files:
text = ocr_file(drive, pdf_path, source_mime_type="application/pdf")
if text:
txt_path = save_text(pdf_path, text)
results.append((pdf_path, txt_path, len(text)))
# Preview first 500 chars
print(f"\n [PREVIEW] 500 ky tu dau:")
print(f" {'-'*50}")
preview = text[:500].replace("\n", "\n ")
print(f" {preview}")
print(f" {'-'*50}")
else:
print(f" [FAIL] Khong trich xuat duoc text tu {os.path.basename(pdf_path)}")
# Summary
print(f"\n{'='*60}")
print(f"[SUMMARY] KET QUA TONG HOP")
print(f"{'='*60}")
print(f" Tong file: {len(pdf_files)}")
print(f" Thanh cong: {len(results)}")
print(f" That bai: {len(pdf_files) - len(results)}")
for pdf_path, txt_path, chars in results:
print(f" [OK] {os.path.basename(pdf_path)} -> {chars:,} ky tu")
print()
if __name__ == "__main__":
main()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment