Commit b9f8d512 authored by Vũ Hoàng Anh's avatar Vũ Hoàng Anh

Initial commit

parents
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
env/
venv/
.venv/
*.pdf
ocr_service_account.json
.env
# ──────────────────────────────────────
# Stage 1: Build dependencies
# ──────────────────────────────────────
FROM python:3.12-alpine AS builder
RUN apk add --no-cache gcc musl-dev gcompat libstdc++
WORKDIR /build
COPY requirements.txt .
RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
# ──────────────────────────────────────
# Stage 2: Runtime (minimal)
# ──────────────────────────────────────
FROM python:3.12-alpine AS runtime
LABEL maintainer="anhvh"
LABEL description="Document Converter API - PDF→Docling, DOCX→python-docx"
# Create non-root user
RUN addgroup -S appgroup && adduser -S appuser -G appgroup
# Install runtime dependencies for PyMuPDF
RUN apk add --no-cache libstdc++ gcompat
WORKDIR /app
# Copy installed packages from builder
COPY --from=builder /install /usr/local
# Copy source code
COPY app.py .
COPY docx_converter.py .
# Switch to non-root user
USER appuser
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
"""
Document Converter API Wrapper.
- Any file type → convert via Google Drive OCR/export
"""
import os
import logging
import inspect
import re
import tempfile
import httpx
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse
import pymupdf4llm
from ocr_drive import get_drive_service, ocr_file
from docx_converter import convert_docx_to_markdown
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("doc-converter")
app = FastAPI(
title="Document Converter API",
version="1.0.0",
description="Wrapper API: Any file type -> Google Drive OCR/export",
)
def _looks_like_low_information_markdown(md_text: str) -> bool:
"""
Heuristic to detect markdown output that has almost no meaningful text.
Typical examples are repeated page separators like '-----'.
"""
if not md_text:
return True
lines = [line.strip() for line in md_text.splitlines() if line.strip()]
if not lines:
return True
meaningful_lines = [line for line in lines if re.sub(r"[-=\s]", "", line)]
if not meaningful_lines:
return True
# Must contain at least one word-like character to be considered useful text.
has_word_content = any(re.search(r"\w", line, flags=re.UNICODE) for line in meaningful_lines)
if not has_word_content:
return True
separator_lines = [line for line in lines if re.fullmatch(r"[-=]{3,}", line)]
return len(separator_lines) >= 3 and len(meaningful_lines) <= 1
def _convert_pdf_markdown_best_effort(doc):
"""
Convert PDF to markdown and retry with OCR when output seems empty.
"""
sig = inspect.signature(pymupdf4llm.to_markdown)
params = sig.parameters
base_kwargs = {}
if "show_progress" in params:
base_kwargs["show_progress"] = False
md_text = pymupdf4llm.to_markdown(doc, **base_kwargs)
low_information = _looks_like_low_information_markdown(md_text)
ocr_attempted = False
ocr_used = False
ocr_error = None
# Retry once with OCR only when the first pass is likely useless.
if low_information and "use_ocr" in params:
ocr_attempted = True
ocr_kwargs = dict(base_kwargs)
ocr_kwargs["use_ocr"] = True
if "ocr_language" in params:
ocr_kwargs["ocr_language"] = os.getenv("OCR_LANGUAGE", "vie+eng")
try:
md_text_ocr = pymupdf4llm.to_markdown(doc, **ocr_kwargs)
if len(md_text_ocr.strip()) > len(md_text.strip()):
md_text = md_text_ocr
ocr_used = True
low_information = _looks_like_low_information_markdown(md_text)
except Exception as exc:
ocr_error = str(exc)
return md_text, {
"low_information": low_information,
"ocr_attempted": ocr_attempted,
"ocr_used": ocr_used,
"ocr_error": ocr_error,
}
@app.get("/health")
async def health():
return {"status": "ok"}
@app.post("/v1/convert/file")
async def convert_file(
files: UploadFile = File(...),
):
"""
Convert uploaded file to markdown-compatible text using Google Drive OCR/export.
This endpoint intentionally routes every file type through Drive processing.
"""
filename = files.filename or "unknown"
ext = os.path.splitext(filename)[1].lower() or ".bin"
file_bytes = await files.read()
logger.info(f"Received file: {filename} ({len(file_bytes)} bytes, ext={ext}, content_type={files.content_type})")
if not file_bytes:
return JSONResponse(
status_code=400,
content={"error": "Empty file upload."},
)
try:
logger.info(f"Processing file via Google Drive OCR/export: {filename}")
# Save uploaded content to a temporary file before Drive upload.
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
tmp.write(file_bytes)
tmp_path = tmp.name
try:
drive = get_drive_service()
docx_bytes = ocr_file(
drive,
tmp_path,
source_mime_type=files.content_type,
filename=filename,
export_mode="docx",
)
md_text = ""
if docx_bytes:
try:
parsed = convert_docx_to_markdown(docx_bytes, filename)
md_text = parsed.get("document", {}).get("md_content", "")
except Exception as parse_exc:
logger.warning(
f"DOCX parse after Drive export failed, fallback to text export: {parse_exc}"
)
if not md_text:
md_text = ocr_file(
drive,
tmp_path,
source_mime_type=files.content_type,
filename=filename,
export_mode="text",
)
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
logger.info(f"Drive OCR/export conversion done: {len(md_text)} chars")
response = {
"document": {
"filename": filename,
"md_content": md_text,
},
"meta": {
"low_information": _looks_like_low_information_markdown(md_text),
"ocr_attempted": True,
"ocr_used": True,
"backend": "google_drive",
"source_content_type": files.content_type,
},
}
if not md_text:
response["warning"] = "Google Drive did not extract text for this file."
return JSONResponse(content=response)
except Exception as e:
logger.error(f"Drive OCR/export conversion failed: {str(e)}")
return JSONResponse(
status_code=500,
content={"error": f"Drive OCR/export conversion failed: {str(e)}"},
)
services:
doc-converter:
build:
context: .
dockerfile: Dockerfile
container_name: doc-converter
restart: unless-stopped
ports:
- "5005:8000"
dns:
- 8.8.8.8 # DNS của Google
- 1.1.1.1 # DNS của Cloudflare
volumes:
- ./app.py:/app/app.py
- ./docx_converter.py:/app/docx_converter.py
- ./ocr_drive.py:/app/ocr_drive.py
- ./ocr_service_account.json:/app/ocr_service_account.json
extra_hosts:
- "host.docker.internal:host-gateway"
command: ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
networks:
doc-net:
ipv4_address: 10.10.1.2
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 5s
retries: 3
start_period: 5s
networks:
doc-net:
driver: bridge
ipam:
config:
- subnet: 10.10.1.0/24
gateway: 10.10.1.1
"""
DOCX to Markdown converter using raw XML parsing.
Extracts paragraphs, tables, headers/footers — preserves 100% content.
Supports lvlText templates (e.g. "Điều %1.") for correct numbering.
"""
import re
import zipfile
import xml.etree.ElementTree as ET
from io import BytesIO
from typing import Dict, List, Tuple, Optional
WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
NS = {"w": WORD_NS}
def _get_numbering_map(z: zipfile.ZipFile) -> dict:
"""
Parse word/numbering.xml to build numId+level -> (format, lvlText, start) mapping.
Returns: {numId: {ilvl: {"fmt": ..., "lvlText": ..., "start": ...}}}
"""
mapping = {}
try:
numbering_xml = z.read("word/numbering.xml")
except KeyError:
return mapping
root = ET.fromstring(numbering_xml)
# Build abstractNum lookup
abstract_nums = {}
for abstract in root.findall(".//w:abstractNum", NS):
abstract_id = abstract.get(f"{{{WORD_NS}}}abstractNumId")
levels = {}
for lvl in abstract.findall("w:lvl", NS):
ilvl = lvl.get(f"{{{WORD_NS}}}ilvl")
num_fmt_el = lvl.find("w:numFmt", NS)
fmt = num_fmt_el.get(f"{{{WORD_NS}}}val") if num_fmt_el is not None else "bullet"
lvl_text_el = lvl.find("w:lvlText", NS)
lvl_text = lvl_text_el.get(f"{{{WORD_NS}}}val") if lvl_text_el is not None else "%1."
start_el = lvl.find("w:start", NS)
start_val = int(start_el.get(f"{{{WORD_NS}}}val")) if start_el is not None else 1
levels[ilvl] = {
"fmt": fmt,
"lvlText": lvl_text,
"start": start_val,
}
abstract_nums[abstract_id] = levels
# Build numId -> abstractNumId mapping
for num in root.findall(".//w:num", NS):
num_id = num.get(f"{{{WORD_NS}}}numId")
abstract_ref = num.find("w:abstractNumId", NS)
if abstract_ref is not None:
abstract_id = abstract_ref.get(f"{{{WORD_NS}}}val")
if abstract_id in abstract_nums:
mapping[num_id] = abstract_nums[abstract_id]
return mapping
def _format_number(value: int, fmt: str) -> str:
"""Convert a number to the specified format."""
if fmt == "decimal":
return str(value)
elif fmt == "lowerLetter":
return chr(ord('a') + value - 1) if 1 <= value <= 26 else str(value)
elif fmt == "upperLetter":
return chr(ord('A') + value - 1) if 1 <= value <= 26 else str(value)
elif fmt == "lowerRoman":
romans = [(1000,'m'),(900,'cm'),(500,'d'),(400,'cd'),(100,'c'),(90,'xc'),
(50,'l'),(40,'xl'),(10,'x'),(9,'ix'),(5,'v'),(4,'iv'),(1,'i')]
result = ""
for val, numeral in romans:
while value >= val:
result += numeral
value -= val
return result
elif fmt == "upperRoman":
romans = [(1000,'M'),(900,'CM'),(500,'D'),(400,'CD'),(100,'C'),(90,'XC'),
(50,'L'),(40,'XL'),(10,'X'),(9,'IX'),(5,'V'),(4,'IV'),(1,'I')]
result = ""
for val, numeral in romans:
while value >= val:
result += numeral
value -= val
return result
else:
return str(value)
def _render_lvl_text(
lvl_text: str,
counters: Dict[str, int],
num_id: str,
current_ilvl: str,
numbering_map: dict,
) -> str:
"""
Render a lvlText template like 'Điều %1.' or '%1.%2.' by substituting
counter values for each level.
"""
result = lvl_text
# Find all %N references in the lvlText
for match in re.finditer(r"%(\d+)", lvl_text):
ref_level = str(int(match.group(1)) - 1) # %1 = ilvl 0, %2 = ilvl 1, etc.
counter_key = f"{num_id}_{ref_level}"
counter_val = counters.get(counter_key, 1)
# Get the format for this referenced level
ref_fmt = "decimal"
if num_id in numbering_map and ref_level in numbering_map[num_id]:
ref_fmt = numbering_map[num_id][ref_level]["fmt"]
formatted = _format_number(counter_val, ref_fmt)
result = result.replace(match.group(0), formatted, 1)
return result
def _extract_run_text(run) -> Tuple[str, bool, bool]:
"""Extract text from a run element with bold/italic info."""
texts = []
for t in run.findall(".//w:t", NS):
if t.text:
texts.append(t.text)
text = "".join(texts)
rpr = run.find("w:rPr", NS)
bold = False
italic = False
if rpr is not None:
b_el = rpr.find("w:b", NS)
if b_el is not None:
val = b_el.get(f"{{{WORD_NS}}}val", "true")
bold = val.lower() != "false"
i_el = rpr.find("w:i", NS)
if i_el is not None:
val = i_el.get(f"{{{WORD_NS}}}val", "true")
italic = val.lower() != "false"
return text, bold, italic
def _merge_runs(runs_data: List[Tuple[str, bool, bool]]) -> str:
"""
Merge adjacent runs with the same formatting to avoid ****artifacts****.
"""
if not runs_data:
return ""
# Group consecutive runs with same formatting
groups = []
for text, bold, italic in runs_data:
if not text:
continue
if groups and groups[-1][1] == bold and groups[-1][2] == italic:
groups[-1] = (groups[-1][0] + text, bold, italic)
else:
groups.append((text, bold, italic))
# Convert groups to markdown
parts = []
for text, bold, italic in groups:
if bold and italic:
parts.append(f"***{text}***")
elif bold:
parts.append(f"**{text}**")
elif italic:
parts.append(f"*{text}*")
else:
parts.append(text)
return "".join(parts)
def _paragraph_to_markdown(
para,
numbering_map: dict,
list_counters: dict,
) -> str:
"""Convert a single paragraph element to markdown."""
# Extract style
ppr = para.find("w:pPr", NS)
style_name = ""
num_id = None
ilvl = "0"
if ppr is not None:
style_el = ppr.find("w:pStyle", NS)
if style_el is not None:
style_name = style_el.get(f"{{{WORD_NS}}}val", "")
# Check for list numbering
num_pr = ppr.find("w:numPr", NS)
if num_pr is not None:
num_id_el = num_pr.find("w:numId", NS)
ilvl_el = num_pr.find("w:ilvl", NS)
if num_id_el is not None:
num_id = num_id_el.get(f"{{{WORD_NS}}}val")
if ilvl_el is not None:
ilvl = ilvl_el.get(f"{{{WORD_NS}}}val", "0")
# Process all runs — collect then merge
runs_data = []
for run in para.findall(".//w:r", NS):
text, bold, italic = _extract_run_text(run)
if text:
runs_data.append((text, bold, italic))
line = _merge_runs(runs_data).strip()
if not line:
return ""
# Apply heading styles
heading_level = 0
style_lower = style_name.lower()
if "heading" in style_lower:
for i in range(1, 7):
if str(i) in style_name:
heading_level = i
break
if heading_level == 0 and ("title" in style_lower or "heading" in style_lower):
heading_level = 1
if heading_level > 0:
return f"{'#' * heading_level} {line}"
# Apply list formatting with lvlText support
if num_id is not None and num_id != "0":
indent = " " * int(ilvl)
fmt = "bullet"
lvl_text = "%1."
if num_id in numbering_map and ilvl in numbering_map[num_id]:
lvl_info = numbering_map[num_id][ilvl]
fmt = lvl_info["fmt"]
lvl_text = lvl_info["lvlText"]
if fmt == "bullet":
return f"{indent}- {line}"
# Increment counter for current level
counter_key = f"{num_id}_{ilvl}"
list_counters[counter_key] = list_counters.get(
counter_key,
numbering_map.get(num_id, {}).get(ilvl, {}).get("start", 1) - 1,
) + 1
# Render the full numbering text
rendered = _render_lvl_text(
lvl_text, list_counters, num_id, ilvl, numbering_map
)
return f"{indent}{rendered} {line}"
return line
def _table_to_markdown(table) -> str:
"""Convert a table element to markdown table."""
rows = table.findall(".//w:tr", NS)
if not rows:
return ""
md_rows = []
max_cols = 0
for row in rows:
cells = row.findall(".//w:tc", NS)
cell_texts = []
for cell in cells:
# Extract all text from cell paragraphs
cell_parts = []
for p in cell.findall(".//w:p", NS):
runs_data = []
for run in p.findall(".//w:r", NS):
text, bold, italic = _extract_run_text(run)
if text:
runs_data.append((text, bold, italic))
p_text = _merge_runs(runs_data).strip()
if p_text:
cell_parts.append(p_text)
cell_texts.append(" ".join(cell_parts))
md_rows.append(cell_texts)
max_cols = max(max_cols, len(cell_texts))
if max_cols == 0:
return ""
# Normalize row lengths
for row in md_rows:
while len(row) < max_cols:
row.append("")
lines = []
# Header row
header = md_rows[0]
lines.append("| " + " | ".join(header) + " |")
lines.append("| " + " | ".join(["---"] * max_cols) + " |")
# Data rows
for row in md_rows[1:]:
lines.append("| " + " | ".join(row) + " |")
return "\n".join(lines)
def convert_docx_to_markdown(file_bytes: bytes, filename: str) -> dict:
"""
Convert DOCX file bytes to markdown content.
Returns dict matching Docling response format.
"""
z = zipfile.ZipFile(BytesIO(file_bytes))
doc_xml = z.read("word/document.xml")
root = ET.fromstring(doc_xml)
numbering_map = _get_numbering_map(z)
list_counters: Dict[str, int] = {}
# Find the document body
body = root.find(f"{{{WORD_NS}}}body")
if body is None:
body = root
md_parts = []
# Process all top-level elements in order (paragraphs and tables)
for element in body:
tag = element.tag.replace(f"{{{WORD_NS}}}", "")
if tag == "p":
line = _paragraph_to_markdown(element, numbering_map, list_counters)
if line:
md_parts.append(line)
else:
md_parts.append("") # preserve blank lines
elif tag == "tbl":
table_md = _table_to_markdown(element)
if table_md:
md_parts.append("")
md_parts.append(table_md)
md_parts.append("")
elif tag == "sectPr":
pass # skip section properties
# Also extract headers and footers
header_footer_parts = _extract_headers_footers(z)
if header_footer_parts:
md_parts = header_footer_parts + ["", "---", ""] + md_parts
# Clean up excessive blank lines
md_content = "\n".join(md_parts)
while "\n\n\n" in md_content:
md_content = md_content.replace("\n\n\n", "\n\n")
md_content = md_content.strip()
return {
"document": {
"filename": filename,
"md_content": md_content,
}
}
def _extract_headers_footers(z: zipfile.ZipFile) -> List[str]:
"""Extract text from document headers and footers."""
parts = []
for name in z.namelist():
if name.startswith("word/header") or name.startswith("word/footer"):
try:
xml_data = z.read(name)
root = ET.fromstring(xml_data)
for p in root.findall(f".//{{{WORD_NS}}}p"):
texts = []
for t in p.findall(f".//{{{WORD_NS}}}t"):
if t.text:
texts.append(t.text)
line = "".join(texts).strip()
if line and line not in parts:
parts.append(line)
except Exception:
pass
return parts
"""
PDF Scan -> Google Drive OCR -> Text Extraction
==============================================
Upload scanned PDFs to Google Drive (auto-converts to Google Docs with OCR),
export the recognized text, save as .txt files, and clean up.
Usage:
python ocr_drive.py # Process all PDFs in current directory
python ocr_drive.py file1.pdf file2.pdf # Process specific files
"""
import os
import sys
import io
import json
import time
import glob
import mimetypes
from pathlib import Path
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
# --- CONFIG ---
SCRIPT_DIR = Path(__file__).parent
SERVICE_ACCOUNT_FILE = SCRIPT_DIR / "ocr_service_account.json"
SCOPES = ["https://www.googleapis.com/auth/drive.file"]
WAIT_SECONDS = 5 # Wait for Google to process OCR
GOOGLE_DOC_MIME = "application/vnd.google-apps.document"
DOCX_EXPORT_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
TEXT_EXPORT_MIME = "text/plain"
def get_drive_service():
"""Authenticate and return Google Drive service."""
if not SERVICE_ACCOUNT_FILE.exists():
print(f"[ERROR] Khong tim thay file credentials: {SERVICE_ACCOUNT_FILE}")
sys.exit(1)
creds = service_account.Credentials.from_service_account_file(
str(SERVICE_ACCOUNT_FILE), scopes=SCOPES
)
return build("drive", "v3", credentials=creds)
def _guess_mime_type(file_path: str, source_mime_type: str = None) -> str:
"""Best-effort MIME detection for upload to Google Drive."""
if source_mime_type:
source_mime_type = source_mime_type.strip().lower()
if source_mime_type and source_mime_type != "application/octet-stream":
return source_mime_type
guessed, _ = mimetypes.guess_type(file_path)
if guessed:
return guessed
return "application/octet-stream"
def ocr_file(
drive,
file_path: str,
source_mime_type: str = None,
filename: str = None,
export_mode: str = "text",
):
"""
Upload a file to Google Drive -> convert to Google Docs (OCR/export) -> export data -> cleanup.
export_mode:
- "text": return UTF-8 string
- "docx": return DOCX bytes
"""
local_name = os.path.basename(file_path)
display_name = filename or local_name
upload_mime = _guess_mime_type(file_path, source_mime_type)
print(f"\n{'='*60}")
print(f"[FILE] Processing: {display_name}")
print(f" Size: {os.path.getsize(file_path) / 1024:.0f} KB")
print(f" MIME: {upload_mime}")
doc_id = None
try:
# Step 1: Upload source file -> Google Docs conversion
print(f" [UPLOAD] Uploading to Google Drive...")
file_metadata = {"name": display_name, "mimeType": GOOGLE_DOC_MIME}
media = MediaFileUpload(file_path, mimetype=upload_mime)
uploaded = (
drive.files()
.create(body=file_metadata, media_body=media, fields="id")
.execute()
)
doc_id = uploaded.get("id")
print(f" [OK] Uploaded! Doc ID: {doc_id}")
# Step 2: Wait for Google OCR processing
print(f" [WAIT] Waiting {WAIT_SECONDS}s for OCR processing...")
time.sleep(WAIT_SECONDS)
# Step 3: Export from Google Docs
if export_mode == "docx":
export_mime = DOCX_EXPORT_MIME
print(f" [EXPORT] Exporting DOCX...")
else:
export_mime = TEXT_EXPORT_MIME
print(f" [EXPORT] Exporting text...")
request = drive.files().export_media(fileId=doc_id, mimeType=export_mime)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while not done:
status, done = downloader.next_chunk()
if status:
print(f" [DOWNLOAD] {int(status.progress() * 100)}%")
payload = fh.getvalue()
if export_mode == "docx":
print(f" [OK] Exported {len(payload):,} bytes DOCX")
return payload
text_content = payload.decode("utf-8").strip()
print(f" [OK] Extracted {len(text_content):,} characters")
return text_content
except Exception as e:
print(f" [ERROR] {e}")
if export_mode == "docx":
return b""
return ""
finally:
# Step 4: Cleanup - delete from Google Drive
if doc_id:
try:
drive.files().delete(fileId=doc_id).execute()
print(f" [CLEANUP] Deleted Drive file")
except Exception as e:
print(f" [WARN] Cleanup failed: {e}")
def ocr_pdf(drive, pdf_path: str) -> str:
"""Backward-compatible wrapper for legacy callers."""
return ocr_file(drive, pdf_path, source_mime_type="application/pdf")
def save_text(pdf_path: str, text: str) -> str:
"""Save extracted text to a .txt file with the same name."""
txt_path = os.path.splitext(pdf_path)[0] + ".txt"
with open(txt_path, "w", encoding="utf-8") as f:
f.write(text)
print(f" [SAVED] {os.path.basename(txt_path)}")
return txt_path
def main():
# Determine which files to process
if len(sys.argv) > 1:
pdf_files = [f for f in sys.argv[1:] if f.lower().endswith(".pdf")]
else:
pdf_files = glob.glob(str(SCRIPT_DIR / "*.pdf"))
if not pdf_files:
print("[ERROR] Khong tim thay file PDF nao!")
print(f" Thu muc: {SCRIPT_DIR}")
sys.exit(1)
print(f"[START] Google Drive OCR - {len(pdf_files)} file(s)")
print(f" Credentials: {SERVICE_ACCOUNT_FILE}")
# Authenticate
drive = get_drive_service()
print("[OK] Ket noi Google Drive thanh cong!")
# Process each PDF
results = []
for pdf_path in pdf_files:
text = ocr_file(drive, pdf_path, source_mime_type="application/pdf")
if text:
txt_path = save_text(pdf_path, text)
results.append((pdf_path, txt_path, len(text)))
# Preview first 500 chars
print(f"\n [PREVIEW] 500 ky tu dau:")
print(f" {'-'*50}")
preview = text[:500].replace("\n", "\n ")
print(f" {preview}")
print(f" {'-'*50}")
else:
print(f" [FAIL] Khong trich xuat duoc text tu {os.path.basename(pdf_path)}")
# Summary
print(f"\n{'='*60}")
print(f"[SUMMARY] KET QUA TONG HOP")
print(f"{'='*60}")
print(f" Tong file: {len(pdf_files)}")
print(f" Thanh cong: {len(results)}")
print(f" That bai: {len(pdf_files) - len(results)}")
for pdf_path, txt_path, chars in results:
print(f" [OK] {os.path.basename(pdf_path)} -> {chars:,} ky tu")
print()
if __name__ == "__main__":
main()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment