Commit d92639e2 authored by Vũ Hoàng Anh's avatar Vũ Hoàng Anh

Fix OCR floating page numbers

parent b9f8d512
Pipeline #3414 failed with stages
......@@ -219,6 +219,12 @@ def _paragraph_to_markdown(
if not line:
return ""
# Lọc bỏ các dòng chỉ chứa số trang hoặc chữ "Trang / Page" do OCR chèn nhầm
if line.isdigit() and len(line) <= 3:
return ""
if re.fullmatch(r"Trang\s+\d+(/\d+)?|Page\s+\d+(/\d+)?|\d+\s*-.*-.*|^\d+$", line, re.IGNORECASE):
return ""
# Apply heading styles
heading_level = 0
style_lower = style_name.lower()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment