Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
take_data_contracts_from_file
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Vũ Hoàng Anh
take_data_contracts_from_file
Commits
4a82a1ab
Commit
4a82a1ab
authored
Apr 09, 2026
by
Vũ Hoàng Anh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: detect ext from content_type when filename has no extension + detailed strategy logging
parent
333179a7
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
31 additions
and
6 deletions
+31
-6
app.py
app.py
+31
-6
No files found.
app.py
View file @
4a82a1ab
...
@@ -74,6 +74,9 @@ MIME_MAP = {
...
@@ -74,6 +74,9 @@ MIME_MAP = {
".jpeg"
:
"image/jpeg"
,
".jpeg"
:
"image/jpeg"
,
}
}
# Reverse lookup: content_type → ext (dùng khi filename không có extension)
MIME_TO_EXT
=
{
v
:
k
for
k
,
v
in
MIME_MAP
.
items
()}
def
_count_dieu
(
text
:
str
)
->
int
:
def
_count_dieu
(
text
:
str
)
->
int
:
"""Đếm số lần xuất hiện pattern 'Điều X' / 'ĐIỀU X'."""
"""Đếm số lần xuất hiện pattern 'Điều X' / 'ĐIỀU X'."""
...
@@ -371,43 +374,65 @@ def _convert_document(file_bytes: bytes, filename: str, content_type: str) -> di
...
@@ -371,43 +374,65 @@ def _convert_document(file_bytes: bytes, filename: str, content_type: str) -> di
"""
"""
Chạy tất cả strategies phù hợp, chọn output nhiều "Điều" nhất.
Chạy tất cả strategies phù hợp, chọn output nhiều "Điều" nhất.
DOCX → chạy
cả 3 strategy (A
+ B + C)
DOCX → chạy
4 strategy (A + A2
+ B + C)
DOC → chạy 2 strategy (B + C), vì python-docx không đọc .doc
DOC → chạy 2 strategy (B + C), vì python-docx không đọc .doc
"""
"""
ext
=
os
.
path
.
splitext
(
filename
)[
1
]
.
lower
()
ext
=
os
.
path
.
splitext
(
filename
)[
1
]
.
lower
()
# ── Fix: detect ext từ content_type nếu filename không có extension ──
if
not
ext
and
content_type
:
ext
=
MIME_TO_EXT
.
get
(
content_type
,
""
)
if
ext
:
logger
.
info
(
f
" [EXT] Filename '{filename}' không có extension, detect từ content_type → ext={ext}"
)
if
not
content_type
:
if
not
content_type
:
content_type
=
MIME_MAP
.
get
(
ext
,
"application/octet-stream"
)
content_type
=
MIME_MAP
.
get
(
ext
,
"application/octet-stream"
)
logger
.
info
(
f
"Converting: {filename} ({len(file_bytes)} bytes, ext={ext})"
)
logger
.
info
(
f
"Converting: {filename} ({len(file_bytes)} bytes, "
f
"ext={ext}, content_type={content_type})"
)
is_docx
=
ext
==
".docx"
is_doc
=
ext
==
".doc"
candidates
=
[]
candidates
=
[]
# ── Strategy A: docx_converter (chỉ .docx) ──
# ── Strategy A: docx_converter (chỉ .docx) ──
if
ext
==
".docx"
:
if
is_docx
:
logger
.
info
(
" [STRATEGY A: docx_native] Running — parse XML numbering.xml..."
)
result_a
=
_strategy_docx_native
(
file_bytes
,
filename
)
result_a
=
_strategy_docx_native
(
file_bytes
,
filename
)
candidates
.
append
(
result_a
)
candidates
.
append
(
result_a
)
# Nếu Strategy A đã có "Điều" → skip tất cả
# Nếu Strategy A đã có "Điều" → skip tất cả
if
_count_dieu
(
result_a
[
"md"
])
>
0
and
_meaningful_chars
(
result_a
[
"md"
])
>
200
:
if
_count_dieu
(
result_a
[
"md"
])
>
0
and
_meaningful_chars
(
result_a
[
"md"
])
>
200
:
logger
.
info
(
f
"
Strategy A đã có Điều, skip các strategy khác
"
)
logger
.
info
(
f
"
✅ Strategy A có {_count_dieu(result_a['md'])} Điều → DONE, skip A2/B/C
"
)
best
=
result_a
best
=
result_a
return
_build_response
(
filename
,
best
,
candidates
)
return
_build_response
(
filename
,
best
,
candidates
)
else
:
logger
.
info
(
f
" ❌ Strategy A: {_count_dieu(result_a['md'])} Điều → thử A2..."
)
# ── Strategy A2: mammoth DOCX → HTML → markdown (chỉ .docx) ──
# ── Strategy A2: mammoth DOCX → HTML → markdown ──
logger
.
info
(
" [STRATEGY A2: docx_html] Running — mammoth DOCX→HTML→MD..."
)
result_a2
=
_strategy_docx_html
(
file_bytes
,
filename
)
result_a2
=
_strategy_docx_html
(
file_bytes
,
filename
)
candidates
.
append
(
result_a2
)
candidates
.
append
(
result_a2
)
# Nếu A2 đã có "Điều" → skip Drive calls
# Nếu A2 đã có "Điều" → skip Drive calls
if
_count_dieu
(
result_a2
[
"md"
])
>
0
and
_meaningful_chars
(
result_a2
[
"md"
])
>
200
:
if
_count_dieu
(
result_a2
[
"md"
])
>
0
and
_meaningful_chars
(
result_a2
[
"md"
])
>
200
:
logger
.
info
(
f
"
Strategy A2 (HTML) đã có Điều, skip Drive calls
"
)
logger
.
info
(
f
"
✅ Strategy A2 có {_count_dieu(result_a2['md'])} Điều → DONE, skip B/C
"
)
best
=
_pick_best
(
*
candidates
)
best
=
_pick_best
(
*
candidates
)
return
_build_response
(
filename
,
best
,
candidates
)
return
_build_response
(
filename
,
best
,
candidates
)
else
:
logger
.
info
(
f
" ❌ Strategy A2: {_count_dieu(result_a2['md'])} Điều → thử B/C (Drive)..."
)
else
:
logger
.
info
(
f
" [SKIP A, A2] ext={ext} không phải .docx → skip offline strategies"
)
# ── Strategy B: Drive → DOCX → docx_converter ──
# ── Strategy B: Drive → DOCX → docx_converter ──
logger
.
info
(
" [STRATEGY B: drive_docx] Running — upload Drive → export DOCX → parse..."
)
result_b
=
_strategy_drive_docx
(
file_bytes
,
filename
,
content_type
)
result_b
=
_strategy_drive_docx
(
file_bytes
,
filename
,
content_type
)
candidates
.
append
(
result_b
)
candidates
.
append
(
result_b
)
# ── Strategy C: Drive → text → regex ──
# ── Strategy C: Drive → text → regex ──
logger
.
info
(
" [STRATEGY C: drive_text] Running — upload Drive → export text → regex..."
)
result_c
=
_strategy_drive_text
(
file_bytes
,
filename
,
content_type
)
result_c
=
_strategy_drive_text
(
file_bytes
,
filename
,
content_type
)
candidates
.
append
(
result_c
)
candidates
.
append
(
result_c
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment