Skip to content

Commit 995a948

Browse files
committed
[14.0][IMP] account_invoice_import_simple_pdf: use Tesseract-OCR if available
1 parent 629125c commit 995a948

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

account_invoice_import_simple_pdf/wizard/account_invoice_import.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,12 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info):
5454
pages = []
5555
doc = fitz.open(fileobj.name)
5656
for page in doc:
57-
pages.append(page.get_text())
57+
# Check if Tessdata is available for OCR
58+
tessdata = fitz.get_tessdata()
59+
# Perform OCR if Tessdata is available, otherwise use regular text extraction
60+
textpage = page.get_textpage_ocr(full=False, tessdata=tessdata) if tessdata else page.get_textpage()
61+
# Append the extracted text to the pages list
62+
pages.append(page.get_text(textpage=textpage))
5863
res = {
5964
"all": "\n\n".join(pages),
6065
"first": pages and pages[0] or "",

0 commit comments

Comments
 (0)