From 995a94853df0c03753a40e33e9c30fc3ffad3ebb Mon Sep 17 00:00:00 2001 From: Matteo Trubini <7964032+matteotrubini@users.noreply.github.com> Date: Tue, 13 Feb 2024 16:54:23 +0100 Subject: [PATCH 1/2] [14.0][IMP] account_invoice_import_simple_pdf: use Tesseract-OCR if available --- .../wizard/account_invoice_import.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py index 27cf94a8c7..ba5c2ea266 100644 --- a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py +++ b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py @@ -54,7 +54,12 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info): pages = [] doc = fitz.open(fileobj.name) for page in doc: - pages.append(page.get_text()) + # Check if Tessdata is available for OCR + tessdata = fitz.get_tessdata() + # Perform OCR if Tessdata is available, otherwise use regular text extraction + textpage = page.get_textpage_ocr(full=False, tessdata=tessdata) if tessdata else page.get_textpage() + # Append the extracted text to the pages list + pages.append(page.get_text(textpage=textpage)) res = { "all": "\n\n".join(pages), "first": pages and pages[0] or "", From 5c3e52df214eb7ab31ad9c950cf7ffa8fd5d9d56 Mon Sep 17 00:00:00 2001 From: Matteo Trubini <7964032+matteotrubini@users.noreply.github.com> Date: Tue, 13 Feb 2024 16:54:23 +0100 Subject: [PATCH 2/2] [14.0][IMP] account_invoice_import_simple_pdf: use Tesseract-OCR if available --- .../wizard/account_invoice_import.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py index 27cf94a8c7..8356e1a097 100644 --- a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py +++ b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py @@ -54,7 +54,16 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info): pages = [] doc = fitz.open(fileobj.name) for page in doc: - pages.append(page.get_text()) + # Check if Tessdata is available for OCR + tessdata = fitz.get_tessdata() + # Perform OCR if Tessdata is available, otherwise use regular text extraction + textpage = ( + page.get_textpage_ocr(full=False, tessdata=tessdata) + if tessdata + else page.get_textpage() + ) + # Append the extracted text to the pages list + pages.append(page.get_text(textpage=textpage)) res = { "all": "\n\n".join(pages), "first": pages and pages[0] or "",