Merge PR #976 into 14.0

OCA-git-bot · OCA-git-bot · commit 59519ab71f3c · 2024-05-23T10:41:20.000Z
Signed-off-by alexis-via
diff --git a/pdf_helper/__manifest__.py b/pdf_helper/__manifest__.py
@@ -15,4 +15,5 @@
     "depends": [
         "base",
     ],
+    "external_dependencies": {"python": ["pypdf"]},
 }
diff --git a/pdf_helper/models/helper.py b/pdf_helper/models/helper.py
@@ -3,7 +3,7 @@
 # License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).
 import logging
 
-from PyPDF2.utils import PdfReadError
+from pypdf.errors import PdfReadError
 
 from odoo import models
 
diff --git a/pdf_helper/static/description/index.html b/pdf_helper/static/description/index.html
@@ -1,4 +1,3 @@
-<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
diff --git a/pdf_helper/utils.py b/pdf_helper/utils.py
@@ -13,9 +13,9 @@
 _logger = logging.getLogger(__name__)
 
 try:
-    import PyPDF2
+    import pypdf
 except ImportError:
-    _logger.debug("Cannot import PyPDF2")
+    _logger.debug("Cannot import pypdf")
 
 
 class PDFParser:
@@ -30,41 +30,30 @@ def get_xml_files(self):
         """
         res = {}
         with BytesIO(self.pdf_file) as fd:
-            xmlfiles = self._extract_xml_files(fd)
-            for filename, xml_obj in xmlfiles.items():
-                root = self._extract_xml_root(xml_obj)
-                if root is None or not len(root):
-                    continue
-                res[filename] = root
+            res = self._extract_xml_files(fd)
         if res:
             _logger.debug("Valid XML files found in PDF: %s", list(res.keys()))
         return res
 
     def _extract_xml_files(self, fd):
-        pdf = PyPDF2.PdfFileReader(fd)
-        _logger.debug("pdf.trailer=%s", pdf.trailer)
-        pdf_root = pdf.trailer["/Root"]
-        _logger.debug("pdf_root=%s", pdf_root)
-        # TODO add support for /Kids
-        embeddedfiles = pdf_root["/Names"]["/EmbeddedFiles"]["/Names"]
-        i = 0
-        xmlfiles = {}  # key = filename, value = PDF obj
-        for embeddedfile in embeddedfiles[:-1]:
-            mime_res = mimetypes.guess_type(embeddedfile)
+        reader = pypdf.PdfReader(fd)
+        # attachment parsing via pypdf doesn't support /Kids
+        # cf my bug report https://github.com/py-pdf/pypdf/issues/2087
+        xmlfiles = {}
+        for filename, content_list in reader.attachments.items():
+            _logger.debug("Attachment %s found in PDF", filename)
+            mime_res = mimetypes.guess_type(filename)
             if mime_res and mime_res[0] in ["application/xml", "text/xml"]:
-                xmlfiles[embeddedfile] = embeddedfiles[i + 1]
-            i += 1
-        _logger.debug("xmlfiles=%s", xmlfiles)
+                try:
+                    _logger.debug("Trying to parse XML attachment %s", filename)
+                    xml_root = etree.fromstring(content_list[0])
+                    if len(xml_root) > 0:
+                        _logger.info("Valid XML file %s found in attachments", filename)
+                        xmlfiles[filename] = xml_root
+                    else:
+                        _logger.warning("XML file %s is empty", filename)
+                except Exception as err:
+                    _logger.warning(
+                        "Failed to parse XML file %s. Error: %s", filename, str(err)
+                    )
         return xmlfiles
-
-    def _extract_xml_root(self, xml_obj):
-        xml_root = None
-        try:
-            xml_file_dict = xml_obj.getObject()
-            _logger.debug("xml_file_dict=%s", xml_file_dict)
-            xml_string = xml_file_dict["/EF"]["/F"].getData()
-            xml_root = etree.fromstring(xml_string)
-        except Exception as err:
-            # TODO: can't we catch specific exceptions?
-            _logger.debug("_pdf_extract_xml_root failed: %s", str(err))
-        return xml_root
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ factur-x
 invoice2data
 ovh
 phonenumbers
+pypdf
 pypdf>=3.1.0
 pyyaml
 regex

Original file line number	Diff line number	Diff line change
`@@ -15,4 +15,5 @@`
`15`	`15`	`"depends": [`
`16`	`16`	`"base",`
`17`	`17`	`],`
	`18`	`+ "external_dependencies": {"python": ["pypdf"]},`
`18`	`19`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-<?xml version="1.0" encoding="utf-8"?>`
`2`	`1`	`<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">`
`3`	`2`	`<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">`
`4`	`3`	`<head>`