Skip to content

Commit 59519ab

Browse files
committed
Merge PR #976 into 14.0
Signed-off-by alexis-via
2 parents 362be98 + 89423e0 commit 59519ab

File tree

5 files changed

+25
-35
lines changed

5 files changed

+25
-35
lines changed

pdf_helper/__manifest__.py

+1
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@
1515
"depends": [
1616
"base",
1717
],
18+
"external_dependencies": {"python": ["pypdf"]},
1819
}

pdf_helper/models/helper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).
44
import logging
55

6-
from PyPDF2.utils import PdfReadError
6+
from pypdf.errors import PdfReadError
77

88
from odoo import models
99

pdf_helper/static/description/index.html

-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
<?xml version="1.0" encoding="utf-8"?>
21
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
32
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
43
<head>

pdf_helper/utils.py

+22-33
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
_logger = logging.getLogger(__name__)
1414

1515
try:
16-
import PyPDF2
16+
import pypdf
1717
except ImportError:
18-
_logger.debug("Cannot import PyPDF2")
18+
_logger.debug("Cannot import pypdf")
1919

2020

2121
class PDFParser:
@@ -30,41 +30,30 @@ def get_xml_files(self):
3030
"""
3131
res = {}
3232
with BytesIO(self.pdf_file) as fd:
33-
xmlfiles = self._extract_xml_files(fd)
34-
for filename, xml_obj in xmlfiles.items():
35-
root = self._extract_xml_root(xml_obj)
36-
if root is None or not len(root):
37-
continue
38-
res[filename] = root
33+
res = self._extract_xml_files(fd)
3934
if res:
4035
_logger.debug("Valid XML files found in PDF: %s", list(res.keys()))
4136
return res
4237

4338
def _extract_xml_files(self, fd):
44-
pdf = PyPDF2.PdfFileReader(fd)
45-
_logger.debug("pdf.trailer=%s", pdf.trailer)
46-
pdf_root = pdf.trailer["/Root"]
47-
_logger.debug("pdf_root=%s", pdf_root)
48-
# TODO add support for /Kids
49-
embeddedfiles = pdf_root["/Names"]["/EmbeddedFiles"]["/Names"]
50-
i = 0
51-
xmlfiles = {} # key = filename, value = PDF obj
52-
for embeddedfile in embeddedfiles[:-1]:
53-
mime_res = mimetypes.guess_type(embeddedfile)
39+
reader = pypdf.PdfReader(fd)
40+
# attachment parsing via pypdf doesn't support /Kids
41+
# cf my bug report https://github.com/py-pdf/pypdf/issues/2087
42+
xmlfiles = {}
43+
for filename, content_list in reader.attachments.items():
44+
_logger.debug("Attachment %s found in PDF", filename)
45+
mime_res = mimetypes.guess_type(filename)
5446
if mime_res and mime_res[0] in ["application/xml", "text/xml"]:
55-
xmlfiles[embeddedfile] = embeddedfiles[i + 1]
56-
i += 1
57-
_logger.debug("xmlfiles=%s", xmlfiles)
47+
try:
48+
_logger.debug("Trying to parse XML attachment %s", filename)
49+
xml_root = etree.fromstring(content_list[0])
50+
if len(xml_root) > 0:
51+
_logger.info("Valid XML file %s found in attachments", filename)
52+
xmlfiles[filename] = xml_root
53+
else:
54+
_logger.warning("XML file %s is empty", filename)
55+
except Exception as err:
56+
_logger.warning(
57+
"Failed to parse XML file %s. Error: %s", filename, str(err)
58+
)
5859
return xmlfiles
59-
60-
def _extract_xml_root(self, xml_obj):
61-
xml_root = None
62-
try:
63-
xml_file_dict = xml_obj.getObject()
64-
_logger.debug("xml_file_dict=%s", xml_file_dict)
65-
xml_string = xml_file_dict["/EF"]["/F"].getData()
66-
xml_root = etree.fromstring(xml_string)
67-
except Exception as err:
68-
# TODO: can't we catch specific exceptions?
69-
_logger.debug("_pdf_extract_xml_root failed: %s", str(err))
70-
return xml_root

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ factur-x
44
invoice2data
55
ovh
66
phonenumbers
7+
pypdf
78
pypdf>=3.1.0
89
pyyaml
910
regex

0 commit comments

Comments
 (0)