13
13
_logger = logging .getLogger (__name__ )
14
14
15
15
try :
16
- import PyPDF2
16
+ import pypdf
17
17
except ImportError :
18
- _logger .debug ("Cannot import PyPDF2 " )
18
+ _logger .debug ("Cannot import pypdf " )
19
19
20
20
21
21
class PDFParser :
@@ -30,41 +30,30 @@ def get_xml_files(self):
30
30
"""
31
31
res = {}
32
32
with BytesIO (self .pdf_file ) as fd :
33
- xmlfiles = self ._extract_xml_files (fd )
34
- for filename , xml_obj in xmlfiles .items ():
35
- root = self ._extract_xml_root (xml_obj )
36
- if root is None or not len (root ):
37
- continue
38
- res [filename ] = root
33
+ res = self ._extract_xml_files (fd )
39
34
if res :
40
35
_logger .debug ("Valid XML files found in PDF: %s" , list (res .keys ()))
41
36
return res
42
37
43
38
def _extract_xml_files (self , fd ):
44
- pdf = PyPDF2 .PdfFileReader (fd )
45
- _logger .debug ("pdf.trailer=%s" , pdf .trailer )
46
- pdf_root = pdf .trailer ["/Root" ]
47
- _logger .debug ("pdf_root=%s" , pdf_root )
48
- # TODO add support for /Kids
49
- embeddedfiles = pdf_root ["/Names" ]["/EmbeddedFiles" ]["/Names" ]
50
- i = 0
51
- xmlfiles = {} # key = filename, value = PDF obj
52
- for embeddedfile in embeddedfiles [:- 1 ]:
53
- mime_res = mimetypes .guess_type (embeddedfile )
39
+ reader = pypdf .PdfReader (fd )
40
+ # attachment parsing via pypdf doesn't support /Kids
41
+ # cf my bug report https://github.com/py-pdf/pypdf/issues/2087
42
+ xmlfiles = {}
43
+ for filename , content_list in reader .attachments .items ():
44
+ _logger .debug ("Attachment %s found in PDF" , filename )
45
+ mime_res = mimetypes .guess_type (filename )
54
46
if mime_res and mime_res [0 ] in ["application/xml" , "text/xml" ]:
55
- xmlfiles [embeddedfile ] = embeddedfiles [i + 1 ]
56
- i += 1
57
- _logger .debug ("xmlfiles=%s" , xmlfiles )
47
+ try :
48
+ _logger .debug ("Trying to parse XML attachment %s" , filename )
49
+ xml_root = etree .fromstring (content_list [0 ])
50
+ if len (xml_root ) > 0 :
51
+ _logger .info ("Valid XML file %s found in attachments" , filename )
52
+ xmlfiles [filename ] = xml_root
53
+ else :
54
+ _logger .warning ("XML file %s is empty" , filename )
55
+ except Exception as err :
56
+ _logger .warning (
57
+ "Failed to parse XML file %s. Error: %s" , filename , str (err )
58
+ )
58
59
return xmlfiles
59
-
60
- def _extract_xml_root (self , xml_obj ):
61
- xml_root = None
62
- try :
63
- xml_file_dict = xml_obj .getObject ()
64
- _logger .debug ("xml_file_dict=%s" , xml_file_dict )
65
- xml_string = xml_file_dict ["/EF" ]["/F" ].getData ()
66
- xml_root = etree .fromstring (xml_string )
67
- except Exception as err :
68
- # TODO: can't we catch specific exceptions?
69
- _logger .debug ("_pdf_extract_xml_root failed: %s" , str (err ))
70
- return xml_root
0 commit comments