diff --git a/base_ubl/models/ubl.py b/base_ubl/models/ubl.py index f0231f074b..5f09b3de1c 100644 --- a/base_ubl/models/ubl.py +++ b/base_ubl/models/ubl.py @@ -14,12 +14,6 @@ logger = logging.getLogger(__name__) -try: - from PyPDF2 import PdfFileReader, PdfFileWriter - from PyPDF2.generic import NameObject -except ImportError: - logger.debug("Cannot import PyPDF2") - class BaseUbl(models.AbstractModel): _name = "base.ubl" @@ -592,23 +586,18 @@ def _ubl_check_xml_schema(self, xml_string, document, version="2.1"): ) return True - # TODO: move to pdf_helper @api.model def _ubl_add_xml_in_pdf_buffer(self, xml_string, xml_filename, buffer): - # Add attachment to PDF content. - reader = PdfFileReader(buffer) - writer = PdfFileWriter() - writer.appendPagesFromReader(reader) - writer.addAttachment(xml_filename, xml_string) - # show attachments when opening PDF - writer._root_object.update( - {NameObject("/PageMode"): NameObject("/UseAttachments")} + logger.warning( + "`_ubl_add_xml_in_pdf_buffer` deprecated: use `pdf.helper.pdf_embed_xml`" ) - new_buffer = BytesIO() - writer.write(new_buffer) + pdf_content = buffer.getvalue() + new_content = self.env["pdf.helper"].pdf_embed_xml( + pdf_content, xml_filename, xml_string + ) + new_buffer = BytesIO(new_content) return new_buffer - # TODO: move to pdf_helper @api.model def _embed_ubl_xml_in_pdf_content(self, xml_string, xml_filename, pdf_content): """Add the attachments to the PDF content. @@ -616,16 +605,14 @@ def _embed_ubl_xml_in_pdf_content(self, xml_string, xml_filename, pdf_content): -> it will return the new PDF binary with the embedded XML (used for qweb-pdf reports) """ + logger.warning( + "`_embed_ubl_xml_in_pdf_content` deprecated: use `pdf.helper.pdf_embed_xml`" + ) self.ensure_one() logger.debug("Starting to embed %s in PDF", xml_filename) - - with BytesIO(pdf_content) as reader_buffer: - buffer = self._ubl_add_xml_in_pdf_buffer( - xml_string, xml_filename, reader_buffer - ) - pdf_content = buffer.getvalue() - buffer.close() - + pdf_content = self.env["pdf.helper"].pdf_embed_xml( + pdf_content, xml_filename, xml_string + ) logger.info("%s file added to PDF content", xml_filename) return pdf_content @@ -648,8 +635,8 @@ def embed_xml_in_pdf( if pdf_file: with open(pdf_file, "rb") as f: pdf_content = f.read() - updated_pdf_content = self._embed_ubl_xml_in_pdf_content( - xml_string, xml_filename, pdf_content + updated_pdf_content = self.env["pdf.helper"].pdf_embed_xml( + pdf_content, xml_filename, xml_string ) if pdf_file: with open(pdf_file, "wb") as f: diff --git a/pdf_helper/models/helper.py b/pdf_helper/models/helper.py index dc6ffb9e9f..f07b9258fc 100644 --- a/pdf_helper/models/helper.py +++ b/pdf_helper/models/helper.py @@ -1,11 +1,13 @@ # Copyright 2022 Camptocamp SA # @author: Simone Orsi +# Copyright 2023 Jacques-Etienne Baudoux (BCIM) # License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl). -import logging -from pypdf.errors import PdfReadError +import io +import logging -from odoo import models +from odoo import api, models +from odoo.tools.pdf import NameObject, OdooPdfFileReader, OdooPdfFileWriter from ..utils import PDFParser @@ -18,16 +20,31 @@ class PDFHelper(models.AbstractModel): _PDF_PARSER_KLASS = PDFParser + @api.model def pdf_get_xml_files(self, pdf_file): + """Extract XML attachments from pdf + + :param pdf_file: binary PDF file content + :returns: a dict like {$filename: $parsed_xml_file_obj}. + """ parser = self._PDF_PARSER_KLASS(pdf_file) try: return parser.get_xml_files() - except self._pdf_get_xml_files_swallable_exceptions() as err: - # TODO: can't we catch specific exceptions? - # This try/except block was added to reflect what done - # in base_business_document_import till now. + except parser.get_xml_files_swallable_exceptions() as err: _logger.error("PDF file parsing failed: %s", str(err)) return {} - def _pdf_get_xml_files_swallable_exceptions(self): - return (KeyError, PdfReadError) + @api.model + def pdf_embed_xml(self, pdf_content, xml_filename, xml_string): + """Add an XML attachment in a pdf""" + with io.BytesIO(pdf_content) as reader_buffer, io.BytesIO() as new_pdf_stream: + reader = OdooPdfFileReader(reader_buffer, strict=False) + writer = OdooPdfFileWriter() + writer.cloneReaderDocumentRoot(reader) + writer.addAttachment(xml_filename, xml_string, subtype="text/xml") + # show attachments when opening PDF + writer._root_object.update( + {NameObject("/PageMode"): NameObject("/UseAttachments")} + ) + writer.write(new_pdf_stream) + return new_pdf_stream.getvalue() diff --git a/pdf_helper/readme/CONTRIBUTORS.rst b/pdf_helper/readme/CONTRIBUTORS.rst index fe493ea973..ad925fe476 100644 --- a/pdf_helper/readme/CONTRIBUTORS.rst +++ b/pdf_helper/readme/CONTRIBUTORS.rst @@ -1,2 +1,3 @@ * Simone Orsi * Alexis de Lattre +* Jacques-Etienne Baudoux (BCIM) diff --git a/pdf_helper/readme/USAGE.rst b/pdf_helper/readme/USAGE.rst index 4cee3e9dbb..a90789e211 100644 --- a/pdf_helper/readme/USAGE.rst +++ b/pdf_helper/readme/USAGE.rst @@ -2,6 +2,8 @@ Inside Odoo env:: res = env["pdf.helper"].pdf_get_xml_files(pdf_filecontent) + new_pdf_filecontent = env["pdf.helper"].pdf_embed_xml(pdf_filecontent, filename, xml) + Outside Odoo env:: from odoo.addons.pdf_helper.utils import PDFParser diff --git a/pdf_helper/tests/test_helper.py b/pdf_helper/tests/test_helper.py index d0711a0b34..3219e3fc9f 100644 --- a/pdf_helper/tests/test_helper.py +++ b/pdf_helper/tests/test_helper.py @@ -28,14 +28,14 @@ def test_parse_xml(self): class TestPDFHelper(TransactionCase): - def test_parse_xml(self): + def test_get_xml(self): pdf_content = read_test_file("pdf_with_xml_test.pdf", mode="rb") res = self.env["pdf.helper"].pdf_get_xml_files(pdf_content) fname, xml_root = tuple(res.items())[0] self.assertEqual(fname, "factur-x.xml") self.assertTrue(isinstance(xml_root, etree._Element)) - def test_parse_xml_fail(self): + def test_get_xml_fail(self): with self.assertLogs( "odoo.addons.pdf_helper.models.helper", level="ERROR" ) as log_catcher: @@ -44,3 +44,15 @@ def test_parse_xml_fail(self): "PDF file parsing failed: Cannot read an empty file", log_catcher.output[0], ) + + def test_embed_xml(self): + pdf_content = read_test_file("pdf_with_xml_test.pdf", mode="rb") + filename = "test" + xml = b"test" + newpdf_content = self.env["pdf.helper"].pdf_embed_xml( + pdf_content, filename, xml + ) + attachments = self.env["pdf.helper"].pdf_get_xml_files(newpdf_content) + self.assertTrue(filename in attachments) + etree_content = attachments[filename] + self.assertEqual(xml, etree.tostring(etree_content)) diff --git a/pdf_helper/utils.py b/pdf_helper/utils.py index 2f764b1155..fd0c0ff629 100644 --- a/pdf_helper/utils.py +++ b/pdf_helper/utils.py @@ -5,17 +5,19 @@ # License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl). import logging -import mimetypes from io import BytesIO +from struct import error as StructError from lxml import etree -_logger = logging.getLogger(__name__) - try: - import pypdf + from PyPDF2.errors import PdfReadError except ImportError: - _logger.debug("Cannot import pypdf") + from PyPDF2.utils import PdfReadError + +from odoo.tools.pdf import OdooPdfFileReader + +_logger = logging.getLogger(__name__) class PDFParser: @@ -35,25 +37,19 @@ def get_xml_files(self): _logger.debug("Valid XML files found in PDF: %s", list(res.keys())) return res - def _extract_xml_files(self, fd): - reader = pypdf.PdfReader(fd) - # attachment parsing via pypdf doesn't support /Kids - # cf my bug report https://github.com/py-pdf/pypdf/issues/2087 - xmlfiles = {} - for filename, content_list in reader.attachments.items(): - _logger.debug("Attachment %s found in PDF", filename) - mime_res = mimetypes.guess_type(filename) - if mime_res and mime_res[0] in ["application/xml", "text/xml"]: + with BytesIO(self.pdf_file) as buffer: + pdf_reader = OdooPdfFileReader(buffer, strict=False) + + # Process embedded files. + for xml_name, content in pdf_reader.getAttachments(): try: - _logger.debug("Trying to parse XML attachment %s", filename) - xml_root = etree.fromstring(content_list[0]) - if len(xml_root) > 0: - _logger.info("Valid XML file %s found in attachments", filename) - xmlfiles[filename] = xml_root - else: - _logger.warning("XML file %s is empty", filename) - except Exception as err: - _logger.warning( - "Failed to parse XML file %s. Error: %s", filename, str(err) - ) - return xmlfiles + res[xml_name] = etree.fromstring(content) + except Exception: + _logger.debug("Non XML file found in PDF") + if res: + _logger.debug("Valid XML files found in PDF: %s", list(res.keys())) + return res + + def get_xml_files_swallable_exceptions(self): + return (NotImplementedError, StructError, PdfReadError) +