From 516a6e72e796ed443c1a9c64bc59e202e0f6afe2 Mon Sep 17 00:00:00 2001 From: gal kahana Date: Sat, 22 Jun 2024 12:29:12 +0300 Subject: [PATCH] #261: recursion in page tree parsing. avoid with storing the current path and identifying cycles --- PDFWriter/CMakeLists.txt | 4 + PDFWriter/PDFParser.cpp | 26 +++++- PDFWriter/PDFParser.h | 3 +- PDFWriter/PDFParsingPath.cpp | 75 ++++++++++++++++++ PDFWriter/PDFParsingPath.h | 47 +++++++++++ PDFWriterTesting/CMakeLists.txt | 10 +++ .../Materials/fuzzing/BrokenPageIDs.pdf | Bin 0 -> 1703 bytes PDFWriterTesting/PDFParserFuzzSanity.cpp | 32 ++++++++ 8 files changed, 193 insertions(+), 4 deletions(-) create mode 100644 PDFWriter/PDFParsingPath.cpp create mode 100644 PDFWriter/PDFParsingPath.h create mode 100644 PDFWriterTesting/Materials/fuzzing/BrokenPageIDs.pdf create mode 100644 PDFWriterTesting/PDFParserFuzzSanity.cpp diff --git a/PDFWriter/CMakeLists.txt b/PDFWriter/CMakeLists.txt index 0c314d1a..e25ca990 100644 --- a/PDFWriter/CMakeLists.txt +++ b/PDFWriter/CMakeLists.txt @@ -113,6 +113,7 @@ PDFPageMergingHelper.cpp PDFParser.cpp PDFParserTokenizer.cpp PDFParsingOptions.cpp +PDFParsingPath.cpp PDFReal.cpp PDFRectangle.cpp PDFStream.cpp @@ -313,6 +314,7 @@ PDFPageMergingHelper.h PDFParser.h PDFParserTokenizer.h PDFParsingOptions.h +PDFParsingPath.h PDFReal.h PDFRectangle.h PDFStream.h @@ -720,6 +722,8 @@ PDFNull.h PDFObject.cpp PDFObject.h PDFObjectCast.h +PDFParsingPath.cpp +PDFParsingPath.h PDFReal.cpp PDFReal.h PDFStreamInput.cpp diff --git a/PDFWriter/PDFParser.cpp b/PDFWriter/PDFParser.cpp index daf9a251..98b23267 100644 --- a/PDFWriter/PDFParser.cpp +++ b/PDFWriter/PDFParser.cpp @@ -875,13 +875,19 @@ EStatusCode PDFParser::ParsePagesObjectIDs() EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID) { unsigned long currentPageIndex = 0; + PDFParsingPath parsingPath; - return ParsePagesIDs(inPageNode,inNodeObjectID,currentPageIndex); + return ParsePagesIDs(inPageNode, inNodeObjectID, currentPageIndex, parsingPath); } static const std::string scPage = "Page"; static const std::string scPages = "Pages"; -EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex) +EStatusCode PDFParser::ParsePagesIDs( + PDFDictionary* inPageNode, + ObjectIDType inNodeObjectID, + unsigned long& ioCurrentPageIndex, + PDFParsingPath& ioParsingPath +) { // recursion. // if this is a page, write it's node object ID in the current page index and +1 @@ -891,6 +897,12 @@ EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNo do { + // add object to parsing path, checking for cycles + if(ioParsingPath.EnterObject(inNodeObjectID) != eSuccess) { + status = PDFHummus::eFailure; + break; + } + PDFObjectCastPtr objectType(inPageNode->QueryDirectObject("Type")); if(!objectType) { @@ -952,7 +964,7 @@ EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNo break; } - status = ParsePagesIDs(pageNodeObject.GetPtr(),((PDFIndirectObjectReference*)it.GetItem())->mObjectID,ioCurrentPageIndex); + status = ParsePagesIDs(pageNodeObject.GetPtr(),((PDFIndirectObjectReference*)it.GetItem())->mObjectID, ioCurrentPageIndex, ioParsingPath); } } else @@ -961,6 +973,14 @@ EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNo status = PDFHummus::eFailure; break; } + + + // exit object + if(ioParsingPath.ExitObject(inNodeObjectID) != eSuccess) { + status = PDFHummus::eFailure; + break; + } + }while(false); return status; diff --git a/PDFWriter/PDFParser.h b/PDFWriter/PDFParser.h index e2a8ca9c..aff8fa9d 100644 --- a/PDFWriter/PDFParser.h +++ b/PDFWriter/PDFParser.h @@ -31,6 +31,7 @@ #include "DecryptionHelper.h" #include "PDFParsingOptions.h" #include "InputOffsetStream.h" +#include "PDFParsingPath.h" #include #include @@ -214,7 +215,7 @@ class PDFParser PDFHummus::EStatusCode SetupDecryptionHelper(const std::string& inPassword); PDFHummus::EStatusCode ParsePagesObjectIDs(); PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID); - PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex); + PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex, PDFParsingPath& ioParsingPath); PDFHummus::EStatusCode ParsePreviousXrefs(PDFDictionary* inTrailer); PDFHummus::EStatusCode MergeXrefWithMainXref(XrefEntryInputVector& inTableToMerge,ObjectIDType inMergedTableSize); PDFHummus::EStatusCode ParseFileDirectory(); diff --git a/PDFWriter/PDFParsingPath.cpp b/PDFWriter/PDFParsingPath.cpp new file mode 100644 index 00000000..5f0db363 --- /dev/null +++ b/PDFWriter/PDFParsingPath.cpp @@ -0,0 +1,75 @@ + +/* + Source File : PDFParsingPath.cpp + + + Copyright 2011 Gal Kahana PDFWriter + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +*/ + +#include "PDFParsingPath.h" +#include "Trace.h" + +#include + + +using namespace std; +using namespace PDFHummus; + +PDFParsingPath::PDFParsingPath() { + +} + + +EStatusCode PDFParsingPath::EnterObject(ObjectIDType inObjectId) { + ObjectIDTypeList::iterator it = find(mObjectsPath.begin(), mObjectsPath.end(), inObjectId); + if(it != mObjectsPath.end()) { + TRACE_LOG2("PDFParsingPath::EnterObject, attempting to enter object %ld, where the object already exists in the current path: %s",inObjectId,PrintPath().c_str()); + return eFailure; + } + + mObjectsPath.push_back(inObjectId); + return eSuccess; +} + +EStatusCode PDFParsingPath::ExitObject(ObjectIDType inObjectId) { + if(mObjectsPath.size() == 0 || mObjectsPath.back() != inObjectId) { + TRACE_LOG2("PDFParsingPath::ExitObject, attempting to exit object %ld, where the object is NOT the last entered: %s",inObjectId,PrintPath().c_str()); + return eFailure; + } + + mObjectsPath.pop_back(); + return eSuccess; + +} + +void PDFParsingPath::Reset() { + mObjectsPath.clear(); +} + +std::string PDFParsingPath::PrintPath() { + std::stringstream pathWriter; + ObjectIDTypeList::iterator it = mObjectsPath.begin(); + + if(it != mObjectsPath.end()) { + pathWriter<<*it; + for(; it != mObjectsPath.end(); ++it) { + pathWriter<<", "<<*it; + } + } + + return pathWriter.str(); +} \ No newline at end of file diff --git a/PDFWriter/PDFParsingPath.h b/PDFWriter/PDFParsingPath.h new file mode 100644 index 00000000..95f80880 --- /dev/null +++ b/PDFWriter/PDFParsingPath.h @@ -0,0 +1,47 @@ +/* + Source File : PDFParsingPath.h + + + Copyright 2011 Gal Kahana PDFWriter + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +*/ +#pragma once + +#include "EStatusCode.h" +#include "ObjectsBasicTypes.h" + +#include +#include + + +typedef std::list ObjectIDTypeList; + + +class PDFParsingPath { + +public: + PDFParsingPath(); + + PDFHummus::EStatusCode EnterObject(ObjectIDType inObjectId); + PDFHummus::EStatusCode ExitObject(ObjectIDType inObjectId); + + void Reset(); +private: + + ObjectIDTypeList mObjectsPath; + + std::string PrintPath(); +}; diff --git a/PDFWriterTesting/CMakeLists.txt b/PDFWriterTesting/CMakeLists.txt index 5a233c32..976401d0 100644 --- a/PDFWriterTesting/CMakeLists.txt +++ b/PDFWriterTesting/CMakeLists.txt @@ -52,6 +52,7 @@ create_test_sourcelist (Tests PDFEmbedTest.cpp PDFObjectCastTest.cpp PDFObjectParserTest.cpp + PDFParserFuzzSanity.cpp PDFParserTest.cpp PDFWithPassword.cpp PFBStreamTest.cpp @@ -102,12 +103,21 @@ endif(APPLE) # Add all the ADD_TEST for each test (reusing the create_test_sourcelist list minus the generated executable) set (TestsToRun ${Tests}) list(REMOVE_AT TestsToRun 0) # removing first item which is PDFWriterTesting. started getting a full path for it, so moved to REMOVE_AT instead of REMOVE_ITEM with the file name +list(REMOVE_ITEM TestsToRun PDFParserFuzzSanity.cpp) # will add tests for this specifically foreach (test ${TestsToRun}) get_filename_component (TName ${test} NAME_WE) add_test (NAME ${TName} COMMAND PDFWriterTesting ${TName} ${CMAKE_CURRENT_SOURCE_DIR}/Materials ${CMAKE_BINARY_DIR}/Testing/Output) endforeach () +# fuzz test specific +file(GLOB fuzztestfiles ${CMAKE_CURRENT_SOURCE_DIR}/Materials/fuzzing/*) +foreach (fuzztestfile ${fuzztestfiles}) + get_filename_component (TName ${fuzztestfile} NAME) + add_test (NAME FuzzTest_${TName} COMMAND PDFWriterTesting PDFParserFuzzSanity ${CMAKE_CURRENT_SOURCE_DIR}/Materials ${CMAKE_BINARY_DIR}/Testing/Output ${fuzztestfile}) +endforeach () + + # create temp dir for output files set (TmpOutputDir ${CMAKE_BINARY_DIR}/Testing/Output) diff --git a/PDFWriterTesting/Materials/fuzzing/BrokenPageIDs.pdf b/PDFWriterTesting/Materials/fuzzing/BrokenPageIDs.pdf new file mode 100644 index 0000000000000000000000000000000000000000..785597719436165aed6ec22d8d685778a61bbc7a GIT binary patch literal 1703 zcmeHG!A^rf5IsY}Kg@-r=`L;2CWMRHi!l=P)_6d>H5gM2tcm*dz8RQ`Nv|dx3SpR; zotd3|Z#FKj)@R0;2;b>BA@Dy4T#%vW=`{+oEDD3!!`Hi`Sy8<>whGykR){Itz8!j? z4Lo?w5??}(yjE&|B$a~MZ!D`4q)tD?h)wgMe30iTBtnx7cCx%osYvX^>wW2oUJRR{ z)hR=2nLF?~Mv^H$+H=R;UN#?lPYJ6x^|k{kG7=<5Et%SJEz5+Ci!uGmDA(Ze>U65c znV+rOfnBl-<(QeqjES72FSU!IzrFTn?HmY5&j(+Bf?jIeQQvBb@sX87u1yT!R-bRH z`pvabe_!wGZzpi!0yd2&!CvkvaNAHOX%IVA`2Wo2wsKH&gg9qhr6F)SSbstWemh^a QzNg40>3I~!@%7#M2W+H2m;e9( literal 0 HcmV?d00001 diff --git a/PDFWriterTesting/PDFParserFuzzSanity.cpp b/PDFWriterTesting/PDFParserFuzzSanity.cpp new file mode 100644 index 00000000..5987313a --- /dev/null +++ b/PDFWriterTesting/PDFParserFuzzSanity.cpp @@ -0,0 +1,32 @@ +#include "PDFParser.h" +#include "InputFile.h" +#include "Trace.h" + +#include "testing/TestIO.h" + +#include + +using namespace std; +using namespace PDFHummus; + +int PDFParserFuzzSanity(int argc, char* argv[]) { + PDFParser parser; + InputFile pdfFile; + + std::string path = argv[3]; + + EStatusCode status = pdfFile.OpenFile(path); + if(status != PDFHummus::eSuccess) + { + cout<<"unable to open file for reading, Path:"<