From 516a6e72e796ed443c1a9c64bc59e202e0f6afe2 Mon Sep 17 00:00:00 2001
From: gal kahana <gal.kahana@hotmail.com>
Date: Sat, 22 Jun 2024 12:29:12 +0300
Subject: [PATCH] #261: recursion in page tree parsing. avoid with storing the
 current path and identifying cycles

---
 PDFWriter/CMakeLists.txt                      |   4 +
 PDFWriter/PDFParser.cpp                       |  26 +++++-
 PDFWriter/PDFParser.h                         |   3 +-
 PDFWriter/PDFParsingPath.cpp                  |  75 ++++++++++++++++++
 PDFWriter/PDFParsingPath.h                    |  47 +++++++++++
 PDFWriterTesting/CMakeLists.txt               |  10 +++
 .../Materials/fuzzing/BrokenPageIDs.pdf       | Bin 0 -> 1703 bytes
 PDFWriterTesting/PDFParserFuzzSanity.cpp      |  32 ++++++++
 8 files changed, 193 insertions(+), 4 deletions(-)
 create mode 100644 PDFWriter/PDFParsingPath.cpp
 create mode 100644 PDFWriter/PDFParsingPath.h
 create mode 100644 PDFWriterTesting/Materials/fuzzing/BrokenPageIDs.pdf
 create mode 100644 PDFWriterTesting/PDFParserFuzzSanity.cpp
diff --git a/PDFWriter/CMakeLists.txt b/PDFWriter/CMakeLists.txt
index 0c314d1a..e25ca990 100644
--- a/PDFWriter/CMakeLists.txt
+++ b/PDFWriter/CMakeLists.txt
@@ -113,6 +113,7 @@ PDFPageMergingHelper.cpp
 PDFParser.cpp
 PDFParserTokenizer.cpp
 PDFParsingOptions.cpp
+PDFParsingPath.cpp
 PDFReal.cpp
 PDFRectangle.cpp
 PDFStream.cpp
@@ -313,6 +314,7 @@ PDFPageMergingHelper.h
 PDFParser.h
 PDFParserTokenizer.h
 PDFParsingOptions.h
+PDFParsingPath.h
 PDFReal.h
 PDFRectangle.h
 PDFStream.h
@@ -720,6 +722,8 @@ PDFNull.h
 PDFObject.cpp
 PDFObject.h
 PDFObjectCast.h
+PDFParsingPath.cpp
+PDFParsingPath.h
 PDFReal.cpp
 PDFReal.h
 PDFStreamInput.cpp
diff --git a/PDFWriter/PDFParser.cpp b/PDFWriter/PDFParser.cpp
index daf9a251..98b23267 100644
--- a/PDFWriter/PDFParser.cpp
+++ b/PDFWriter/PDFParser.cpp
@@ -875,13 +875,19 @@ EStatusCode PDFParser::ParsePagesObjectIDs()
 EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID)
 {
 	unsigned long currentPageIndex = 0;
+	PDFParsingPath parsingPath;
 
-	return ParsePagesIDs(inPageNode,inNodeObjectID,currentPageIndex);
+	return ParsePagesIDs(inPageNode, inNodeObjectID, currentPageIndex, parsingPath);
 }
 
 static const std::string scPage = "Page";
 static const std::string scPages = "Pages";
-EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex)
+EStatusCode PDFParser::ParsePagesIDs(
+	PDFDictionary* inPageNode,
+	ObjectIDType inNodeObjectID,
+	unsigned long& ioCurrentPageIndex,
+	PDFParsingPath& ioParsingPath
+)
 {
 	// recursion.
 	// if this is a page, write it's node object ID in the current page index and +1
@@ -891,6 +897,12 @@ EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNo
 
 	do
 	{
+		// add object to parsing path, checking for cycles
+		if(ioParsingPath.EnterObject(inNodeObjectID) != eSuccess) {
+			status = PDFHummus::eFailure;
+			break;
+		}
+
 		PDFObjectCastPtr<PDFName> objectType(inPageNode->QueryDirectObject("Type"));
 		if(!objectType)
 		{
@@ -952,7 +964,7 @@ EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNo
 					break;
 				}
 
-				status = ParsePagesIDs(pageNodeObject.GetPtr(),((PDFIndirectObjectReference*)it.GetItem())->mObjectID,ioCurrentPageIndex);
+				status = ParsePagesIDs(pageNodeObject.GetPtr(),((PDFIndirectObjectReference*)it.GetItem())->mObjectID, ioCurrentPageIndex, ioParsingPath);
 			}
 		}
 		else
@@ -961,6 +973,14 @@ EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNo
 			status = PDFHummus::eFailure;
 			break;
 		}
+
+
+		// exit object
+		if(ioParsingPath.ExitObject(inNodeObjectID) != eSuccess) {
+			status = PDFHummus::eFailure;
+			break;
+		}
+
 	}while(false);
 
 	return status;
diff --git a/PDFWriter/PDFParser.h b/PDFWriter/PDFParser.h
index e2a8ca9c..aff8fa9d 100644
--- a/PDFWriter/PDFParser.h
+++ b/PDFWriter/PDFParser.h
@@ -31,6 +31,7 @@
 #include "DecryptionHelper.h"
 #include "PDFParsingOptions.h"
 #include "InputOffsetStream.h"
+#include "PDFParsingPath.h"
 
 #include <map>
 #include <set>
@@ -214,7 +215,7 @@ class PDFParser
 	PDFHummus::EStatusCode SetupDecryptionHelper(const std::string& inPassword);
 	PDFHummus::EStatusCode ParsePagesObjectIDs();
 	PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID);
-	PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex);
+	PDFHummus::EStatusCode ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex, PDFParsingPath& ioParsingPath);
 	PDFHummus::EStatusCode ParsePreviousXrefs(PDFDictionary* inTrailer);
 	PDFHummus::EStatusCode MergeXrefWithMainXref(XrefEntryInputVector& inTableToMerge,ObjectIDType inMergedTableSize);
 	PDFHummus::EStatusCode ParseFileDirectory();
diff --git a/PDFWriter/PDFParsingPath.cpp b/PDFWriter/PDFParsingPath.cpp
new file mode 100644
index 00000000..5f0db363
--- /dev/null
+++ b/PDFWriter/PDFParsingPath.cpp
@@ -0,0 +1,75 @@
+
+/*
+   Source File : PDFParsingPath.cpp
+
+
+   Copyright 2011 Gal Kahana PDFWriter
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+*/
+
+#include "PDFParsingPath.h"
+#include "Trace.h"
+
+#include <sstream>
+
+
+using namespace std;
+using namespace PDFHummus;
+
+PDFParsingPath::PDFParsingPath() {
+
+}
+
+
+EStatusCode PDFParsingPath::EnterObject(ObjectIDType inObjectId) {
+    ObjectIDTypeList::iterator it = find(mObjectsPath.begin(), mObjectsPath.end(), inObjectId);
+    if(it != mObjectsPath.end()) {
+        TRACE_LOG2("PDFParsingPath::EnterObject, attempting to enter object %ld, where the object already exists in the current path: %s",inObjectId,PrintPath().c_str());
+        return eFailure;
+    }
+
+    mObjectsPath.push_back(inObjectId);
+    return eSuccess;
+}
+
+EStatusCode PDFParsingPath::ExitObject(ObjectIDType inObjectId) {
+    if(mObjectsPath.size() == 0 || mObjectsPath.back() != inObjectId) {
+        TRACE_LOG2("PDFParsingPath::ExitObject, attempting to exit object %ld, where the object is NOT the last entered: %s",inObjectId,PrintPath().c_str());
+        return eFailure;
+    }
+
+    mObjectsPath.pop_back();
+    return eSuccess;
+
+}
+
+void PDFParsingPath::Reset() {
+    mObjectsPath.clear();
+}
+
+std::string PDFParsingPath::PrintPath() {
+	std::stringstream pathWriter;
+    ObjectIDTypeList::iterator it = mObjectsPath.begin();
+	
+    if(it != mObjectsPath.end()) {
+        pathWriter<<*it;
+        for(; it != mObjectsPath.end(); ++it) {
+            pathWriter<<", "<<*it;
+        }
+    }
+
+    return pathWriter.str();
+}
\ No newline at end of file
diff --git a/PDFWriter/PDFParsingPath.h b/PDFWriter/PDFParsingPath.h
new file mode 100644
index 00000000..95f80880
--- /dev/null
+++ b/PDFWriter/PDFParsingPath.h
@@ -0,0 +1,47 @@
+/*
+   Source File : PDFParsingPath.h
+
+
+   Copyright 2011 Gal Kahana PDFWriter
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   
+*/
+#pragma once
+
+#include "EStatusCode.h"
+#include "ObjectsBasicTypes.h"
+
+#include <list>
+#include <string>
+
+
+typedef std::list<ObjectIDType> ObjectIDTypeList;
+
+
+class PDFParsingPath {
+
+public:
+    PDFParsingPath();
+
+    PDFHummus::EStatusCode EnterObject(ObjectIDType inObjectId);
+    PDFHummus::EStatusCode ExitObject(ObjectIDType inObjectId);
+
+    void Reset();
+private:
+
+    ObjectIDTypeList mObjectsPath;
+
+    std::string PrintPath();
+};
diff --git a/PDFWriterTesting/CMakeLists.txt b/PDFWriterTesting/CMakeLists.txt
index 5a233c32..976401d0 100644
--- a/PDFWriterTesting/CMakeLists.txt
+++ b/PDFWriterTesting/CMakeLists.txt
@@ -52,6 +52,7 @@ create_test_sourcelist (Tests
   PDFEmbedTest.cpp
   PDFObjectCastTest.cpp
   PDFObjectParserTest.cpp
+  PDFParserFuzzSanity.cpp
   PDFParserTest.cpp
   PDFWithPassword.cpp
   PFBStreamTest.cpp
@@ -102,12 +103,21 @@ endif(APPLE)
 # Add all the ADD_TEST for each test (reusing the create_test_sourcelist list minus the generated executable)
 set (TestsToRun ${Tests})
 list(REMOVE_AT TestsToRun 0) # removing first item which is PDFWriterTesting. started getting a full path for it, so moved to REMOVE_AT instead of REMOVE_ITEM with the file name
+list(REMOVE_ITEM TestsToRun PDFParserFuzzSanity.cpp) # will add tests for this specifically
 
 foreach (test ${TestsToRun})
   get_filename_component (TName ${test} NAME_WE)
   add_test (NAME ${TName} COMMAND PDFWriterTesting ${TName} ${CMAKE_CURRENT_SOURCE_DIR}/Materials ${CMAKE_BINARY_DIR}/Testing/Output)
 endforeach ()
 
+# fuzz test specific
+file(GLOB fuzztestfiles ${CMAKE_CURRENT_SOURCE_DIR}/Materials/fuzzing/*)
+foreach (fuzztestfile ${fuzztestfiles})
+    get_filename_component (TName ${fuzztestfile} NAME)
+    add_test (NAME FuzzTest_${TName} COMMAND PDFWriterTesting PDFParserFuzzSanity ${CMAKE_CURRENT_SOURCE_DIR}/Materials ${CMAKE_BINARY_DIR}/Testing/Output ${fuzztestfile})
+endforeach ()
+
+
 # create temp dir for output files
 set (TmpOutputDir ${CMAKE_BINARY_DIR}/Testing/Output)
 
diff --git a/PDFWriterTesting/Materials/fuzzing/BrokenPageIDs.pdf b/PDFWriterTesting/Materials/fuzzing/BrokenPageIDs.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..785597719436165aed6ec22d8d685778a61bbc7a
GIT binary patch
literal 1703
zcmeHG!A^rf5IsY}Kg@-r=`L;2CWMRHi!l=P)_6d>H5gM2tcm*dz8RQ`Nv|dx3SpR;
zotd3|Z#FKj)@R0;2;b>BA@Dy4T#%vW=`{+oEDD3!!`Hi`Sy8<>whGykR){Itz8!j?
z4Lo?w5??}(yjE&|B$a~MZ!D`4q)tD?h)wgMe30iTBtnx7cCx%osYvX^>wW2oUJRR{
z)hR=2nLF?~Mv^H$+H=R;UN#?lPYJ6x^|k{kG7=<5Et%SJEz5+Ci!uGmDA(Ze>U65c
znV+rOfnBl-<(QeqjES72FSU!IzrFTn?HmY5&j(+Bf?jIeQQvBb@sX87u1yT!R-bRH
z`pvabe_!wGZzpi!0yd2&!CvkvaNAHOX%IVA`2Wo2wsKH&gg9qhr6F)SSbstWemh^a
QzNg40>3I~!@%7#M2W+H2m;e9(

literal 0
HcmV?d00001

diff --git a/PDFWriterTesting/PDFParserFuzzSanity.cpp b/PDFWriterTesting/PDFParserFuzzSanity.cpp
new file mode 100644
index 00000000..5987313a
--- /dev/null
+++ b/PDFWriterTesting/PDFParserFuzzSanity.cpp
@@ -0,0 +1,32 @@
+#include "PDFParser.h"
+#include "InputFile.h"
+#include "Trace.h"
+
+#include "testing/TestIO.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace PDFHummus;
+
+int PDFParserFuzzSanity(int argc, char* argv[]) {
+    PDFParser parser;
+    InputFile pdfFile;
+
+    std::string path = argv[3];
+
+    EStatusCode status = pdfFile.OpenFile(path);
+    if(status != PDFHummus::eSuccess)
+    {
+        cout<<"unable to open file for reading, Path:"<<path.c_str()<<"\n";
+        return 1;
+    }
+
+    // traces on
+    Trace::DefaultTrace().SetLogSettings(BuildRelativeOutputPath(argv,  "PDFParserFuzzSanity.txt"), true, true);
+
+    // checks that returns, status doesn't matter
+    parser.StartPDFParsing(pdfFile.GetInputStream());
+
+    return status;
+}
\ No newline at end of file