fix: harsher hex string parsing to avoid more dire parsing errors (#195)

running fuzztests on my mac was failing erraticaly for one of the files. also recreating a decompressed version didnt work. traced the first problem to a too permissive hex string parsing which caused overreads. and the second problem to a too restrictive zip decoding issue where end of stream error canceled the whole stream reading. which is a shame cause it could parse something worthwhile till then, like in this case.
galkahana · Feb 18, 2023 · 1cd3842 · 1cd3842
1 parent c0de725
commit 1cd3842
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 9 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.15)
 # Soname
 # MAJOR is incremented when symbols are removed or changed in an incompatible way
 # MINOR is incremented when new symbols are added
-project(PDFHummus VERSION 4.5.1)
+project(PDFHummus VERSION 4.5.2)
 
 
 option(USE_BUNDLED  "Whether to use bundled libraries" TRUE)

diff --git a/PDFWriter/InputFlateDecodeStream.cpp b/PDFWriter/InputFlateDecodeStream.cpp
@@ -106,8 +106,8 @@ IOBasicTypes::LongBufferSizeType InputFlateDecodeStream::Read(IOBasicTypes::Byte
 
 IOBasicTypes::LongBufferSizeType InputFlateDecodeStream::DecodeBufferAndRead(const IOBasicTypes::Byte* inBuffer,IOBasicTypes::LongBufferSizeType inSize)
 {
-	if(0 == inSize)
-		return 0; // inflate kinda touchy about getting 0 lengths
+	if(0 == inSize || mEndOfCompressionEoncountered)
+		return 0; // inflate kinda touchy about getting 0 lengths, also stop if already decided to finish
 
 	int inflateResult = Z_OK;
 
@@ -168,10 +168,8 @@ IOBasicTypes::LongBufferSizeType InputFlateDecodeStream::DecodeBufferAndRead(con
 
 	// should be that at the last buffer we'll get here a nice Z_STREAM_END
 	mEndOfCompressionEoncountered = (Z_STREAM_END == inflateResult) || isError(inflateResult);
-	if(Z_OK == inflateResult || Z_STREAM_END == inflateResult)
-		return inSize - mZLibState->avail_out;
-	else
-		return 0;
+	// allows returning what's read even if finished with error. a forgiving approach for end of stream errors
+	return inSize - mZLibState->avail_out;
 }
 
 bool InputFlateDecodeStream::NotEnded()

diff --git a/PDFWriter/PDFParserTokenizer.cpp b/PDFWriter/PDFParserTokenizer.cpp
@@ -21,6 +21,7 @@
 #include "PDFParserTokenizer.h"
 #include "IByteReader.h"
 #include "OutputStringBufferStream.h"
+#include "Trace.h"
 
 using namespace PDFHummus;
 using namespace IOBasicTypes;
@@ -212,8 +213,20 @@ BoolAndString PDFParserTokenizer::GetNextToken()
 							break;
 						}
 
-						if(!IsPDFWhiteSpace(buffer))
-							tokenBuffer.Write(&buffer,1);
+						if(IsPDFWhiteSpace(buffer))
+							continue;
+
+						// verify that getting a hex char or ending char, so that we are sure there's no orcish mischief
+						if(!(buffer>='0' && buffer<='9') &&
+						   !(buffer>='A' && buffer<='F') &&
+						   !(buffer>='a' && buffer<='f') &&
+						   buffer != '>') {
+							TRACE_LOG1("PDFParserTokenizer::GetNextToken, encountered non ascii char in hex string. probably a corruption. byte value = %d", buffer);
+							result.first = false;
+							break;
+						}
+
+						tokenBuffer.Write(&buffer,1);
 					}
 				}
 				result.second = tokenBuffer.ToString();