From a604278f3b6700b841328b60240b4faf637b4980 Mon Sep 17 00:00:00 2001 From: Krzysztof Pyrkosz Date: Mon, 27 May 2024 19:45:48 +0200 Subject: [PATCH 1/6] Fix hexadecimal decoding (issue #715) --- samples/bugs/Issue715.pdf | Bin 0 -> 350 bytes src/Smalot/PdfParser/Element/ElementHexa.php | 7 +++---- src/Smalot/PdfParser/Parser.php | 2 +- .../Integration/Element/ElementHexaTest.php | 4 ++++ tests/PHPUnit/Integration/ParserTest.php | 14 ++++++++++++++ 5 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 samples/bugs/Issue715.pdf diff --git a/samples/bugs/Issue715.pdf b/samples/bugs/Issue715.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7dc919f8f7a73eaf268b56749eeb22e76474ae40 GIT binary patch literal 350 zcmZ9I!HU8#5Qgvb6m#iOlQhkuP;#*9MHWT9iH9uBmWU+MM$mnFC$a=}Zu#b$Wd0PJ z`En%296%t)@N=qbw!dF2YfLof(6No_?0_XTw_01@k^<@UqCOS2a_w-C$vd(pTzy25 z2NM1mVBfZNW)^M-PECpOnC[A-F0-9]+)\>/is', $content, $match)) { $name = $match['name']; $offset += strpos($content, '<'.$name) + \strlen($name) + 2; // 1 for '>' - // repackage string as standard - $name = '('.self::decode($name).')'; - $element = ElementDate::parse($name, $document); + $name = self::decode($name); + $element = ElementDate::parse('('.$name.')', $document); if (!$element) { - $element = ElementString::parse($name, $document); + $element = new self($name); } return $element; diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index b051f114..f808ab2b 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -296,7 +296,7 @@ protected function parseHeaderElement(?string $type, $value, ?Document $document return ElementString::parse('('.$value.')', $document); case '<': - return $this->parseHeaderElement('(', ElementHexa::decode($value), $document); + return ElementHexa::parse('<'.$value.'>', $document); case '/': return ElementName::parse('/'.$value, $document); diff --git a/tests/PHPUnit/Integration/Element/ElementHexaTest.php b/tests/PHPUnit/Integration/Element/ElementHexaTest.php index 1c22a5c3..e2b7aae5 100644 --- a/tests/PHPUnit/Integration/Element/ElementHexaTest.php +++ b/tests/PHPUnit/Integration/Element/ElementHexaTest.php @@ -130,5 +130,9 @@ public function testParse(): void 007000610 073007100750061002c0020> '); $this->assertEquals('pasqua, primavera, resurrezione, festa cristiana, gesù, uova di cioccolata, coniglietti, pulcini, pasquale, campane, dina rebucci, uova di pasqua, ', $element); + + $testString = '()\\'; + $element = ElementHexa::parse('<'.bin2hex($testString).'>', null, $offset); + $this->assertEquals($testString, (string) $element); } } diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index fa0d3f42..2e1f3aa8 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -438,6 +438,20 @@ public function testIgnoreEncryption(): void // without the configuration option set, an exception would be thrown. } + + /** + * Tests special chars encoded as hex. + */ + public function testSpecialCharsEncodedAsHex(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue715.pdf'; + + $this->fixture = new Parser(); + $document = $this->fixture->parseFile($filename); + $result = (string) ($document->getObjectsByType('Sig')['4_0'] ?? null)?->getHeader()->get('Contents'); + + $this->assertEquals('()\\', $result); + } } class ParserSub extends Parser From edaa7f4bc0b3b73d1c41511243b9315f6cb28e74 Mon Sep 17 00:00:00 2001 From: Krzysztof Pyrkosz Date: Mon, 27 May 2024 21:03:59 +0200 Subject: [PATCH 2/6] More round bracket related tests --- samples/bugs/Issue715.pdf | Bin 350 -> 430 bytes .../Integration/Element/ElementStringTest.php | 3 +++ tests/PHPUnit/Integration/ParserTest.php | 4 +++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/samples/bugs/Issue715.pdf b/samples/bugs/Issue715.pdf index 7dc919f8f7a73eaf268b56749eeb22e76474ae40..b5b1fc7b9a416c3982b2691f0bc866b0ed619421 100644 GIT binary patch delta 101 zcmcb|w2pbg0S!|H1BLvgEE^mBfTH}A(&W@41&s=gN=;3D=c3fal6)XHQ6ouHbK>?Q xE&~WKv^1Kmz^JaQpzoQNmahOZA;`{-tGFbwsHCDOHI2*2#E45()z#mP3ji@P9Pa=C delta 37 tcmZ3-e2;0ufr%e8C+9NCiQCz66_+Fyl~fd^rg0fs8gr?ty863u0RR#23`+n2 diff --git a/tests/PHPUnit/Integration/Element/ElementStringTest.php b/tests/PHPUnit/Integration/Element/ElementStringTest.php index 61d3ad6b..a6e21fb4 100644 --- a/tests/PHPUnit/Integration/Element/ElementStringTest.php +++ b/tests/PHPUnit/Integration/Element/ElementStringTest.php @@ -131,6 +131,9 @@ public function testParse(): void $element = ElementString::parse('(Gutter\\ console\\ assembly)', null, $offset); $this->assertEquals('Gutter console assembly', $element->getContent()); $this->assertEquals(27, $offset); + + $element = ElementString::parse('(())'); + $this->assertEquals('()', $element->getContent()); } public function testGetContent(): void diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index 2e1f3aa8..b91f0aae 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -449,8 +449,10 @@ public function testSpecialCharsEncodedAsHex(): void $this->fixture = new Parser(); $document = $this->fixture->parseFile($filename); $result = (string) ($document->getObjectsByType('Sig')['4_0'] ?? null)?->getHeader()->get('Contents'); - $this->assertEquals('()\\', $result); + $details = $document->getDetails(); + $this->assertEquals('x(y)', $details['Producer'] ?? null); + $this->assertEquals('a(b)', $details['Creator'] ?? null); } } From 8e6d3b7663ccf670e200b6e2577d6fabdaf642d6 Mon Sep 17 00:00:00 2001 From: Krzysztof Pyrkosz Date: Tue, 28 May 2024 10:07:28 +0200 Subject: [PATCH 3/6] Fix PHP 7.x compatibility --- tests/PHPUnit/Integration/ParserTest.php | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index b91f0aae..83a7ff4a 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -448,7 +448,11 @@ public function testSpecialCharsEncodedAsHex(): void $this->fixture = new Parser(); $document = $this->fixture->parseFile($filename); - $result = (string) ($document->getObjectsByType('Sig')['4_0'] ?? null)?->getHeader()->get('Contents'); + $sigObject = $document->getObjectsByType('Sig'); + $result = null; + if (isset($sigObject['4_0'])) { + $result = (string) $sigObject['4_0']->getHeader()->get('Contents'); + } $this->assertEquals('()\\', $result); $details = $document->getDetails(); $this->assertEquals('x(y)', $details['Producer'] ?? null); From 4ba5b8d69c85ce6027b9d96d4ad1cd1e5816fc21 Mon Sep 17 00:00:00 2001 From: Krzysztof Pyrkosz Date: Wed, 29 May 2024 13:16:29 +0200 Subject: [PATCH 4/6] Rewrite ElementString parser --- .../PdfParser/Element/ElementString.php | 80 +++++++++++++++---- 1 file changed, 63 insertions(+), 17 deletions(-) diff --git a/src/Smalot/PdfParser/Element/ElementString.php b/src/Smalot/PdfParser/Element/ElementString.php index 011bcf46..9571ce88 100644 --- a/src/Smalot/PdfParser/Element/ElementString.php +++ b/src/Smalot/PdfParser/Element/ElementString.php @@ -59,25 +59,71 @@ public static function parse(string $content, ?Document $document = null, int &$ if (preg_match('/^\s*\((?P.*)/s', $content, $match)) { $name = $match['name']; - // Find next ')' not escaped. - $cur_start_text = $start_search_end = 0; - while (false !== ($cur_start_pos = strpos($name, ')', $start_search_end))) { - $cur_extract = substr($name, $cur_start_text, $cur_start_pos - $cur_start_text); - preg_match('/(?P[\\\]*)$/s', $cur_extract, $match); - if (!(\strlen($match['escape']) % 2)) { - break; + $delimiterCount = 0; + $position = 0; + $processedName = ''; + do { + $char = substr($name, 0, 1); + $name = substr($name, 1); + ++$position; + switch ($char) { + // matched delimiters should be treated as part of string + case '(': + $processedName .= $char; + ++$delimiterCount; + break; + case ')': + if (0 === $delimiterCount) { + $name = substr($name, 1); + break 2; + } + $processedName .= $char; + --$delimiterCount; + break; + // escaped chars + case '\\': + $nextChar = substr($name, 0, 1); + switch ($nextChar) { + // end-of-line markers (CR, LF, CRLF) should be ignored + case "\r": + case "\n": + preg_match('/^\\r?\\n?/', $name, $matches); + $name = substr($name, \strlen($matches[0])); + $position += \strlen($matches[0]); + break; + // process LF, CR, HT, BS, FF + case 'n': + case 't': + case 'r': + case 'b': + case 'f': + $processedName .= stripcslashes('\\'.$nextChar); + $name = substr($name, 1); + ++$position; + break; + // decode escaped parentheses and backslash + case '(': + case ')': + case '\\': + case ' ': // TODO: this should probably be removed - kept for compatibility + $processedName .= $nextChar; + $name = substr($name, 1); + ++$position; + break; + // TODO: process octal encoding (but it is also processed later) + // keep backslash in other cases + default: + $processedName .= $char; + } + break; + default: + $processedName .= $char; } - $start_search_end = $cur_start_pos + 1; - } + } while (\strlen($name)); - // Extract string. - $name = substr($name, 0, (int) $cur_start_pos); - $offset += strpos($content, '(') + $cur_start_pos + 2; // 2 for '(' and ')' - $name = str_replace( - ['\\\\', '\\ ', '\\/', '\(', '\)', '\n', '\r', '\t'], - ['\\', ' ', '/', '(', ')', "\n", "\r", "\t"], - $name - ); + $offset += strpos($content, '(') + 1 + $position; + + $name = $processedName; // Decode string. $name = Font::decodeOctal($name); From e738b8edd41e5be7b2c74917cb658a6a1469da31 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Wed, 5 Jun 2024 09:43:35 +0200 Subject: [PATCH 5/6] mostly refinements and some code refactoring (see below) - Moved some test-related code to separate function to improve code readability - renamed and refined testSpecialCharsEncodedAsHex: you don't need an if-clause in a test if you insist on certain values along the way, just use assertX to check for expected values ("fail early") - ElementString: moved the part which handles escaped characters to a separate function to improve code readability - added references / comments --- .../PdfParser/Element/ElementString.php | 82 +++++++++++-------- .../Integration/Element/ElementHexaTest.php | 9 ++ .../Integration/Element/ElementStringTest.php | 6 ++ tests/PHPUnit/Integration/ParserTest.php | 13 +-- 4 files changed, 69 insertions(+), 41 deletions(-) diff --git a/src/Smalot/PdfParser/Element/ElementString.php b/src/Smalot/PdfParser/Element/ElementString.php index 9571ce88..7bdcc38e 100644 --- a/src/Smalot/PdfParser/Element/ElementString.php +++ b/src/Smalot/PdfParser/Element/ElementString.php @@ -51,6 +51,51 @@ public function equals($value): bool return $value == $this->value; } + /** + * Part of parsing process to handle escaped characters. + * Note, most parameters are passed by reference. + * + * Further information in PDF specification (page 53): + * https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf + */ + private static function handleEscapedCharacters(string &$name, int &$position, string &$processedName, string $char): void + { + // escaped chars + $nextChar = substr($name, 0, 1); + switch ($nextChar) { + // end-of-line markers (CR, LF, CRLF) should be ignored + case "\r": + case "\n": + preg_match('/^\\r?\\n?/', $name, $matches); + $name = substr($name, \strlen($matches[0])); + $position += \strlen($matches[0]); + break; + // process LF, CR, HT, BS, FF + case 'n': + case 't': + case 'r': + case 'b': + case 'f': + $processedName .= stripcslashes('\\'.$nextChar); + $name = substr($name, 1); + ++$position; + break; + // decode escaped parentheses and backslash + case '(': + case ')': + case '\\': + case ' ': // TODO: this should probably be removed - kept for compatibility + $processedName .= $nextChar; + $name = substr($name, 1); + ++$position; + break; + // TODO: process octal encoding (but it is also processed later) + // keep backslash in other cases + default: + $processedName .= $char; + } + } + /** * @return bool|ElementString */ @@ -80,46 +125,13 @@ public static function parse(string $content, ?Document $document = null, int &$ $processedName .= $char; --$delimiterCount; break; - // escaped chars case '\\': - $nextChar = substr($name, 0, 1); - switch ($nextChar) { - // end-of-line markers (CR, LF, CRLF) should be ignored - case "\r": - case "\n": - preg_match('/^\\r?\\n?/', $name, $matches); - $name = substr($name, \strlen($matches[0])); - $position += \strlen($matches[0]); - break; - // process LF, CR, HT, BS, FF - case 'n': - case 't': - case 'r': - case 'b': - case 'f': - $processedName .= stripcslashes('\\'.$nextChar); - $name = substr($name, 1); - ++$position; - break; - // decode escaped parentheses and backslash - case '(': - case ')': - case '\\': - case ' ': // TODO: this should probably be removed - kept for compatibility - $processedName .= $nextChar; - $name = substr($name, 1); - ++$position; - break; - // TODO: process octal encoding (but it is also processed later) - // keep backslash in other cases - default: - $processedName .= $char; - } + self::handleEscapedCharacters($name, $position, $processedName, $char); break; default: $processedName .= $char; } - } while (\strlen($name)); + } while ('' !== $name); $offset += strpos($content, '(') + 1 + $position; diff --git a/tests/PHPUnit/Integration/Element/ElementHexaTest.php b/tests/PHPUnit/Integration/Element/ElementHexaTest.php index e2b7aae5..d106918e 100644 --- a/tests/PHPUnit/Integration/Element/ElementHexaTest.php +++ b/tests/PHPUnit/Integration/Element/ElementHexaTest.php @@ -130,7 +130,16 @@ public function testParse(): void 007000610 073007100750061002c0020> '); $this->assertEquals('pasqua, primavera, resurrezione, festa cristiana, gesù, uova di cioccolata, coniglietti, pulcini, pasquale, campane, dina rebucci, uova di pasqua, ', $element); + } + /** + * Closing round bracket encoded in hexadecimal format breaks parsing - string is truncated. + * + * @see https://github.com/smalot/pdfparser/issues/715 + */ + public function testIssue715(): void + { + $offset = 0; $testString = '()\\'; $element = ElementHexa::parse('<'.bin2hex($testString).'>', null, $offset); $this->assertEquals($testString, (string) $element); diff --git a/tests/PHPUnit/Integration/Element/ElementStringTest.php b/tests/PHPUnit/Integration/Element/ElementStringTest.php index a6e21fb4..032214b8 100644 --- a/tests/PHPUnit/Integration/Element/ElementStringTest.php +++ b/tests/PHPUnit/Integration/Element/ElementStringTest.php @@ -131,7 +131,13 @@ public function testParse(): void $element = ElementString::parse('(Gutter\\ console\\ assembly)', null, $offset); $this->assertEquals('Gutter console assembly', $element->getContent()); $this->assertEquals(27, $offset); + } + /** + * @see https://github.com/smalot/pdfparser/issues/715 + */ + public function testParseIssue715(): void + { $element = ElementString::parse('(())'); $this->assertEquals('()', $element->getContent()); } diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index 83a7ff4a..b26384b0 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -441,19 +441,20 @@ public function testIgnoreEncryption(): void /** * Tests special chars encoded as hex. + * + * @see https://github.com/smalot/pdfparser/issues/715 */ - public function testSpecialCharsEncodedAsHex(): void + public function testIssue715SpecialCharsEncodedAsHex(): void { $filename = $this->rootDir.'/samples/bugs/Issue715.pdf'; $this->fixture = new Parser(); $document = $this->fixture->parseFile($filename); $sigObject = $document->getObjectsByType('Sig'); - $result = null; - if (isset($sigObject['4_0'])) { - $result = (string) $sigObject['4_0']->getHeader()->get('Contents'); - } - $this->assertEquals('()\\', $result); + + $this->assertTrue(isset($sigObject['4_0'])); + $this->assertEquals('()\\', (string) $sigObject['4_0']->getHeader()->get('Contents')); + $details = $document->getDetails(); $this->assertEquals('x(y)', $details['Producer'] ?? null); $this->assertEquals('a(b)', $details['Creator'] ?? null); From 53d5ccd9af9db842fbe09a24a48f10458b7f0711 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Wed, 5 Jun 2024 10:03:50 +0200 Subject: [PATCH 6/6] ElementStringTest: added further tests, ... ... which fail (mostly) without the fixes in this branch. --- .../Integration/Element/ElementStringTest.php | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/PHPUnit/Integration/Element/ElementStringTest.php b/tests/PHPUnit/Integration/Element/ElementStringTest.php index 032214b8..8207426f 100644 --- a/tests/PHPUnit/Integration/Element/ElementStringTest.php +++ b/tests/PHPUnit/Integration/Element/ElementStringTest.php @@ -140,6 +140,29 @@ public function testParseIssue715(): void { $element = ElementString::parse('(())'); $this->assertEquals('()', $element->getContent()); + + // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf + // page 54 + $string = '(Strings may contain balanced parentheses ( ) and +special characters (*!&}^% and so on).)'; + $element = ElementString::parse($string); + $this->assertEquals('Strings may contain balanced parentheses ( ) and +special characters (*!&}^% and so on).', $element->getContent()); + + // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf + // page 55 + $string = '( This string has an end−of−line at the end of it. +)'; + $element = ElementString::parse($string); + $this->assertEquals(' This string has an end−of−line at the end of it.'.\PHP_EOL, $element->getContent()); + + // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf + // page 55 + $string = '( These \ +two strings \ +are the same.)'; + $element = ElementString::parse($string); + $this->assertEquals(' These two strings are the same.', $element->getContent()); } public function testGetContent(): void