Handle sloppy cross references more and less generically

michaelweiser · michaelweiser · commit 5a5887ae4ca5 · 2020-07-01T11:40:15.000+02:00
A previous commit adjusted readSymbol() to skip leading whitespace in
order to avoid errors with sloppy cross references. This did not fix
handling of literals such as numbers and booleans in readObject()
because they're not accessed using readSymbol(). Also, adjusting the
very low-level readSymbol() function might generate fallout.

So instead, this change moves the skipping of leading whitespace into
readObject() so that it affects all types of referenced objects equally
but not all symbol lookups altogether.

Signed-off-by: Michael Weiser &lt;michael.weiser@gmx.de&gt;
diff --git a/peepdf/PDFCore.py b/peepdf/PDFCore.py
@@ -7857,6 +7857,11 @@ def readObject(self, content, objectType=None, forceMode=False, looseMode=False)
         pdfObject = None
         oldCounter = self.charCounter
         self.charCounter = 0
+        # skip leading whitespace in case of sloppy reference offsets
+        self.readSpaces(content)
+        if self.charCounter > 0:
+            content = content[self.charCounter:]
+            self.charCounter = 0
         if objectType is not None:
             objectsTypeArray = [self.delimiters[i][2] for i in range(len(self.delimiters))]
             index = objectsTypeArray.index(objectType)
@@ -8011,7 +8016,6 @@ def readSymbol(self, string, symbol, deleteSpaces=True):
             errorMessage = 'EOF while looking for symbol "'+symbol+'"'
             pdfFile.addError(errorMessage)
             return (-1, errorMessage)
-        self.readSpaces(string)
         while string[self.charCounter] == '%':
             ret = self.readUntilEndOfLine(string)
             if ret[0] == -1: