[Pkg-javascript-commits] [pdf.js] 57/115: [api-minor] Add a parameter to `PDFPageProxy_getTextContent` that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)

David Prévot taffit at moszumanska.debian.org
Wed Dec 16 20:03:15 UTC 2015


This is an automated email from the git hooks/post-receive script.

taffit pushed a commit to branch master
in repository pdf.js.

commit 6dfe53b976c10e957f6f5e611a14aa7ebf4a1aed
Author: Jonas Jenwald <jonas.jenwald at gmail.com>
Date:   Mon Nov 23 16:57:43 2015 +0100

    [api-minor] Add a parameter to `PDFPageProxy_getTextContent` that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
    
    This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces.
    
    When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`).
    
    Fixes 6612.
---
 src/core/core.js           |   7 +++++--
 src/core/evaluator.js      |  27 +++++++++++++++++++++------
 src/core/worker.js         |   4 +++-
 src/display/api.js         |  16 ++++++++++++++--
 test/driver.js             |  10 ++++++----
 test/pdfs/.gitignore       |   1 +
 test/pdfs/issue6612.pdf    | Bin 0 -> 7067 bytes
 test/test_manifest.json    |   7 +++++++
 test/unit/api_spec.js      |  20 +++++++++++++++-----
 web/pdf_find_controller.js |   1 -
 web/pdf_page_view.js       |   2 +-
 web/pdf_viewer.js          |   4 ++--
 12 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/src/core/core.js b/src/core/core.js
index 984c5e9..52ac6d5 100644
--- a/src/core/core.js
+++ b/src/core/core.js
@@ -218,7 +218,8 @@ var Page = (function PageClosure() {
       });
     },
 
-    extractTextContent: function Page_extractTextContent(task) {
+    extractTextContent: function Page_extractTextContent(task,
+                                                         normalizeWhitespace) {
       var handler = {
         on: function nullHandlerOn() {},
         send: function nullHandlerSend() {}
@@ -248,7 +249,9 @@ var Page = (function PageClosure() {
 
         return partialEvaluator.getTextContent(contentStream,
                                                task,
-                                               self.resources);
+                                               self.resources,
+                                               /* stateManager = */ null,
+                                               normalizeWhitespace);
       });
     },
 
diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index 7e80ecf..2008775 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
       });
     },
 
-    getTextContent: function PartialEvaluator_getTextContent(stream, task,
-                                                             resources,
-                                                             stateManager) {
+    getTextContent:
+        function PartialEvaluator_getTextContent(stream, task, resources,
+                                                 stateManager,
+                                                 normalizeWhitespace) {
 
       stateManager = (stateManager || new StateManager(new TextState()));
 
+      var WhitespaceRegexp = /\s/g;
+
       var textContent = {
         items: [],
         styles: Object.create(null)
@@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
         return textContentItem;
       }
 
+      function replaceWhitespace(str) {
+        // Replaces all whitespaces with standard spaces (0x20), to avoid
+        // alignment issues between the textLayer and the canvas if the text
+        // contains e.g. tabs (fixes issue6612.pdf).
+        var i = 0, ii = str.length, code;
+        while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) {
+          i++;
+        }
+        return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str);
+      }
+
       function runBidiTransform(textChunk) {
         var str = textChunk.str.join('');
         var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
         return {
-          str: bidiResult.str,
+          str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) :
+                                      bidiResult.str),
           dir: bidiResult.dir,
           width: textChunk.width,
           height: textChunk.height,
@@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
               }
 
               return self.getTextContent(xobj, task,
-                xobj.dict.get('Resources') || resources, stateManager).
-                then(function (formTextContent) {
+                xobj.dict.get('Resources') || resources, stateManager,
+                normalizeWhitespace).then(function (formTextContent) {
                   Util.appendToArray(textContent.items, formTextContent.items);
                   Util.extendObj(textContent.styles, formTextContent.styles);
                   stateManager.restore();
diff --git a/src/core/worker.js b/src/core/worker.js
index 08fa189..c456340 100644
--- a/src/core/worker.js
+++ b/src/core/worker.js
@@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
 
     handler.on('GetTextContent', function wphExtractText(data) {
       var pageIndex = data.pageIndex;
+      var normalizeWhitespace = data.normalizeWhitespace;
       return pdfManager.getPage(pageIndex).then(function(page) {
         var task = new WorkerTask('GetTextContent: page ' + pageIndex);
         startWorkerTask(task);
         var pageNum = pageIndex + 1;
         var start = Date.now();
-        return page.extractTextContent(task).then(function(textContent) {
+        return page.extractTextContent(task, normalizeWhitespace).then(
+            function(textContent) {
           finishWorkerTask(task);
           info('text indexing: page=' + pageNum + ' - time=' +
                (Date.now() - start) + 'ms');
diff --git a/src/display/api.js b/src/display/api.js
index e3aafa0..1b8dce1 100644
--- a/src/display/api.js
+++ b/src/display/api.js
@@ -709,6 +709,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
 })();
 
 /**
+ * Page getTextContent parameters.
+ *
+ * @typedef {Object} getTextContentParameters
+ * @param {boolean} normalizeWhitespace - replaces all occurrences of
+ *   whitespace with standard spaces (0x20). The default value is `false`.
+ */
+
+/**
  * Page text content.
  *
  * @typedef {Object} TextContent
@@ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
     },
 
     /**
+     * @param {getTextContentParameters} params - getTextContent parameters.
      * @return {Promise} That is resolved a {@link TextContent}
      * object that represent the page text content.
      */
-    getTextContent: function PDFPageProxy_getTextContent() {
+    getTextContent: function PDFPageProxy_getTextContent(params) {
+      var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
+
       return this.transport.messageHandler.sendWithPromise('GetTextContent', {
-        pageIndex: this.pageNumber - 1
+        pageIndex: this.pageNumber - 1,
+        normalizeWhitespace: normalizeWhitespace,
       });
     },
 
diff --git a/test/driver.js b/test/driver.js
index c41ec70..a61084e 100644
--- a/test/driver.js
+++ b/test/driver.js
@@ -334,10 +334,12 @@ var Driver = (function DriverClosure() {
               textLayerContext.clearRect(0, 0,
                 textLayerCanvas.width, textLayerCanvas.height);
               // The text builder will draw its content on the test canvas
-              initPromise = page.getTextContent().then(function(textContent) {
-                return rasterizeTextLayer(textLayerContext, viewport,
-                                          textContent);
-              });
+              initPromise =
+                page.getTextContent({ normalizeWhitespace: true }).then(
+                  function(textContent) {
+                    return rasterizeTextLayer(textLayerContext, viewport,
+                                              textContent);
+                });
             } else {
               textLayerCanvas = null;
               initPromise = Promise.resolve();
diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore
index 62a7a80..38a33eb 100644
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@@ -49,6 +49,7 @@
 !issue5280.pdf
 !issue5677.pdf
 !issue5954.pdf
+!issue6612.pdf
 !alphatrans.pdf
 !devicen.pdf
 !cmykjpeg.pdf
diff --git a/test/pdfs/issue6612.pdf b/test/pdfs/issue6612.pdf
new file mode 100644
index 0000000..c9543f1
Binary files /dev/null and b/test/pdfs/issue6612.pdf differ
diff --git a/test/test_manifest.json b/test/test_manifest.json
index 1bb299c..178e2f0 100644
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@@ -1271,6 +1271,13 @@
        "link": false,
        "type": "eq"
     },
+    {  "id": "issue6612-text",
+       "file": "pdfs/issue6612.pdf",
+       "md5": "657f33236496916597cd70ef1222509a",
+       "rounds": 1,
+       "link": false,
+       "type": "text"
+    },
     {  "id": "zerowidthline",
       "file": "pdfs/zerowidthline.pdf",
       "md5": "295d26e61a85635433f8e4b768953f60",
diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
index 9749942..714166a 100644
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@@ -482,11 +482,21 @@ describe('api', function() {
       });
     });
     it('gets text content', function () {
-      var promise = page.getTextContent();
-      waitsForPromiseResolved(promise, function (data) {
-        expect(!!data.items).toEqual(true);
-        expect(data.items.length).toEqual(7);
-        expect(!!data.styles).toEqual(true);
+      var defaultPromise = page.getTextContent();
+      var normalizeWhitespacePromise = page.getTextContent({
+        normalizeWhitespace: true });
+
+      var promises = [
+        defaultPromise,
+        normalizeWhitespacePromise
+      ];
+      waitsForPromiseResolved(Promise.all(promises), function (data) {
+        expect(!!data[0].items).toEqual(true);
+        expect(data[0].items.length).toEqual(7);
+        expect(!!data[0].styles).toEqual(true);
+
+        // A simple check that ensures the two `textContent` object match.
+        expect(JSON.stringify(data[0])).toEqual(JSON.stringify(data[1]));
       });
     });
     it('gets operator list', function() {
diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js
index 6f264a8..183db2d 100644
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@@ -66,7 +66,6 @@ var PDFFindController = (function PDFFindControllerClosure() {
       '\u00BC': '1/4', // Vulgar fraction one quarter
       '\u00BD': '1/2', // Vulgar fraction one half
       '\u00BE': '3/4', // Vulgar fraction three quarters
-      '\u00A0': ' ' // No-break space
     };
     this.findBar = options.findBar || null;
 
diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js
index bfa5875..440b31f 100644
--- a/web/pdf_page_view.js
+++ b/web/pdf_page_view.js
@@ -489,7 +489,7 @@ var PDFPageView = (function PDFPageViewClosure() {
         function pdfPageRenderCallback() {
           pageViewDrawCallback(null);
           if (textLayer) {
-            self.pdfPage.getTextContent().then(
+            self.pdfPage.getTextContent({ normalizeWhitespace: true }).then(
               function textContentResolved(textContent) {
                 textLayer.setTextContent(textContent);
                 textLayer.render(TEXT_LAYER_RENDER_DELAY);
diff --git a/web/pdf_viewer.js b/web/pdf_viewer.js
index e32d9c6..a9c3d3d 100644
--- a/web/pdf_viewer.js
+++ b/web/pdf_viewer.js
@@ -471,7 +471,7 @@ var PDFViewer = (function pdfViewer() {
       if (!this.pdfDocument) {
         return;
       }
-      
+
       var pageView = this._pages[pageNumber - 1];
 
       if (this.isInPresentationMode) {
@@ -729,7 +729,7 @@ var PDFViewer = (function pdfViewer() {
 
     getPageTextContent: function (pageIndex) {
       return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
-        return page.getTextContent();
+        return page.getTextContent({ normalizeWhitespace: true });
       });
     },
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-javascript/pdf.js.git



More information about the Pkg-javascript-commits mailing list