[Pkg-javascript-commits] [pdf.js] 57/115: [api-minor] Add a parameter to `PDFPageProxy_getTextContent` that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
David Prévot
taffit at moszumanska.debian.org
Wed Dec 16 20:03:15 UTC 2015
This is an automated email from the git hooks/post-receive script.
taffit pushed a commit to branch master
in repository pdf.js.
commit 6dfe53b976c10e957f6f5e611a14aa7ebf4a1aed
Author: Jonas Jenwald <jonas.jenwald at gmail.com>
Date: Mon Nov 23 16:57:43 2015 +0100
[api-minor] Add a parameter to `PDFPageProxy_getTextContent` that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces.
When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`).
Fixes 6612.
---
src/core/core.js | 7 +++++--
src/core/evaluator.js | 27 +++++++++++++++++++++------
src/core/worker.js | 4 +++-
src/display/api.js | 16 ++++++++++++++--
test/driver.js | 10 ++++++----
test/pdfs/.gitignore | 1 +
test/pdfs/issue6612.pdf | Bin 0 -> 7067 bytes
test/test_manifest.json | 7 +++++++
test/unit/api_spec.js | 20 +++++++++++++++-----
web/pdf_find_controller.js | 1 -
web/pdf_page_view.js | 2 +-
web/pdf_viewer.js | 4 ++--
12 files changed, 75 insertions(+), 24 deletions(-)
diff --git a/src/core/core.js b/src/core/core.js
index 984c5e9..52ac6d5 100644
--- a/src/core/core.js
+++ b/src/core/core.js
@@ -218,7 +218,8 @@ var Page = (function PageClosure() {
});
},
- extractTextContent: function Page_extractTextContent(task) {
+ extractTextContent: function Page_extractTextContent(task,
+ normalizeWhitespace) {
var handler = {
on: function nullHandlerOn() {},
send: function nullHandlerSend() {}
@@ -248,7 +249,9 @@ var Page = (function PageClosure() {
return partialEvaluator.getTextContent(contentStream,
task,
- self.resources);
+ self.resources,
+ /* stateManager = */ null,
+ normalizeWhitespace);
});
},
diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index 7e80ecf..2008775 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
});
},
- getTextContent: function PartialEvaluator_getTextContent(stream, task,
- resources,
- stateManager) {
+ getTextContent:
+ function PartialEvaluator_getTextContent(stream, task, resources,
+ stateManager,
+ normalizeWhitespace) {
stateManager = (stateManager || new StateManager(new TextState()));
+ var WhitespaceRegexp = /\s/g;
+
var textContent = {
items: [],
styles: Object.create(null)
@@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
return textContentItem;
}
+ function replaceWhitespace(str) {
+ // Replaces all whitespaces with standard spaces (0x20), to avoid
+ // alignment issues between the textLayer and the canvas if the text
+ // contains e.g. tabs (fixes issue6612.pdf).
+ var i = 0, ii = str.length, code;
+ while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) {
+ i++;
+ }
+ return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str);
+ }
+
function runBidiTransform(textChunk) {
var str = textChunk.str.join('');
var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
return {
- str: bidiResult.str,
+ str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) :
+ bidiResult.str),
dir: bidiResult.dir,
width: textChunk.width,
height: textChunk.height,
@@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
return self.getTextContent(xobj, task,
- xobj.dict.get('Resources') || resources, stateManager).
- then(function (formTextContent) {
+ xobj.dict.get('Resources') || resources, stateManager,
+ normalizeWhitespace).then(function (formTextContent) {
Util.appendToArray(textContent.items, formTextContent.items);
Util.extendObj(textContent.styles, formTextContent.styles);
stateManager.restore();
diff --git a/src/core/worker.js b/src/core/worker.js
index 08fa189..c456340 100644
--- a/src/core/worker.js
+++ b/src/core/worker.js
@@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
handler.on('GetTextContent', function wphExtractText(data) {
var pageIndex = data.pageIndex;
+ var normalizeWhitespace = data.normalizeWhitespace;
return pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task);
var pageNum = pageIndex + 1;
var start = Date.now();
- return page.extractTextContent(task).then(function(textContent) {
+ return page.extractTextContent(task, normalizeWhitespace).then(
+ function(textContent) {
finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' +
(Date.now() - start) + 'ms');
diff --git a/src/display/api.js b/src/display/api.js
index e3aafa0..1b8dce1 100644
--- a/src/display/api.js
+++ b/src/display/api.js
@@ -709,6 +709,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
})();
/**
+ * Page getTextContent parameters.
+ *
+ * @typedef {Object} getTextContentParameters
+ * @param {boolean} normalizeWhitespace - replaces all occurrences of
+ * whitespace with standard spaces (0x20). The default value is `false`.
+ */
+
+/**
* Page text content.
*
* @typedef {Object} TextContent
@@ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
},
/**
+ * @param {getTextContentParameters} params - getTextContent parameters.
* @return {Promise} That is resolved a {@link TextContent}
* object that represent the page text content.
*/
- getTextContent: function PDFPageProxy_getTextContent() {
+ getTextContent: function PDFPageProxy_getTextContent(params) {
+ var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
+
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
- pageIndex: this.pageNumber - 1
+ pageIndex: this.pageNumber - 1,
+ normalizeWhitespace: normalizeWhitespace,
});
},
diff --git a/test/driver.js b/test/driver.js
index c41ec70..a61084e 100644
--- a/test/driver.js
+++ b/test/driver.js
@@ -334,10 +334,12 @@ var Driver = (function DriverClosure() {
textLayerContext.clearRect(0, 0,
textLayerCanvas.width, textLayerCanvas.height);
// The text builder will draw its content on the test canvas
- initPromise = page.getTextContent().then(function(textContent) {
- return rasterizeTextLayer(textLayerContext, viewport,
- textContent);
- });
+ initPromise =
+ page.getTextContent({ normalizeWhitespace: true }).then(
+ function(textContent) {
+ return rasterizeTextLayer(textLayerContext, viewport,
+ textContent);
+ });
} else {
textLayerCanvas = null;
initPromise = Promise.resolve();
diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore
index 62a7a80..38a33eb 100644
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@@ -49,6 +49,7 @@
!issue5280.pdf
!issue5677.pdf
!issue5954.pdf
+!issue6612.pdf
!alphatrans.pdf
!devicen.pdf
!cmykjpeg.pdf
diff --git a/test/pdfs/issue6612.pdf b/test/pdfs/issue6612.pdf
new file mode 100644
index 0000000..c9543f1
Binary files /dev/null and b/test/pdfs/issue6612.pdf differ
diff --git a/test/test_manifest.json b/test/test_manifest.json
index 1bb299c..178e2f0 100644
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@@ -1271,6 +1271,13 @@
"link": false,
"type": "eq"
},
+ { "id": "issue6612-text",
+ "file": "pdfs/issue6612.pdf",
+ "md5": "657f33236496916597cd70ef1222509a",
+ "rounds": 1,
+ "link": false,
+ "type": "text"
+ },
{ "id": "zerowidthline",
"file": "pdfs/zerowidthline.pdf",
"md5": "295d26e61a85635433f8e4b768953f60",
diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
index 9749942..714166a 100644
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@@ -482,11 +482,21 @@ describe('api', function() {
});
});
it('gets text content', function () {
- var promise = page.getTextContent();
- waitsForPromiseResolved(promise, function (data) {
- expect(!!data.items).toEqual(true);
- expect(data.items.length).toEqual(7);
- expect(!!data.styles).toEqual(true);
+ var defaultPromise = page.getTextContent();
+ var normalizeWhitespacePromise = page.getTextContent({
+ normalizeWhitespace: true });
+
+ var promises = [
+ defaultPromise,
+ normalizeWhitespacePromise
+ ];
+ waitsForPromiseResolved(Promise.all(promises), function (data) {
+ expect(!!data[0].items).toEqual(true);
+ expect(data[0].items.length).toEqual(7);
+ expect(!!data[0].styles).toEqual(true);
+
+ // A simple check that ensures the two `textContent` object match.
+ expect(JSON.stringify(data[0])).toEqual(JSON.stringify(data[1]));
});
});
it('gets operator list', function() {
diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js
index 6f264a8..183db2d 100644
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@@ -66,7 +66,6 @@ var PDFFindController = (function PDFFindControllerClosure() {
'\u00BC': '1/4', // Vulgar fraction one quarter
'\u00BD': '1/2', // Vulgar fraction one half
'\u00BE': '3/4', // Vulgar fraction three quarters
- '\u00A0': ' ' // No-break space
};
this.findBar = options.findBar || null;
diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js
index bfa5875..440b31f 100644
--- a/web/pdf_page_view.js
+++ b/web/pdf_page_view.js
@@ -489,7 +489,7 @@ var PDFPageView = (function PDFPageViewClosure() {
function pdfPageRenderCallback() {
pageViewDrawCallback(null);
if (textLayer) {
- self.pdfPage.getTextContent().then(
+ self.pdfPage.getTextContent({ normalizeWhitespace: true }).then(
function textContentResolved(textContent) {
textLayer.setTextContent(textContent);
textLayer.render(TEXT_LAYER_RENDER_DELAY);
diff --git a/web/pdf_viewer.js b/web/pdf_viewer.js
index e32d9c6..a9c3d3d 100644
--- a/web/pdf_viewer.js
+++ b/web/pdf_viewer.js
@@ -471,7 +471,7 @@ var PDFViewer = (function pdfViewer() {
if (!this.pdfDocument) {
return;
}
-
+
var pageView = this._pages[pageNumber - 1];
if (this.isInPresentationMode) {
@@ -729,7 +729,7 @@ var PDFViewer = (function pdfViewer() {
getPageTextContent: function (pageIndex) {
return this.pdfDocument.getPage(pageIndex + 1).then(function (page) {
- return page.getTextContent();
+ return page.getTextContent({ normalizeWhitespace: true });
});
},
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-javascript/pdf.js.git
More information about the Pkg-javascript-commits
mailing list