I am trying to search the text of of a pdf using https://www.npmjs.com/package/pdfjs-dist-for-node.
My code looks like this:
gettext: function(){
var data = '../static/example.pdf';
return pdfjs.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n")
});
}).then(function(pages){
console.log(pages)
});
}
This seems to work, but it skips parts of the text. Specifically, it skips whatever I can't highlight with the mouse in the original pdf doc. Is there a way to get pdf.js to pick up on this data?
via David J.
No comments:
Post a Comment