import Tesseract from 'tesseract.js'; import pdfimage from 'pdf-image';
var PDFImage = pdfimage.PDFImage; var pdfImage = new PDFImage(pdfFilePath, { convertOptions: { "-density": "196" }}); function recursiveFunction(pageIndex, numberOfPages) { if (pageIndex > numberOfPages) { return; } pdfImage.convertPage(pageIndex).then(function (pageImage) { Tesseract.recognize(pageImage, 'eng').then(({ data: { text } }) => { console.log(text); //perform other synchronous processing with the text before moving to the next page... recursiveFunction(pageIndex, numberOfPages + 1) }); }); } recursiveFunction(0, numberOfPages) Additionally if each page processing does not rely on previous results for generation I would not go the synchronous route. If you need the results ordered as if it was synchronous you can simulate this with preallocated array where each page is an index. See below: import Tesseract from 'tesseract.js'; import pdfimage from 'pdf-image'; var PDFImage = pdfimage.PDFImage; var pdfImage = new PDFImage(pdfFilePath, { convertOptions: { "-density": "196" }}); var acc = []; for(var pageIndex = 0; pageIndex < numberOfPages; pageIndex++) { let currentPageIndex = pageIndex; //Make sure you do this!! node closures will reference the wrong pageIndex otherwise! pdfImage.convertPage(pageIndex).then(function (pageImage) { Tesseract.recognize(pageImage, 'eng').then(({ data: { text } }) => { console.log(text); //perform other processing with the text before moving to the next page... //Save the result as a variable. acc[currentPageIndex] = results; }); }); } The end result of the above code is that all processes will be done async but when everything is finished your results will be saved in an array in an ordered format from the first page to the last. Btw I'm not a node expert, just a generalist programmer, honestly not even that into node... so someone else may have a better idea how to do this idiomatically. On Monday, June 8, 2020 at 2:25:05 AM UTC-7, Matthew Hamilton wrote: > > I'm relatively new to JavaScript programming for node.js, and I've been > reading about this for 5 hours and cannot wrap my head around it, so here I > am... I am trying to get the text from a PDF for searching. I need page > numbers, line numbers, and character positions of the results. It appears > that pdf.js cannot keep line breaks at the very least. I wonder if it will > keep multiple sequential spaces, but the line breaks are a dealbreaker, so > I've moved on. Now I'm using pdf-image to convert the pdf document to a png > for each page. Then I want to use tesseract.js to run OCR on the png files > to get the text as it appears in the pdf including line breaks and extra > spaces. The problem is if the pdf document is more than 5-10 pages, then > execution kills my laptop. The process of converting the pdf to png's > consumes over 12GB of RAM and never finishes to even move on the the OCR > which has to be worse. The average number of pages in the pdf documents I > am processing is 300-500, so I have to batch process. The problem I have is > that pdf-image and tesseract.js both use promises for async processing. > It's really the async that's killing my laptop. I just want to get the > number of pages, loop over each page one at a time, convert it to png, then > perform the OCR, then finish some other synchronous processing before > moving on to the next page. The code I have right now that doesn't work is: > > import Tesseract from 'tesseract.js'; > import pdfimage from 'pdf-image'; > > var PDFImage = pdfimage.PDFImage; > var pdfImage = new PDFImage(pdfFilePath, { convertOptions: { "-density": > "196" }}); > > for(var pageIndex = 0; pageIndex < numberOfPages; pageIndex++) > { > pdfImage.convertPage(pageIndex).then(function (pageImage) { > Tesseract.recognize(pageImage, 'eng').then(({ data: { text } }) => { > console.log(text); > //perform other synchronous processing with the text before moving to the > next page... > }); > }); > } > > I can process one page without the for loop in about 200ms, but when I try > to loop everything gets messed up. I'm not sure how to proceed with > processing these promises synchronously. I know promises are supposed to be > more efficient, but sometimes order is important and resources are limited > for unchecked parallel processing... Like file type conversion and OCR for > 300-500 page documents. > > As a nice-to-have, I would also like to figure out how to load and > initialize tesseract.js once and then just call the recognize method. I > have tried the following code to achieve that, but I think it loads and > initializes then reloads and initializes when it calls the recognize > method. Controlling that behavior may not be possible, but I figured I'd > throw it out there. > > (async () => > { > await Tesseract.load(); > await Tesseract.loadLanguage('eng'); > await Tesseract.initialize('eng'); > }); > > //then perform the convertPage then recognize as shown in the first code > block above... > > Thank you! > -- Job board: http://jobs.nodejs.org/ New group rules: https://gist.github.com/othiym23/9886289#file-moderation-policy-md Old group rules: https://github.com/joyent/node/wiki/Mailing-List-Posting-Guidelines --- You received this message because you are subscribed to the Google Groups "nodejs" group. To unsubscribe from this group and stop receiving emails from it, send an email to nodejs+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/nodejs/d973eb4b-e3e8-4394-bf82-f49e73efca70o%40googlegroups.com.