Thursday 1 June 2017

async.mapLimit only doing first batch

Context

I'm using Tabula to parse pdf tables. I have a lot of pdfs and those pdfs have varying numbers of pages, anywhere from a couple to hundreds. I'm trying to point my node.js program to an input folder and have it give me csvs of all the tables defined by regions I've determined.

Because of memory limitations, I can't just loop through the files, loop through the pages, and have tabula parse each table asynchronously. So I looked around for how to do batches of tasks and async.mapLimit kept coming up. It seems like it is supposed to do what I require, but for whatever reason, I'm only getting it to finish the first batch of size limit and then it "finishes" without going to the subsequent batches and performing the same task.

Code

const tabula = require('tabula-js');
require('pdfjs-dist');
const fs = require('fs');
const path = require('path');
const async = require('async');

const region1 = "5.94,121.275,35.64,788.535";
const region2 = "38.61,159.885,85.14,788.535";
const region3 = "64.35,9.405,149.49,157.905";
const region4 = "87.12,159.885,146.52,789.525";
const region5 = "148.5,186.615,314.82,791.505";
const region6 = "151.47,7.425,313.83,181.665";
const region7 = "318.78,6.435,383.13,788.535";
const region8 = "386.1,10.395,479.16,216.315";
const region9 = "385.11,218.295,595.98,374.715";
const region10 = "386.1,377.685,481.14,791.505";
const region11 = "481.14,10.395,595.98,214.335";
const region12 = "483.12,376.695,596.97,778.635";

const handleTableParse = (err, data, title) => {
    if (err) {
        console.log(err);
        return;
    }

    if (!fs.existsSync('./output/')) {
        fs.mkdirSync('./output/');
    }

    if (fs.existsSync(`./output/${title}.csv`)) {
        fs.unlinkSync(`./output/${title}.csv`);
    }

    data.map(line => {
        fs.appendFileSync(`./output/${title}.csv`, `${line}\n`);
    });

}

const parseTablesInPDF = (pdf, numPages) => {

    const pageNumbers = Array.from(new Array(numPages), (x, i) => i + 1);

    const ext = path.extname(pdf);
    const filename = path.basename(pdf, ext)
        .split(' ')
        .join('_');

    async.mapLimit(pageNumbers, 5, pageNumber => {
    // pageNumbers.forEach(pageNumber => {
        const region1Data = tabula(pdf, { pages: `${pageNumber}`, area: region1 });
        region1Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region1_page${pageNumber}`));

        const region2Data = tabula(pdf, { pages: `${pageNumber}`, area: region2 });
        region2Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region2_page${pageNumber}`));

        const region3Data = tabula(pdf, { pages: `${pageNumber}`, area: region3 });
        region3Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region3_page${pageNumber}`));

        const regino4Data = tabula(pdf, { pages: `${pageNumber}`, area: region4 });
        region4Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region4_page${pageNumber}`));

        const region5Data = tabula(pdf, { pages: `${pageNumber}`, area: region5 });
        region5Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region5_page${pageNumber}`));

        const region6Data = tabula(pdf, { pages: `${pageNumber}`, area: region6 });
        region6Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region6_page${pageNumber}`));

        const region7Data = tabula(pdf, { pages: `${pageNumber}`, area: region7 });
        region7Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region7_page${pageNumber}`));

        const region8Data = tabula(pdf, { pages: `${pageNumber}`, area: region8 });
        region8Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region8_page${pageNumber}`));

        const region9Data = tabula(pdf, { pages: `${pageNumber}`, area: region9 });
        region9Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region9_page${pageNumber}`));

        const region10Data = tabula(pdf, { pages: `${pageNumber}`, area: region10 });
        region10Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region10_page${pageNumber}`));

        const regino11Data = tabula(pdf, { pages: `${pageNumber}`, area: region11 });
        region11Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region11_page${pageNumber}`));

        const region12Data = tabula(pdf, { pages: `${pageNumber}`, area: region12 });
        region12Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region12_page${pageNumber}`));
    })
}

async.mapLimit(fs.readdirSync('./input/2016'), 5, file => {
    const data = new Uint8Array(fs.readFileSync(`./input/2016/${file}`));
    PDFJS.getDocument(data).then(document => {
        parseTablesInPDF(`./input/2016/${file}`, document.numPages);
    })
}, () => console.log('DONE!'));

Problem

At the end of the run, I only get 5 pages of each pdf when I know that several have over 100 pages in them. It seems that every pdf file is being processed, but for whatever reason, I only get 5 pages for each.

This is my first time using some of these libraries. What am I doing wrong? Thanks in advance!



via Jake Smith

No comments:

Post a Comment