Monday, 17 April 2017

reading/writing large files with PapaParse/BabyParse

I have a large CSV file (~500mb) that I want to convert to JSON using BabyParse (the node version of PapaParse). With smaller files I can read the CSV into a string and then pass the string to parse. However, a 500mb file is to too big to be read into a string in this way.

I have a workaround that reads the CSV file as a stream line-by-line, but it's horrendously slow (see below). Can someone tell me a faster way to work with large CSV files in Papa/Baby parse?

var Baby = require('babyparse');
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var file = '500mbbigtest.csv';
//var content = fs.readFileSync(file, { encoding: 'binary' }); DOESN'T WORK

var instream = fs.createReadStream('500mbbigtest.csv');
var outstream = new stream;
var rl = readline.createInterface(instream, outstream);

rl.on('line', function(line) {
  parsed = Baby.parse(line, {fastMode: false});
    rows = parsed.data;
    rows = JSON.stringify(rows);        
    fs.appendFileSync("blahblahblah.json", rows);
});



via Adam

No comments:

Post a Comment