Thursday, 20 April 2017

Is nesting async calls inside async calls desirable? (Node.js)

I am playing with Node.js and I have created a simple script that uploads files from a directory to a server:

var request = require('request');
var file = require('file');
var fs = require('fs');
var path = require('path');


VERSION = '0.1'
CONFIG_FILE = path.join(__dirname, 'etc', 'sender.conf.json');


var config = JSON.parse(
  fs.readFileSync(CONFIG_FILE).toString()
);

var DATA_DIR = __dirname
config['data_dir'].forEach(function(dir) {
  DATA_DIR = path.join(DATA_DIR, dir)
});


console.log('sending data from root directory: ' + DATA_DIR);
file.walk(
  DATA_DIR,
  function(err, dir_path, dirs, files) {
    if(err) {
      return console.error(err);
    } 
    sendFiles(dir_path, files);
  } 
);

function sendFiles(dir_path, files)
{
  files
    .filter(function(file) {
      return file.substr(-5) === '.meta';
    })
    .forEach(function(file) {
      var name = path.basename(file.slice(0, -5));
      sendFile(dir_path, name);
    })
  ; 
} 

function sendFile(dir_path, name)
{
  console.log("reading file start: " + dir_path + "/" + name);
  fs.readFile(
    path.join(dir_path, name + '.meta'),
    function(err, raw_meta) {
      if(err) {
        return console.error(err);
      }
      console.log("reading file done: " + dir_path + "/" + name);
      sendData(
        name,
        JSON.parse(raw_meta),
        fs.createReadStream(path.join(dir_path, name + '.data'))
      );
    }
  );
  console.log("reading file async: " + dir_path + "/" + name);
}

function sendData(name, meta, data_stream)
{ 
  meta['source'] = config['data_source'];

  var req = request.post(
    config['sink_url'],
    function(err, res, body) {
      if(err) {
        console.log(err);
      }
      else {
        console.log(name);
        console.log(meta);
        console.log(body);
      }
    }
  );
  var form = req.form();

  form.append(
    'meta',
    JSON.stringify(meta),
    { 
      contentType: 'application/x-www-form-urlencoded'
    }
  );

  form.append(
    'data',
    data_stream
  );
}

It works fine, when run with only a few files. But when I run it on directory with lots of files, it chokes. This is because it keeps creating huge amounts of tasks for reading from a file, but never gets to actually doing the reading (because there is too many files). This can be observed on output:

sending data from root directory: .../data
reading file start: .../data/ac/ad/acigisu-adruire-sabeveab-ozaniaru-fugeef-wemathu-lubesoraf-lojoepe
reading file async: .../data/ac/ad/acigisu-adruire-sabeveab-ozaniaru-fugeef-wemathu-lubesoraf-lojoepe
reading file start: .../data/ac/ab/acodug-abueba-alizacod-ugvut-nucom
reading file async: .../data/ac/ab/acodug-abueba-alizacod-ugvut-nucom
reading file start: .../data/ac/as/acigisu-asetufvub-liwi-ru-mitdawej-vekof
reading file async: .../data/ac/as/acigisu-asetufvub-liwi-ru-mitdawej-vekof
reading file start: .../data/ac/av/ace-avhad-bop-rujan-pehwopa
reading file async: .../data/ac/av/ace-avhad-bop-rujan-pehwopa
...

For each file, there is console output "reading file start" produced immediately before call to fs.readFile, and "reading file async" that is produced immediately after the async reading has been scheduled. But there is no "reading file done" message even when I let it run for a long time, which means that reading of any file has probably never been even scheduled (those files are on order of 100s of bytes, so once scheduled, those reads would probably finish in single go).

This leads me to the following thought process. Async calls in Node.js are done because the event loop itself is single-threaded and we do not want to block it. However, once this requirement is satisfied, does it make any sense to nest further async calls into async calls that are themselves nested in async calls, etc.? Would it serve any particular purpose? Moreover, would not it be actual pessimisation of the code due to scheduling overhead that is not really needed and can be completely avoided if complete handling of single file have consisted of synchronous calls only?

Given the thought process above, my course of action would be to use solution from this question:

  • asynchronously push names of all files to async.queue
  • limit number of parallel tasks by setting queue.concurrency
  • provide file-upload handler that is completely synchronous, i.e. it synchronously reads contents of the file and after that is finished, it synchronously sends POST request to the server

This is my very first try to use Node.js and/or JavaScript, therefore it is quite possible I am completely wrong (note that e.g. sync-request package makes it very clear that synchronous calls are not desirable, which is in contradiction with my thought process above - the question is why). Any comments on validity of the above thought process as well as viability of the proposed solution and eventual alternatives to it would be very much appreciated.



via mareq

No comments:

Post a Comment