Monday, 17 April 2017

Memory leaks nodejs scraper

How i can do for make my scraper running more longer ... After approximately 20.000 request there is a problem of out of memory :(

I use a vps of 8GB RAM ( node --expose-gc --max-old-space-size=7468 myscript.js )

Logs

 2224509 ms: Mark-sweep 6017.8 (7502.1) -> 6017.8 (7502.1) MB, 267.3 / 0.0 ms [a                                                                                                                llocation failure] [scavenge might not succeed].
 2224770 ms: Mark-sweep 6017.8 (7502.1) -> 6017.7 (7502.1) MB, 261.7 / 0.0 ms [a                                                                                                                llocation failure] [scavenge might not succeed].
 2225014 ms: Mark-sweep 6017.7 (7502.1) -> 6026.3 (7486.1) MB, 243.2 / 0.0 ms [l                                                                                                                ast resort gc].
 2225295 ms: Mark-sweep 6026.3 (7486.1) -> 6034.7 (7486.1) MB, 280.2 / 0.0 ms [l                                                                                                                ast resort gc].


<--- JS stacktrace --->

==== JS stack trace =========================================

Security context: 0x360d13dcfb39 <JS Object>
    1: slowToString [buffer.js:~421] [pc=0x313b461fda3] (this=0xb64408e3f61 <an                                                                                                                 Uint8Array with map 0x2246ccf06989>,encoding=0x360d13d04381 <undefined>,start=0x                                                                                                                360d13d04381 <undefined>,end=0x360d13d04381 <undefined>)
    2: arguments adaptor frame: 1->3
    3: toString [buffer.js:~488] [pc=0x313ae183641] (this=0xb64408e3f61 <an Uint                                                                                                                8Array with map 0x2246ccf06989>)
    4: arguments adaptor frame...

FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memo                                                                                                                ry
 1: node::Abort() [node]
 2: 0xf8cf13 [node]
 3: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [node]
 4: v8::internal::Factory::NewRawTwoByteString(int, v8::internal::PretenureFlag)                                                                                                                 [node]
 5: v8::internal::Factory::NewStringFromUtf8(v8::internal::Vector<char const>, v                                                                                                                8::internal::PretenureFlag) [node]
 6: v8::String::NewFromUtf8(v8::Isolate*, char const*, v8::String::NewStringType                                                                                                                , int) [node]
 7: node::StringBytes::Encode(v8::Isolate*, char const*, unsigned long, node::en                                                                                                                coding) [node]
 8: void node::Buffer::StringSlice<(node::encoding)1>(v8::FunctionCallbackInfo<v                                                                                                                8::Value> const&) [node]
 9: 0x313ae17ecb8
Aborted

My code

    var request = require('request');
var async = require('async');
var mysql = require('mysql');
var winston = require('winston');
var fs = require('fs');
var htmlparser = require('htmlparser2');

var logger = new winston.Logger({
    level: 'info',
    transports: [
        new(winston.transports.Console)(),
        new(winston.transports.File)({
            filename: 'errors.log'
        })
    ]
});

process.on('uncaughtException', function(err) {
    logger.log('info', err);
});

// On utilise socket.io uniquement coté client 
var socket = require('socket.io-client')('http://127.0.0.1:3000', {
    reconnection: true,
    reconnectionDelay: 5000,
    reconnectionAttempts: 10
});

var pages = [];
var _count = 0;
var source = 'extratorrent';

var elasticsearch = require('elasticsearch');

var client = new elasticsearch.Client({
    host: '99.115.16.67:9200'
   // ,log: 'trace'
});

socket.on('connect', function() {

    var q = async.queue(function(task, done) {
        request.get(task.url, function(error, response, body) {

            console.log('Chargement de la page  [' + task.page_id + '] , récupération des liens... ');

            var parser = new htmlparser.Parser({
                onopentag: function(name, attribs) {
                    if (name === 'a') {
                        if (attribs.href.indexOf('torrent/') !== -1) {
                            var link = 'http://extratorrent.cc' + attribs.href;
                            pages.push(link);
                        }
                    }
                }
            }, {
                decodeEntities: true
            });
            parser.write(body.substr(body.indexOf('<table class="tl">'),body.length - body.indexOf('Recent Searches')));
            parser.end();
            console.log(pages.length + ' Liens chargés');
            done();
        });
    }, 12);

    q.drain = function() {

        // On créer une nouvelle queue mais cette fois pour explorer les URL
        var call = 0;

        q = async.queue(function(task, done) {

            request.get(task.url, {
                timeout: 4000
            }, function(error, response, body) {

                try {
                    var indexof = body.indexOf('Info hash:');
                    var torrent_hash = body.substr((indexof + 38),40);

                    indexof = body.indexOf('class="leech">');
                    var leech = body.substr((indexof + 14),20);
                    leech = parseInt(leech.replace ( /[^\d.]/g, '' ),10);
                    if(leech == '' || leech == null) leech = 0;

                    indexof = body.indexOf('class="seed">');
                    var seed = body.substr((indexof + 14),20);
                    seed = parseInt(seed.replace ( /[^\d.]/g, '' ),10);
                    if(seed == '' || seed == null) seed = 0;

                    console.log('seed => ' + seed + ' leech => ' + leech);
                } catch(err) {
                    pages.remove(task.url);
                    done();
                }
                    // index data here

                    client.index({
                        index: 'torrents',
                        type: 'items',
                        id: torrent_hash,
                        body: {
                            'name': torrent_hash,
                            'seed': seed,
                            'leech': leech,
                            'source': source
                        }
                    }, function(err, resp, status) {
                        call += 1;
                        console.log('---------------------------------------------------------------------------------------------> TORRENT : '  + call + ' | PAGE : ' + (1 + Math.round(call / 50)) );
                        socket.emit('new_torrent', torrent_hash, function() {
                            console.log('torrent ' + torrent_hash + ' envoyé');
                        });

                    });

                pages.remove(task.url);
                console.log('---------------------------------------------------------------------------------------------> LIENS RESTANT : ' + pages.length);                 
                done();
            });

            if (!global.gc) {
                console.error('Use "node --expose-gc script.js" to test with gc.')
                return;
            } else {
                global.gc();
            }

        }, 10);

        q.drain = function() {
            //connection.end();
            setTimeout(function(){
                console.log('Tous les torrents ont été soumis ! ')
            },6000);
        }


        pages.forEach(function(page) {
            q.push({
                url: page
            });
        });

    }


    for (var i = 1; i <= 500; i++) {
        q.push({
            page_id: i,
            url: 'http://extratorrentlive.com/category/4/Movies+Torrents.html?page=' + i + '&srt=added&order=desc&pp=50'
        });
    }

});

Array.prototype.remove = function() {
    var what, a = arguments, L = a.length, ax;
    while (L && this.length) {
        what = a[--L];
        while ((ax = this.indexOf(what)) !== -1) {
            this.splice(ax, 1);
        }
    }
    return this;
};



via LeSpotted44

No comments:

Post a Comment