How i can do for make my scraper running more longer ... After approximately 20.000 request there is a problem of out of memory :(
I use a vps of 8GB RAM ( node --expose-gc --max-old-space-size=7468 myscript.js )
Logs
2224509 ms: Mark-sweep 6017.8 (7502.1) -> 6017.8 (7502.1) MB, 267.3 / 0.0 ms [a llocation failure] [scavenge might not succeed].
2224770 ms: Mark-sweep 6017.8 (7502.1) -> 6017.7 (7502.1) MB, 261.7 / 0.0 ms [a llocation failure] [scavenge might not succeed].
2225014 ms: Mark-sweep 6017.7 (7502.1) -> 6026.3 (7486.1) MB, 243.2 / 0.0 ms [l ast resort gc].
2225295 ms: Mark-sweep 6026.3 (7486.1) -> 6034.7 (7486.1) MB, 280.2 / 0.0 ms [l ast resort gc].
<--- JS stacktrace --->
==== JS stack trace =========================================
Security context: 0x360d13dcfb39 <JS Object>
1: slowToString [buffer.js:~421] [pc=0x313b461fda3] (this=0xb64408e3f61 <an Uint8Array with map 0x2246ccf06989>,encoding=0x360d13d04381 <undefined>,start=0x 360d13d04381 <undefined>,end=0x360d13d04381 <undefined>)
2: arguments adaptor frame: 1->3
3: toString [buffer.js:~488] [pc=0x313ae183641] (this=0xb64408e3f61 <an Uint 8Array with map 0x2246ccf06989>)
4: arguments adaptor frame...
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memo ry
1: node::Abort() [node]
2: 0xf8cf13 [node]
3: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [node]
4: v8::internal::Factory::NewRawTwoByteString(int, v8::internal::PretenureFlag) [node]
5: v8::internal::Factory::NewStringFromUtf8(v8::internal::Vector<char const>, v 8::internal::PretenureFlag) [node]
6: v8::String::NewFromUtf8(v8::Isolate*, char const*, v8::String::NewStringType , int) [node]
7: node::StringBytes::Encode(v8::Isolate*, char const*, unsigned long, node::en coding) [node]
8: void node::Buffer::StringSlice<(node::encoding)1>(v8::FunctionCallbackInfo<v 8::Value> const&) [node]
9: 0x313ae17ecb8
Aborted
My code
var request = require('request');
var async = require('async');
var mysql = require('mysql');
var winston = require('winston');
var fs = require('fs');
var htmlparser = require('htmlparser2');
var logger = new winston.Logger({
level: 'info',
transports: [
new(winston.transports.Console)(),
new(winston.transports.File)({
filename: 'errors.log'
})
]
});
process.on('uncaughtException', function(err) {
logger.log('info', err);
});
// On utilise socket.io uniquement coté client
var socket = require('socket.io-client')('http://127.0.0.1:3000', {
reconnection: true,
reconnectionDelay: 5000,
reconnectionAttempts: 10
});
var pages = [];
var _count = 0;
var source = 'extratorrent';
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({
host: '99.115.16.67:9200'
// ,log: 'trace'
});
socket.on('connect', function() {
var q = async.queue(function(task, done) {
request.get(task.url, function(error, response, body) {
console.log('Chargement de la page [' + task.page_id + '] , récupération des liens... ');
var parser = new htmlparser.Parser({
onopentag: function(name, attribs) {
if (name === 'a') {
if (attribs.href.indexOf('torrent/') !== -1) {
var link = 'http://extratorrent.cc' + attribs.href;
pages.push(link);
}
}
}
}, {
decodeEntities: true
});
parser.write(body.substr(body.indexOf('<table class="tl">'),body.length - body.indexOf('Recent Searches')));
parser.end();
console.log(pages.length + ' Liens chargés');
done();
});
}, 12);
q.drain = function() {
// On créer une nouvelle queue mais cette fois pour explorer les URL
var call = 0;
q = async.queue(function(task, done) {
request.get(task.url, {
timeout: 4000
}, function(error, response, body) {
try {
var indexof = body.indexOf('Info hash:');
var torrent_hash = body.substr((indexof + 38),40);
indexof = body.indexOf('class="leech">');
var leech = body.substr((indexof + 14),20);
leech = parseInt(leech.replace ( /[^\d.]/g, '' ),10);
if(leech == '' || leech == null) leech = 0;
indexof = body.indexOf('class="seed">');
var seed = body.substr((indexof + 14),20);
seed = parseInt(seed.replace ( /[^\d.]/g, '' ),10);
if(seed == '' || seed == null) seed = 0;
console.log('seed => ' + seed + ' leech => ' + leech);
} catch(err) {
pages.remove(task.url);
done();
}
// index data here
client.index({
index: 'torrents',
type: 'items',
id: torrent_hash,
body: {
'name': torrent_hash,
'seed': seed,
'leech': leech,
'source': source
}
}, function(err, resp, status) {
call += 1;
console.log('---------------------------------------------------------------------------------------------> TORRENT : ' + call + ' | PAGE : ' + (1 + Math.round(call / 50)) );
socket.emit('new_torrent', torrent_hash, function() {
console.log('torrent ' + torrent_hash + ' envoyé');
});
});
pages.remove(task.url);
console.log('---------------------------------------------------------------------------------------------> LIENS RESTANT : ' + pages.length);
done();
});
if (!global.gc) {
console.error('Use "node --expose-gc script.js" to test with gc.')
return;
} else {
global.gc();
}
}, 10);
q.drain = function() {
//connection.end();
setTimeout(function(){
console.log('Tous les torrents ont été soumis ! ')
},6000);
}
pages.forEach(function(page) {
q.push({
url: page
});
});
}
for (var i = 1; i <= 500; i++) {
q.push({
page_id: i,
url: 'http://extratorrentlive.com/category/4/Movies+Torrents.html?page=' + i + '&srt=added&order=desc&pp=50'
});
}
});
Array.prototype.remove = function() {
var what, a = arguments, L = a.length, ax;
while (L && this.length) {
what = a[--L];
while ((ax = this.indexOf(what)) !== -1) {
this.splice(ax, 1);
}
}
return this;
};
via LeSpotted44
No comments:
Post a Comment