Wednesday, 12 April 2017

Node.js webcrawler: parse HTML after authentication (returning only sessionToken)

I am building a web crawler in Node.js. I am using modules request, cheerio and js-crawler:

var email = "xxx";
var pwd = "xxx";
var auth = "Basic " + new Buffer(email + ":" + pwd).toString("base64");
var url = "www.z.com/login";

request.post({
    url,
    formData : {
      email,
      pwd 
  },
    headers : {
      'User-Agent': 'request',
      "Authorization" : auth,
    }
  },
  function(error, response, body) {

      console.log('(Logged) Status Code: ' + response.statusCode)
      console.log('(Logged) Error: ' + error)
      //it does return 200

      //the following URL is the dashboard, it's shown only user is logged
      new Crawler().configure({depth: 1})
      .crawl("https://www.z.com/dashboard", function onSuccess(page) {

      //testing the responses
      console.log(page.content);
      console.log(page.status);
      console.log(page.error);
      console.log(page.response);
      console.log(page.body);

      //trying to get the username
      var $ = cheerio.load(body);
      var header = $('strong').find('.user-name').text();
      console.log('(Logged) Título: ' + header);
      console.log('(Logged) body: ' + body);
  });

  } ); 

However, everything that I see in the bodyresponse is (which was supposed to print the HTML of the page):

"sessionToken":"eyJhbGciOiJSUzI1NiJ9.eyJpc3MiOiJHdWlhYm9sc28iLCJleHAiOjE0OTIwMDg1MDIsImlhdCI6MTQ5MjAwNzkwMiwiYXV0aCI6Ik1ETmpZVFEyT1dJeU9UWTJaV1l6Wm1VeFlqQTRaR1psWWpWaE56ZGlOR0l1TXpnME1qQXhNek0yTlRReU1qTTNNRGsyTURNME1qUXhOekV5TURRME16azBOVEF4T1RNMk16QTROelkzTmpVNU56YzVNakUwTVRrek5qTTJNRFV3TXpBek1UUXdNVEUwTURNd016Zz0iLCJwbGF0Zm9ybSI6IndlYiIsImh0dHBzOi8vd3d3Lmd1aWFib2xzby5jb20uYnIiOnRydWUsInNlc3Npb25Ub2tlbiI6IjM4NDIwMTMzNjU0MjIzNzA5NjAzNDI0MTcxMjA0NDM5NDUwMTkzNjMwODc2NzY1OTc3OTIxNDE5MzYzNjA1MDMwMzE0MDExNDAzMDM4IiwiZGV2aWNlVG9rZW4iOiIwM2NhNDY5YjI5NjZlZjNmZTFiMDhkZmViNWE3N2I0YiJ9.NuF5UEz0zSRVhBQ29q07OPsJYvCCJSm6UhgnUg8_gpSBdOd9EjOF3beLQz_b4295PTsmgpE8i6lQALkozs-aSkhR2_njjBPTTbXVUiDRIe3XI1qXbTfpWFbeiXoUhocIOa3fopyzqCn6WpMDvkFviDDlRLmc-gNLxK-E2UTsZ68dQ2Zg--XPTL-3GOjZAQZa-jn57MLJdMUIY8tmvTluRpeCaUc5vCEdjFMAqUOX5_YbnhCu0d4v4gbvDFQMrQQwcxA4jFg0-s7v3ZL7rMoWmF571x3zKoZxNV23sq685hL-Ma871HfiFGZUwcBdJ6Hc6o7L97kqPEjRvNSYxiIo02gUeFivHQ-B3rqhgcUUZX83xNNpbqa_9M1CwMyw8aEXCRGEDh6PijUFsaHz3qUtnWQDrrFc9E0B5wzBKO6KTJx4e3wwzxTUNUPjHQEdWzpB7pgNzDhRetVlAbDLNe0Yf0lbVtn5rfaJ-moNhY9A142qXuVOIhRM79hqg6h11L9IPyn0OoFvh1MTPOEcv0uoOvHmrF0gAQ3gJYkTXE4Kc45a_15L8-vPD9FZPQ-axZGw8lMdPEqhf2RY-Xa7XICM2cVGomsT2-GSB5bUp1BaVQ9dpzOa3VUDIhfR685cgTrAcqIIozTqrmUNYb_kaQEo1oUnK2iOOg1olqWajo7mZKo", "documentStatus":true

Any help is very much appreciated. Thank you.



via Vinicius Martinson

No comments:

Post a Comment