Wednesday, 19 April 2017

Scrape repetative HTML into JSON array with Node

I'm practicing scraping and I am trying to scrape a list of agents into a JSON array. My code currently only scrapes the last person 4 times. I'm wondering how to iterate through each class that repeats.

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){

char = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x',
'y','z']

url = 'http://www.nhlpa.com/inside-nhlpa/certified-player-agents/find-an-agent?ln=A';

request(url, function(error, response, html){
    if(!error){
        var $ = cheerio.load(html);

        var agent, agency, address, street, city, state, country, zip, deskphone, fax, email, cell;
        var json = { agent : "", agency : "", street : "", city : "", state : "", country : "", zip : "", deskphone : "", fax : "", email : "", cell : ""};
        var jsonarry = []

    $('.inBox').each(function(i, elem) {

        $('.inBodyText').filter(function(){
            var data = $(this);
            agent = data.children().first().text();
            //agency = data.children().last().children().text();

            json.agent = agent;

        })



        $('.inCaption').filter(function(){
            var data = $(this);
            agency =     data.children().children().first().next().text();
            json.agency = agency;
            street =     data.children().children().first().next().next().text();
            json.street = street;
            address =       data.children().children().first().next().next().next().text().replace(/ /g,'');
            address = address.split(",");
            json.city = address[0];
            json.state = address[1]
            json.country = address[2]
            zip =        data.children().children().first().next().next().next().next().text();
            json.zip = zip

            deskphone =  data.children().children().last().prev().prev().prev().text();
            json.deskphone = deskphone
            fax =        data.children().children().last().prev().prev().text();
            json.fax = fax
            email =      data.children().children().last().prev().text();
            json.email = email
            cell =       data.children().children().last().text();
            json.cell = cell
        })
        jsonarry.push(json)
      });
    }



    fs.writeFile('output.json', JSON.stringify(jsonarry, null, 4), function(err){

    console.log('File successfully written! - Check your project directory for the output.json file');

})

// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send(html)

    }) ;
})



app.listen('8081')

console.log('Magic happens on port 8081');

exports = module.exports = app;



via user1093111

No comments:

Post a Comment