Tuesday, 25 April 2017

Webscraping using Nodejs - tags

I am trying to scrape a stock market web page for the titles of href links. I dont get any output. I tried calling the class, id's,etc. I see the tag has a $0 assigned. I typed this in the console command line but it just showed the same thing.

I am trying to get the text "Bulk Deals" from the html. If I can figure how that's done, I can proceed.

The HTML I am trying to parse:

<ul>
        <li xmlns:dc="http://purl.org/dc/elements/1.1/" class="regularitem">
                <h4 class="itemtitle">
                        <a href="http://feedproxy.google.com/~r/nseindia/CMDailyReport/~3/P9Aw3__Tm9M/bulk.csv">Bulk Deals</a></h4>
                <h5 class="itemposttime"></h5>
                <div class="itemcontent" name="decodeable"><div class="feedflare">
        <a href="http://feeds.feedburner.com/~ff/nseindia/CMDailyReport?a=P9Aw3__Tm9M:Clpwmq7B-_I:yIl2AUoC8zA">
                <img src="http://feeds.feedburner.com/~ff/nseindia/CMDailyReport?d=yIl2AUoC8zA" border="0">
        </a> 
        <a href="http://feeds.feedburner.com/~ff/nseindia/CMDailyReport?a=P9Aw3__Tm9M:Clpwmq7B-_I:F7zBnMyn0Lo">
                <img src="http://feeds.feedburner.com/~ff/nseindia/CMDailyReport?i=P9Aw3__Tm9M:Clpwmq7B-_I:F7zBnMyn0Lo" border="0">
        </a> 
        <a href="http://feeds.feedburner.com/~ff/nseindia/CMDailyReport?a=P9Aw3__Tm9M:Clpwmq7B-_I:qj6IDK7rITs">
                <img src="http://feeds.feedburner.com/~ff/nseindia/CMDailyReport?d=qj6IDK7rITs" border="0">
        </a> 
        <a href="http://feeds.feedburner.com/~ff/nseindia/CMDailyReport?a=P9Aw3__Tm9M:Clpwmq7B-_I:gIN9vFwOqvQ">
                <img src="http://feeds.feedburner.com/~ff/nseindia/CMDailyReport?i=P9Aw3__Tm9M:Clpwmq7B-_I:gIN9vFwOqvQ" border="0">
        </a>
        </div>
                <img src="http://feeds.feedburner.com/~r/nseindia/CMDailyReport/~4/P9Aw3__Tm9M" height="1" width="1" alt="">
        </div>
</li>

The js code:

var cheerio = require('cheerio');
var request = require('request');

var url ="http://feeds.feedburner.com/nseindia/CMDailyReport";
request(url, function(err, resp, body){
        var $ = cheerio.load(body);
        $(".regularitem .itemtitle").each(function(){
                var link = $(this);
                var text = link.text();
                var href = link.attr("href");

                console.log(text + href);
        
        });    

});


via sujith john

No comments:

Post a Comment