I am trying to produce the following JSON structure:
"events": [
{
"start_date": {
"year": "602"
},
"end_date": {
"year":"629"
},
"media": {
"url": "https://en.wikipedia.org/wiki/Roman-Persian_Wars"
},
"background": {
"opacity":"50",
"url": "https://upload.wikimedia.org/wikipedia/commons/a/a2/HumiliationValerianusHolbein.jpg"
},
"text": {
"headline": "Last great Roman-Persian War.",
"text": "Long conflict leaves both empires exhausted and unable to cope with the newly united Arab armies under Islam in the 630s"
}
},
{
"start_date": {
"year": "604"
},
"end_date": {
"year": "609"
},
"media": {
"url": "https://en.wikipedia.org/wiki/Grand_Canal_(China)"
},
"background": {
"opacity":"50",
"url": "https://upload.wikimedia.org/wikipedia/commons/a/ad/Sui_Wendi_Tang.jpg"
},
"text": {
"headline": "Grand Canal in China is fully completed",
"text": "Its main role throughout its history was the transport of grain to the capital."
}
}
There are about 25 objects within the events array, but to make this shorter I've only included two here.
For the moment I'm only trying to create the "sub-objects" of background and text within the master object.
I am scraping this Wikipedia page with Node and the request and cheerio libraries: https://en.wikipedia.org/wiki/Timeline_of_the_Middle_Ages
The first part of my code (below) uses the request library to gather together all the links to other pages from the main Wikipedia "landing page":
request(landingPage, function (err, response, body) {
var $ = cheerio.load(body);
var absoluteLinks = [];
// GET REMOTE PAGE LINKS FOR IMAGES:
// eq(1) = 7th Century Table
$('.wikitable').eq(1).find('tr').each(function() {
var $link = $(this).find('td').eq(2).find('a').eq(0).attr('href');
if ( $link != undefined || $link != null ) {
absoluteLinks.push("https://en.wikipedia.org" + $link);
}
});
getRemoteImages(absoluteLinks);
});
The second part uses Promise.all in order to ensure that the array of image urls is constructed in the same order as the array of scraped page urls:
function getRemoteImages(absoluteLinks) {
Promise.all(absoluteLinks.map (function (a) {
return new Promise(function(resolve, reject) {
request(a, function(err, response, body) {
if (err) { return reject(err); }
$ = cheerio.load(body);
var $thumbImg = $('.infobox').find('img').eq(0).attr('src');
// To do: make full size image
$thumbImg = "https:" + $thumbImg;
resolve({ thumbImg: $thumbImg });
});
});
})).then(function (result) {
cleanImages(result);
}).catch(function(err) {
console.log(err);
});
}
The third part of the code is the bit I'm struggling with:
function buildTextSection(result) {
request(landingPage, function (err, response, body) {
var data = { "events": [] };
$ = cheerio.load(body);
$('.wikitable').eq(1).find('tr').each(function() {
var evObj = {};
var $headline = $(this).find('td').eq(2).html();
var $text = $(this).find('td').eq(3).text();
evObj.text = {"headline": $headline, "text": $text };
data.events.push(evObj);
}); // end each
console.log(data.events.length);
buildImageSection(data, result);
});
}
function buildImageSection(data, result) {
result.forEach(function(obj) {
data.events.background = {"opacity": "50", "url": obj.thumbImg };
console.log(data);
// console.log(data.events);
}); // end forEach
}
I can't find a way integrate the two different each iterators (one gathering the text data from the "local" landing page, and the other gathering the image urls from each "remote" Wikipedia destination page).
My last attempt (of many) generates this output when I run the file in the terminal with console.log(data);
{ events:
[ { text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
{ text: [Object] },
background: { opacity: '50',
url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/1/10/History_of_Korea-Inter-country_Age-830_CE.gif/220px-History_of_Korea-Inter-country_Age-830_CE.gif' } ] }
How can I solve this problem? I've completely run out of ideas now! Thanks!
via daneasterman
No comments:
Post a Comment