I tell you what I want to do. I am doing a scraping of tematika.com site to be able to create a base of books to me. The problem is that I need to go from page to page but I can not, so I always get the same books. I think with the code they will understand it better.
const phantom = require("phantom");
const cheerio = require("cheerio");
let _instance, _page;
let _books = [];
phantom.create().then(instance => {
_instance = instance;
return _instance.createPage();
}).then(page => {
_page = page;
return _page.open('http://www.tematika.com/catalogo/libros/arte__arquitectura_y_diseno--9.htm');
}).then(status => {
console.log(status);
return _page.property('content')
}).then(html => {
// I transform the html into an object to manipulate it with cheerio
let $ = cheerio.load(html);
//I get all the books and pass them to an array to go through them and extract the info I'm looking for
let books = $('.Gcentro').find('.moduleproductob').toArray();
//As always there are 10 pages I make a for it it itere 10 times
for(let i=0;i<10;i++){
books.forEach(book => {
let _book = {};
_book.imageMin = $(book).find('.Gimagesproductos').attr('src');
_book.linkBook = $(book).find('.FProductos').attr('href');
_books.push(_book);
});
//In this part what I look for is to click on the next button on the page and it is what is not working for me
//What ends up happening is that it never goes back to the page. So always bring me the info of the same 10 books.
_page.evaluate(function() {
var evObj = document.createEvent('Events');
evObj.initEvent('click', true, false);
document.getElementById('catalogNext').dispatchEvent(evObj);
});
}
console.log(_books);
_page.close();
_instance.exit();
}).catch(e => console.log(e));
What I need is to be able to click on next and be able to bring me the books that are showing. If it is not with phantom-node and cheerio can be with another tool, all I need is that it works on node.
Thank you! :)
via Exequiel Demaio
No comments:
Post a Comment