artoo.scrape('td.title:nth-child(3)', {
title: {sel: 'a'},
url: {sel: 'a', attr: 'href'}
}, artoo.savePrettyJson);
artoo.scrape('tr tr:has(td.title:has(a)):not(:last)', {
title: {sel: '.title a'},
url: {sel: '.title a', attr: 'href'},
domain: {
sel: '.comhead',
method: function($) {
return $(this).text().trim().replace(/[\(\)]/g, '');
}
},
score: {
sel: '+ tr [id^=score]',
method: function($) {
return +$(this).text().replace(/ points/, '');
}
},
user: {
sel: '+ tr a[href^=user]',
method: function($) {
return $(this).length ? $(this).text() : null;
}
},
nb_comments: {
sel: '+ tr a[href^=item]',
method: function($) {
var nb = +$(this).text().replace(/ comments/, '');
return isNaN(nb) ? 0 : nb;
}
}
}, artoo.savePrettyJson);
One could easily scrape several pages by using an ajaxSpider.
Example - Downloading the first three pages
var scraper = {
iterator: 'tr tr:has(td.title:has(a)):not(:last)',
data: {
title: {sel: '.title a'},
url: {sel: '.title a', attr: 'href'},
domain: {
sel: '.comhead',
method: function($) {
return $(this).text().trim().replace(/[\(\)]/g, '');
}
},
score: {
sel: '+ tr [id^=score]',
method: function($) {
return +$(this).text().replace(' points', '');
}
},
user: {
sel: '+ tr a[href^=user]',
method: function($) {
return $(this).length ? $(this).text() : null;
}
},
nb_comments: {
sel: '+ tr a[href^=item]',
method: function($) {
var nb = +$(this).text().replace(' comments', '');
return isNaN(nb) ? 0 : nb;
}
}
}
};
function nextUrl($page) {
return $page.find('td.title:last > a').attr('href');
}
artoo.log.debug('Starting the scraper...');
var frontpage = artoo.scrape(scraper);
artoo.ajaxSpider(
function(i, $data) {
return nextUrl(!i ? artoo.$(document) : $data);
},
{
limit: 2,
scrape: scraper,
concat: true,
done: function(data) {
artoo.log.debug('Finished retrieving data. Downloading...');
artoo.savePrettyJson(
frontpage.concat(data),
{filename: 'hacker_news.json'}
);
}
}
);
Just change the limit
to get more pages and put a throttle
parameter not to be too hard on their servers.
« Ok, I see your point but this is quite tiresome to copy & paste code into the JavaScript console. There should be a more pratical way. »
Fortunately, there is: You could create a custom artoo.js bookmarklet wrapping things up.
In fact, the icon above is just this. Just drop it onto your bookmark bar and click it when visiting Hacker News and it should download the first three pages’ data.
This is not instantaneous so please wait for the data to be retrieved and downloaded for if you click the bookmark twice, you will logically download the data twice.
Custom bookmarks such as this one can easily be generated on this site (here) through a specific gulp plugin whose documentation can be found here.
Finally, a fully explained example of how you would create such a bookmarklet can be found on this gist.