A couple weeks ago I posted a web-crawler I'd built for parsing Wikipedia articles (http://news.ycombinator.com/item?id=4436349). jDistiller grew out of this project. As a test I rebuilt the crawler in thirty lines of code using the jDistiller API:<p>new jDistiller()
.set('title', '#firstHeading')
.set('links', '#bodyContent p a', function(element, prev) {
var href = element.attr('href'),
key = href.replace('/wiki/', '');<p><pre><code> if ( key === href) return;
element.replaceWith('[[' + key + ']]');
return [key, {
title: element.attr('title'),
text: prev[key] ? prev[key].text : element.text(),
occurrences: prev[key] ? prev[key].occurrences + 1 : 1
}]
})
.set('sections', '#bodyContent p,#firstHeading,h2,h3,img', function(element, prev) {
var images = [];
if ( element.is('h1') || element.is('h2') ) {
this.heading = element.text().trim();
return
}
return [this.heading, {
text: prev[this.heading] ? prev[this.heading].text + element.text() : element.text(),
images: element.is('img') && element.attr('width') > 50 ? [element.attr('src').replace('//', 'http://')] : []
}];
})
.distill('http://en.wikipedia.org/wiki/Dog', function(err, distilledPage) {
console.log(JSON.stringify(distilledPage.sections));
});</code></pre>