Posted on 24 Jun 2013 in Speaking BurlingtonJS JavaScript Scraping Node.io Casper.js Phantom.js Node.js
June 19, 2013 was the third meeting of the BurlingtonJS group, the second of which I attended, and the first at which I was lucky enough to be presenting. I volunteered to speak on web scraping. Web scraping has been around about as long as browsers and the prevalence of the Internet. It is an interesting topic in the world of JS because of such amazing developments as Node.io and Casper.js (on Phantom.js).
I gave the presentation via PowerPoint. Being my first tech talk, I felt it was best to have everything in one nicely packaged document so I could just have one button to press. It ended up really helping with my nerves to not have to worry about switching between different apps/windows/tabs.
The idea for the main project I feature in this talk came about months ago. I never thought I would present on it, but you can view an intro and the final map.
$ curl 'http://tycho.usno.navy.mil/cgi-bin/timer.pl' | grep UTC | html2text
$ node.io query 'http://tycho.usno.navy.mil/cgi-bin/timer.pl' | grep UTC | html2text
$ node.io query 'reddit.com/r/javascript' a.title
var casper = require('casper').create();
function getLi() {
var li = document.querySelectorAll('li');
return Array.prototype.map.call(li, function(e) {
return e.innerText
});
}
casper.start('http://localhost:8000/list-items.html');
casper.then(function() {
li = this.evaluate(getLi);
});
casper.run(function() {
this.echo(li.join('\n')).exit();
});
var nodeio = require('node.io');
var query = "burlingtonjs";
var scraper = {
input: [0, 10, 20, 30],
run: function(gStart) {
var googleurl = 'https://www.google.com/search?q=' + query + '&start=' + gStart;
this.getHtml(googleurl, function(err, $) {
if (err) {
this.exit(err);
}
var urls = [];
$('h3.r').each(function(result) {
urls.push(result.striptags);
});
this.emit(urls);
});
}
};
var job = new nodeio.Job({timeout:10, max: 1, wait: 1}, scraper);
nodeio.start(job, function(err, output) {
console.log(output);
console.log(output.length);
process.exit();
}, true);
var nodeio = require('node.io');
var geo = require('geo');
var dealerships = ['houstondodgedealer.com','gillmanchevroletharlingen.com', 'actontoyota.com'];
var scraper = {
input: dealerships,
run: function(dealer) {
this.setUserAgent("Node.io scraper. Hodor!");
this.getHtml("http://www." + dealer + "/index.htm", function(err, $) {
if (err) console.log(err);
var names = [], output =[];
var info = {};
if($('#hDealerName', null, true)){
info.name = $('#hDealerName').striptags;
} else if ($('.org', null, true)){
info.name = $('.org').striptags;
}
if ($('.address.street-address', null, true)) {
info.street = $('.address.street-address').striptags;
} else if ($('.street-address', null, true)) {
info.street = $('.street-address').striptags;
}
if ($('#fContact .locality', null, true)) {
info.locality = $('#fContact .locality').striptags;
} else if ($('.locality', null, true)) {
info.locality = $('.locality').striptags;
}
if ($('.state.region', null, true)) {
info.region = $('.state.region').striptags;
} else if ($('.adr .region', null, true)) {
info.region = $('.adr .region').striptags;
}
if ($('.zip.postal-code', null, true)) {
info.zip = $('.zip.postal-code').striptags;
} else if ($('.adr .postal-code', null, true)) {
info.zip = $('.adr .postal-code', null, true).striptags;
}
if($('#hMovieWrap .geo .latitude .value-title', null, true)) {
info.lat = $('#hMovieWrap .geo .latitude .value-title').attribs.title;
}
if($('#hMovieWrap .geo .longitude .value-title', null, true)) {
info.lon = $('#hMovieWrap .geo .longitude .value-title').attribs.title;
}
var me = this;
if( !info.lat && !info.lon ) {
var fullAddress = info.street + " " + info.locality + ", " + info.region + " " + info.zip;
var sensor = false;
geo.geocoder(geo.google, fullAddress, sensor,
function(formattedAddress, latitude, longitude, details){
info.lat = latitude;
info.lon = longitude;
me.emit(info);
}
);
} else {
me.emit(info);
}
});
}
};
var job = new nodeio.Job({timeout:10, max: 1, wait: 2, redirects: 3}, scraper);
nodeio.start(job, function(err, output) {
console.log(output);
process.exit();
}, true);
var AlchemyAPI = require('alchemy-api');
var alchemy = new AlchemyAPI('<-your-api-key->');
alchemy.microformats('http://www.hallchryslerdodgejeepram.com/', {}, function(err, response) {
if (err) throw err;
var res = response.microformats;
console.log(res);
});
2024. In dreams we're free, awake we strive.