Save and render a webpage with PhantomJS and node.js
From your comments, I'd guess you have 2 options
- Try to find a phantomjs node module - https://github.com/amir20/phantomjs-node
- Run phantomjs as a child process inside node - http://nodejs.org/api/child_process.html
Edit:
It seems the child process is suggested by phantomjs as a way of interacting with node, see faq - http://code.google.com/p/phantomjs/wiki/FAQ
Edit:
Example Phantomjs script for getting the pages HTML markup:
var page = require('webpage').create();
page.open('http://www.google.com', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
return document.getElementsByTagName('html')[0].innerHTML
});
console.log(p);
}
phantom.exit();
});
I've used two different ways in the past, including the page.evaluate() method that queries the DOM that Declan mentioned. The other way I've passed info from the web page is to spit it out to console.log() from there, and in the phantomjs script use:
page.onConsoleMessage = function (msg, line, source) {
console.log('console [' +source +':' +line +']> ' +msg);
}
I might also trap the variable msg in the onConsoleMessage and search for some encapsulate data. Depends on how you want to use the output.
Then in the Nodejs script, you would have to scan the output of the Phantomjs script:
var yourfunc = function(...params...) {
var phantom = spawn('phantomjs', [...args]);
phantom.stdout.setEncoding('utf8');
phantom.stdout.on('data', function(data) {
//parse or echo data
var str_phantom_output = data.toString();
// The above will get triggered one or more times, so you'll need to
// add code to parse for whatever info you're expecting from the browser
});
phantom.stderr.on('data', function(data) {
// do something with error data
});
phantom.on('exit', function(code) {
if (code !== 0) {
// console.log('phantomjs exited with code ' +code);
} else {
// clean exit: do something else such as a passed-in callback
}
});
}
Hope that helps some.
With v2 of phantomjs-node
it's pretty easy to print the HTML after it has been processed.
var phantom = require('phantom');
phantom.create().then(function(ph) {
ph.createPage().then(function(page) {
page.open('https://stackoverflow.com/').then(function(status) {
console.log(status);
page.property('content').then(function(content) {
console.log(content);
page.close();
ph.exit();
});
});
});
});
This will show the output as it would have been rendered with the browser.
Edit 2019:
You can use async/await
:
const phantom = require('phantom');
(async function() {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
const status = await page.open('https://stackoverflow.com/');
const content = await page.property('content');
console.log(content);
await instance.exit();
})();
Or if you just want to test, you can use npx
npx phantom@latest https://stackoverflow.com/