How can I catch and process the data from the XHR responses using casperjs?
I may be late into the party, but the answer may help someone like me who would fall into this problem later in future.
I had to start with PhantomJS, then moved to CasperJS but finally settled with SlimerJS. Slimer is based on Phantom, is compatible with Casper, and can send you back the response body using the same onResponseReceived method, in "response.body" part.
Reference: https://docs.slimerjs.org/current/api/webpage.html#webpage-onresourcereceived
@Artjom's answer's doesn't work for me in the recent Chrome and CasperJS versions.
Based on @Artjom's answer and based on gilly3's answer on how to replace XMLHttpRequest, I have composed a new solution that should work in most/all versions of the different browsers. Works for me.
SlimerJS cannot work on newer version of FireFox, therefore no good for me.
Here is the the generic code to add a listner to load of XHR (not dependent on CasperJS):
var addXHRListener = function (XHROnStateChange) {
var XHROnLoad = function () {
if (this.readyState == 4) {
XHROnStateChange(this)
}
}
var open_original = XMLHttpRequest.prototype.open;
XMLHttpRequest.prototype.open = function (method, url, async, unk1, unk2) {
this.requestUrl = url
open_original.apply(this, arguments);
};
var xhrSend = XMLHttpRequest.prototype.send;
XMLHttpRequest.prototype.send = function () {
var xhr = this;
if (xhr.addEventListener) {
xhr.removeEventListener("readystatechange", XHROnLoad);
xhr.addEventListener("readystatechange", XHROnLoad, false);
} else {
function readyStateChange() {
if (handler) {
if (handler.handleEvent) {
handler.handleEvent.apply(xhr, arguments);
} else {
handler.apply(xhr, arguments);
}
}
XHROnLoad.apply(xhr, arguments);
setReadyStateChange();
}
function setReadyStateChange() {
setTimeout(function () {
if (xhr.onreadystatechange != readyStateChange) {
handler = xhr.onreadystatechange;
xhr.onreadystatechange = readyStateChange;
}
}, 1);
}
var handler;
setReadyStateChange();
}
xhrSend.apply(xhr, arguments);
};
}
Here is CasperJS code to emit a custom event on load of XHR:
casper.on("page.initialized", function (resource) {
var emitXHRLoad = function (xhr) {
window.callPhantom({eventName: 'xhr.load', eventData: xhr})
}
this.evaluate(addXHRListener, emitXHRLoad);
});
casper.on('remote.callback', function (data) {
casper.emit(data.eventName, data.eventData)
});
Here is a code to listen to "xhr.load" event and get the XHR response body:
casper.on('xhr.load', function (xhr) {
console.log('xhr load', xhr.requestUrl)
console.log('xhr load', xhr.responseText)
});
This is not easily possible, because the resource.received
event handler only provides meta data like url
, headers
or status
, but not the actual data. The underlying phantomjs event handler acts the same way.
Stateless AJAX Request
If the ajax call is stateless, you may repeat the request
casper.on("resource.received", function(resource){
// somehow identify this request, here: if it contains ".json"
// it also also only does something when the stage is "end" otherwise this would be executed two times
if (resource.url.indexOf(".json") != -1 && resource.stage == "end") {
var data = casper.evaluate(function(url){
// synchronous GET request
return __utils__.sendAJAX(url, "GET");
}, resource.url);
// do something with data, you might need to JSON.parse(data)
}
});
casper.start(url); // your script
You may want to add the event listener to resource.requested
. That way you don't need to way for the call to complete.
You can also do this right inside of the control flow like this (source: A: CasperJS waitForResource: how to get the resource i've waited for):
casper.start(url);
var res, resData;
casper.waitForResource(function check(resource){
res = resource;
return resource.url.indexOf(".json") != -1;
}, function then(){
resData = casper.evaluate(function(url){
// synchronous GET request
return __utils__.sendAJAX(url, "GET");
}, res.url);
// do something with the data here or in a later step
});
casper.run();
Stateful AJAX Request
If it is not stateless, you would need to replace the implementation of XMLHttpRequest. You will need to inject your own implementation of the onreadystatechange
handler, collect the information in the page window
object and later collect it in another evaluate
call.
You may want to look at the XHR faker in sinon.js or use the following complete proxy for XMLHttpRequest
(I modeled it after method 3 from How can I create a XMLHttpRequest wrapper/proxy?):
function replaceXHR(){
(function(window, debug){
function args(a){
var s = "";
for(var i = 0; i < a.length; i++) {
s += "\t\n[" + i + "] => " + a[i];
}
return s;
}
var _XMLHttpRequest = window.XMLHttpRequest;
window.XMLHttpRequest = function() {
this.xhr = new _XMLHttpRequest();
}
// proxy ALL methods/properties
var methods = [
"open",
"abort",
"setRequestHeader",
"send",
"addEventListener",
"removeEventListener",
"getResponseHeader",
"getAllResponseHeaders",
"dispatchEvent",
"overrideMimeType"
];
methods.forEach(function(method){
window.XMLHttpRequest.prototype[method] = function() {
if (debug) console.log("ARGUMENTS", method, args(arguments));
if (method == "open") {
this._url = arguments[1];
}
return this.xhr[method].apply(this.xhr, arguments);
}
});
// proxy change event handler
Object.defineProperty(window.XMLHttpRequest.prototype, "onreadystatechange", {
get: function(){
// this will probably never called
return this.xhr.onreadystatechange;
},
set: function(onreadystatechange){
var that = this.xhr;
var realThis = this;
that.onreadystatechange = function(){
// request is fully loaded
if (that.readyState == 4) {
if (debug) console.log("RESPONSE RECEIVED:", typeof that.responseText == "string" ? that.responseText.length : "none");
// there is a response and filter execution based on url
if (that.responseText && realThis._url.indexOf("whatever") != -1) {
window.myAwesomeResponse = that.responseText;
}
}
onreadystatechange.call(that);
};
}
});
var otherscalars = [
"onabort",
"onerror",
"onload",
"onloadstart",
"onloadend",
"onprogress",
"readyState",
"responseText",
"responseType",
"responseXML",
"status",
"statusText",
"upload",
"withCredentials",
"DONE",
"UNSENT",
"HEADERS_RECEIVED",
"LOADING",
"OPENED"
];
otherscalars.forEach(function(scalar){
Object.defineProperty(window.XMLHttpRequest.prototype, scalar, {
get: function(){
return this.xhr[scalar];
},
set: function(obj){
this.xhr[scalar] = obj;
}
});
});
})(window, false);
}
If you want to capture the AJAX calls from the very beginning, you need to add this to one of the first event handlers
casper.on("page.initialized", function(resource){
this.evaluate(replaceXHR);
});
or evaluate(replaceXHR)
when you need it.
The control flow would look like this:
function replaceXHR(){ /* from above*/ }
casper.start(yourUrl, function(){
this.evaluate(replaceXHR);
});
function getAwesomeResponse(){
return this.evaluate(function(){
return window.myAwesomeResponse;
});
}
// stops waiting if window.myAwesomeResponse is something that evaluates to true
casper.waitFor(getAwesomeResponse, function then(){
var data = JSON.parse(getAwesomeResponse());
// Do something with data
});
casper.run();
As described above, I create a proxy for XMLHttpRequest so that every time it is used on the page, I can do something with it. The page that you scrape uses the xhr.onreadystatechange
callback to receive data. The proxying is done by defining a specific setter function which writes the received data to window.myAwesomeResponse
in the page context. The only thing you need to do is retrieving this text.
JSONP Request
Writing a proxy for JSONP is even easier, if you know the prefix (the function to call with the loaded JSON e.g. insert({"data":["Some", "JSON", "here"],"id":"asdasda")
). You can overwrite insert
in the page context
after the page is loaded
casper.start(url).then(function(){ this.evaluate(function(){ var oldInsert = insert; insert = function(json){ window.myAwesomeResponse = json; oldInsert.apply(window, arguments); }; }); }).waitFor(getAwesomeResponse, function then(){ var data = JSON.parse(getAwesomeResponse()); // Do something with data }).run();
or before the request is received (if the function is registered just before the request is invoked)
casper.on("resource.requested", function(resource){ // filter on the correct call if (resource.url.indexOf(".jsonp") != -1) { this.evaluate(function(){ var oldInsert = insert; insert = function(json){ window.myAwesomeResponse = json; oldInsert.apply(window, arguments); }; }); } }).run(); casper.start(url).waitFor(getAwesomeResponse, function then(){ var data = JSON.parse(getAwesomeResponse()); // Do something with data }).run();