How can I scrape sites that require authentication using node.js?
Or using superagent:
var superagent = require('superagent')
var agent = superagent.agent();
agent
is then a persistent browser, which will handle getting and setting cookies, referers, etc. Just agent.get
, agent.post()
as normal.
Use Mikeal's Request library, you need to enable cookies support like this:
var request = request.defaults({jar: true})
So you first should create a username on that site (manually) and pass the username and the password as params when making the POST request to that site. After that the server will respond with a cookie which Request will remember, so you will be able to access the pages that require you to be logged into that site.
Note: this approach doesn't work if something like reCaptcha is used on the login page.
You can scrape the data from sites that require authentication like csrf token.
Using the cookies for each request like this:
var j = request.jar(); // this is to set the jar of request for session and cookie persistence
request = request.defaults({ jar: j }); //here we are setting the default cookies of request
Here is small code to elaborate it further:
var express = require('express');
var bodyParser = require('body-parser');
var querystring = require('querystring');
var request = require('request'); //npm request package to send a get and post request to a url
const cheerio = require('cheerio'); //npm package used for scraping content from third party sites
var cookieParser = require('cookie-parser')
var http = require('http');
var app = express();
app.use(cookieParser());
var _csrf; //variable to store the _csrf value to be used later
app.use(bodyParser.json());
var html = '';
var j = request.jar(); // this is to set the jar of request for session and cookie persistence
request = request.defaults({ jar: j }); //here we are setting the default cookies of request
//___________________API CALL TO VERIFY THE GMS NUMBER_______________________
app.get('/check', function(req, response) {
var schemeId = null;
if (req.query.schemeId) {
schemeId = req.query.schemeId;
console.log(schemeId);
} else {
response.send('false');
response.end();
}
getCsrfValue(function(err, res) {
if (!err) {
_csrf = res;
console.log(_csrf);
request.post({
headers: {
'Authorization': '',
'Content-Type': 'application/x-www-form-urlencoded',
},
uri: 'https://www.xyz.site',
body: "schemeId=" + schemeId + "&_csrf=" + _csrf
}, function(err, res, body) {
if (err) {
console.log(err);
} else {
console.log("body of post: " + res.body);
const $ = cheerio.load(body.toString());
var txt = $('.schemeCheckResult').text();
console.log(txt);
if (txt) {
response.send('true');
} else {
response.send('false');
}
html += body;
}
});
} else {
response.send(err);
}
})
});
//______________FUNCTION TO SCRAPE THE CSRF TOKEN FROM THE SITE____________
function getCsrfValue(callback) {
request.get({
headers: {
'Authorization': '',
'Content-Type': 'application/x-www-form-urlencoded',
},
uri: 'https://www.xyz.site'
}, function(err, res, body) {
if (err) {
return callback(err);
} else {
const $ = cheerio.load(body.toString());
var txt = $('input[name=_csrf]').val();
_csrf = txt;
return callback(null, _csrf);
}
});
}
module.exports = app;
I've been working with NodeJs Scrapers for more than 2 years now
I can tell you that the best choice when dealing with logins and authentication is to NOT use direct request
That is because you just waste time on building manual requests and it is way slower,
Instead, use a high lever browser that you control via an API like Puppeteer or NightmareJs
I have a good starter and in-depth guide on How to start scraping with Puppeteer, I'm sure it will help!