aws-sdk S3: best way to list all keys with listObjectsV2
Building on previous answers, here is an approach that takes advantage of the Prefix
parameter to make multiple calls to s3.listObjectsV2() in parallel.
This has led to 2-15x speedup for me depending on how evenly the keys are distributed and whether or not the code is running locally or on AWS.
You should make sure that the prefixes cover the full range of possible prefixes for your bucket. The code below covers all "safe" characters but S3 supports a wider range of UTF-8 characters.
Note that this example uses async/await so ES2017/Node 8 is required. The example is a Node 8.10 Lambda function.
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
exports.handler = async (event) => {
// Prefixes are used to fetch data in parallel.
const numbers = '0123456789'.split('');
const letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.split('');
const special = "!-_'.*()".split(''); // "Safe" S3 special chars
const prefixes = [...numbers, ...letters, ...special];
// array of params used to call listObjectsV2 in parallel for each prefix above
const arrayOfParams = prefixes.map((prefix) => {
return { Bucket: 'YOUR-BUCKET-NAME', Prefix: prefix }
});
const allKeys = [];
await Promise.all(arrayOfParams.map(params => getAllKeys(params, allKeys)));
return allKeys.length;
};
async function getAllKeys(params, allKeys = []){
const response = await s3.listObjectsV2(params).promise();
response.Contents.forEach(obj => allKeys.push(obj.Key));
if (response.NextContinuationToken) {
params.ContinuationToken = response.NextContinuationToken;
await getAllKeys(params, allKeys); // RECURSIVE CALL
}
return allKeys;
}
Also, for completeness, here is a simpler, non-prefixed async/await version:
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
exports.handler = async (event) => {
const allKeys = await getAllKeys({ Bucket: 'YOUR-BUCKET-NAME' });
return allKeys.length;
};
async function getAllKeys(params, allKeys = []){
const response = await s3.listObjectsV2(params).promise();
response.Contents.forEach(obj => allKeys.push(obj.Key));
if (response.NextContinuationToken) {
params.ContinuationToken = response.NextContinuationToken;
await getAllKeys(params, allKeys); // RECURSIVE CALL
}
return allKeys;
}
Here is the code to get the list of keys from a bucket.
var params = {
Bucket: 'bucket-name'
};
var allKeys = [];
listAllKeys();
function listAllKeys() {
s3.listObjectsV2(params, function (err, data) {
if (err) {
console.log(err, err.stack); // an error occurred
} else {
var contents = data.Contents;
contents.forEach(function (content) {
allKeys.push(content.Key);
});
if (data.IsTruncated) {
params.ContinuationToken = data.NextContinuationToken;
console.log("get further list...");
listAllKeys();
}
}
});
}
this is the best way to do that in my opinion:
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
const listAllKeys = (params, out = []) => new Promise((resolve, reject) => {
s3.listObjectsV2(params).promise()
.then(({Contents, IsTruncated, NextContinuationToken}) => {
out.push(...Contents);
!IsTruncated ? resolve(out) : resolve(listAllKeys(Object.assign(params, {ContinuationToken: NextContinuationToken}), out));
})
.catch(reject);
});
listAllKeys({Bucket: 'bucket-name'})
.then(console.log)
.catch(console.log);