Я сделал нечто похожее на это, используя get.
Мне пришлось адаптировать код, чтобы он не тестировался, но этот общий подход работал для меня. Я не использовал etl, поэтому я не уверен, как он обрабатывает куски файлов, а не целые файлы.
const aws = require("aws-sdk");
const s3 = new aws.S3();
const got = require('got');
const etl = require("etl");
const getAllFilesAsUrls = async(bucket_name, folder_name) => {
const listParams = {
Bucket: bucket_name,
Delimiter: '/',
StartAfter: `${folder_name}/`
};
const data = await s3.listObjectsV2(listParams).promise();
const keys = data.Contents.map(object => object.key);
const urlsArray = [];
for (let key of keys) {
const params = {Bucket: bucket_name, Key: key};
let url = await s3.getSignedUrl('getObject', params).promise();
urlsArray.push(url);
}
return urlsArray;
}
workEachFileUrl = (url) => {
return new Promise((resolve, reject) => {
//if you're looking to limit the amount of data transferred then this was a good way of doing it
const gotstream = got.stream(url, { headers: { "accept-encoding": "gzip" } })
.on('data', (chunk) => {
//pause the stream as soon as we get the first chunk
gotstream.pause();
//do your work with the chunk, as long as etl can handle partial files, then resolve with the first few lines
//otherwise just use the 'response' event as you were with const parsedChunk;
resolve(parsedChunk);
})
.on("error", (err) => {
console.log(err);
reject(err);
});
});
}
runOperation = async (bucket_name, folder_name) => {
const records = [];
const urls = await getAllFilesAsUrls(bucket_name, folder_name);
for (let url of urls) {
let record = await workEachFileUrl(url);
records.push(record);
}
return records;
}
const completedRecords = await runOperation(bucket_name, folder_name);