Ночной процесс обновления, который извлекает документы из корзины Amazon S3, сравнивает их со списком контрактов и сохраняет байтовые данные. Я хотел бы, чтобы это захватывало и обрабатывало один файл за раз вместо списка. Проблема связана с памятью в списке, если в этот вечер было много документов, процесс выдает ошибку из-за слишком большого размера списка.
import com.amazonaws.regions.Regions;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
import com.amazonaws.services.s3.model.*;
import com.amazonaws.util.IOUtils;
import com.duke.contractsystembackend.models.ContractDocument;
import com.duke.contractsystembackend.models.S3ContractDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class S3Reader {
@Autowired
S3Service s3Service;
private String bucketName;
private Regions region = Regions.US_EAST_1;
private final AmazonS3 s3;
private static final Logger log = LoggerFactory.getLogger(S3Reader.class);
/**
* @param bucketName
*/
public S3Reader(String bucketName) {
this.bucketName = bucketName;
this.s3 = AmazonS3ClientBuilder.standard().withRegion(this.region).build();
}
/**
* @param bucketName
*/
public S3Reader(String bucketName, AmazonS3 s3) {
this.bucketName = bucketName;
this.s3 = s3;
}
/**
*
* @param buckets
* @param contractDocumentList
* @return
*/
public List<S3ContractDocument> traverseBucket(String[] buckets, List<ContractDocument> contractDocumentList, AmazonS3 s3) {
List<S3ContractDocument> list = new ArrayList<S3ContractDocument>();
contractDocumentList.stream().forEach( cd -> {
for (String bucket : buckets) {
String key = String.format("%s/%s", bucket, cd.getFileName());
log.info("- Looking for prefix: " + key);
ListObjectsV2Request req = new ListObjectsV2Request()
.withBucketName(bucketName)
.withPrefix(key)
.withMaxKeys(buckets.length);
ListObjectsV2Result result = s3.listObjectsV2(req);
List<S3ObjectSummary> objects = result.getObjectSummaries();
for (S3ObjectSummary os : objects) {
log.info("* " + os.getKey());
try {
S3Object object = s3.getObject(new GetObjectRequest(bucketName, key));
S3ContractDocument di = new S3ContractDocument();
cd.setFileData(getObjectContentAsByte(object));
di.setKey(os.getKey());
di.setBucketName(bucketName);
di.setContractDocument(cd);
list.add(di);
} catch (Exception e) {
log.error("Error trying to get bytes from s3's inputStream: " + e.getMessage());
e.printStackTrace();
continue;
}
}
}
});
return list;
}
/**
* Convert content to byte array
* @param s3Object
* @return
* @throws InterruptedException
* @throws IOException
*/
protected byte[] getObjectContentAsByte(S3Object s3Object) throws IOException {
return IOUtils.toByteArray(s3Object.getObjectContent());
}
}
import com.amazonaws.services.s3.AmazonS3;
import com.duke.contractsystembackend.exceptions.FileStorageException;
import com.duke.contractsystembackend.models.ContractDocument;
import com.duke.contractsystembackend.models.S3ContractDocument;
import com.duke.contractsystembackend.services.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
@Slf4j
@Component
public class NightlyMaximoUpdate {
@Autowired
S3Service s3Service;
@Autowired
ContractDocumentStorageService contractDocumentStorageService;
@Autowired
OCRSubmissionService ocrSubmissionService;
@Autowired
ContractService contractService;
@Value("${S3_BUCKET_NAME}")
protected String BUCKET_NAME;
@Value("${SEND_BATCH:TRUE}")
protected String shouldSend;
@Transactional
@Scheduled(cron = "${batch.job.cron}")
public void nightlyMaximoTransfer(){
// code to tell what environment you're in - let's set this to trigger a cdc job at 11 and an adc job at 3 (or something along those lines).
if(System.getenv("application_env_json").contains("cdc.duke-energy.com")){
}
if (this.shouldSend.equals("TRUE")) {
AmazonS3 s3Client = s3Service.getS3Client();
List<ContractDocument> contractDocumentList = contractService.getAllByFileDataIsNull();
System.out.println("returned " + contractDocumentList.size() + " elements");
contractDocumentList.forEach(doc -> System.out.println(doc.getId()));
// Defines the buckets to read from
String[] folders = new String[]{
"sc_contracts",
"attachments",
"diagrams",
"sc_purchasing",
};
// Initialize S3 reader
S3Reader reader = new S3Reader(BUCKET_NAME, s3Service.getS3Client());
List<S3ContractDocument> s3ContractDocuments = reader.traverseBucket(folders, contractDocumentList, s3Client);
System.out.println("Found " + s3ContractDocuments.size() + " elements");
// Diffs existing between SSMS and S3
s3Service.diffList(contractDocumentList, s3ContractDocuments);
for (S3ContractDocument s3cd : s3ContractDocuments) {
ContractDocument newDocument = null;
try {
newDocument = contractDocumentStorageService.storeS3FIle(s3cd.getContractDocument().getFileData(),
s3cd.getContractDocument().getId());
} catch (FileStorageException e) {
e.printStackTrace();
}
Boolean success = false;
if (newDocument.getFileType().equals("application/pdf")) {
success = this.ocrSubmissionService.SubmitContractForOCR(s3cd.getContractDocument().getFileData(), newDocument.getId());
} else {
success = true;
}
if (!success) {
log.info("OCR not successful " + newDocument.getId());
} else {
log.info("OCR Successful " + newDocument.getId());
}
}
}
}
}