В одном из моих сценариев использования мы разработали машинный код на основе Python, который берет входные данные размером 16 МБ, включает алгоритм обучения и генерирует модель ML размером 1,6 ГБ. Код работает как положено в моем ноутбуке. Но когда мы переносим код в нашу лабораторную инфраструктуру, он падает. Инфраструктура хранит входные данные в HDFS (точно в S3), а затем, при выполнении части ML, данные перемещаются из S3 на компьютер, где происходит выполнение. Я вижу эту ошибку на данном этапе
INFO [2018-11-29T10:38:47.487Z] transforms.foundry._fsreader: Creating filesystem for rid ri.foundry.main.dataset.b9f3d045-1ae5-4ec9-9e35-41ab6d137e5a, None to ri.foundry.main.transaction.0000001a-86bf-c1ef-8563-2811067ebd2a
ERROR [2018-11-29T10:38:48.14Z] org.apache.hadoop.fs.s3a.S3AFileSystem: Partial failure of delete, {} errors {"0":3,"throwable0_message":"One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 23761947DAADD329; S3 Extended Request ID: 8uFmJygkfRdj/EBixq+TJ+AtUsDRFvotMKYT1aCv+ZG4o26RpQOfzlxnB60K/XYKdKCQAsAhW/k=)"}
com.amazonaws.services.s3.model.MultiObjectDeleteException: One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 23761947DAADD329; S3 Extended Request ID: 8uFmJygkfRdj/EBixq+TJ+AtUsDRFvotMKYT1aCv+ZG4o26RpQOfzlxnB60K/XYKdKCQAsAhW/k=)
at com.amazonaws.services.s3.AmazonS3Client.deleteObjects(AmazonS3Client.java:2126)
at org.apache.hadoop.fs.s3a.S3AFileSystem.deleteObjects(S3AFileSystem.java:1122)
at org.apache.hadoop.fs.s3a.S3AFileSystem.removeKeys(S3AFileSystem.java:1373)
at org.apache.hadoop.fs.s3a.S3AFileSystem.deleteUnnecessaryFakeDirectories(S3AFileSystem.java:2272)
at org.apache.hadoop.fs.s3a.S3AFileSystem.finishedWrite(S3AFileSystem.java:2238)
at org.apache.hadoop.fs.s3a.S3AFileSystem$WriteOperationHelper.writeSuccessful(S3AFileSystem.java:2736)
at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.close(S3ABlockOutputStream.java:371)
at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:72)
at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106)
at com.palantir.crypto2.hadoop.FileKeyStorageStrategy.put(FileKeyStorageStrategy.java:65)
at com.palantir.crypto2.keys.ChainedKeyStorageStrategy.put(ChainedKeyStorageStrategy.java:51)
at com.palantir.crypto2.hadoop.EncryptedFileSystem.encrypt(EncryptedFileSystem.java:114)
at com.palantir.crypto2.hadoop.EncryptedFileSystem.create(EncryptedFileSystem.java:94)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1067)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1048)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:937)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:925)
at com.palantir.foundry.fs.FileSystemManager.write(FileSystemManager.java:382)
at com.palantir.foundry.fs.Routers.openFile(Routers.java:358)
at com.palantir.foundry.fs.Routers.lambda$fileWritingRouter$16(Routers.java:341)
at com.palantir.foundry.fs.PathRouter.handle(PathRouter.java:74)
at com.palantir.foundry.fs.FoundryFileSystem.create(FoundryFileSystem.java:183)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1067)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1048)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:937)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:925)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at com.palantir.foundry.spark.api.SparkAuthorization.runAsUserInternal(SparkAuthorization.java:152)
at com.palantir.foundry.spark.api.SparkAuthorization.runAsUser(SparkAuthorization.java:94)
at com.palantir.transforms.lang.python.module.runner.AuthProvidingGatewayServer.lambda$createConnection$0(AuthProvidingGatewayServer.java:44)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at com.palantir.tracing.DeferredTracer.withTrace(DeferredTracer.java:53)
at com.palantir.tracing.Tracers$TracingAwareCallable.call(Tracers.java:219)
at com.palantir.tracing.WrappingExecutorService.lambda$wrapTask$0(WrappingExecutorService.java:68)
at com.codahale.metrics.InstrumentedExecutorService$InstrumentedRunnable.run(InstrumentedExecutorService.java:176)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
ERROR [2018-11-29T10:38:48.14Z] org.apache.hadoop.fs.s3a.S3AFileSystem: {}: "{}" - {} {"0":"foundry-catalog/b9f3d045-1ae5-4ec9-9e35-41ab6d137e5a/0000001a-86bf-c1ef-8563-2811067ebd2a/","1":"AccessDenied","2":"Access Denied"}
ERROR [2018-11-29T10:38:48.14Z] org.apache.hadoop.fs.s3a.S3AFileSystem: {}: "{}" - {} {"0":"foundry-catalog/b9f3d045-1ae5-4ec9-9e35-41ab6d137e5a/","1":"AccessDenied","2":"Access Denied"}
ERROR [2018-11-29T10:38:48.14Z] org.apache.hadoop.fs.s3a.S3AFileSystem: {}: "{}" - {} {"0":"foundry-catalog/","1":"AccessDenied","2":"Access Denied"}
INFO [2018-11-29T10:38:48.16Z] transforms.foundry._foundryfs: Copied 240438 bytes
ERROR [2018-11-29T10:38:48.263Z] org.apache.hadoop.fs.s3a.S3AFileSystem: Partial failure of delete, {} errors {"0":3,"throwable0_message":"One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 73EBD93785D770B5; S3 Extended Request ID: NGz0gGaRWA/YgYRkHji5+dYGX1yXZhWEq38UUHy/SpAIRSfzDcRKEiKdSGdLDuPCJOLgCZnzCG0=)"}
com.amazonaws.services.s3.model.MultiObjectDeleteException: One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 73EBD93785D770B5; S3 Extended Request ID: NGz0gGaRWA/YgYRkHji5+dYGX1yXZhWEq38UUHy/SpAIRSfzDcRKEiKdSGdLDuPCJOLgCZnzCG0=)
at com.amazonaws.services.s3.AmazonS3Client.deleteObjects(AmazonS3Client.java:2126)
В результате этого размер модели ML составляет всего несколько КБ (около 200 КБ). Может кто-нибудь помочь мне выяснить, в чем может быть проблема и как диагностировать дальше?