Внутренняя ошибка сервера при запуске кластера emr (выпуск-5.11.0), предоставленная в конфигурации hadoop - PullRequest
0 голосов
/ 05 февраля 2019

Я не могу запустить кластер для выпуска EMR - 5.11.0 (версия AWS JAVA SDK 1.11.221 ), предоставив конфигурацию Hadoop.

Однако всякий раз, когда внешняя конфигурация Hadoop не указана (при удалении метода .withConfigurations , кластеры запускаются успешно. Мне нужно запустить кластер с предоставленными конфигурациями.

Код для запуска кластера:

RunJobFlowRequest request = new RunJobFlowRequest()
                    .withConfigurations(prepareConfigurations(element.getAsJsonObject()))
                    .withName("EMR_PROCESSING__20190201")
                    .withReleaseLabel("emr-5.11.0")
                    .withSteps("mySteps")
                    .withApplications(new Application().withName("Hadoop"),
                            new Application().withName("Ganglia"),
                            new Application().withName("Spark"))
                    .withLogUri("s3://myS3P")
                    .withServiceRole("myDefaultRole")
                    .withJobFlowRole("myDefaultRole")
                    .withVisibleToAllUsers(true)
                    .withSecurityConfiguration("myConfigs")
                    .withSteps(new StepConfig().withName("Enable debugging")
                            .withActionOnFailure(ActionOnFailure.TERMINATE_CLUSTER)
                            .withHadoopJarStep(new StepFactory().newEnableDebuggingStep()))
                    .withInstances(new JobFlowInstancesConfig()
                            .withEc2KeyName("myKeyName")
                            .withEc2SubnetId("subnet-**")
                            .withInstanceCount(2)
                            .withKeepJobFlowAliveWhenNoSteps(true)
                            .withMasterInstanceType("r3.xlarge")
                            .withSlaveInstanceType("c4.8xlarge"));

            RunJobFlowResult result = emr.runJobFlow(request);

            System.out.println("Cluster launch ::: " + result.getJobFlowId());

Я использую следующий JSON в качестве конфигурации Hadoop.

[{
    "classification": "core-site",
    "properties": {
        "fs.s3a.access.key": "********",
        "fs.s3.awsAccessKeyId": "********",
        "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
        "hadoop.proxyuser.mapred.hosts": "*",
        "hadoop.proxyuser.mapred.groups": "*",
        "io.compression.codec.lzo.class": "com.hadoop.compression.lzo.LzoCodec",
        "fs.s3.awsSecretAccessKey": "********",
        "io.compression.codecs": "com.hadoop.compression.lzo.LzoCodec",
        "fs.s3a.buffer.dir": "${hadoop.tmp.dir}/s3a",
        "fs.s3a.secret.key": "********"
    },
    "configurations": []
}, {
    "classification": "mapred-site",
    "properties": {
        "mapreduce.reduce.shuffle.parallelcopies": "20",
        "mapreduce.task.io.sort.mb": "512",
        "mapreduce.tasktracker.reduce.tasks.maximum": "10",
        "mapreduce.map.speculative": "false",
        "mapreduce.output.fileoutputformat.compress": "true",
        "mapreduce.output.fileoutputformat.compress.codec": "com.hadoop.compression.lzo.LzoCodec",
        "mapred.child.java.opts": "-Xmx3500m",
        "mapreduce.job.reduce.slowstart.completedmaps": "0.99",
        "mapreduce.tasktracker.map.tasks.maximum": "13",
        "mapreduce.task.io.sort.factor": "48",
        "mapreduce.reduce.java.opts": "-Xmx4500m",
        "mapreduce.map.memory.mb": "4096",
        "mapreduce.map.output.compress.codec": "com.hadoop.compression.lzo.LzoCodec",
        "mapreduce.job.reduces": "80",
        "yarn.app.mapreduce.am.command-opts": "-Xmx2000m",
        "mapreduce.reduce.memory.mb": "5120",
        "mapreduce.map.java.opts": "-Xmx3800m",
        "mapreduce.reduce.speculative": "false",
        "yarn.app.mapreduce.am.resource.mb": "2048"
    },
    "configurations": []
}, {
    "classification": "yarn-site",
    "properties": {
        "yarn.nodemanager.aux-services": "mapreduce_shuffle,spark_shuffle",
        "yarn.nodemanager.resource.cpu-vcores": "36",
        "yarn.nodemanager.resource.memory-mb": "57344",
        "yarn.application.classpath": "$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*,/data/cascading/lib/*,/usr/lib/hadoop-lzo/lib/*,/usr/share/aws/emr/emrfs/conf,/usr/share/aws/emr/emrfs/lib/*,/usr/share/aws/emr/emrfs/auxlib/*,/usr/share/aws/emr/lib/*,/usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,/usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,/usr/share/aws/emr/kinesis/lib/emr-kinesis-hadoop.jar,/usr/share/aws/emr/cloudwatch-sink/lib/*",
        "yarn.scheduler.maximum-allocation-vcores": "36",
        "yarn.scheduler.maximum-allocation-mb": "57344",
        "yarn.scheduler.minimum-allocation-mb": "512",
        "yarn.nodemanager.aux-services.spark_shuffle.class": "org.apache.spark.network.yarn.YarnShuffleService"
    },
    "configurations": []
}, {
    "classification": "hdfs-site",
    "properties": {
        "dfs.blocksize": "134217728"
    },
    "configurations": []
}, {
    "classification": "capacity-scheduler",
    "properties": {
        "yarn.scheduler.capacity.root.acl_submit_applications": "hadoop,yarn,mapred,hdfs",
        "yarn.scheduler.capacity.root.queues": "default",
        "yarn.scheduler.capacity.root.default.acl_submit_applications": "hadoop,yarn,mapred,hdfs",
        "yarn.scheduler.capacity.root.default.capacity": "100",
        "yarn.scheduler.capacity.root.default.state": "RUNNING"
    },
    "configurations": []
}, {
    "classification": "hadoop-env",
    "properties": {},
    "configurations": [{
        "classification": "export",
        "properties": {
            "HADOOP_CLASSPATH": "\"${HADOOP_CLASSPATH}:/home/hadoop/.driven-plugin/:/data/cascading/lib/*\""
        },
        "configurations": []
    }]
}, {
    "classification": "yarn-env",
    "properties": {},
    "configurations": [{
        "classification": "export",
        "properties": {
            "YARN_USER_CLASSPATH": "\"${YARN_USER_CLASSPATH}:/home/hadoop/.driven-plugin/\""
        },
        "configurations": []
    }]
}, {
    "classification": "spark-defaults",
    "properties": {
        "spark.executor.memory": "8G",
        "spark.driver.memory": "10G",
        "spark.executor.cores": "5",
        "spark.executor.instances": "49"
    },
    "configurations": []
}]

Я использую ту же конфигурацию и код для запуска кластеров EMR ( release-5.0.0 & AWS-JAVA-SDK-1.11.39 ). При обновлении возникают проблемы.

Предоставляю ли я неверную конфигурацию / версию для запуска кластеров 5.11.0 или я пропустилчто-то здесь ??

...