Я пытаюсь использовать S3 в качестве моего каталога глубокого хранения druid и получаю следующее исключение:
2020-03-02T11:47:29,971 INFO [Thread-18] org.apache.hadoop.mapred.LocalJobRunner - reduce task executor complete.
2020-03-02T11:47:29,982 WARN [Thread-18] org.apache.hadoop.mapred.LocalJobRunner - job_local404421415_0001
java.lang.Exception: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:489) ~[hadoop-mapreduce-client-common-2.8.5.jar:?]
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:556) [hadoop-mapreduce-client-common-2.8.5.jar:?]
Caused by: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2369) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2793) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2810) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:100) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2849) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2831) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:389) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:356) ~[hadoop-common-2.8.5.jar:?]
at org.apache.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:788) ~[druid-indexing-hadoop-0.17.0.jar:0.17.0]
at org.apache.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:572) ~[druid-indexing-hadoop-0.17.0.jar:0.17.0]
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171) ~[hadoop-mapreduce-client-core-2.8.5.jar:?]
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:627) ~[hadoop-mapreduce-client-core-2.8.5.jar:?]
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389) ~[hadoop-mapreduce-client-core-2.8.5.jar:?]
at org.apache.hadoop.mapred.LocalJobRunner$Job$ReduceTaskRunnable.run(LocalJobRunner.java:346) ~[hadoop-mapreduce-client-common-2.8.5.jar:?]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_131]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_131]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[?:1.8.0_131]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[?:1.8.0_131]
at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_131]
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2273) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2367) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2793) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2810) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:100) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2849) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2831) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:389) ~[hadoop-common-2.8.5.jar:?]
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:356) ~[hadoop-common-2.8.5.jar:?]
at org.apache.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:788) ~[druid-indexing-hadoop-0.17.0.jar:0.17.0]
at org.apache.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:572) ~[druid-indexing-hadoop-0.17.0.jar:0.17.0]
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171) ~[hadoop-mapreduce-client-core-2.8.5.jar:?]
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:627) ~[hadoop-mapreduce-client-core-2.8.5.jar:?]
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389) ~[hadoop-mapreduce-client-core-2.8.5.jar:?]
at org.apache.hadoop.mapred.LocalJobRunner$Job$ReduceTaskRunnable.run(LocalJobRunner.java:346) ~[hadoop-mapreduce-client-common-2.8.5.jar:?]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_131]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_131]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[?:1.8.0_131]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[?:1.8.0_131]
at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_131]
2020-03-02T11:47:30,599 INFO [main] org.apache.hadoop.mapreduce.Job - map 100% reduce 38%
2020-03-02T11:47:30,599 INFO [main] org.apache.hadoop.mapreduce.Job - Job job_local404421415_0001 failed with state FAILED due to: NA
2020-03-02T11:47:30,619 INFO [main] org.apache.hadoop.mapreduce.Job - Counters: 39
Я использую индексирование командной строки oop, и вот мое json:
{
"type":"index_hadoop",
"spec":{
"dataSchema":{
"dataSource":"some_datasource_v1",
"parser":{
"type":"parquet",
"parseSpec":{
"format":"timeAndDims",
"timestampSpec":{
"column":"date_hour",
"format":"yyyy-MM-dd HH:mm:ss"
},
"dimensionsSpec":{
"dimensions":[
"date_hour",
"A",
"B",
"C",
"D"
]
}
}
},
"metricsSpec":[
{
"type":"longSum",
"name":"count_rec",
"fieldName":"count_rec"
}
],
"granularitySpec":{
"type":"uniform",
"segmentGranularity":"hour",
"queryGranularity":"hour",
"intervals":[
"2020-02-10T00:00:00/2020-02-10T01:00:00"
]
}
},
"ioConfig":{
"inputSpec":{
"type":"static",
"paths":"hdfs://SOME_IP/PATH_TO_FILE/part-00000-0a91d78b-109a-4852-9b9e-f2dbe9f147b3-c000.snappy.parquet,SOME OTHE PATHS",
"filter":"part-",
"inputFormat":"org.apache.druid.data.input.parquet.DruidParquetInputFormat"
},
"type":"hadoop",
"metadataUpdateSpec":{
"type":"mysql",
"connectURI":"jdbc:mysql://127.0.0.1:3306/druid?characterEncoding=UTF-8",
"user":"druid",
"password":"diurd",
"segmentTable":"druid_segments"
},
"segmentOutputPath":"s3a://test-bucket/druid_indexed_data/"
},
"hadoopDependencyCoordinates":[
"org.apache.hadoop:hadoop-client:2.7.3",
"org.apache.hadoop:hadoop-aws:2.7.3"
],
"tuningConfig":{
"type":"hadoop",
"workingPath":"/var/druid/hadoop-tmp",
"jobProperties":{
"fs.s3a.connection.ssl.enabled":"true",
"fs.s3a.impl":"org.apache.hadoop.fs.s3a.S3AFileSystem",
"mapreduce.map.memory.mb":"12288",
"mapreduce.reduce.memory.mb":"16384",
"mapreduce.map.java.opts":"-Xmx6144m -XX:+UseG1GC -Dfile.encoding=UTF-8",
"mapreduce.reduce.java.opts":"-Xmx12288m -XX:+UseG1GC -Dfile.encoding=UTF-8",
"mapreduce.job.classloader":"true"
},
"partitionsSpec":{
"type":"hashed",
"numShards":8
},
"indexSpec":{
"bitmap":{
"type":"roaring"
}
},
"maxRowsInMemory":500000,
"leaveIntermediate":False,
"cleanupOnFailure":True,
"overwriteFiles":True,
"ignoreInvalidRows":False,
"combineText":False,
"useCombiner":True
}
}
}
Команда bash, которую я использую для запуска:
cd /opt/apache-druid-0.17.0; java -Xmx512m
-Daws.region=us-east-1
-Ddruid.storage.bucket=test-bucket
-Ddruid.storage.baseKey=druid_
-Ddruid.storage.useS3aSchema=True
-Ddruid.s3.accessKey=ACCESS_KEY
-Ddruid.s3.secretKey=SECRET_KEY
-Ddruid.storage.storageDirectory=s3a://test-bucket/druid_indexed_data/
-Ddruid.storage.type=s3
-Dfile.encoding=UTF-8
-classpath extensions/druid-parquet-extensions/*:extensions/druid-avro-extensions:extensions/druid-hdfs-storage:lib/*:/opt/apache-druid-0.17.0/conf/druid/single-server/micro-quickstart/_common:/usr/local/Cellar/hadoop/3.1.2/bin/hadoop
org.apache.druid.cli.Main index hadoop /var/log/myusername/druid_index_spec_test_datasource_v1_2020-02-10T000000_1583167637.414025.log
Я использую Друид 0.17. Я читал об этом здесь и там, и я думаю, что проблема заключается в том, что друид загружается неправильно, при запуске кода были oop зависимости. Он загружает hadoop-common
и hadoop-client
версии 2.8.5, в то время как я установил hadoopDependencyCoordinates
для использования 2.7.3, которая является версией, которая решит эту проблему. Есть идеи, что мне здесь не хватает?