Сбой mapreduce цепочки с «Не удалось инициализировать все коллекторы» - PullRequest
0 голосов
/ 01 сентября 2018

У меня есть очень компактная программа для объединения задач grep и сортировки, как показано ниже:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
import org.apache.hadoop.mapreduce.lib.map.RegexMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;

import java.util.Random;

public class grep {
    public static void main(String[] args) throws Exception {
        if (args.length < 3 || args.length > 4) {
            System.out.println("Grep <in dir><out dir><regex>[<group>]");
            return;
        }
        Configuration conf = new Configuration();
        conf.set(RegexMapper.PATTERN, args[2]);
        if (args.length == 4) {
            conf.set(RegexMapper.GROUP, args[3]);
        }
        Job grepJob = new Job(conf);
        grepJob.setJobName("grep");
        Job sortJob = new Job(conf);
        sortJob.setJobName("sort");
        Path tmp = new Path("grep-temp" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        try {
            FileInputFormat.setInputPaths(grepJob, args[0]);
            grepJob.setMapOutputKeyClass(RegexMapper.class);
            grepJob.setCombinerClass(LongSumReducer.class);
            grepJob.setReducerClass(LongSumReducer.class);

            FileOutputFormat.setOutputPath(grepJob, tmp);
            grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            grepJob.setOutputKeyClass(Text.class);
            grepJob.setOutputValueClass(LongWritable.class);
            grepJob.waitForCompletion(true);

            FileInputFormat.setInputPaths(sortJob, tmp);
            sortJob.setInputFormatClass(SequenceFileInputFormat.class);
            sortJob.setMapperClass(InverseMapper.class);
            sortJob.setNumReduceTasks(1);

            FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
            sortJob.setSortComparatorClass(LongWritable.DecreasingComparator.class);
            sortJob.waitForCompletion(true);
        } finally {
            FileSystem.get(conf).delete(tmp, true);
        }
    }
}

Скомпилируйте и упакуйте его, запустите, и я получил эту ошибку:

18/09/01 01:58:50 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/09/01 01:58:50 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
18/09/01 01:58:50 WARN mapreduce.JobResourceUploader: No job jar file set.  User classes may not be found. See Job or Job#setJar(String).
18/09/01 01:58:50 INFO input.FileInputFormat: Total input files to process : 1
18/09/01 01:58:50 INFO mapreduce.JobSubmitter: number of splits:1
18/09/01 01:58:51 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1535418462954_0005
18/09/01 01:58:51 INFO mapred.YARNRunner: Job jar is not present. Not adding any jar to the list of resources.
18/09/01 01:58:51 INFO impl.YarnClientImpl: Submitted application application_1535418462954_0005
18/09/01 01:58:51 INFO mapreduce.Job: The url to track the job: http://ubuntu:8088/proxy/application_1535418462954_0005/
18/09/01 01:58:51 INFO mapreduce.Job: Running job: job_1535418462954_0005
18/09/01 01:58:57 INFO mapreduce.Job: Job job_1535418462954_0005 running in uber mode : false
18/09/01 01:58:57 INFO mapreduce.Job:  map 0% reduce 0%
18/09/01 01:59:00 INFO mapreduce.Job: Task Id : attempt_1535418462954_0005_m_000000_0, Status : FAILED
Error: java.io.IOException: Initialization of all the collectors failed. Error in last collector was :class org.apache.hadoop.mapreduce.lib.map.RegexMapper
    at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:414)
    at org.apache.hadoop.mapred.MapTask.access$100(MapTask.java:81)
    at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:698)
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:770)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
    at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:175)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1840)
    at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:169)
Caused by: java.lang.ClassCastException: class org.apache.hadoop.mapreduce.lib.map.RegexMapper
    at java.lang.Class.asSubclass(Class.java:3218)
    at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:887)
    at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.init(MapTask.java:1004)
    at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:402)
    ... 9 more

Так, где я ошибся в своей программе? Как это исправить?

...