Я пишу Map Reduce Program для следующей задачи:
Поскольку в файле есть пустые строки и заголовок, код завершается с Nullexecptionerror
. Можете ли вы проверить мой код и сообщить мне, что является ошибкой
Извлеките запись, имеющую VendorID
как '2' И tpep_pickup_datetime
как '2017-10-01 00:15:30' И tpep_dropoff_datetime
как '2017-10-01 00:25:11' AND
passenger_count
как '1' AND trip_distance as
'2.17'
Отфильтровать все записи, имеющие RatecodeID
как 4.
Группировать по всем записям, основанным на типе платежа и найти количество для каждой группы. Сортировать типы платежей в порядке возрастания их количества.
Пример данных:
endorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
1,2017-07-01 00:06:25,2017-07-01 00:10:50,1,1.20,1,N,249,90,1,5.5,0.5,0.5,1.35,0,0.3,8.15
1,2017-07-01 00:20:04,2017-07-01 00:21:38,2,.20,1,N,249,158,2,3,0.5,0.5,0,0,0.3,4.3
1,2017-07-01 00:44:10,2017-07-01 00:59:29,1,4.30,1,N,100,45,1,15.5,0.5,0.5,3.35,0,0.3,20.15
1,2017-07-01 00:07:33,2017-07-01 00:31:30,1,8.30,1,N,138,162,1,27,0.5,0.5,6.8,5.76,0.3,40.86
1,2017-07-01 00:01:17,2017-07-01 00:16:18,1,1.90,1,N,107,158,1,10.5,0.5,0.5,0,0,0.3,11.8
1,2017-07-01 00:23:32,2017-07-01 00:43:23,2,4.70,1,N,158,263,1,17,0.5,0.5,3,0,0.3,21.3
1,2017-07-01 00:30:49,2017-07-01 00:46:56,3,2.30,1,N,246,162,1,12,0.5,0.5,2.65,0,0.3,15.95
1,2017-07-01 00:04:21,2017-07-01 00:25:37,1,5.40,1,N,17,90,1,19,0.5,0.5,2,0,0.3,22.3
1,2017-07-01 00:12:29,2017-07-01 00:21:54,2,1.60,1,N,170,237,2,8,0.5,0.5,0,0,0.3,9.3
1,2017-07-01 00:40:17,2017-07-01 00:56:21,3,2.60,1,N,163,90,1,12,0.5,0.5,3.3,0,0.3,16.6
1,2017-07-01 00:24:07,2017-07-01 00:28:31,1,1.10,1,N,142,143,1,5.5,0.5,0.5,1,0,0.3,7.8
1,2017-07-01 00:48:44,2017-07-01 01:13:48,1,5.90,1,N,137,17,1,21.5,0.5,0.5,4.55,0,0.3,27.3
public class SparkMapper extends Mapper<LongWritable, Text, Text, Text> {
public static List <String> VendorID = Arrays.asList( "1","2");
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// String data = value.toString();
int len = value.getLength();
if (value != null && len > 0)
{ String[] fields = value.toString().split(",");
String fields2 = null;
if (fields[0] == "2" && fields[1] == "2017-10-01 00:15:30" && fields[2] == "2017-10-01 00:25:11"
&& fields[3] == "1" && fields[4] == "2.17")
{
fields2= fields[0] + fields[1] +fields[2] + fields[3] + fields[4];
}
context.write(new Text(fields2), null);//, new Text(value));
}
}
}
public class SparkDriver extends Configured implements Tool {
public static void main(String[] args) throws Exception {
int returnStatus = ToolRunner.run(new Configuration(), new SparkDriver(), args);
System.exit(returnStatus);
}
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Job job = new Job(getConf());
job.setJobName("filter");
job.setJarByClass(SparkDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(SparkMapper.class);
// job.setReducerClass(SparkReducer.class);
// job.setPartitionerClass(SparkPartitioner.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
try {
return job.waitForCompletion(true) ? 0 : 1;
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return 0;
}
}
Поскольку файл имеет пустые строки и заголовок, код завершается с помощью Nullexecptionerror
Я хочу удалить заголовок и пробелы при успешном запуске.