Я использую Nutch 1.13 hadoop 2.7.2 и solr 5.5.0
мое приложение пытается ползти на глубину 4
Большую часть времени в фазе разбора Tika не может прочитать содержимое pdf, а затем за несколько секунд выдает из-за оборота ошибки ошибки кучи
2019-03-07 11:19:06,806 WARN [main] FocusedParseUtil: Error parsing
http://irpages2.equitystory.com/download/companies/douglasgmbh/Reports/FY_2014-15_Financial_Report.pdf with org.apache.nutch.parse.tika.TikaParser@4c4d362a
java.util.concurrent.TimeoutException
at java.util.concurrent.FutureTask.get(FutureTask.java:205)
at FocusedParseUtil.runParser(FocusedParseUtil.java:135)
at FocusedParseUtil.parse(FocusedParseUtil.java:109)
at FocusedParseSegment.map(FocusedParseSegment.java:154)
at FocusedParseSegment.map(FocusedParseSegment.java:68)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:54)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:453)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
2019-03-07 11:19:09,330 WARN [main] FocusedParseUtil: Unable to successfully parse content http://irpages2.equitystory.com/download/companies/douglasgmbh/Reports/FY_2014-15_Financial_Report.pdf of type application/pdf
2019-03-07 11:19:10,281 WARN [main] FocusedParseSegment: Error parsing: http://irpages2.equitystory.com/download/companies/douglasgmbh/Reports/FY_2014-15_Financial_Report.pdf: failed(2,200): org.apache.nutch.parse.ParseException: Unable to successfully parse content
2019-03-07 11:19:17,669 INFO [main] FocusedParseSegment: Parsed (75513ms):http://irpages2.equitystory.com/download/companies/douglasgmbh/Reports/FY_2014-15_Financial_Report.pdf
2019-03-07 11:19:18,205 INFO [main] FocusedParseSegment: map end
2019-03-07 11:20:16,906 FATAL [main] org.apache.hadoop.mapred.YarnChild: Error running child : java.lang.OutOfMemoryError: Java heap space
at org.apache.nutch.protocol.Content.readFields(Content.java:139)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:71)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:42)
at org.apache.hadoop.io.SequenceFile$Reader.deserializeValue(SequenceFile.java:2322)
at org.apache.hadoop.io.SequenceFile$Reader.getCurrentValue(SequenceFile.java:2295)
at org.apache.hadoop.mapred.SequenceFileRecordReader.getCurrentValue(SequenceFileRecordReader.java:109)
at org.apache.hadoop.mapred.SequenceFileRecordReader.next(SequenceFileRecordReader.java:84)
at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.moveToNext(MapTask.java:199)
at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.next(MapTask.java:185)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:52)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:453)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Я запускаю свою банку, как показано ниже, из hadoop
HADOOP_CLASSPATH=/opt/app/hadoop-
2.7.2/share/hadoop/mapreduce/lib/javax.ws.rs-api-2.1.jar bin/hadoop jar
/opt/app/nutch/Server1/CrawlerRestService-0.1.jar true -
Dmapreduce.map.java.opts="-Xmx2g" -Dmapreduce.reduce.java.opts="-Xmx2g"
когда я запускаю тот же Jar в автономном режиме, он работает нормально, но при запуске в кластере hadoop (режим pseduo) он дает сбой 95% раз с указанной выше ошибкой