Не умеет читать файлы .xlsx с использованием библиотеки spark-excel - PullRequest
0 голосов
/ 25 января 2019

Я пытаюсь прочитать файл .xlsx и преобразовать его в фрейм данных, используя spark-excel. но когда я пытаюсь прочитать файл, он выдает java.lang.IllegalArgumentException: InputStream класса класса org.apache.commons.compress.archivers.zip.ZipArchiveInputStream не реализует InputStreamStatistics.

Я создал приложение Scala со следующими зависимостями:

build.sbt:

scalaVersion := "2.11.8"
mainClass in Compile := Some("com.spark.ReadExcel")
val sparkVersion = "2.2.0"

libraryDependencies ++= Seq(
  "org.apache.spark" % "spark-core_2.11" % sparkVersion % "provided",
  "org.apache.spark" % "spark-sql_2.11" % sparkVersion % "provided",
  "org.apache.spark" % "spark-hive_2.11" % sparkVersion % "provided",

  "com.crealytics" %% "spark-excel" % "0.11.0"
)

assemblyMergeStrategy in assembly := {
  case PathList("META-INF", xs @ _*) => MergeStrategy.discard
  case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
  case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
  case PathList("org", "apache", xs @ _*) => MergeStrategy.last
  case PathList("com", "google", xs @ _*) => MergeStrategy.last
  case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
  case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
  case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
  case "about.html" => MergeStrategy.rename
  case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
  case "META-INF/mailcap" => MergeStrategy.last
  case "META-INF/mimetypes.default" => MergeStrategy.last
  case "plugin.properties" => MergeStrategy.last
  case "log4j.properties" => MergeStrategy.last
  case "overview.html" => MergeStrategy.rename
  case x =>
    val oldStrategy = (assemblyMergeStrategy in assembly).value
    oldStrategy(x)
}

ReadExcel.scala

val spark = SparkSession.builder()
   .appName("ExcelProcessing")
   .getOrCreate()

val df = spark.read
  .format("com.crealytics.spark.excel")
  .option("dataAddress", "'Sheet1'!A1")
  .option("useHeader", "true")
  .option("treatEmptyValuesAsNulls", "true")
  .option("inferSchema", "true")
  .option("maxRowsInMemory", 20)
  .option("excerptSize", 10)
  .load("test.xlsx")

println("Printing the values: ")
df.show()

, когда я пытаюсь запустить вышеуказанную программу, она завершается неудачей со следующим исключением:

Exception in thread "main" java.lang.IllegalArgumentException: InputStream of class class org.apache.commons.compress.archivers.zip.ZipArchiveInputStream is not implementing InputStreamStatistics.
        at org.apache.poi.openxml4j.util.ZipArchiveThresholdInputStream.<init>(ZipArchiveThresholdInputStream.java:63)
        at org.apache.poi.openxml4j.opc.internal.ZipHelper.openZipStream(ZipHelper.java:180)
        at org.apache.poi.openxml4j.opc.ZipPackage.<init>(ZipPackage.java:104)
        at org.apache.poi.openxml4j.opc.OPCPackage.open(OPCPackage.java:298)
        at com.monitorjbl.xlsx.impl.StreamingWorkbookReader.init(StreamingWorkbookReader.java:109)
        at com.monitorjbl.xlsx.impl.StreamingWorkbookReader.init(StreamingWorkbookReader.java:91)
        at com.monitorjbl.xlsx.StreamingReader$Builder.open(StreamingReader.java:251)
        at com.crealytics.spark.excel.StreamingWorkbookReader.openWorkbook(WorkbookReader.scala:56)
        at com.crealytics.spark.excel.WorkbookReader$class.withWorkbook(WorkbookReader.scala:14)
        at com.crealytics.spark.excel.StreamingWorkbookReader.withWorkbook(WorkbookReader.scala:47)
        at com.crealytics.spark.excel.ExcelRelation.excerpt$lzycompute(ExcelRelation.scala:31)
        at com.crealytics.spark.excel.ExcelRelation.excerpt(ExcelRelation.scala:31)
        at com.crealytics.spark.excel.ExcelRelation.headerCells$lzycompute(ExcelRelation.scala:33)
        at com.crealytics.spark.excel.ExcelRelation.headerCells(ExcelRelation.scala:33)
        at com.crealytics.spark.excel.ExcelRelation$$anonfun$inferSchema$1.apply(ExcelRelation.scala:148)
        at com.crealytics.spark.excel.ExcelRelation$$anonfun$inferSchema$1.apply(ExcelRelation.scala:147)
        at scala.Option.getOrElse(Option.scala:121)
        at com.crealytics.spark.excel.ExcelRelation.inferSchema(ExcelRelation.scala:147)
        at com.crealytics.spark.excel.ExcelRelation.<init>(ExcelRelation.scala:40)
        at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:40)
        at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:18)
        at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:12)
        at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:341)
        at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
        at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
        at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:174)
        at com.spark.ReadExcel$.main(ReadExcel.scala:36)
        at com.spark.ReadExcel.main(ReadExcel.scala)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
        at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:894)
        at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198)
        at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228)
        at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137)
        at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
...