Я пытаюсь прочитать файл .xlsx и преобразовать его в фрейм данных, используя spark-excel. но когда я пытаюсь прочитать файл, он выдает java.lang.IllegalArgumentException: InputStream класса класса org.apache.commons.compress.archivers.zip.ZipArchiveInputStream не реализует InputStreamStatistics.
Я создал приложение Scala со следующими зависимостями:
build.sbt:
scalaVersion := "2.11.8"
mainClass in Compile := Some("com.spark.ReadExcel")
val sparkVersion = "2.2.0"
libraryDependencies ++= Seq(
"org.apache.spark" % "spark-core_2.11" % sparkVersion % "provided",
"org.apache.spark" % "spark-sql_2.11" % sparkVersion % "provided",
"org.apache.spark" % "spark-hive_2.11" % sparkVersion % "provided",
"com.crealytics" %% "spark-excel" % "0.11.0"
)
assemblyMergeStrategy in assembly := {
case PathList("META-INF", xs @ _*) => MergeStrategy.discard
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
case PathList("org", "apache", xs @ _*) => MergeStrategy.last
case PathList("com", "google", xs @ _*) => MergeStrategy.last
case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
case "about.html" => MergeStrategy.rename
case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
case "META-INF/mailcap" => MergeStrategy.last
case "META-INF/mimetypes.default" => MergeStrategy.last
case "plugin.properties" => MergeStrategy.last
case "log4j.properties" => MergeStrategy.last
case "overview.html" => MergeStrategy.rename
case x =>
val oldStrategy = (assemblyMergeStrategy in assembly).value
oldStrategy(x)
}
ReadExcel.scala
val spark = SparkSession.builder()
.appName("ExcelProcessing")
.getOrCreate()
val df = spark.read
.format("com.crealytics.spark.excel")
.option("dataAddress", "'Sheet1'!A1")
.option("useHeader", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "true")
.option("maxRowsInMemory", 20)
.option("excerptSize", 10)
.load("test.xlsx")
println("Printing the values: ")
df.show()
, когда я пытаюсь запустить вышеуказанную программу, она завершается неудачей со следующим исключением:
Exception in thread "main" java.lang.IllegalArgumentException: InputStream of class class org.apache.commons.compress.archivers.zip.ZipArchiveInputStream is not implementing InputStreamStatistics.
at org.apache.poi.openxml4j.util.ZipArchiveThresholdInputStream.<init>(ZipArchiveThresholdInputStream.java:63)
at org.apache.poi.openxml4j.opc.internal.ZipHelper.openZipStream(ZipHelper.java:180)
at org.apache.poi.openxml4j.opc.ZipPackage.<init>(ZipPackage.java:104)
at org.apache.poi.openxml4j.opc.OPCPackage.open(OPCPackage.java:298)
at com.monitorjbl.xlsx.impl.StreamingWorkbookReader.init(StreamingWorkbookReader.java:109)
at com.monitorjbl.xlsx.impl.StreamingWorkbookReader.init(StreamingWorkbookReader.java:91)
at com.monitorjbl.xlsx.StreamingReader$Builder.open(StreamingReader.java:251)
at com.crealytics.spark.excel.StreamingWorkbookReader.openWorkbook(WorkbookReader.scala:56)
at com.crealytics.spark.excel.WorkbookReader$class.withWorkbook(WorkbookReader.scala:14)
at com.crealytics.spark.excel.StreamingWorkbookReader.withWorkbook(WorkbookReader.scala:47)
at com.crealytics.spark.excel.ExcelRelation.excerpt$lzycompute(ExcelRelation.scala:31)
at com.crealytics.spark.excel.ExcelRelation.excerpt(ExcelRelation.scala:31)
at com.crealytics.spark.excel.ExcelRelation.headerCells$lzycompute(ExcelRelation.scala:33)
at com.crealytics.spark.excel.ExcelRelation.headerCells(ExcelRelation.scala:33)
at com.crealytics.spark.excel.ExcelRelation$$anonfun$inferSchema$1.apply(ExcelRelation.scala:148)
at com.crealytics.spark.excel.ExcelRelation$$anonfun$inferSchema$1.apply(ExcelRelation.scala:147)
at scala.Option.getOrElse(Option.scala:121)
at com.crealytics.spark.excel.ExcelRelation.inferSchema(ExcelRelation.scala:147)
at com.crealytics.spark.excel.ExcelRelation.<init>(ExcelRelation.scala:40)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:40)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:18)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:12)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:341)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:174)
at com.spark.ReadExcel$.main(ReadExcel.scala:36)
at com.spark.ReadExcel.main(ReadExcel.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:894)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)