Получение ошибки при использовании UDF для добавления нового столбца в DF - PullRequest
0 голосов
/ 28 мая 2020

Я пытаюсь получить первую букву из всех значений в столбце «word», но получаю ошибку

> import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import spark.implicits._


// Define case classes for input data
case class Docword(docId: Int, vocabId: Int, count: Int)
case class VocabWord(vocabId: Int, word: String)

// Read the input data
val docwords = spark.read.
  schema(Encoders.product[Docword].schema).
  option("delimiter", " ").
  csv("hdfs:///user/ashhall1616/bdc_data/t3/docword.txt").
  as[Docword]
val vocab = spark.read.
  schema(Encoders.product[VocabWord].schema).
  option("delimiter", " ").
  csv("hdfs:///user/ashhall1616/bdc_data/t3/vocab.txt").
  as[VocabWord]



def firstletter(x: String): String = {
x.substring(0,1)}

val firstletterUdf =spark.udf.regster[String,String]("firstletter", firstletter(_))
val joinfile = docwords.join(vocab, "vocabId").select($"word", $"docId", $"count").withColumn("firstletter", firstletterUdf($"word"))


joinfile.write.mode("overwrite").partitionBy("firstletter").parquet("file:///home/user204943816622/t3_docword_index_part.parquet")
joinfile.show(10)

ОШИБКА:

 val firstletterUdf =spark.udf.regster[String,String]("firstletter", firstletter(_))
<console>:100: error: value regster is not a member of org.apache.spark.sql.UDFRegistration
       val firstletterUdf =spark.udf.regster[String,String]("firstletter", firstletter(_))
                                     ^
scala> val joinfile = docwords.join(vocab, "vocabId").select($"word", $"docId", $"count").withColumn("firstletter", firstletterUdf($"word"))
<console>:106: error: not found: value firstletterUdf
       val joinfile = docwords.join(vocab, "vocabId").select($"word", $"docId", $"count").withColumn("firstletter", firstletterUdf($"word"))

Хотите получить вывод как:

| word | docId | count | firstLetter

plane | 1 | 1000 | п

...