В приведенном ниже примере я взял пример данных. Я реализовал функции UDF и зарегистрировался в SparkSession, и использовал эти udf-файлы в spark sql.
Я взял образец данных на основе предоставленной вами информации. Вы также можете опубликовать свой код, чтобы мы пришли к тому, чего не хватает.
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[4]")
.appName("Remove-String-From-One-DF-From-Another").getOrCreate()
import spark.implicits._
val tripDF1 = Seq(
("8", Timestamp.valueOf("2017-02-12 03:04:00"), Timestamp.valueOf("2017-02-12 03:12:00"), "washington", "roslyn", "01010101", "Annual Member"),
("60", Timestamp.valueOf("2017-02-13 11:04:00"), Timestamp.valueOf("2017-02-13 12:04:00"), "reston", "ashburn", "01010102", "casual"),
("20", Timestamp.valueOf("2017-02-14 19:04:00"), Timestamp.valueOf("2017-02-14 19:24:00"), "Boston ", "roslyn", "01010103", "30 Day Member"),
("30", Timestamp.valueOf("2017-02-12 03:14:00"), Timestamp.valueOf("2017-02-12 03:44:00"), "Philadelphia ", "Washington", "01010104", "Annual Member"),
("17", Timestamp.valueOf("2017-02-11 12:04:00"), Timestamp.valueOf("2017-02-11 12:21:00"), "Baltimore", "Washington", "01010105", "casual"),
("30", Timestamp.valueOf("2017-02-15 05:00:00"), Timestamp.valueOf("2017-02-15 05:30:00"), "washington", "Miami ", "01010106", "30 Day Member"),
("20", Timestamp.valueOf("2017-02-16 07:10:00"), Timestamp.valueOf("2017-02-16 07:30:00"), "Cincinnati", "Chicago", "01010107", "casual"),
("10", Timestamp.valueOf("2017-02-17 17:10:00"), Timestamp.valueOf("2017-02-17 17:20:00"), "Raleigh", "Charlotte", "01010108", "30 Day Member"),
("30", Timestamp.valueOf("2017-02-15 05:00:00"), Timestamp.valueOf("2017-02-15 05:30:00"), "washington", "Miami ", "01010106", "30 Day Member"),
("20", Timestamp.valueOf("2017-02-16 07:10:00"), Timestamp.valueOf("2017-02-16 07:30:00"), "Cincinnati", "Chicago", "01010107", "casual"),
("10", Timestamp.valueOf("2017-02-17 17:10:00"), Timestamp.valueOf("2017-02-17 17:20:00"), "Raleigh", "Charlotte", "01010108", "30 Day Member"),
("30", Timestamp.valueOf("2017-02-15 05:00:00"), Timestamp.valueOf("2017-02-15 05:30:00"), "washington", "Miami ", "01010106", "30 Day Member"),
("20", Timestamp.valueOf("2017-02-16 07:10:00"), Timestamp.valueOf("2017-02-16 07:30:00"), "Cincinnati", "Chicago", "01010107", "casual"),
("10", Timestamp.valueOf("2017-02-17 17:10:00"), Timestamp.valueOf("2017-02-17 17:20:00"), "Raleigh", "Charlotte", "01010108", "30 Day Member")
).toDF("Duration", "StartDate", "EndDate", "StartStation", "EndStation", "BikeNumber", "MemberType")
spark.udf.register("upperCase", (inputString: String) => inputString.toUpperCase())
tripDF1.show()
tripDF1.createOrReplaceTempView("TRIPS")
spark.sql(" Select " +
"Duration, " +
"StartDate, " +
"to_date(cast(unix_timestamp(StartDate, 'MM/dd/yyyy') as TIMESTAMP)) as converted_StartDate, " +
"EndDate, " +
"to_date(CAST(unix_timestamp(EndDate, 'MM/dd/yyyy') as TIMESTAMP)) as converted_EndDate, " +
"date_format(EndDate, 'E') as day_of_week, " +
"case date_format(EndDate, 'E') " +
"when 'Sat' then 'Yes' " +
"when 'Sun' then 'Yes' " +
"else 'No' end as Derived_WeekDayOrNot, " +
"upperCase(StartStation), " +
"upperCase(EndStation), " +
"BikeNumber, " +
"MemberType " +
"from TRIPS").show()
}