Вот пример без окна, просто с помощью groupby / agg
val data = Seq(
Row("ServiceCent4", "AP-1-IOO-PPP", "241.206.155.172", "06-12-18:17:42:34", 162),
Row("ServiceCent1", "AP-1-SPG-QQQ", "178.182.57.167", "06-12-18:17:42:34", 110 )
)
val schema = List(
StructField("COL1", StringType, true),
StructField("COL2", StringType, true),
StructField("COL3", StringType, true),
StructField("EventTimeString", StringType, true),
StructField("COL4", IntegerType, true)
)
val df = spark.createDataFrame(
spark.sparkContext.parallelize(data),
StructType(schema)
)
/* convert string to timestamp
* get month and year from timestamp
* drop timestamp string
*/
val monthDF = df.withColumn("EventTime", to_timestamp($"EventTimeString", "MM-dd-yy:HH:mm:ss")).withColumn("EventYear", year($"EventTime")).withColumn("EventMonth", month($"EventTime")).drop("EventTimeString")
monthDF.groupBy("COL1", "COL2", "EventYear", "EventMonth").agg(avg("COL4")).show()
+------------+------------+---------+----------+---------+
| COL1| COL2|EventYear|EventMonth|avg(COL4)|
+------------+------------+---------+----------+---------+
|ServiceCent4|AP-1-IOO-PPP| 2018| 6| 162.0|
|ServiceCent1|AP-1-SPG-QQQ| 2018| 6| 110.0|
+------------+------------+---------+----------+---------+