Решение показано ниже:
scala> initDF.show
+------+-------+--------+---+
|family|element|priority|qty|
+------+-------+--------+---+
| f1| elmt 1| 1| 20|
| f1| elmt 2| 2| 40|
| f1| elmt 3| 3| 10|
| f1| elmt 4| 4| 50|
| f1| elmt 5| 5| 40|
| f1| elmt 6| 6| 10|
| f1| elmt 7| 7| 20|
| f1| elmt 8| 8| 10|
+------+-------+--------+---+
scala> val df1 = initDF.groupBy("family").agg(collect_list("qty").as("comb_qty"), collect_list("priority").as("comb_prior"), collect_list("element").as("comb_elem"))
df1: org.apache.spark.sql.DataFrame = [family: string, comb_qty: array<int> ... 2 more fields]
scala> df1.show
+------+--------------------+--------------------+--------------------+
|family| comb_qty| comb_prior| comb_elem|
+------+--------------------+--------------------+--------------------+
| f1|[20, 40, 10, 50, ...|[1, 2, 3, 4, 5, 6...|[elmt 1, elmt 2, ...|
+------+--------------------+--------------------+--------------------+
scala> val df2 = df1.join(limitQtyDF, df1("family") === limitQtyDF("family")).drop(limitQtyDF("family"))
df2: org.apache.spark.sql.DataFrame = [family: string, comb_qty: array<int> ... 3 more fields]
scala> df2.show
+------+--------------------+--------------------+--------------------+--------+
|family| comb_qty| comb_prior| comb_elem|limitQty|
+------+--------------------+--------------------+--------------------+--------+
| f1|[20, 40, 10, 50, ...|[1, 2, 3, 4, 5, 6...|[elmt 1, elmt 2, ...| 100|
+------+--------------------+--------------------+--------------------+--------+
scala> def validCheck = (qty: Seq[Int], limit: Int) => {
| var sum = 0
| qty.map(elem => {
| if (elem + sum <= limit) {
| sum = sum + elem
| 1}else{
| 0
| }})}
validCheck: (scala.collection.mutable.Seq[Int], Int) => scala.collection.mutable.Seq[Int]
scala> val newUdf = udf(validCheck)
newUdf: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function2>,ArrayType(IntegerType,false),Some(List(ArrayType(IntegerType,false), IntegerType)))
val df3 = df2.withColumn("valid", newUdf(col("comb_qty"),col("limitQty"))).drop("limitQty")
df3: org.apache.spark.sql.DataFrame = [family: string, comb_qty: array<int> ... 3 more fields]
scala> df3.show
+------+--------------------+--------------------+--------------------+--------------------+
|family| comb_qty| comb_prior| comb_elem| valid|
+------+--------------------+--------------------+--------------------+--------------------+
| f1|[20, 40, 10, 50, ...|[1, 2, 3, 4, 5, 6...|[elmt 1, elmt 2, ...|[1, 1, 1, 0, 0, 1...|
+------+--------------------+--------------------+--------------------+--------------------+
scala> val myUdf = udf((qty: Seq[Int], prior: Seq[Int], elem: Seq[String], valid: Seq[Int]) => {
| elem zip prior zip qty zip valid map{
| case (((a,b),c),d) => (a,b,c,d)}
| }
| )
scala> val df4 = df3.withColumn("combined", myUdf(col("comb_qty"),col("comb_prior"),col("comb_elem"),col("valid")))
df4: org.apache.spark.sql.DataFrame = [family: string, comb_qty: array<int> ... 4 more fields]
scala> val df5 = df4.drop("comb_qty","comb_prior","comb_elem","valid")
df5: org.apache.spark.sql.DataFrame = [family: string, combined: array<struct<_1:string,_2:int,_3:int,_4:int>>]
scala> df5.show(false)
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|family|combined |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|f1 |[[elmt 1, 1, 20, 1], [elmt 2, 2, 40, 1], [elmt 3, 3, 10, 1], [elmt 4, 4, 50, 0], [elmt 5, 5, 40, 0], [elmt 6, 6, 10, 1], [elmt 7, 7, 20, 1], [elmt 8, 8, 10, 0]]|
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
scala> val df6 = df5.withColumn("combined",explode(col("combined")))
df6: org.apache.spark.sql.DataFrame = [family: string, combined: struct<_1: string, _2: int ... 2 more fields>]
scala> df6.show
+------+------------------+
|family| combined|
+------+------------------+
| f1|[elmt 1, 1, 20, 1]|
| f1|[elmt 2, 2, 40, 1]|
| f1|[elmt 3, 3, 10, 1]|
| f1|[elmt 4, 4, 50, 0]|
| f1|[elmt 5, 5, 40, 0]|
| f1|[elmt 6, 6, 10, 1]|
| f1|[elmt 7, 7, 20, 1]|
| f1|[elmt 8, 8, 10, 0]|
+------+------------------+
scala> val df7 = df6.select("family", "combined._1", "combined._2", "combined._3", "combined._4").withColumnRenamed("_1","element").withColumnRenamed("_2","priority").withColumnRenamed("_3", "qty").withColumnRenamed("_4","ok")
df7: org.apache.spark.sql.DataFrame = [family: string, element: string ... 3 more fields]
scala> df7.show
+------+-------+--------+---+---+
|family|element|priority|qty| ok|
+------+-------+--------+---+---+
| f1| elmt 1| 1| 20| 1|
| f1| elmt 2| 2| 40| 1|
| f1| elmt 3| 3| 10| 1|
| f1| elmt 4| 4| 50| 0|
| f1| elmt 5| 5| 40| 0|
| f1| elmt 6| 6| 10| 1|
| f1| elmt 7| 7| 20| 1|
| f1| elmt 8| 8| 10| 0|
+------+-------+--------+---+---+
Дайте мне знать, если это поможет !!