Используйте arrays_overlap
(или) array_intersect
функций для передачи array(<strings>)
вместо array_contains
.
Example:
1.filter based on texts variable:
val df=Seq((Seq("text1")),(Seq("text4","text1")),(Seq("text5"))).
toDF("textCol")
df.show()
//+--------------+
//| textCol|
//+--------------+
//| [text1]|
//|[text4, text1]|
//| [text5]|
//+--------------+
val texts = Array("text1","text2","text3")
//using arrays_overlap
df.filter(arrays_overlap(col("textcol"),lit(texts))).show(false)
//+--------------+
//|textCol |
//+--------------+
//|[text1] |
//|[text4, text1]|
//+--------------+
//using arrays_intersect
df.filter(size(array_intersect(col("textcol"),lit(texts))) > 0).show(false)
//+--------------+
//|textCol |
//+--------------+
//|[text1] |
//|[text4, text1]|
//+--------------+
2.Adding texts variable to the dataframe:
val texts = "text1,text2,text3"
val df=Seq((Seq("text1")),(Seq("text4","text1")),(Seq("text5"))).
toDF("textCol").
withColumn("texts",split(lit(s"${texts}"),","))
df.show(false)
//+--------------+---------------------+
//|textCol |texts |
//+--------------+---------------------+
//|[text1] |[text1, text2, text3]|
//|[text4, text1]|[text1, text2, text3]|
//|[text5] |[text1, text2, text3]|
//+--------------+---------------------+
//using array_intersect
df.filter("""size(array_intersect(textcol,texts)) > 0""").show(false)
//+--------------+---------------------+
//|textCol |texts |
//+--------------+---------------------+
//|[text1] |[text1, text2, text3]|
//|[text4, text1]|[text1, text2, text3]|
//+--------------+---------------------+
//using arrays_overlap
df.filter("""arrays_overlap(textcol,texts)""").show(false)
+--------------+---------------------+
|textCol |texts |
+--------------+---------------------+
|[text1] |[text1, text2, text3]|
|[text4, text1]|[text1, text2, text3]|
+--------------+---------------------+