Использование оконных функций spark-sql и без каких-либо файлов. Конструкция df выполняется в scala, но часть sql будет такой же в python. Проверьте это:
val df = Seq(("d25as","AB",2),("d25as","AB",3),("d25as","AB",5),("m3562","AB",1),("m3562","AB",7),("m3562","AB",9),("ha42a","AB",3),("ha42a","AB",4),("ha42a","AB",5)).toDF("user_id","action","day")
df.createOrReplaceTempView("qubix")
spark.sql(
""" with t1( select user_id, action, day, row_number() over(partition by user_id order by day)-day diff from qubix),
t2( select user_id, action, day, collect_set(diff) over(partition by user_id) diff2 from t1)
select user_id, action, day from t2 where size(diff2) > 1
""").show(false)
Результаты:
+-------+------+---+
|user_id|action|day|
+-------+------+---+
|d25as |AB |2 |
|d25as |AB |3 |
|d25as |AB |5 |
|m3562 |AB |1 |
|m3562 |AB |7 |
|m3562 |AB |9 |
+-------+------+---+
версия pyspark
>>> from pyspark.sql.functions import *
>>> values = [('d25as','AB',2),('d25as','AB',3),('d25as','AB',5),
... ('m3562','AB',1),('m3562','AB',7),('m3562','AB',9),
... ('ha42a','AB',3),('ha42a','AB',4),('ha42a','AB',5)]
>>> df = spark.createDataFrame(values,['user_id','action','day'])
>>> df.show()
+-------+------+---+
|user_id|action|day|
+-------+------+---+
| d25as| AB| 2|
| d25as| AB| 3|
| d25as| AB| 5|
| m3562| AB| 1|
| m3562| AB| 7|
| m3562| AB| 9|
| ha42a| AB| 3|
| ha42a| AB| 4|
| ha42a| AB| 5|
+-------+------+---+
>>> df.createOrReplaceTempView("qubix")
>>> spark.sql(
... """ with t1( select user_id, action, day, row_number() over(partition by user_id order by day)-day diff from qubix),
... t2( select user_id, action, day, collect_set(diff) over(partition by user_id) diff2 from t1)
... select user_id, action, day from t2 where size(diff2) > 1
... """).show()
+-------+------+---+
|user_id|action|day|
+-------+------+---+
| d25as| AB| 2|
| d25as| AB| 3|
| d25as| AB| 5|
| m3562| AB| 1|
| m3562| AB| 7|
| m3562| AB| 9|
+-------+------+---+
>>>