Используя lag
, мы можем получить доступ к данным предыдущих строк. Вот решение
from pyspark.sql.window import Window
from pyspark.sql.functions import col,lag,when
df = spark.createDataFrame(([1,'10:10','red','open'],
[2,'11:12','blau','closed'],
[3,'11:30','black','closed'],
[4,'02:13','red','open'],
[5,'03:00','yellow','closed'],
[6,'03:18','white','closed'],
[7,'04:15','red','open'],
[8,'06:00','black','closed'])).toDF("a","b","c","d")
window = Window.orderBy("a")
df = df.withColumn("prev_row", lag("c",1,"red").over(window))
df = df.withColumn("selected", when(col('c') == 'red', "true").when(col('prev_row') == 'red', "true").otherwise("false"))
df = df.filter(col("selected") == "true").drop("prev_row","selected")
df.show()
что дает
+---+-----+------+------+
| a| b| c| d|
+---+-----+------+------+
| 1|10:10| red| open|
| 2|11:12| blau|closed|
| 4|02:13| red| open|
| 5|03:00|yellow|closed|
| 7|04:15| red| open|
| 8|06:00| black|closed|
+---+-----+------+------+