Я пытаюсь получить 25,50 и 75 процентилей, но мой код работает не очень хорошо, я пытаюсь перевести некоторый код из SAS в Pyspark, но это меня бесит.
Iпопытался удалить группы на 1,2 для имени столбца, который я хочу, но в любом случае он не работает
spark.sql('select distinct tAuxiliar.canalidad, \
case \
when tAuxiliar.percentil < 0.25 then "p25" \
when tAuxiliar.percentil >= 0.25 and tAuxiliar.percentil < 0.50 then "p50" \
when tAuxiliar.percentil >= 0.50 and tAuxiliar.percentil < 0.75 then "p75" \
end as grupo, \
max(corte_max) as percentil \
from (select "02_Ambos" as canalidad, \
tPercen.percentil, \
min(tPercen.operaciones) as corte_min, \
max(tPercen.operaciones) as corte_max \
from (select operaciones, \
percentile_approx(operaciones, 1) as percentil \
from base_canalidad_unica \
where canalidad = "02_Ambos" \
group by operaciones) as tPercen group by operaciones,percentil \
) as tAuxiliar \
where tAuxiliar.percentil < 0.75 \
group by distinct tAuxiliar.canalidad , grupo')
исходный запрос SAS:
select distinct
tAuxiliar.canalidad,
case
when tAuxiliar.percentil < 25 then 'p25'
when tAuxiliar.percentil >= 25 and tAuxiliar.percentil < 50 then 'p50'
when tAuxiliar.percentil >= 50 and tAuxiliar.percentil < 75 then 'p75'
end as grupo,
max(corte_max) as percentil
From
(
select '00_Solo_Internet' as canalidad,
tPercen.percentil,
min(tPercen.operaciones) as corte_min,
max(tPercen.operaciones) as corte_max
from
(
select operaciones,
Quantile(100,operaciones) as
percentil
from BASE_CANALIDAD_UNICA
where canalidad = '00_Solo_Internet'
) as tPercen
group by 1,2
)
as tAuxiliar
where tAuxiliar.percentil < 75
group by 1,2