Я использовал несколько циклов и проверок условий и попробовал использовать набор тестов, кроме упомянутого вами «базового» набора.
library(tidyverse)
#base dataset
billNumber <- c(1,1,2,2,3,3,3)
day <- c(10,11,23,23,11,12,13)
month <- c(4,4,11,11,3,3,3)
year <- c(1998,1998,2001,2001,2005,2005,2005)
agValue <- c(10,14,12,9,8,9,4)
#test dataset
billNumber <- c(1,1,2,2,3,3,3,4,4,4)
day <- c(10,11,23,23,11,12,13,15,15,15)
month <- c(4,4,11,11,3,3,3,6,6,6)
year <- c(1998,1998,2001,2001,2005,2005,2005,2020,2020,2020)
agValue <- c(10,14,9,12,8,9,4,13,15,8)
#build the dataset
df <- data.frame(billNumber,day,month,year,agValue)
#add a couple of working columns
df_full <- df %>%
mutate(
concat = paste(df$billNumber,df$day,df$month,df$year,sep="-"),
flag = ""
)
df_full
billNumber day month year agValue concat flag
1 1 10 4 1998 10 1-10-4-1998
2 1 11 4 1998 14 1-11-4-1998
3 2 23 11 2001 12 2-23-11-2001
4 2 23 11 2001 9 2-23-11-2001
5 3 11 3 2005 8 3-11-3-2005
6 3 12 3 2005 9 3-12-3-2005
7 3 13 3 2005 4 3-13-3-2005
#separate records with one/multi occurence as defined in the question
row_single <- df_full %>% count(concat) %>% filter(n == 1)
df_full_single <- df_full[df_full$concat %in% row_single$concat,]
row_multi <- df_full %>% count(concat) %>% filter(n > 1)
df_full_multi <- df_full[df_full$concat %in% row_multi$concat,]
#flag the rows with single occurence
df_full_single[1,]$flag = "Y"
for (row in 2:nrow(df_full_single)) {
if (df_full_single[row,]$billNumber == df_full_single[row-1,]$billNumber) {
df_full_single[row,]$flag = "N"
} else
{
df_full_single[row,]$flag = "Y"
}
}
df_full_single
#flag the rows with multi occurences
df_full_multi[1,]$flag = "Y"
for (row in 2:nrow(df_full_multi)) {
if (
(df_full_multi[row,]$billNumber == df_full_multi[row-1,]$billNumber) &
(df_full_multi[row,]$agValue > df_full_multi[row-1,]$agValue)
) {
df_full_multi[row,]$flag = "Y"
df_full_multi[row-1,]$flag = "N"
} else
{
df_full_multi[row,]$flag = "N"
}
}
df_full_multi
#rebuild full dataset and retrieve the desired output
df_full_final <- rbind(df_full_single,df_full_multi)
df_full_final <- df_full_final[df_full_final$flag == "Y",c(1,2,3,4,5)]
df_full_final <- df_full_final[order(df_full_final$billNumber),]
df_full_final
billNumber day month year agValue
1 1 10 4 1998 10
3 2 23 11 2001 12
5 3 11 3 2005 8