Окончательное редактирование: Я заменил uniqueN
на length(unique())
.Это обеспечило быстрые результаты.Кроме того, у меня была опечатка в моем предыдущем правиле для правила 7. Я использовал unique(am_data)
, чтобы удалить дубликаты, и это, казалось, исправило все, кроме rule_4.
> res_2[, lapply(.SD, sum), .SDcols = 2:8]
rule_1 rule_2 rule_3 rule_4 rule_5 rule_6 rule_7
1: 17167 10448 17165 2 606 16040 17072
> res[, lapply(.SD,sum), .SDcols = 2:8]
rule_1 rule_2 rule_3 rule_4 rule_5 rule_6 rule_7
1: 17167 10448 17165 0 606 16040 17072
am_data <- unique(am_data)
# Prepare for Rules 1 - 3 -------------------------------------------------
am_data2 <- copy(am_data)[!is.na(device_id)]
a <- copy(am_data2)
setnames(a, paste0('a.', names(a)))
# Make Rules 1-3 happen ---------------------------------------------------
self_join <- am_data2[a,
on = .(device_id = a.device_id,
sent_at < a.sent_at),
allow.cartesian = TRUE
,nomatch = 0L
][customer_id != a.customer_id]
rule_1 = self_join[, length(unique(customer_id)), by = a.app_id]
rule_2 = self_join[rejected == 1 , length(unique(customer_id)), by = a.app_id]
rule_3 = self_join[, length(unique(person_id)), by = a.app_id]
# Prepare for Rule 4 ------------------------------------------------------
am_data2 <- copy(am_data)[!is.na(ip_address_id)]
a <- copy(am_data2)
setnames(a, paste0('a.', names(a)))
a[, a.sent_at_range := a.sent_at - 14]
# Make Rule 4 happen ------------------------------------------------------
self_join <- am_data2[rejected == 1
][a,
on = .(ip_address_id = a.ip_address_id,
sent_at < a.sent_at,
sent_at >= a.sent_at_range),
allow.cartesian = TRUE
,nomatch = 0L
][customer_id != a.customer_id]
rule_4 <- self_join[, length(unique(customer_id)), by = a.app_id]
# Prepare for Rule 5 ------------------------------------------------------
am_data2 <- copy(am_data)[!is.na(contact_phone_id)]
a <- copy(am_data)[!is.na(mobile_phone_id)]
setnames(a, paste0('a.', names(a)))
# Make Rule 5 happen ------------------------------------------------------
self_join <- am_data2[rejected == 1
][a,
on = .(contact_phone_id = a.mobile_phone_id,
sent_at < a.sent_at),
allow.cartesian = TRUE
,nomatch = 0L
][customer_id != a.customer_id]
rule_5 <- self_join[, length(unique(customer_id)), by = a.app_id]
# Prepare for Rule 6 ------------------------------------------------------
am_data2 <- copy(am_data)[!is.na(work_phone_id)]
a <- copy(am_data)[!is.na(mobile_phone_id)]
setnames(a, paste0('a.', names(a)))
# Make Rule 6 Happen ------------------------------------------------------
self_join <- am_data2[rejected == 1
][a,
on = .(work_phone_id = a.mobile_phone_id,
sent_at < a.sent_at),
allow.cartesian = TRUE
,nomatch = 0L
][customer_id != a.customer_id]
rule_6 <- self_join[, length(unique(customer_id)), by = a.app_id]
# Prepare for Rule 7 ------------------------------------------------------
am_data2 <- copy(am_data)[!is.na(person_id)]
a <- copy(am_data2)
setnames(a, paste0('a.', names(a)))
# Make Rule 7 Happen ------------------------------------------------------
self_join <- am_data2[a,
on = .(person_id = a.person_id,
sent_at < a.sent_at),
allow.cartesian = TRUE
# ,nomatch = 0L
][customer_id != a.customer_id & passport_id != a.passport_id]
rule_7 <- self_join[, length(unique(customer_id)), by = a.app_id]
# Combine and cast the rules we made --------------------------------------
res_2 <- dcast(rbindlist(list(rule_1, rule_2, rule_3, rule_4, rule_5, rule_6, rule_7), idcol = 'rule'), formula = a.app_id ~ rule , fill = 0L)
setnames(res_2,2:8, paste0('rule_', 1:7))
Результаты
> res_2
a.app_id rule_1 rule_2 rule_3 rule_4 rule_5 rule_6 rule_7
1: 89033 0 0 0 0 0 1 0
2: 95775 0 0 0 0 0 1 0
3: 96542 0 0 0 0 0 1 0
4: 106447 0 0 0 0 0 1 0
5: 113040 0 0 0 0 0 1 0
---
21925: 34904219 1 1 1 0 0 1 0
21926: 34904725 1 1 1 0 0 0 1
21927: 34904750 1 0 1 0 0 1 1
21928: 34904921 1 0 1 0 0 0 1
21929: 34905033 0 0 0 0 0 1 1
> res[order(a.app_id) & (rule_1 > 0 | rule_2 > 0 | rule_3 > 0 |
rule_4 > 0 | rule_5 >0 | rule_6 > 0 | rule_7 > 0)]
a.app_id rule_1 rule_2 rule_3 rule_4 rule_5 rule_6 rule_7
1: 89033 0 0 0 0 0 1 0
2: 95775 0 0 0 0 0 1 0
3: 96542 0 0 0 0 0 1 0
4: 106447 0 0 0 0 0 1 0
5: 113040 0 0 0 0 0 1 0
---
22403: 34904219 1 1 1 0 0 1 1
22404: 34904725 1 1 1 0 0 0 1
22405: 34904750 1 0 1 0 0 1 1
22406: 34904921 1 0 1 0 0 0 1
22407: 34905033 0 0 0 0 0 1 1
Оригинал: Сохранено, так как оно вводится устройством и может быть полезным.
Это data.table эквивалент SQL для правила1.Я проверил первые 5 и последние 5 результатов, и они совпадают.
tmp2 <- am_data[!is.na(device_id), ..cols]
tmp2[tmp2,
on = .(device_id = device_id,
sent_at > sent_at),
allow.cartesian = TRUE
][customer_id != i.customer_id | is.na(customer_id),
.N,
keyby = device_id]