Создайте запрос, используя клей, но сделайте его не строкой - PullRequest
1 голос
/ 15 февраля 2020

Я работаю в rstudio с помощью скрипта ar, который читает скрипт sql, используя клей. Мой скрипт r:

library(tidyverse)
library(dbplyr)
library(DBI)
library(glue)
con <- dbConnect(odbc(), "Athena") 

day_from <- 7
day_to <- 30
game_name <- 'fungame'
query <- read_lines("example.sql") %>% glue_collapse(sep = "\n") %>% glue_sql(.con = con)
setosa <- dbGetQuery(con, query)
setosa %>% glimpse()

Мой запрос скрипта sql. sql:

/*a comment describing the query*/
with


/*
Athena installs data. After deducting the training period, get the preceding 90 days of installs for training.
*/
installs as (
select s, 
       install_dt, 
       split(game_name, '_')[2] as platform,
       case when country = 'United States' then 1 else 0 end as usa
from device_metrics.game_install
where year || '-' || month || '-' || day >= date_format(date_add('day', -({day_to} + 91), current_date), '%Y-%m-%d')
and year || '-' || month || '-' || day <= date_format(date_add('day', -({day_to} + 1), current_date), '%Y-%m-%d')
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
),


/* 
Get marketing data from adx.
Full quarter of training data with at least one full day_m day cycle i.e. last full quarter + day_m days 
*/
adx_min as (
select 
  adx_id,
  publisher_name,
  row_number() over(partition by adx_id order by time_stamp asc) rn -- some dups, get first instance of an install
from blah_ui_dev.adxdata_match_v2
where lower(game_name) = {game_name}
and concat(yy,'-',mm,'-',dd) >= date_format(date_add('day', -({day_to} + 91), current_date), '%Y-%m-%d')
and concat(yy,'-',mm,'-',dd) <= date_format(date_add('day', -({day_to} + 1), current_date), '%Y-%m-%d')
),


/*
Dedupped installs based on earliest timestamp
*/
adx as (
select 
  adx_id,
  publisher_name
from adx_min
where rn = 1
),


/*
installs and marketing dta where exists
use min/max to dedup, some cases with a single s assoociated with multiple platforms
*/
installs_base as (
select 
  i.s,
  i.usa,
  min(i.install_dt) as install_dt,
  min(i.platform) as platform,
  min(a.publisher_name) as publisher_name
from installs i 
left join adx a on upper(if(i.s like 'IDFV%', substr(i.s,6), i.s)) = a.adx_id 
group by i.s, i.usa
),


/*
day n sessions count
*/
sessions_day_from as (
select i.s,
       count(1) as sessions_day_from,
       sum(session_length) / 1000 as sum_session_time_day_from
from installs_base i        
join device_metrics.user_game_session sess on sess.s = i.s
where regexp_like(lower(sess.game_name), '^(?!.*QA).*' || {game_name} || '.*')
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(sess.activity_date, '%Y-%m-%d')) <= {day_from}
group by i.s
),


/*
day n utility
*/
utility_day_from as (
select 
  u.s,
  sum(u.utility) as utility_day_from
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = {game_name}
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= {day_from}
group by u.s
),


/*
recent utility. Are users continuing to play the game more recently or did they drop off.
*/
utility_recent as (
select 
  u.s,
  sum(u.utility) as recent_utility_sum
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = {game_name}
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= floor({day_from} / 2)
group by u.s
),


/*
day n spend
*/
spend_day_from as (
select 
  i.s, 
  sum(dr.amt) as spend_day_from
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= {day_from}
group by 1
),


/*
day m target spend (target)
*/
spend_day_to as (
select 
  i.s, 
  sum(dr.amt) as spend_day_to
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= {day_to}
group by 1
)


select 
  i.s,
  i.install_dt,
  i.platform,
  i.usa,
  i.publisher_name,
  coalesce(sn.sessions_day_from, 0) as {glue('sessions_day_{day_from}')},
  coalesce(sn.sum_session_time_day_from, 0) as {glue('sum_session_time_day_{day_from}')},
  coalesce(un.utility_day_from, 0) as {glue('utility_day_{day_from}')},
  coalesce(spn.spend_day_from, 0) as {glue('spend_day_{day_from}')},
  round(coalesce(ru.recent_utility_sum / un.utility_day_from, 0), 2) as recent_utility_ratio,
  coalesce(spm.spend_day_to, 0) as {glue('spend_day_{day_to}')} 
from installs_base i 
left join sessions_day_from sn on sn.s = i.s 
left join utility_day_from un on un.s = i.s 
left join spend_day_from spn on spn.s = i.s
left join utility_recent ru on ru.s = i.s
left join spend_day_to spm on spm.s = i.s

Чтобы узнать, как выглядит sql в консоли, введите переменную запрос имени:

query
<SQL> -- !preview conn=con

/*DDA day day_from to day day_to training data query*/
with


/*
Athena installs data. After deducting the training period, get the preceeding 90 days of installs for training.
*/
installs as (
select s, 
       install_dt, 
       split(game_name, '_')[2] as platform,
       case when country = 'United States' then 1 else 0 end as usa
from device_metrics.game_install
where year || '-' || month || '-' || day >= date_format(date_add('day', -(30 + 91), current_date), '%Y-%m-%d')
and year || '-' || month || '-' || day <= date_format(date_add('day', -(30 + 1), current_date), '%Y-%m-%d')
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
),


/* 
Get marketing data from adx.
Full quarter of training data with at least one full day_m day cycle i.e. last full quarter + day_m days 
*/
adx_min as (
select 
  adx_id,
  publisher_name,
  row_number() over(partition by adx_id order by time_stamp asc) rn -- some dups, get first instance of an install
from glu_ui_dev.adxdata_match_v2
where lower(game_name) = 'fungame'
and concat(yy,'-',mm,'-',dd) >= date_format(date_add('day', -(30 + 91), current_date), '%Y-%m-%d')
and concat(yy,'-',mm,'-',dd) <= date_format(date_add('day', -(30 + 1), current_date), '%Y-%m-%d')
),


/*
Dedupped installs based on earliest timestamp
*/
adx as (
select 
  adx_id,
  publisher_name
from adx_min
where rn = 1
),


/*
installs and marketing dta where exists
use min/max to dedup, some cases with a single s assoociated with multiple platforms
*/
installs_base as (
select 
  i.s,
  i.usa,
  min(i.install_dt) as install_dt,
  min(i.platform) as platform,
  min(a.publisher_name) as publisher_name
from installs i 
left join adx a on upper(if(i.s like 'IDFV%', substr(i.s,6), i.s)) = a.adx_id 
group by i.s, i.usa
),


/*
day n sessions count
*/
sessions_day_from as (
select i.s,
       count(1) as sessions_day_from,
       sum(session_length) / 1000 as sum_session_time_day_from
from installs_base i        
join device_metrics.user_game_session sess on sess.s = i.s
where regexp_like(lower(sess.game_name), '^(?!.*QA).*' || 'fungame' || '.*')
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(sess.activity_date, '%Y-%m-%d')) <= 7
group by i.s
),


/*
day n utility
*/
utility_day_from as (
select 
  u.s,
  sum(u.utility) as utility_day_from
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = 'fungame'
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= 7
group by u.s
),


/*
recent utility. Are users continuing to play the game more recently or did they drop off.
*/
utility_recent as (
select 
  u.s,
  sum(u.utility) as recent_utility_sum
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = 'fungame'
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= floor(7 / 2)
group by u.s
),


/*
day n spend
*/
spend_day_from as (
select 
  i.s, 
  sum(dr.amt) as spend_day_from
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= 7
group by 1
),


/*
day m target spend (target)
*/
spend_day_to as (
select 
  i.s, 
  sum(dr.amt) as spend_day_to
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= 30
group by 1
)


select 
  i.s,
  i.install_dt,
  i.platform,
  i.usa,
  i.publisher_name,
  coalesce(sn.sessions_day_from, 0) as 'sessions_day_7',
  coalesce(sn.sum_session_time_day_from, 0) as 'sum_session_time_day_7',
  coalesce(un.utility_day_from, 0) as 'utility_day_7',
  coalesce(spn.spend_day_from, 0) as 'spend_day_7',
  round(coalesce(ru.recent_utility_sum / un.utility_day_from, 0), 2) as recent_utility_ratio,
  coalesce(spm.spend_day_to, 0) as 'spend_day_30' 
from installs_base i 
left join sessions_day_from sn on sn.s = i.s 
left join utility_day_from un on un.s = i.s 
left join spend_day_from spn on spn.s = i.s
left join utility_recent ru on ru.s = i.s
left join spend_day_to spm on spm.s = i.s

Проблема, о которой я пишу, специально находится в самом нижнем разделе имени поля выбора. После 'as' мои имена полей заключаются в кавычки.

Glue возвращает часть после 'as' в виде строки. Это, похоже, не вызывает проблем в этом случае, однако, в моем реальном сценарии я использую odb c с Афиной, и ему не нравится тот факт, что запрос имеет форму select field_name as 'string' в отличие от обычная select field_name as string // строка не заключена в кавычки как строка

Есть ли способ, чтобы клей составлял эту часть после 'as' без строковых кавычек? Желаемый запрос:

<SQL> 
    select
      Species as Species-setosa
    from iris
    where species = 'setosa'

1 Ответ

1 голос
/ 15 февраля 2020

Один из вариантов - заменить ', сопоставив кавычку в указанном c местоположении с поиском регулярных выражений. Здесь мы можем использовать str_replace_all

library(magrittr)
library(stringr)
query %>%
     str_replace_all("(?<=as )'|(?<![%])'(?=(\nfrom|,))", "") %>%
     glue_sql(.con = con)
#<SQL> select
#  Species as Species-setosa
#from iris
#where species = 'setosa'

Мы сопоставляем ' после слова as и пробел или (|) ' предшествующий запятой (*) 1010 *) или следующая строка и от (\nfrom)

...