BigQuery: как рассчитать количество посещений отдельных посетителей за последние 2 дня - PullRequest
0 голосов
/ 31 декабря 2018

Я хочу рассчитать уникальное количество пользователей за последние 2 дня для каждой даты.

Первый запрос: - Я пытался с помощью оператора CASE дать мне количество пользователей за этот день, который не ожидаетсярезультат, даже я пытался с оконной функцией.

Я знаю одно альтернативное решение с помощью самостоятельного соединения ( уже упоминалось как второй запрос ), которое дает мне правильный ответ, что я ожидал, но я хочусделайте это в одном запросе.

Причина, по которой в одном запросе нужно уменьшить размер обрабатываемых данных, если я сделаю сам объединение, он дважды прочитает полную таблицу, а исходный размер таблицы составляет несколько ТБ.

SELECT
(CASE WHEN dt BETWEEN DATE_SUB(dt, INTERVAL 1 DAY) AND dt THEN 
CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) END) AS Date_range,
COUNT(DISTINCT (CASE WHEN dt BETWEEN DATE_SUB(dt, INTERVAL 1 DAY) AND dt THEN Visitor_Name END)) AS Visitor_Count
FROM
(SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
UNION ALL
SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
UNION ALL
SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name)
GROUP BY Date_range
ORDER BY Date_range;

Решение:

SELECT
    (CASE WHEN a.dt BETWEEN DATE_SUB(b.dt, INTERVAL 1 DAY) AND b.dt THEN 
    CONCAT(CAST(DATE_SUB(b.dt, INTERVAL 1 DAY) AS STRING), '::', CAST(b.dt AS STRING)) END) AS Date_range,
    COUNT(DISTINCT (CASE WHEN a.dt BETWEEN DATE_SUB(b.dt, INTERVAL 1 DAY) AND b.dt THEN a.Visitor_Name END)) AS Visitor_Count
FROM


    (SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name) AS a


    INNER JOIN


    (SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name) AS b

    ON (a.dt <= b.dt)
GROUP BY Date_range
ORDER BY Date_range;

Ответы [ 2 ]

0 голосов
/ 31 декабря 2018

Ниже для BigQuery Standard SQL

#standardSQL
SELECT CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) Date_range, 
  ANY_VALUE((SELECT COUNT(DISTINCT visitor) FROM UNNEST(arr_visitors) visitor)) AS Visitor_Count
FROM (
  SELECT dt, 
    ARRAY_AGG(visitor_name) OVER(ORDER BY UNIX_DATE(dt) RANGE BETWEEN 1 PRECEDING AND CURRENT ROW) AS arr_visitors
  FROM `project.dataset.your_table`
)
GROUP BY Date_range 

Вы можете протестировать / поиграть с ним, используя фиктивные данные из вашего вопроса, как показано ниже

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name UNION ALL
  SELECT '2018-01-01', 'B' UNION ALL
  SELECT '2018-01-01', 'C' UNION ALL
  SELECT '2018-01-01', 'D' UNION ALL
  SELECT '2018-01-02', 'B' UNION ALL
  SELECT '2018-01-02', 'C' UNION ALL
  SELECT '2018-01-02', 'E' UNION ALL
  SELECT '2018-01-03', 'A' UNION ALL
  SELECT '2018-01-03', 'P' UNION ALL
  SELECT '2018-01-04', 'A' UNION ALL
  SELECT '2018-01-04', 'C' UNION ALL
  SELECT '2018-01-05', 'D' UNION ALL
  SELECT '2018-01-05', 'B' UNION ALL
  SELECT '2018-01-05', 'B' UNION ALL
  SELECT '2018-01-06', 'P' 
)
SELECT CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) Date_range, 
  ANY_VALUE((SELECT COUNT(DISTINCT visitor) FROM UNNEST(arr_visitors) visitor)) AS Visitor_Count
FROM (
  SELECT dt, 
    ARRAY_AGG(visitor_name) OVER(ORDER BY UNIX_DATE(dt) RANGE BETWEEN 1 PRECEDING AND CURRENT ROW) AS arr_visitors
  FROM `project.dataset.your_table`
)
GROUP BY Date_range 
ORDER BY Date_range   

с результатом

Row Date_range              Visitor_Count    
1   2017-12-31::2018-01-01  4    
2   2018-01-01::2018-01-02  5    
3   2018-01-02::2018-01-03  5    
4   2018-01-03::2018-01-04  3    
5   2018-01-04::2018-01-05  4    
6   2018-01-05::2018-01-06  3      
0 голосов
/ 31 декабря 2018

Вы можете сделать это, «умножив» записи перед агрегацией.То есть, дайте каждому пользователю запись для каждой даты, которую пользователь должен считать.

Вот пример:

with t as (
      SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name
     )
select dt, count(distinct visitor_name) as num_visitors
from (select distinct date_add(dt, interval inc day) as dt, visitor_name
      from t CROSS JOIN
           (select 0 as inc UNION ALL
            SELECT 1
           ) x
     ) t
group by t.dt
order by t.dt;
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...