BigQuery: преобразование устаревшего SQL-запроса в стандартный SQL - PullRequest
0 голосов
/ 28 мая 2018

Я конвертирую законный SQL-запрос в стандартный SQL-запрос в Bigquery, чтобы рассчитать показатель отказов Google Analytics.Но при преобразовании запроса есть некоторая разница в выходном результате.

Устаревший SQL-запрос

SELECT session_bounceRate,

  t1.source as source,t1.medium as medium,

  total_session

FROM (
  SELECT
    IFNULL(t1.session_bounceSessionCount, 0) / t2.session_sessionId_distinct_count AS session_bounceRate,
    t1.source,t1.medium,
    t2.total_session_distinct_count AS total_session
  FROM (
    SELECT
      INTEGER(session_bounceSession_distinct_count) AS session_bounceSessionCount,
    source,
        medium
    FROM (
      SELECT
        COUNT(DISTINCT session_sessionId, 10000000) AS session_bounceSession_distinct_count,    --Changes done (Count(x) in legacy sql stands for approx ,
      source,                                                                                   --,so replaced it with APPROX_COUNT_DISTINCT to get approx count
        medium from (
        SELECT
          SUM(IF(session_hitsType = 'event'
              AND session_isInteraction_first = 1, 1, 0)) AS session_isEventInteraction_sum,
          session_sessionId AS session_sessionId,
          SUM(session_pageViews_sum) AS session_pageViews_sum_sum,
           source,
           medium
        FROM (
          SELECT
            hits.type AS session_hitsType,
            sessionId AS session_sessionId,
            SUM(totals.pageviews) AS session_pageViews_sum,
            FIRST(hits.isInteraction) AS session_isInteraction_first,
            trafficSource.source AS source,
            trafficSource.medium AS medium,

          FROM
            TABLE_DATE_RANGE([[test:test.session_streaming_], TIMESTAMP('2018-04-01'), TIMESTAMP('2018-04-30')) AS session_streaming
          GROUP BY
            source,
            medium,
            session_hitsType,
            session_sessionId )
        GROUP BY
          source,
          medium,
          session_sessionId )
      WHERE
        (session_isEventInteraction_sum = 0
          AND session_pageViews_sum_sum = 1)
      GROUP BY
        source,
        medium ) ) AS t1
  JOIN EACH ( 
    SELECT
      COUNT(DISTINCT sessionId, 10000000) AS session_sessionId_distinct_count,
      trafficSource.source AS source,
      trafficSource.medium AS medium,
      COUNT(DISTINCT sessionId, 10000000) AS total_session_distinct_count

    FROM
      TABLE_DATE_RANGE([test:Test.session_streaming_], TIMESTAMP('2018-04-01'), TIMESTAMP('2018-04-30')) AS session_streaming
    GROUP BY
      source,
       medium ) AS t2
  ON
    t1.source = t2.source  
    and t1.medium=t2.medium)
where t1.medium='zadv_display'
ORDER BY
  total_session DESC

Стандартный SQL-запрос

SELECT
  IFNULL(t1.session_bounceSessionCount, 0) / t2.session_sessionId_distinct_count AS session_bounceRate,
  t1.source,
  t1.medium,
      t2.total_session_distinct_count AS total_session
FROM (
  SELECT
    CAST(session_bounceSession_distinct_count AS INT64) AS session_bounceSessionCount,
    source,
    medium

  FROM (
    SELECT
      APPROX_COUNT_DISTINCT(DISTINCT session_sessionId) AS session_bounceSession_distinct_count,
      source,
      medium

    FROM (
      SELECT
        SUM(IF(session_hitsType = 'event'
            AND session_isInteraction_first = 1, 1, 0)) AS session_isEventInteraction_sum,
        session_sessionId AS session_sessionId,
        SUM(session_pageViews_sum) AS session_pageViews_sum_sum,
        source,
        medium

      FROM (
        SELECT
          session_hitsType,
          session_sessionId,

          source,
          medium,
          CASE
            WHEN session_isInteraction_first = TRUE THEN 1
            ELSE 0
          END AS session_isInteraction_first,
          SUM(session_pageViews_sum) AS session_pageViews_sum
        FROM (
          SELECT
            hits.Type AS session_hitsType,
            sessionId AS session_sessionId,

            trafficSource.source AS source,
            trafficSource.medium AS medium,
            totals.pageviews AS session_pageViews_sum,
            FIRST_VALUE(hits.isInteraction) OVER(PARTITION BY sessionId ORDER BY TIMESTAMP_SECONDS(hits.Time)) AS session_isInteraction_first
          FROM
            `test.Test.session_streaming_*`,
            unNEST(hits) hits
          WHERE
            _table_suffix BETWEEN '20180401'
            AND '20180430' )
        GROUP BY
          session_hitsType,
          session_sessionId,

          source,
          medium,
          session_isInteraction_first )
      GROUP BY
        source,
        medium,
        session_sessionId)
    WHERE
      (session_isEventInteraction_sum = 0
        AND session_pageViews_sum_sum = 1)
    GROUP BY
      source,
      medium)) AS t1
JOIN (
  SELECT
    APPROX_COUNT_DISTINCT(DISTINCT sessionId) AS session_sessionId_distinct_count,
    trafficSource.source AS source,
    trafficSource.medium AS medium,
    APPROX_COUNT_DISTINCT(DISTINCT sessionId) AS total_session_distinct_count

  FROM
    `test.Test.session_streaming_*`
  WHERE _table_suffix BETWEEN '20180401' AND '20180430'
  GROUP BY
    source,
    medium
    ) AS t2
ON
  t1.source = t2.source
       AND t1.medium=t2.medium
  where t1.medium='zadv_display'
  order by total_session desc

Мы должны заменить функцию First взначение с First_value в стандартном sql, которое является заметным изменением, внесенным в запрос.

Может кто-нибудь подсказать мне, есть ли какая-то проблема при преобразовании, поскольку стандартный вывод запроса sql должен совпадать с устаревшим выводом?

1 Ответ

0 голосов
/ 25 июня 2018

Я не совсем уверен, что вам действительно нужны все эти вложенные запросы.Но вы должны использовать подзапросы для массивов - очень много.Например, информация о первом взаимодействии событий в сеансе выглядит следующим образом:

SELECT
  date,
  visitStartTime,
  (SELECT isInteraction FROM t.hits WHERE type='EVENT' ORDER BY hitNumber ASC LIMIT 1) AS isInteraction
FROM
  `project.dataset.ga_sessions_20180624` AS t
LIMIT
  1000

Так что просто относитесь к (struct-) массивам как к меньшим таблицам в большей таблице.Если у вас есть только агрегации до уровня сеанса, вам не следует выравнивать таблицу вообще.Вот также документ, объясняющий переход на стандарт SQL: https://cloud.google.com/bigquery/docs/reference/standard-sql/migrating-from-legacy-sql

...