Есть ли способ оптимизировать группу по срединному запросу? - PullRequest
0 голосов
/ 13 февраля 2020

Я написал запрос, который находит медиану значений для каждого месяца. Сделать это было достаточно сложно, потому что у MySQL нет встроенной медианной функции, поэтому я действительно должен был мыслить нестандартно со своими промежуточными навыками SQL. Но теперь проблема в том, что для выполнения запроса требуется очень много времени (иногда 1 или 2 минуты). Есть ли способ оптимизировать этот запрос? Или, возможно, мне следует написать скрипт python, который находит медиану и передает ее в базу данных с помощью соединителя?

Вот запрос:

SET @row_num_pos := 0;
SET @median_group_pos := '';
SET @row_num_neg := 0;
SET @median_group_neg := '';

SELECT 
    p.month_num AS 'month_num',
    CASE
        WHEN p.month_num = 1 THEN 'Jan'
        WHEN p.month_num = 2 THEN 'Feb'
        WHEN p.month_num = 3 THEN 'Mar'
        WHEN p.month_num = 4 THEN 'Apr'
        WHEN p.month_num = 5 THEN 'May'
        WHEN p.month_num = 6 THEN 'Jun'
        WHEN p.month_num = 7 THEN 'Jul'
        WHEN p.month_num = 8 THEN 'Aug'
        WHEN p.month_num = 9 THEN 'Sep'
        WHEN p.month_num = 10 THEN 'Oct'
        WHEN p.month_num = 11 THEN 'Nov'
        WHEN p.month_num = 12 THEN 'Dec'
    END AS 'Timeline',
    p.ck_pos_median AS 'CK+ Median',
    n.ck_neg_median AS 'CK- Median'
FROM
    (SELECT 
        s.median_month_pos AS 'month_num',
            ROUND(AVG(ck_pos), 1) AS 'ck_pos_median'
    FROM
        (SELECT 
        @row_num_pos:=CASE
                WHEN @median_group_pos = q.month_num THEN @row_num_pos + 1
                ELSE 1
            END AS 'count_of_group',
            @median_group_pos:=q.month_num AS 'median_month_pos',
            q.month_num,
            q.ck_pos,
            (SELECT 
                    COUNT(*)
                FROM
                    Biocept_DB.result_management_report
                WHERE
                    ck_pos IS NOT NULL
                        AND MONTH(order_date) = q.month_num) AS total_month
    FROM
        (SELECT 
        MONTH(order_date) AS 'month_num', ck_pos
    FROM
        Biocept_DB.result_management_report
    WHERE
        ck_pos IS NOT NULL
    ORDER BY MONTH(order_date) , ck_pos ASC) AS q) AS s
    WHERE
        s.count_of_group BETWEEN (s.total_month / 2.0) AND (s.total_month / 2.0 + 1)
    GROUP BY s.median_month_pos) AS p
        JOIN
    (SELECT 
        s.median_month_neg AS 'month_num',
            ROUND(AVG(ck_neg), 1) AS 'ck_neg_median'
    FROM
        (SELECT 
        @row_num_neg:=CASE
                WHEN @median_group_neg = q.month_num THEN @row_num_neg + 1
                ELSE 1
            END AS 'count_of_group',
            @median_group_neg:=q.month_num AS 'median_month_neg',
            q.month_num,
            q.ck_neg,
            (SELECT 
                    COUNT(*)
                FROM
                    Biocept_DB.result_management_report
                WHERE
                    ck_neg IS NOT NULL
                        AND MONTH(order_date) = q.month_num) AS total_month
    FROM
        (SELECT 
        MONTH(order_date) AS 'month_num', ck_neg
    FROM
        Biocept_DB.result_management_report
    WHERE
        ck_neg IS NOT NULL
    ORDER BY MONTH(order_date) , ck_neg ASC) AS q) AS s
    WHERE
        s.count_of_group BETWEEN (s.total_month / 2.0) AND (s.total_month / 2.0 + 1)
    GROUP BY s.median_month_neg) AS n ON p.month_num = n.month_num
ORDER BY p.month_num;

SET @row_num_pos := NULL;
SET @median_group_pos := NULL;
SET @row_num_neg := NULL;
SET @median_group_neg := NULL;

Вот таблица, которую он генерирует : enter image description here

1 Ответ

1 голос
/ 16 февраля 2020

Я немного изменил ваш запрос. Я надеюсь, что расчет верен. С вашей датой выборки результаты совпадают.

В моей среде ваш запрос занимает 6,22 секунды, а мои - только 20 мс. поэтому он выглядит в 300 раз быстрее.

Пожалуйста, проверьте мой запрос и дайте мне знать, работает ли он для вас. если скорость не очень хорошая, мы можем оптимизировать ее с помощью виртуальных столбцов.

Пожалуйста, не забудьте установить для group_concat_max_len подходящее для вас значение.

SET SESSION group_concat_max_len = 1000000;

Запрос

SELECT  r.Timeline AS `month_number`
        , SUBSTRING_INDEX(SUBSTRING_INDEX( 'Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec', ',' , r.Timeline ) , ',' , -1)
            AS Timeline
        , ( SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_pos, ',' , (r.cnt_ck_pos>>1)+1 ) , ',' , -1) +
            SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_pos, ',' , (r.cnt_ck_pos>>1) + 1 - ((r.cnt_ck_pos&1) XOR 1) ) , ',' , -1) 
            ) / 2 AS 'ck_pos_median'
        , ( SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_neg, ',' , (r.cnt_ck_neg>>1)+1 ) , ',' , -1) +
            SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_neg, ',' , (r.cnt_ck_neg>>1) + 1 - ((r.cnt_ck_neg&1) XOR 1) ) , ',' , -1) 
            ) / 2 AS 'ck_neg_median'
FROM (
        SELECT  MONTH(`order_date`) AS 'Timeline'
            , SUM(IF(ck_pos is NULL,0,1)) AS cnt_ck_pos
            , GROUP_CONCAT(ck_pos ORDER BY ck_pos) as grp_ck_pos
            , SUM(IF(ck_neg is NULL,0,1)) AS cnt_ck_neg
            , GROUP_CONCAT(ck_neg ORDER BY ck_neg) as grp_ck_neg
        FROM result_management_report
        where (ck_pos is not null or ck_neg is not null) = 1
        GROUP BY Timeline
) r;

Определение таблицы

Образец нового запроса

mysql> SELECT r.Timeline AS `month_number`
    -> , SUBSTRING_INDEX(SUBSTRING_INDEX( 'Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec', ',' , r.Timeline ) , ',' , -1)
    -> AS Timeline
    -> , ( SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_pos, ',' , (r.cnt_ck_pos>>1)+1 ) , ',' , -1) +
    -> SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_pos, ',' , (r.cnt_ck_pos>>1) + 1 - ((r.cnt_ck_pos&1) XOR 1) ) , ',' , -1) 
    -> ) / 2 AS 'ck_pos_median'
    -> , ( SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_neg, ',' , (r.cnt_ck_neg>>1)+1 ) , ',' , -1) +
    -> SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_neg, ',' , (r.cnt_ck_neg>>1) + 1 - ((r.cnt_ck_neg&1) XOR 1) ) , ',' , -1) 
    -> ) / 2 AS 'ck_neg_median'
    -> FROM (
    -> SELECT MONTH(`order_date`) AS 'Timeline'
    -> , SUM(IF(ck_pos is NULL,0,1)) AS cnt_ck_pos
    -> , GROUP_CONCAT(ck_pos ORDER BY ck_pos) as grp_ck_pos
    -> , SUM(IF(ck_neg is NULL,0,1)) AS cnt_ck_neg
    -> , GROUP_CONCAT(ck_neg ORDER BY ck_neg) as grp_ck_neg
    -> FROM result_management_report
    -> where (ck_pos is not null or ck_neg is not null) = 1
    -> GROUP BY Timeline
    -> ) r;
+--------------+----------+---------------+---------------+
| month_number | Timeline | ck_pos_median | ck_neg_median |
+--------------+----------+---------------+---------------+
|            1 | Jan      |             2 |             2 |
|            2 | Feb      |             2 |             3 |
|            3 | Mar      |             2 |             3 |
|            4 | Apr      |             4 |             4 |
|            5 | May      |             2 |             3 |
|            6 | Jun      |             3 |             3 |
|            7 | Jul      |             4 |             4 |
|            8 | Aug      |             3 |             7 |
|            9 | Sep      |             4 |            12 |
|           10 | Oct      |             5 |             8 |
|           11 | Nov      |             4 |             9 |
|           12 | Dec      |             2 |            12 |
+--------------+----------+---------------+---------------+
12 rows in set (0.02 sec)

Пример старого запроса

mysql> 
mysql> SET @row_num_pos := 0;
Query OK, 0 rows affected (0.00 sec)

mysql> SET @median_group_pos := '';
Query OK, 0 rows affected (0.00 sec)

mysql> SET @row_num_neg := 0;
Query OK, 0 rows affected (0.00 sec)

mysql> SET @median_group_neg := '';
Query OK, 0 rows affected (0.00 sec)

mysql> 
mysql> SELECT 
    ->     p.month_num AS 'month_num',
    ->     CASE
    ->         WHEN p.month_num = 1 THEN 'Jan'
    ->         WHEN p.month_num = 2 THEN 'Feb'
    ->         WHEN p.month_num = 3 THEN 'Mar'
    ->         WHEN p.month_num = 4 THEN 'Apr'
    ->         WHEN p.month_num = 5 THEN 'May'
    ->         WHEN p.month_num = 6 THEN 'Jun'
    ->         WHEN p.month_num = 7 THEN 'Jul'
    ->         WHEN p.month_num = 8 THEN 'Aug'
    ->         WHEN p.month_num = 9 THEN 'Sep'
    ->         WHEN p.month_num = 10 THEN 'Oct'
    ->         WHEN p.month_num = 11 THEN 'Nov'
    ->         WHEN p.month_num = 12 THEN 'Dec'
    ->     END AS 'Timeline',
    ->     p.ck_pos_median AS 'CK+ Median',
    ->     n.ck_neg_median AS 'CK- Median'
    -> FROM
    ->     (SELECT 
    ->         s.median_month_pos AS 'month_num',
    ->             ROUND(AVG(ck_pos), 1) AS 'ck_pos_median'
    ->     FROM
    ->         (SELECT 
    ->         @row_num_pos:=CASE
    ->                 WHEN @median_group_pos = q.month_num THEN @row_num_pos + 1
    ->                 ELSE 1
    ->             END AS 'count_of_group',
    ->             @median_group_pos:=q.month_num AS 'median_month_pos',
    ->             q.month_num,
    ->             q.ck_pos,
    ->             (SELECT 
    ->                     COUNT(*)
    ->                 FROM
    ->                     result_management_report
    ->                 WHERE
    ->                     ck_pos IS NOT NULL
    ->                         AND MONTH(order_date) = q.month_num) AS total_month
    ->     FROM
    ->         (SELECT 
    ->         MONTH(order_date) AS 'month_num', ck_pos
    ->     FROM
    ->         result_management_report
    ->     WHERE
    ->         ck_pos IS NOT NULL
    ->     ORDER BY MONTH(order_date) , ck_pos ASC) AS q) AS s
    ->     WHERE
    ->         s.count_of_group BETWEEN (s.total_month / 2.0) AND (s.total_month / 2.0 + 1)
    ->     GROUP BY s.median_month_pos) AS p
    ->         JOIN
    ->     (SELECT 
    ->         s.median_month_neg AS 'month_num',
    ->             ROUND(AVG(ck_neg), 1) AS 'ck_neg_median'
    ->     FROM
    ->         (SELECT 
    ->         @row_num_neg:=CASE
    ->                 WHEN @median_group_neg = q.month_num THEN @row_num_neg + 1
    ->                 ELSE 1
    ->             END AS 'count_of_group',
    ->             @median_group_neg:=q.month_num AS 'median_month_neg',
    ->             q.month_num,
    ->             q.ck_neg,
    ->             (SELECT 
    ->                     COUNT(*)
    ->                 FROM
    ->                     result_management_report
    ->                 WHERE
    ->                     ck_neg IS NOT NULL
    ->                         AND MONTH(order_date) = q.month_num) AS total_month
    ->     FROM
    ->         (SELECT 
    ->         MONTH(order_date) AS 'month_num', ck_neg
    ->     FROM
    ->         result_management_report
    ->     WHERE
    ->         ck_neg IS NOT NULL
    ->     ORDER BY MONTH(order_date) , ck_neg ASC) AS q) AS s
    ->     WHERE
    ->         s.count_of_group BETWEEN (s.total_month / 2.0) AND (s.total_month / 2.0 + 1)
    ->     GROUP BY s.median_month_neg) AS n ON p.month_num = n.month_num
    -> ORDER BY p.month_num;
+-----------+----------+------------+------------+
| month_num | Timeline | CK+ Median | CK- Median |
+-----------+----------+------------+------------+
|         1 | Jan      |        2.0 |        2.0 |
|         2 | Feb      |        2.0 |        3.0 |
|         3 | Mar      |        2.0 |        3.0 |
|         4 | Apr      |        4.0 |        4.0 |
|         5 | May      |        2.0 |        3.0 |
|         6 | Jun      |        3.0 |        3.0 |
|         7 | Jul      |        4.0 |        4.0 |
|         8 | Aug      |        3.0 |        7.0 |
|         9 | Sep      |        4.0 |       12.0 |
|        10 | Oct      |        5.0 |        8.0 |
|        11 | Nov      |        4.0 |        9.0 |
|        12 | Dec      |        2.0 |       12.0 |
+-----------+----------+------------+------------+
12 rows in set (6.22 sec)

mysql> 
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...