Question

Я тестировал одну из своих программ и заметил разное время выполнения между запусками программы. Я обнаружил некоторые проблемы, но все-таки одна и та же программа с одинаковыми входными данными может выполняться в два раза медленнее. Я протестировал свою программу на Windows 10 (с Intel Skylake 8700k на борту) и двух компиляторах: Visual Studio 2019 и llvm для windows. Результаты были одинаковыми для обоих компиляторов (только в разное время).

Наконец, я создал простую программу для воспроизведения проблемы (прилагается ниже). Мне действительно любопытно, что вызывает такую огромную разницу в производительности. Я уже исправил ложное совместное использование на vector<vector<myData>> (это было для переменных-членов вектора), но основная проблема все еще существует.

Контрольный показатель

#include <chrono>
#include <cstdlib>
#include <cstdio>
#include <vector>
#include <omp.h>
using namespace std;

struct pr{
    unsigned short a, b, c;
    pr() = default;
    pr(int a, int b, int c)
        : a((unsigned short)a)
        , b((unsigned short)b)
        , c((unsigned short)c)
    {}
};

struct image{
    image(unsigned short w, unsigned short h){
        data = new unsigned char[w * h];
        width = w;
        height = h;
    }
    ~image(){
        delete[] data;
    }
    unsigned char* Row(int row){
        return data + row * width;
    }
    unsigned char* data;
    unsigned short width;
    unsigned short height;
};

//align to prevent false sharing
struct alignas(64) myData
{
    vector<pr> data;
};

vector<myData> buffer;
void Process(image& inImage, vector<pr>& outResult)
{
    //allocate some temporary buffers
    //don't release memory between iterations
    //just clear at the end of the function
    while (buffer.size() < 20)
        buffer.resize(20);
    outResult.clear(); //clear result

    #pragma omp parallel for schedule(static)
    for (int i = 0; i < inImage.height; ++i)
    {
        //get buffer based on thread id
        int id = omp_get_thread_num();
        vector<pr>& localout = id == 0 ? outResult : buffer[id].data;

        //perform some computations to fill buffers
        unsigned char* data = inImage.Row(i);
        int begin = 0;
        for (int j = 0; j < inImage.width; ++j)
        {
            if (data[j] < 20) begin = j;
            else if (data[j] > 190)
                localout.emplace_back(j, i, j - begin);
        }
    }

    //combine partial results
    for (int i = 0; i < buffer.size(); ++i)
        outResult.insert(outResult.end(), buffer[i].data.begin(), buffer[i].data.end());

    for (auto& item : buffer)
        item.data.clear();
}

int main()
{
    image img{ 1000, 1000 };

    //init with some random values
    srand(123);
    for (int i = 0; i < img.height; ++i){
        unsigned char* row = img.Row(i);
        for (int j = 0; j < img.width; ++j)
            row[j] = (unsigned char)(rand() % 256);
    }

    //measure execution time and display best result
    vector<pr> result;
    std::chrono::milliseconds minTime = std::chrono::milliseconds::max();
    for(int it = 0; true; ++it){
        auto start = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < 100; ++i)
            Process(img, result);
        auto finish = std::chrono::high_resolution_clock::now();
        auto d = std::chrono::duration_cast<std::chrono::milliseconds>(finish - start);

        if (d < minTime){
            minTime = d;
            printf("min: %lld ms\n", minTime.count());
        }
    }
    return 0;
}

Наконец после запуска программы несколько раз я получаю такие примеры выходных данных:

Прогон 1

min: 101 ms
min: 90 ms
min: 87 ms
min: 86 ms
min: 85 ms
min: 84 ms
min: 82 ms
min: 81 ms
min: 80 ms
min: 79 ms

Прогон 2

min: 66 ms
min: 51 ms
min: 50 ms
min: 49 ms
min: 48 ms

Выполнить 3

min: 101 ms
min: 93 ms
min: 92 ms
min: 88 ms

Выполнить 4

min: 130 ms
min: 122 ms
min: 120 ms
min: 119 ms
min: 118 ms

Когда минимальное время выполнения было большим, как 118, тогда программа работала намного дольше (поэтому я ждал, если возможно получить лучший результат, но без каких-либо существенных улучшений)

Как мы видим, минимальная разница во времени выполнения огромна (48 мс в лучшем случае против 118 мс в худшем случае). Я также записал среднее время. он ведет себя аналогичным образом.

При записи данных с помощью VTune я вижу, что программа привязана к L3 (когда программа работает медленно). В обоих случаях я выполняю программу с теми же данными и получаю доступ к данным с полностью предсказуемым шаблоном. Так, что вызывает такое большое замедление? Как это предотвратить?

Редактировать (основываясь на комментариях ниже)

Благодаря комментарию @PeterCordes, наконец, программа работает всегда быстро. Оптимизированная версия:

void Process(image& inImage, vector<pr>& outResult)
{
    //allocate some temporary buffers
    //don't release memory between iterations
    //just clear at the end of the function
    while (buffer.size() < 20)
        buffer.resize(20);
    outResult.clear(); //clear result

    #pragma omp parallel for schedule(static)
    for (int i = 0; i < inImage.height; ++i)
    {
        //get buffer based on thread id
        int id = omp_get_thread_num();
        vector<pr> localout = std::move(id == 0 ? outResult : buffer[id].data);

        //perform some computations to fill buffers
        unsigned char* data = inImage.Row(i);
        int width = inImage.width;
        int begin = 0;
        for (int j = 0; j < width; ++j)
        {
            if (data[j] < 20) begin = j;
            else if (data[j] > 190)
                localout.emplace_back(j, i, j - begin);
        }

        localout.swap(id == 0 ? outResult : buffer[id].data);
    }

    //combine partial results
    for (int i = 0; i < buffer.size(); ++i)
        outResult.insert(outResult.end(), buffer[i].data.begin(), buffer[i].data.end());

    for (auto& item : buffer)
        item.data.clear();
}

Сборка из первой версии:

; 56   :        vector<pr>& localout = id == 0 ? outResult : buffer[id].data;
    test    eax, eax
    jne SHORT $LN19@Process$om
    mov rbx, r13
    jmp SHORT $LN20@Process$om
$LN19@Process$om:
    movsxd  rbx, eax
; File C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\vector
; 1505 :         return _My_data._Myfirst[_Pos];
    shl rbx, 6
    add rbx, QWORD PTR ?buffer@@3V?$vector@UmyData@@V?$allocator@UmyData@@@std@@@std@@A
$LN20@Process$om:
; Source.cpp
; 28   :        return data + row * width;
    movzx   ecx, WORD PTR [r15+8]
; 61   :        for (int j = 0; j < inImage.width; ++j)
    xor edi, edi
; 28   :        return data + row * width;
    mov eax, ecx
; 61   :        for (int j = 0; j < inImage.width; ++j)
    mov DWORD PTR j$2[rsp], edi
; 28   :        return data + row * width;
    imul    eax, esi
; 57   : 
; 58   :        //perform some computations to fill buffers
; 59   :        unsigned char* data = inImage.Row(i);
; 60   :        int begin = 0;
    xor ebp, ebp
; 28   :        return data + row * width;
    movsxd  r14, eax
    add r14, QWORD PTR [r15]
; 61   :        for (int j = 0; j < inImage.width; ++j)
    test    ecx, ecx
    je  SHORT $LN4@Process$om
$LL8@Process$om:
; 62   :        {
; 63   :            if (data[j] < 20) begin = j;
    movsxd  rax, edi
    movzx   ecx, BYTE PTR [rax+r14]
    cmp cl, 20
    jae SHORT $LN15@Process$om
    mov ebp, edi
    jmp SHORT $LN6@Process$om
$LN15@Process$om:
; 64   :            else if (data[j] > 190)
    cmp cl, 190                 ; 000000beH
    jbe SHORT $LN6@Process$om
; File C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\vector
; 705  :         if (_Mylast != _My_data._Myend) {
    mov rdx, QWORD PTR [rbx+8]
; Source.cpp
; 65   :                localout.emplace_back(j, i, j - begin);
    mov eax, edi
    sub eax, ebp
    mov DWORD PTR $T5[rsp], eax
; File C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\vector
; 705  :         if (_Mylast != _My_data._Myend) {
    cmp rdx, QWORD PTR [rbx+16]
    je  SHORT $LN8@Process$om
; Source.cpp
; 12   :        : a((unsigned short)a)
    mov WORD PTR [rdx], di
; 13   :        , b((unsigned short)b)
    mov WORD PTR [rdx+2], si
; 14   :        , c((unsigned short)c)
    mov WORD PTR [rdx+4], ax
; File C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\vector
; 691  :         ++_Mylast;
    add QWORD PTR [rbx+8], 6
; 706  :             return _Emplace_back_with_unused_capacity(_STD forward<_Valty>(_Val)...);
    jmp SHORT $LN6@Process$om
$LN8@Process$om:
; 707  :         }
; 708  : 
; 709  :         _Ty& _Result = *_Emplace_reallocate(_Mylast, _STD forward<_Valty>(_Val)...);
    lea rax, QWORD PTR $T5[rsp]
    mov rcx, rbx
    lea r9, QWORD PTR i$3[rsp]
    mov QWORD PTR [rsp+32], rax
    lea r8, QWORD PTR j$2[rsp]
    call    ??$_Emplace_reallocate@AEAHAEAHH@?$vector@Upr@@V?$allocator@Upr@@@std@@@std@@QEAAPEAUpr@@QEAU2@AEAH1$$QEAH@Z ; std::vector<pr,std::allocator<pr> >::_Emplace_reallocate<int &,int &,int>
$LN6@Process$om:
; Source.cpp
; 61   :        for (int j = 0; j < inImage.width; ++j)
    movzx   eax, WORD PTR [r15+8]
    inc edi
    mov DWORD PTR j$2[rsp], edi
    cmp edi, eax
    jl  SHORT $LL8@Process$om
$LN4@Process$om:

Сборка из окончательной версии:

; 56   :        vector<pr> localout = std::move(id == 0 ? outResult : buffer[id].data);
    test    r15d, r15d
    jne SHORT $LN19@Process$om
    mov rcx, r13
    jmp SHORT $LN20@Process$om
$LN19@Process$om:
    mov rcx, r15
; File C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\vector
; 1505 :         return _My_data._Myfirst[_Pos];
    shl rcx, 6
    add rcx, QWORD PTR ?buffer@@3V?$vector@UmyData@@V?$allocator@UmyData@@@std@@@std@@A
$LN20@Process$om:
; 385  :         _Myfirst = _Right._Myfirst;
    mov r8, QWORD PTR [rcx]
    mov QWORD PTR localout$6[rbp-112], r8
; 386  :         _Mylast  = _Right._Mylast;
    mov rdx, QWORD PTR [rcx+8]
    mov QWORD PTR localout$6[rbp-104], rdx
; 387  :         _Myend   = _Right._Myend;
    mov r9, QWORD PTR [rcx+16]
    mov QWORD PTR localout$6[rbp-96], r9
; 388  : 
; 389  :         _Right._Myfirst = pointer();
    mov QWORD PTR [rcx], rsi
; 390  :         _Right._Mylast  = pointer();
    mov QWORD PTR [rcx+8], rsi
; 391  :         _Right._Myend   = pointer();
    mov QWORD PTR [rcx+16], rsi
Source.cpp
; 28   :        return data + row * width;
    movzx   r14d, WORD PTR [rbx+8]
    mov eax, r14d
    imul    eax, edi
    movsxd  r12, eax
    add r12, QWORD PTR [rbx]
; 62   :        for (int j = 0; j < width; ++j)
    xor ebx, ebx
    mov DWORD PTR j$3[rbp-112], ebx
    test    r14d, r14d
    je  SHORT $LN7@Process$om
    npad    5
$LL8@Process$om:
; 63   :        {
; 64   :            if (data[j] < 20) begin = j;
    movsxd  rax, ebx
    movzx   ecx, BYTE PTR [rax+r12]
    cmp cl, 20
    jae SHORT $LN15@Process$om
    mov esi, ebx
    jmp SHORT $LN6@Process$om
$LN15@Process$om:
; 65   :            else if (data[j] > 190)
    cmp cl, 190                 ; 000000beH
    jbe SHORT $LN6@Process$om
; 66   :                localout.emplace_back(j, i, j - begin);
    mov eax, ebx
    sub eax, esi
    mov DWORD PTR $T5[rbp-112], eax
; File C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\vector
; 705  :         if (_Mylast != _My_data._Myend) {
    cmp rdx, r9
    je  SHORT $LN21@Process$om
; Source.cpp
; 12   :        : a((unsigned short)a)
    mov WORD PTR [rdx], bx
; 13   :        , b((unsigned short)b)
    mov WORD PTR [rdx+2], di
; 14   :        , c((unsigned short)c)
    mov WORD PTR [rdx+4], ax
; File C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\vector
; 691  :         ++_Mylast;
    add rdx, 6
    mov QWORD PTR localout$6[rbp-104], rdx
; 706  :             return _Emplace_back_with_unused_capacity(_STD forward<_Valty>(_Val)...);
    jmp SHORT $LN6@Process$om
$LN21@Process$om:
; 707  :         }
; 708  : 
; 709  :         _Ty& _Result = *_Emplace_reallocate(_Mylast, _STD forward<_Valty>(_Val)...);
    lea rax, QWORD PTR $T5[rbp-112]
    mov QWORD PTR [rsp+32], rax
    lea r9, QWORD PTR i$2[rbp-112]
    lea r8, QWORD PTR j$3[rbp-112]
    lea rcx, QWORD PTR localout$6[rbp-112]
    call    ??$_Emplace_reallocate@AEAHAEAHH@?$vector@Upr@@V?$allocator@Upr@@@std@@@std@@QEAAPEAUpr@@QEAU2@AEAH1$$QEAH@Z ; std::vector<pr,std::allocator<pr> >::_Emplace_reallocate<int &,int &,int>
    mov r9, QWORD PTR localout$6[rbp-96]
    mov rdx, QWORD PTR localout$6[rbp-104]
$LN6@Process$om:
; Source.cpp
; 62   :        for (int j = 0; j < width; ++j)
    inc ebx
    mov DWORD PTR j$3[rbp-112], ebx
    cmp ebx, r14d
    jl  SHORT $LL8@Process$om
    mov r8, QWORD PTR localout$6[rbp-112]
$LN7@Process$om:
; 67   :        }
; 68   : 
; 69   :        localout.swap(id == 0 ? outResult : buffer[id].data);

Огромная минимальная разница во времени выполнения алгоритма при многократном запуске одной и той же программы на одних и тех же данных.

Контрольный показатель

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Огромная минимальная разница во времени выполнения алгоритма при многократном запуске одной и той же программы на одних и тех же данных.

Контрольный показатель

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Нет похожих вопросов