Я исследовал разборку полученного машинного кода. Вставка if (left > 0 || right > 0)
вызывает значительные изменения для последующего кода компилятором. Похоже, компилятор оптимизирует субоптимальный во втором случае, что может объяснить снижение производительности на 20%.
184 /* fill left and right borders from top to bottom border */
185
186 for (int y = top2; y < bottom2; y += lz) {
0x00000000002004ca <+138>: cmp %r14d,%esi
0x00000000002004d1 <+145>: jge 0x2005ad <mirror_borders16+365>
0x00000000002004d7 <+151>: movslq %ebx,%rax
0x00000000002004da <+154>: lea -0x2(%rbp),%r15
0x00000000002004de <+158>: lea -0x1(%r11),%r12d
0x00000000002004e2 <+162>: lea (%rax,%rax,1),%r8
0x00000000002004e6 <+166>: movslq %esi,%rax
0x000000000020050e <+206>: mov $0x1,%r12d
0x0000000000200514 <+212>: mov %ebx,%r9d
0x0000000000200517 <+215>: mov %rbx,0x30(%rsp)
0x000000000020051c <+220>: sub %rax,%r15
0x000000000020051f <+223>: sub %edx,%r12d
0x0000000000200522 <+226>: mov %r14d,%ebx
0x0000000000200525 <+229>: nopl (%rax)
0x00000000002005a0 <+352>: lea (%r12,%rsi,1),%eax
0x00000000002005a4 <+356>: cmp %eax,%ebx
0x00000000002005a6 <+358>: jg 0x200528 <mirror_borders16+232>
0x00000000002005a8 <+360>: mov 0x30(%rsp),%rbx
187 for (int x = 0; x < left; x++)
0x0000000000200528 <+232>: test %r11d,%r11d
0x000000000020052b <+235>: jle 0x20055f <mirror_borders16+287>
0x000000000020052d <+237>: movslq %esi,%r14
0x0000000000200530 <+240>: mov %rdi,%rdx
0x0000000000200533 <+243>: mov %ecx,(%rsp)
0x0000000000200536 <+246>: add %r14,%r14
0x0000000000200539 <+249>: lea 0x0(%rbp,%r14,1),%rax
0x000000000020053e <+254>: add %r13,%r14
0x0000000000200541 <+257>: nopl 0x0(%rax)
0x0000000000200557 <+279>: cmp %rax,%r14
0x000000000020055a <+282>: jne 0x200548 <mirror_borders16+264>
0x000000000020055c <+284>: mov (%rsp),%ecx
188 data[y + x] = data[y + left * 2 - 1 - x];
0x00000000002004e9 <+169>: lea (%r11,%r11,1),%edx
0x00000000002004ed <+173>: sub $0x1,%ecx
0x00000000002004f0 <+176>: lea 0x0(%rbp,%rax,2),%rdi
0x00000000002004f5 <+181>: lea -0x1(%r10),%eax
0x00000000002004f9 <+185>: add %r12,%r12
0x00000000002004fc <+188>: mov %r15,%r13
0x00000000002004ff <+191>: sub %r10d,%ecx
0x0000000000200502 <+194>: add %esi,%ecx
0x0000000000200504 <+196>: add %rax,%rax
0x0000000000200507 <+199>: sub %r12,%r13
0x000000000020050a <+202>: lea -0x1(%rsi,%rdx,1),%esi
0x0000000000200548 <+264>: movzwl (%rax),%ecx
0x000000000020054b <+267>: sub $0x2,%rax
0x000000000020054f <+271>: add $0x2,%rdx
0x0000000000200553 <+275>: mov %cx,-0x2(%rdx)
189 for (int x = 0; x < right; x++)
0x000000000020055f <+287>: test %r10d,%r10d
0x0000000000200562 <+290>: jle 0x200597 <mirror_borders16+343>
0x0000000000200564 <+292>: lea 0x1(%rcx),%edx
0x0000000000200567 <+295>: movslq %ecx,%r14
0x000000000020056a <+298>: mov %ecx,(%rsp)
0x000000000020056d <+301>: add %r14,%r14
0x0000000000200570 <+304>: movslq %edx,%rdx
0x0000000000200573 <+307>: lea 0x0(%rbp,%r14,1),%rax
0x0000000000200578 <+312>: add %r15,%r14
0x000000000020057b <+315>: lea 0x0(%rbp,%rdx,2),%rdx
0x000000000020058f <+335>: cmp %rax,%r14
0x0000000000200592 <+338>: jne 0x200580 <mirror_borders16+320>
0x0000000000200594 <+340>: mov (%rsp),%ecx
0x0000000000200597 <+343>: add %r9d,%esi
0x000000000020059a <+346>: add %r9d,%ecx
0x000000000020059d <+349>: add %r8,%rdi
190 data[y + width - right + x] = data[y + width - right - 1 - x];
0x0000000000200580 <+320>: movzwl (%rax),%ecx
0x0000000000200583 <+323>: sub $0x2,%rax
0x0000000000200587 <+327>: add $0x2,%rdx
0x000000000020058b <+331>: mov %cx,-0x2(%rdx)
191 }
С пропуском бесполезного зацикливания:
184 /* fill left and right borders from top to bottom border */
185 if (left > 0 || right > 0) // in case skip for performance
0x00000000002004f7 <+135>: test %r8d,%r8d
0x00000000002004fe <+142>: jg 0x200640 <mirror_borders16+464>
0x0000000000200504 <+148>: test %ecx,%ecx
0x0000000000200506 <+150>: jg 0x200640 <mirror_borders16+464>
186 for (int y = top2; y < bottom2; y += lz) {
0x0000000000200640 <+464>: cmp 0x24(%rsp),%r15d
0x0000000000200645 <+469>: jge 0x20050c <mirror_borders16+156>
0x000000000020064b <+475>: mov 0x20(%rsp),%ebp
0x0000000000200661 <+497>: mov %r15d,0x4c(%rsp)
0x0000000000200666 <+502>: sub %ecx,%r9d
0x0000000000200669 <+505>: lea -0x1(%rax,%r15,1),%esi
0x000000000020066e <+510>: mov 0x24(%rsp),%r15d
0x0000000000200673 <+515>: sub %eax,%ebp
0x0000000000200675 <+517>: lea (%r11,%rdx,2),%rdi
0x0000000000200679 <+521>: lea -0x1(%r8),%edx
0x000000000020067d <+525>: mov %ebp,%r10d
0x0000000000200680 <+528>: add %r9d,%ebp
0x0000000000200683 <+531>: lea -0x2(%r11),%r9
0x0000000000200687 <+535>: movslq %ebx,%r13
0x000000000020068a <+538>: add %rdx,%rdx
0x000000000020068d <+541>: mov %ebx,%r12d
0x0000000000200690 <+544>: mov %r9,%r14
0x0000000000200693 <+547>: mov $0x1,%r9d
0x0000000000200699 <+553>: add %r13,%r13
0x000000000020069c <+556>: sub %ecx,%r10d
0x000000000020069f <+559>: sub %rdx,%r14
0x00000000002006a2 <+562>: sub %eax,%r9d
0x00000000002006a5 <+565>: mov %rbx,0x38(%rsp)
0x00000000002006aa <+570>: nopw 0x0(%rax,%rax,1)
0x000000000020072d <+701>: lea (%r9,%rsi,1),%eax
0x0000000000200731 <+705>: cmp %eax,%r15d
0x0000000000200734 <+708>: jg 0x2006b0 <mirror_borders16+576>
0x000000000020073a <+714>: mov 0x38(%rsp),%rbx
0x000000000020073f <+719>: mov 0x4c(%rsp),%r15d
0x0000000000200744 <+724>: jmpq 0x20050c <mirror_borders16+156>
0x0000000000200749 <+729>: repz retq
0x000000000020074b: nopl 0x0(%rax,%rax,1)
187 for (int x = 0; x < left; x++)
0x00000000002006b0 <+576>: test %r8d,%r8d
0x00000000002006b3 <+579>: jle 0x2006ec <mirror_borders16+636>
0x00000000002006b5 <+581>: movslq %esi,%rbx
0x00000000002006b8 <+584>: mov %rdi,%rdx
0x00000000002006bb <+587>: mov %ecx,0x8(%rsp)
0x00000000002006bf <+591>: add %rbx,%rbx
0x00000000002006c2 <+594>: lea (%r11,%rbx,1),%rax
0x00000000002006c6 <+598>: add %r14,%rbx
0x00000000002006c9 <+601>: nopl 0x0(%rax)
0x00000000002006df <+623>: cmp %rax,%rbx
0x00000000002006e2 <+626>: jne 0x2006d0 <mirror_borders16+608>
0x00000000002006e4 <+628>: mov 0x8(%rsp),%ecx
0x00000000002006f0 <+640>: mov %esi,0x8(%rsp)
0x00000000002006f4 <+644>: cltq
0x00000000002006f6 <+646>: lea (%r11,%rax,2),%rdx
0x00000000002006fa <+650>: lea 0x0(%rbp,%rsi,1),%eax
0x00000000002006fe <+654>: cltq
0x0000000000200700 <+656>: lea (%r11,%rax,2),%rbx
0x0000000000200704 <+660>: xor %eax,%eax
0x0000000000200706 <+662>: nopw %cs:0x0(%rax,%rax,1)
188 data[y + x] = data[y + left * 2 - 1 - x];
0x000000000020064f <+479>: lea (%r8,%r8,1),%eax
0x0000000000200653 <+483>: mov 0x18(%rsp),%r11
0x0000000000200658 <+488>: mov $0x1,%r9d
0x000000000020065e <+494>: movslq %r15d,%rdx
0x00000000002006d0 <+608>: movzwl (%rax),%ecx
0x00000000002006d3 <+611>: sub $0x2,%rax
0x00000000002006d7 <+615>: add $0x2,%rdx
0x00000000002006db <+619>: mov %cx,-0x2(%rdx)
189 for (int x = 0; x < right; x++)
0x00000000002006e8 <+632>: test %ecx,%ecx
0x00000000002006ea <+634>: jle 0x200727 <mirror_borders16+695>
0x00000000002006ec <+636>: lea (%r10,%rsi,1),%eax
0x000000000020071f <+687>: cmp %eax,%ecx
0x0000000000200721 <+689>: jg 0x200710 <mirror_borders16+672>
0x0000000000200723 <+691>: mov 0x8(%rsp),%esi
0x0000000000200727 <+695>: add %r12d,%esi
0x000000000020072a <+698>: add %r13,%rdi
190 data[y + width - right + x] = data[y + width - right - 1 - x];
0x0000000000200710 <+672>: movzwl (%rdx),%esi
0x0000000000200713 <+675>: sub $0x2,%rdx
0x0000000000200717 <+679>: mov %si,(%rbx,%rax,2)
0x000000000020071b <+683>: add $0x1,%rax
191 }