Я реализовал простой тест для обоих методов.
Результат: Полосатый макет в лучшем случае на 10% быстрее стандартного макета *.Но с SSE4.1 мы можем сделать намного лучше.
* При компиляции с gcc -Ofast
на i5-7200U
процессоре.
Структура работает с несколько проще, но гораздо менее универсальна.Однако это может иметь некоторое преимущество в реальном сценарии, если распределитель достаточно занят.
Полосатый макет
Time 4624 ms
Memory usage summary: heap total: 713728, heap peak: 713728, stack peak: 2896
total calls total memory failed calls
malloc| 3 713728 0
realloc| 0 0 0 (nomove:0, dec:0, free:0)
calloc| 0 0 0
free| 1 640000
#include <chrono>
#include <cstdio>
#include <random>
#include <vector>
#include <xmmintrin.h>
/* -----------------------------------------------------------------------------
Striped layout [X,X,X,X,y,y,y,y,Z,Z,Z,Z,w,w,w,w,X,X,X,X...]
----------------------------------------------------------------------------- */
using AoSoA_scene = std::vector<__m128>;
void print_scene(AoSoA_scene const &scene)
{
// This is likely undefined behavior. Data might need to be stored
// differently, but this is simpler to index.
auto &&punned_data = reinterpret_cast<float const *>(scene.data());
auto scene_size = std::size(scene);
// Limit to 8 lines
for(size_t j = 0lu; j < std::min(scene_size, 8lu); ++j) {
for(size_t i = 0lu; i < 4lu; ++i) {
printf("%10.3e ", punned_data[j + 4lu * i]);
}
printf("\n");
}
if(scene_size > 8lu) {
printf("(%lu more)...\n", scene_size - 8lu);
}
printf("\n");
}
void normalize(AoSoA_scene &scene)
{
// Euclidean norm, SIMD 4 x 4D-vectors at a time.
for(size_t i = 0lu; i < scene.size(); i += 4lu) {
__m128 xs = scene[i + 0lu];
__m128 ys = scene[i + 1lu];
__m128 zs = scene[i + 2lu];
__m128 ws = scene[i + 3lu];
__m128 xxs = _mm_mul_ps(xs, xs);
__m128 yys = _mm_mul_ps(ys, ys);
__m128 zzs = _mm_mul_ps(zs, zs);
__m128 wws = _mm_mul_ps(ws, ws);
__m128 xx_yys = _mm_add_ps(xxs, yys);
__m128 zz_wws = _mm_add_ps(zzs, wws);
__m128 xx_yy_zz_wws = _mm_add_ps(xx_yys, zz_wws);
__m128 norms = _mm_sqrt_ps(xx_yy_zz_wws);
scene[i + 0lu] = _mm_div_ps(xs, norms);
scene[i + 1lu] = _mm_div_ps(ys, norms);
scene[i + 2lu] = _mm_div_ps(zs, norms);
scene[i + 3lu] = _mm_div_ps(ws, norms);
}
}
float randf()
{
std::random_device random_device;
std::default_random_engine random_engine{random_device()};
std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);
return distribution(random_engine);
}
int main()
{
// Scene description, e.g. cameras, or particles, or boids etc.
// Has to be a multiple of 4! -- No edge case handling.
std::vector<__m128> scene(40'000);
for(size_t i = 0lu; i < std::size(scene); ++i) {
scene[i] = _mm_set_ps(randf(), randf(), randf(), randf());
}
// Print, normalize 100'000 times, print again
// Compiler is hopefully not smart enough to realize
// idempotence of normalization
using std::chrono::steady_clock;
using std::chrono::duration_cast;
using std::chrono::milliseconds;
// >:(
print_scene(scene);
printf("Working...\n");
auto begin = steady_clock::now();
for(int j = 0; j < 100'000; ++j) {
normalize(scene);
}
auto end = steady_clock::now();
auto duration = duration_cast<milliseconds>(end - begin);
printf("Time %lu ms\n", duration.count());
print_scene(scene);
return 0;
}
макет SoA
Time 4982 ms
Memory usage summary: heap total: 713728, heap peak: 713728, stack peak: 2992
total calls total memory failed calls
malloc| 6 713728 0
realloc| 0 0 0 (nomove:0, dec:0, free:0)
calloc| 0 0 0
free| 4 640000
#include <chrono>
#include <cstdio>
#include <random>
#include <vector>
#include <xmmintrin.h>
/* -----------------------------------------------------------------------------
SoA layout [X,X,X,X,...], [y,y,y,y,...], [Z,Z,Z,Z,...], ...
----------------------------------------------------------------------------- */
struct SoA_scene {
size_t size;
float *xs;
float *ys;
float *zs;
float *ws;
};
void print_scene(SoA_scene const &scene)
{
// This is likely undefined behavior. Data might need to be stored
// differently, but this is simpler to index.
// Limit to 8 lines
for(size_t j = 0lu; j < std::min(scene.size, 8lu); ++j) {
printf("%10.3e ", scene.xs[j]);
printf("%10.3e ", scene.ys[j]);
printf("%10.3e ", scene.zs[j]);
printf("%10.3e ", scene.ws[j]);
printf("\n");
}
if(scene.size > 8lu) {
printf("(%lu more)...\n", scene.size - 8lu);
}
printf("\n");
}
void normalize(SoA_scene &scene)
{
// Euclidean norm, SIMD 4 x 4D-vectors at a time.
for(size_t i = 0lu; i < scene.size; i += 4lu) {
__m128 xs = _mm_load_ps(&scene.xs[i]);
__m128 ys = _mm_load_ps(&scene.ys[i]);
__m128 zs = _mm_load_ps(&scene.zs[i]);
__m128 ws = _mm_load_ps(&scene.ws[i]);
__m128 xxs = _mm_mul_ps(xs, xs);
__m128 yys = _mm_mul_ps(ys, ys);
__m128 zzs = _mm_mul_ps(zs, zs);
__m128 wws = _mm_mul_ps(ws, ws);
__m128 xx_yys = _mm_add_ps(xxs, yys);
__m128 zz_wws = _mm_add_ps(zzs, wws);
__m128 xx_yy_zz_wws = _mm_add_ps(xx_yys, zz_wws);
__m128 norms = _mm_sqrt_ps(xx_yy_zz_wws);
__m128 normed_xs = _mm_div_ps(xs, norms);
__m128 normed_ys = _mm_div_ps(ys, norms);
__m128 normed_zs = _mm_div_ps(zs, norms);
__m128 normed_ws = _mm_div_ps(ws, norms);
_mm_store_ps(&scene.xs[i], normed_xs);
_mm_store_ps(&scene.ys[i], normed_ys);
_mm_store_ps(&scene.zs[i], normed_zs);
_mm_store_ps(&scene.ws[i], normed_ws);
}
}
float randf()
{
std::random_device random_device;
std::default_random_engine random_engine{random_device()};
std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);
return distribution(random_engine);
}
int main()
{
// Scene description, e.g. cameras, or particles, or boids etc.
// Has to be a multiple of 4! -- No edge case handling.
auto scene_size = 40'000lu;
std::vector<float> xs(scene_size);
std::vector<float> ys(scene_size);
std::vector<float> zs(scene_size);
std::vector<float> ws(scene_size);
for(size_t i = 0lu; i < scene_size; ++i) {
xs[i] = randf();
ys[i] = randf();
zs[i] = randf();
ws[i] = randf();
}
SoA_scene scene{
scene_size,
std::data(xs),
std::data(ys),
std::data(zs),
std::data(ws)
};
// Print, normalize 100'000 times, print again
// Compiler is hopefully not smart enough to realize
// idempotence of normalization
using std::chrono::steady_clock;
using std::chrono::duration_cast;
using std::chrono::milliseconds;
// >:(
print_scene(scene);
printf("Working...\n");
auto begin = steady_clock::now();
for(int j = 0; j < 100'000; ++j) {
normalize(scene);
}
auto end = steady_clock::now();
auto duration = duration_cast<milliseconds>(end - begin);
printf("Time %lu ms\n", duration.count());
print_scene(scene);
return 0;
}
макет AoS
Начиная с SSE4.1, кажется, существует третий вариант - безусловно, самый простойи самый быстрый.
Time 3074 ms
Memory usage summary: heap total: 746552, heap peak: 713736, stack peak: 2720
total calls total memory failed calls
malloc| 5 746552 0
realloc| 0 0 0 (nomove:0, dec:0, free:0)
calloc| 0 0 0
free| 2 672816
Histogram for block sizes:
0-15 1 20% =========================
1024-1039 1 20% =========================
32816-32831 1 20% =========================
large 2 40% ==================================================
/* -----------------------------------------------------------------------------
AoS layout [{X,y,Z,w},{X,y,Z,w},{X,y,Z,w},{X,y,Z,w},...]
----------------------------------------------------------------------------- */
using AoS_scene = std::vector<__m128>;
void print_scene(AoS_scene const &scene)
{
// This is likely undefined behavior. Data might need to be stored
// differently, but this is simpler to index.
auto &&punned_data = reinterpret_cast<float const *>(scene.data());
auto scene_size = std::size(scene);
// Limit to 8 lines
for(size_t j = 0lu; j < std::min(scene_size, 8lu); ++j) {
for(size_t i = 0lu; i < 4lu; ++i) {
printf("%10.3e ", punned_data[j * 4lu + i]);
}
printf("\n");
}
if(scene_size > 8lu) {
printf("(%lu more)...\n", scene_size - 8lu);
}
printf("\n");
}
void normalize(AoS_scene &scene)
{
// Euclidean norm, SIMD 4 x 4D-vectors at a time.
for(size_t i = 0lu; i < scene.size(); i += 4lu) {
__m128 vec = scene[i];
__m128 dot = _mm_dp_ps(vec, vec, 255);
__m128 norms = _mm_sqrt_ps(dot);
scene[i] = _mm_div_ps(vec, norms);
}
}
float randf()
{
std::random_device random_device;
std::default_random_engine random_engine{random_device()};
std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);
return distribution(random_engine);
}
int main()
{
// Scene description, e.g. cameras, or particles, or boids etc.
std::vector<__m128> scene(40'000);
for(size_t i = 0lu; i < std::size(scene); ++i) {
scene[i] = _mm_set_ps(randf(), randf(), randf(), randf());
}
// Print, normalize 100'000 times, print again
// Compiler is hopefully not smart enough to realize
// idempotence of normalization
using std::chrono::steady_clock;
using std::chrono::duration_cast;
using std::chrono::milliseconds;
// >:(
print_scene(scene);
printf("Working...\n");
auto begin = steady_clock::now();
for(int j = 0; j < 100'000; ++j) {
normalize(scene);
//break;
}
auto end = steady_clock::now();
auto duration = duration_cast<milliseconds>(end - begin);
printf("Time %lu ms\n", duration.count());
print_scene(scene);
return 0;
}