由于用c数组,栈内存有限,以问题中能容纳的数据量,基本时间都是0,所以改了个结构。
结论是在我的系统下(windows10,vscode,clang64,debug编译模式)构造对象时间差异巨大,以下文中的数量算,至少有4倍差距。但计算时间,方法二稍好。
开O3后,构造对象无差异,计算时间方法2少30%
所以纯耗时,没有优化时:方法1 << 方法2.
但优化后比较,方法1 耗时稍大于方法 2.
#include <iostream>
#include <vector>
#define N 10000000
#define dt 0.00001f
struct Object
{
float x = 2.36;
float y = 3.988;
float z = 5.125;
float vx = 0;
float vy = 0;
float vz = 0;
};
struct ObjectArray
{
std::vector<float> x = std::vector<float>(N, 2.36);
std::vector<float> y = std::vector<float>(N, 3.988);
std::vector<float> z = std::vector<float>(N, 5.125);
std::vector<float> vx = std::vector<float>(N);
std::vector<float> vy = std::vector<float>(N);
std::vector<float> vz = std::vector<float>(N);
};
int main()
{
auto start = clock();
std::vector<Object> obj(N);
std::cout << "construct objectVector time: " << double(clock() - start)
<< std::endl;
start = clock();
for (auto &i : obj)
{
i.x += i.vx * dt;
i.y += i.vy * dt;
i.z += i.vz * dt;
}
std::cout << "objectVector compute time: " << double(clock() - start)
<< std::endl;
start = clock();
ObjectArray obj_all;
std::cout << "construct ObjectArray time: " << double(clock() - start)
<< std::endl;
start = clock();
for (int i = 0; i < N; i++)
{
obj_all.x[i] += obj_all.vx[i] * dt;
obj_all.y[i] += obj_all.vy[i] * dt;
obj_all.z[i] += obj_all.vz[i] * dt;
}
std::cout << "ObjectArray compute time: " << double(clock() - start)
<< std::endl;
}
做了一下指令集优化,结果差不多,基本可以肯定,编译器已经自动优化了。
#include <immintrin.h>
#include <iostream>
#include <vector>
#define N 100000000
#define DT 0.00001f
struct Obj
{
float x = 2.36;
float y = 3.988;
float z = 5.125;
float vx = 1;
float vy = 1;
float vz = 1;
};
struct ObjArr
{
ObjArr() = default;
~ObjArr()
{
_mm_free(x);
_mm_free(y);
_mm_free(z);
_mm_free(vx);
_mm_free(vy);
_mm_free(vz);
}
float *x = static_cast<float *>(_mm_malloc(N * sizeof(float), 32));
float *y = static_cast<float *>(_mm_malloc(N * sizeof(float), 32));
float *z = static_cast<float *>(_mm_malloc(N * sizeof(float), 32));
float *vx = static_cast<float *>(_mm_malloc(N * sizeof(float), 32));
float *vy = static_cast<float *>(_mm_malloc(N * sizeof(float), 32));
float *vz = static_cast<float *>(_mm_malloc(N * sizeof(float), 32));
};
int main()
{
auto start = clock();
std::vector<Obj> obj(N);
std::cout << "construct objectVector time: " << double(clock() - start)
<< std::endl;
start = clock();
for (auto &i : obj)
{
i.x += i.vx * DT;
i.y += i.vy * DT;
i.z += i.vz * DT;
}
std::cout << "objectVector compute time: " << double(clock() - start)
<< std::endl;
start = clock();
ObjArr objAll;
for (int i = 0; i != N; ++i)
{
objAll.x[i] = 2.36;
objAll.y[i] = 3.988;
objAll.z[i] = 5.125;
objAll.vx[i] = 1;
objAll.vy[i] = 1;
objAll.vz[i] = 1;
}
std::cout << "construct ObjArr time: " << double(clock() - start)
<< std::endl;
start = clock();
__m256 x;
__m256 y;
__m256 z;
__m256 rx;
__m256 ry;
__m256 rz;
__m256 mdt;
float dtArr[8]{DT, DT, DT, DT, DT, DT, DT, DT};
mdt = _mm256_load_ps(dtArr);
for (int i = 0; i + 8 < N; i += 8)
{
x = _mm256_load_ps(&objAll.x[i]);
rx = _mm256_load_ps(&objAll.vx[i]);
x = _mm256_add_ps(x, _mm256_mul_ps(rx, mdt));
_mm256_store_ps(&objAll.x[i], x);
y = _mm256_load_ps(&objAll.y[i]);
ry = _mm256_load_ps(&objAll.vy[i]);
y = _mm256_add_ps(y, _mm256_mul_ps(ry, mdt));
_mm256_store_ps(&objAll.y[i], y);
z = _mm256_load_ps(&objAll.z[i]);
rz = _mm256_load_ps(&objAll.vz[i]);
z = _mm256_add_ps(z, _mm256_mul_ps(rz, mdt));
_mm256_store_ps(&objAll.z[i], z);
}
std::cout << "ObjArr compute time: " << double(clock() - start)
<< std::endl;
return 0;
}