From c2048c0590e65c019a321d78e4f0b991778bb8f8 Mon Sep 17 00:00:00 2001 From: Sduby Date: Sat, 12 Feb 2022 19:08:52 +0800 Subject: [PATCH] hw04 --- CMakeLists.txt | 4 +++ answer.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++ main.cpp | 75 +++++++++++++++++++++++++++---------------------- 3 files changed, 121 insertions(+), 34 deletions(-) create mode 100644 answer.md diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..9a369fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,4 +6,8 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +find_package(OpenMP REQUIRED) + add_executable(main main.cpp) +target_compile_options(main PUBLIC -ffast-math -march=native) +target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) diff --git a/answer.md b/answer.md new file mode 100644 index 0000000..dd4fb39 --- /dev/null +++ b/answer.md @@ -0,0 +1,76 @@ +# 优化结果 + +1. 未优化 + +``` +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 1946 ms +``` + +2. 将全局命名空间的`sqrt()`替换为`std::sqrt()` + - 这样会默认使用`float`的重载版本而不是`double` + +``` +Initial energy: -13.414000 +Final energy: -13.356841 +Time elapsed: 1571 ms +``` + +3. 使用SOA + SIMD优化 + - 这样做的话不同对象的相同属性会连续存储,在循环中连续访问的时候便于使用AVX指令集优化。 + +步骤: + - 将Star结构体内的`float`改为`vector` + - 将循环中的迭代器改为下标访问 + - CMakeLists中添加OpenMP包,添加`-ffast-math -march=native`选项启用AVX指令集 + - 使用`#pragma omp simd`避免pointer aliasing影响优化 + +``` +Initial energy: -13.414010 +Final energy: -13.356913 +Time elapsed: 234 ms +``` + +4. 将循环中的不变量提取出来,这样可以避免重复计算 + +对于`step()`其中的不变量为`G*dt`和`eps*eps`, 提取到全局变量后加快了速度 + +``` +Initial energy: -13.414012 +Final energy: -13.356912 +Time elapsed: 217 ms +``` +5. 进行某些代数化简 + +```cpp +float x = stars.mass[j] * Gdt / d2; +stars.vx[i] += dx * x; +stars.vy[i] += dy * x; +stars.vz[i] += dz * x; +``` + +``` +Initial energy: -13.414010 +Final energy: -13.356913 +Time elapsed: 201 ms +``` + +6. 不使用STL容器,使用C数组 + +有一定提升,但感觉不是很必要 + +```cpp +Initial energy: -13.414012 +Final energy: -13.356912 +Time elapsed: 170 ms +``` + +7. 尝试使用循环展开,然而无效果 + +# 结果&总结 + +最终结果:`1946ms -> 170ms` (11.4倍) + +- 效果最显著优化:开启SIMD指令 +- 其他优化:使用`std::sqrt()`, 进行代数化简、循环无关量提出,不使用STL容器 \ No newline at end of file diff --git a/main.cpp b/main.cpp index cf6369b..8f54b8b 100644 --- a/main.cpp +++ b/main.cpp @@ -8,59 +8,66 @@ float frand() { return (float)rand() / RAND_MAX * 2 - 1; } -struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; -}; +constexpr int size = 48; -std::vector stars; +struct Stars { + float px[size], py[size], pz[size]; + float vx[size], vy[size], vz[size]; + float mass[size]; +}; +Stars stars; void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); + for (int i = 0; i < size; i++) { + stars.px[i] = frand(); + stars.py[i] = frand(); + stars.pz[i] = frand(); + stars.vx[i] = frand(); + stars.vy[i] = frand(); + stars.vz[i] = frand(); + stars.mass[i] = frand()+1; } } float G = 0.001; float eps = 0.001; float dt = 0.01; +float Gdt = G*dt; +float epssqure = eps*eps; void step() { - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; +#pragma omp simd + for (size_t i = 0; i != size; i++) { + for (size_t j = 0; j != size; j++) { + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; + float d2 = dx * dx + dy * dy + dz * dz + epssqure; + d2 *= std::sqrt(d2); + float x = stars.mass[j] * Gdt / d2; + stars.vx[i] += dx * x; + stars.vy[i] += dy * x; + stars.vz[i] += dz * x; } } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; + for (size_t i = 0; i != size; i++) { + stars.px[i] += stars.vx[i] * dt; + stars.py[i] += stars.vy[i] * dt; + stars.pz[i] += stars.vz[i] * dt; } } float calc() { float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; + for (size_t i = 0; i != size; i++) { + float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i]; + energy += stars.mass[i] * v2 / 2; + for (size_t j = 0; j != size; j++) { + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; + energy -= stars.mass[j] * stars.mass[i] * G / std::sqrt(d2) / 2; } } return energy;