首页 技术 正文
技术 2022年11月21日
0 收藏 950 点赞 2,938 浏览 6353 个字

▶ 第四章,逐步优化了一个三维卷积计算的过程

● 基准代码

 #include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>
#include <assert.h>
#include <sys/mman.h> #define REAL float
#define NX (64) #ifndef M_PI
#define M_PI (3.1415926535897932384626)
#endif // 初始化格点矩阵
void init(REAL *buff, const int nx, const int ny, const int nz, const REAL kx, const REAL ky, const REAL kz,
const REAL dx, const REAL dy, const REAL dz, const REAL kappa, const REAL time)
{
REAL ax = exp(-kappa * time*(kx*kx)), ay = exp(-kappa * time*(ky*ky)), az = exp(-kappa * time*(kz*kz));
for (int jz = ; jz < nz; jz++)
{
for (int jy = ; jy < ny; jy++)
{
for (int jx = ; jx < nx; jx++)
{
int j = (jz * ny + jy) * NX + jx;
REAL x = dx * ((REAL)(jx + 0.5)), y = dy * ((REAL)(jy + 0.5)), z = dz * ((REAL)(jz + 0.5));
buff[j] = (REAL)0.125*(1.0 - ax * cos(kx * x))*(1.0 - ay * cos(ky * y))*(1.0 - az * cos(kz * z));;
}
}
}
} // 计算卷积
void diffusion(REAL *f1, REAL *f2, int nx, int ny, int nz,
REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)
{
for (int i = ; i < count; ++i)
{
for (int z = ; z < nz; z++)
{
for (int y = ; y < ny; y++)
{
for (int x = ; x < nx; x++)
{
int c = (z * ny + y) * NX + x;
int w = (x == ) ? c : c - ;
int e = (x == NX - ) ? c : c + ;
int n = (y == ) ? c : c - NX;
int s = (y == ny - ) ? c : c + NX;
int b = (z == ) ? c : c - NX * ny;
int t = (z == nz - ) ? c : c + NX * ny;
f2[c] = cc * f1[c] + cw * f1[w] + ce * f1[e] + cs * f1[s] + cn * f1[n] + cb * f1[b] + ct * f1[t];
}
}
}
REAL *t = f1;
f1 = f2;
f2 = t;
}
return;
} static double cur_second(void) // 计时器,返回一个秒数
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
} REAL accuracy(const REAL *b1, REAL *b2, const int len) //计算两个数组的差距
{
REAL err = 0.0;
for (int i = ; i < len; i++)
err += (b1[i] - b2[i]) * (b1[i] - b2[i]);
return (REAL)sqrt(err / len);
} void dump_result(REAL *f, int nx, int ny, int nz, char *out_path) // 将结果写到文件中
{
FILE *out = fopen(out_path, "w");
assert(out);
fwrite(f, sizeof(REAL), nx * ny * nz, out);
fclose(out);
} int main(int argc, char *argv[])
{
int nx = NX, ny = NX, nz = NX;
REAL *f1 = (REAL *)malloc(sizeof(REAL) * NX * NX * NX);
REAL *f2 = (REAL *)malloc(sizeof(REAL) * NX * NX * NX);
REAL *f3 = (REAL *)malloc(sizeof(REAL) * NX * ny * nz);
assert(f1 != MAP_FAILED);
assert(f2 != MAP_FAILED);
assert(f3 != MAP_FAILED); REAL dx, dy, dz, kx, ky, kz;
dx = dy = dz = 1.0 / nx; // 边长 1.0
kx = ky = kz = 2.0 * M_PI;
REAL kappa = 0.1;
REAL dt = 0.1 * dx * dx / kappa;
int count = 0.1 / dt; init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, 0.0); REAL ce, cw, cn, cs, ct, cb, cc;
ce = cw = kappa * dt / (dx * dx);
cn = cs = kappa * dt / (dy * dy);
ct = cb = kappa * dt / (dz * dz);
cc = 1.0 - (ce + cw + cn + cs + ct + cb); printf("Running diffusion kernel %d times\n", count);
fflush(stdout);
struct timeval time_b, time_e;
gettimeofday(&time_b, NULL);
diffusion(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc, dt, count);
gettimeofday(&time_e, NULL);
//dump_result((count % 2) ? f2 : f1, nx, ny, nz, "diffusion_result.dat"); init(f3, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, count * dt); // 对比基准结果
REAL err = accuracy((count % ) ? f2 : f1, f3, nx*ny*nz);
double elapsed_time = (time_e.tv_sec - time_b.tv_sec) + (time_e.tv_usec - time_b.tv_usec) * 1.0e-6;
REAL mflops = (nx*ny*nz)*13.0*count / elapsed_time * 1.0e-06;
double thput = (nx * ny * nz) * sizeof(REAL) * 3.0 * count / elapsed_time * 1.0e-09; printf("Elapsed time : %.3f (s)\nFLOPS : %.3f (MFlops)\n", elapsed_time, mflops);
printf("Throughput : %.3f (GB/s)\nAccuracy : %e\n", thput, err); free(f1);
free(f2);
return ;
}

■ 输出结果

Running diffusion kernel  times
Elapsed time : 177.015 (s)
FLOPS : 252.276 (MFlops)
Throughput : 0.233 (GB/s)
Accuracy : 5.068947e-06

● 计算内核加入 OpenMP

 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)// 加了 restrict
{
#pragma omp parallel // openMP 并行域
{
REAL *f1_t = f1, *f2_t = f2; // 使用局部的指针
for (int i = ; i < count; ++i)
{
#pragma omp for collapse(2) // 展开外两层循环
for (int z = ; z < nz; z++)
{
for (int y = ; y < ny; y++)
{
for (int x = ; x < nx; x++)
{
int c = (z * ny + y) * NX + x;
int w = (x == ) ? c : c - ;
int e = (x == NX - ) ? c : c + ;
int n = (y == ) ? c : c - NX;
int s = (y == ny - ) ? c : c + NX;
int b = (z == ) ? c : c - NX * ny;
int t = (z == nz - ) ? c : c + NX * ny;
f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
}
}
}
REAL *t = f1_t;
f1_t = f2_t;
f2_t = t;
}
}
return;
}

■ 输出结果

Running diffusion kernel  times
Elapsed time : 2.936 (s)
FLOPS : 15209.439 (MFlops)
Throughput : 14.039 (GB/s)
Accuracy : 4.789139e-06

● 保证向量化

 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)
{
#pragma omp parallel
{
REAL *f1_t = f1, *f2_t = f2;
for (int i = ; i < count; ++i)
{
#pragma omp for collapse(2)
for (int z = ; z < nz; z++)
{
for (int y = ; y < ny; y++)
{
#pragma simd // 保证向量化,不考虑 f1_t 和 f2_t 之间的独立子性
for (int x = ; x < nx; x++)
{
int c = (z * ny + y) * NX + x;
int w = (x == ) ? c : c - ;
int e = (x == NX - ) ? c : c + ;
int n = (y == ) ? c : c - NX;
int s = (y == ny - ) ? c : c + NX;
int b = (z == ) ? c : c - NX * ny;
int t = (z == nz - ) ? c : c + NX * ny;
f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
}
}
}
REAL *t = f1_t;
f1_t = f2_t;
f2_t = t;
}
}
return;
}

■ 输出结果

Running diffusion kernel  times
Elapsed time : 0.865 (s)
FLOPS : 51651.863 (MFlops)
Throughput : 47.679 (GB/s)
Accuracy : 4.427611e-06

● 手动剥离边界

 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)
{
#pragma omp parallel
{
REAL *f1_t = f1, *f2_t = f2;
for (int i = ; i < count; ++i)
{
#pragma omp for collapse(2)
for (int z = ; z < nz; z++)
{
for (int y = ; y < ny; y++)
{
int x = ; // 每行首次
int c = (z * ny + y) * NX + x; // 注意 w 方向的下标是 c
int n = (y == ) ? c : c - NX;
int s = (y == ny - ) ? c : c + NX;
int b = (z == ) ? c : c - NX * ny;
int t = (z == nz - ) ? c : c + NX * ny;
f2_t[c] = cc * f1_t[c] + cw * f1_t[c] + ce * f1_t[c + ] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
#pragma simd
for (x = ; x < nx - ; x++) // 中间部分,注意循环要按照 OpenMP 格式书写
{
c++;
n++;
s++;
b++;
t++;
f2_t[c] = cc * f1_t[c] + cw * f1_t[c - ] + ce * f1_t[c + ] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
}
c++; // 每行末次
n++; // 注意 e 方向的下标是 c
s++;
b++;
t++;
f2_t[c] = cc * f1_t[c] + cw * f1_t[c - ] + ce * f1_t[c] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
}
}
REAL *t = f1_t;
f1_t = f2_t;
f2_t = t;
}
}
return;
}

■ 输出结果

Running diffusion kernel  times
Elapsed time : 0.565 (s)
FLOPS : 79071.250 (MFlops)
Throughput : 72.989 (GB/s)
Accuracy : 4.577150e-06

● 数据切片

 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)
{
#pragma omp parallel
{
REAL *f1_t = f1, *f2_t = f2;
for (int i = ; i < count; ++i)
{
#define YBF 16 // 分块大小
#pragma omp for collapse(2)
for (int yy = ; yy < ny; yy += YBF) // 在循环之外放入分块
{
for (int z = ; z < nz; z++)
{
int yyy = (yy + YBF) >= ny ? ny : (yy + YBF); // 该分块的末端
for (int y = yy; y < yyy; y++) // y 限定在分块内循环
{
int x = ;
int c = (z * ny + y) * NX + x;
int n = (y == ) ? c : c - NX;
int s = (y == ny - ) ? c : c + NX;
int b = (z == ) ? c : c - NX * ny;
int t = (z == nz - ) ? c : c + NX * ny;
f2_t[c] = cc * f1_t[c] + cw * f1_t[c] + ce * f1_t[c + ] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
#pragma simd
for (x = ; x < nx - ; x++)
{
c++;
n++;
s++;
b++;
t++;
f2_t[c] = cc * f1_t[c] + cw * f1_t[c - ] + ce * f1_t[c + ] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
}
c++;
n++;
s++;
b++;
t++;
f2_t[c] = cc * f1_t[c] + cw * f1_t[c - ] + ce * f1_t[c] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
}
}
}
REAL *t = f1_t;
f1_t = f2_t;
f2_t = t;
}
}
return;
}

■ 输出结果,没有明显优化

Running diffusion kernel  times
Elapsed time : 0.594 (s)
FLOPS : 75224.680 (MFlops)
Throughput : 69.438 (GB/s)
Accuracy : 4.577150e-06
相关推荐
python开发_常用的python模块及安装方法
adodb:我们领导推荐的数据库连接组件bsddb3:BerkeleyDB的连接组件Cheetah-1.0:我比较喜欢这个版本的cheeta…
日期:2022-11-24 点赞:878 阅读:9,082
Educational Codeforces Round 11 C. Hard Process 二分
C. Hard Process题目连接:http://www.codeforces.com/contest/660/problem/CDes…
日期:2022-11-24 点赞:807 阅读:5,557
下载Ubuntn 17.04 内核源代码
zengkefu@server1:/usr/src$ uname -aLinux server1 4.10.0-19-generic #21…
日期:2022-11-24 点赞:569 阅读:6,406
可用Active Desktop Calendar V7.86 注册码序列号
可用Active Desktop Calendar V7.86 注册码序列号Name: www.greendown.cn Code: &nb…
日期:2022-11-24 点赞:733 阅读:6,179
Android调用系统相机、自定义相机、处理大图片
Android调用系统相机和自定义相机实例本博文主要是介绍了android上使用相机进行拍照并显示的两种方式,并且由于涉及到要把拍到的照片显…
日期:2022-11-24 点赞:512 阅读:7,815
Struts的使用
一、Struts2的获取  Struts的官方网站为:http://struts.apache.org/  下载完Struts2的jar包,…
日期:2022-11-24 点赞:671 阅读:4,898