使用Vitis HLS实现和优化DethSepConv函数
1初版功能验证
1.0设计框图
+-----------------------------------+ | Input Data | +----------------+------------------+ | Data Split / Parallel Ops +----------------V------------------+ | Depthwise Convolution Layer | Pipelined / Parallel Execution using DSP +----------------+------------------+ | +----------------V------------------+ | Batch Normalization | +----------------+------------------+ | +----------------V------------------+ | Activation Function | +----------------+------------------+ | +----------------V------------------+ | SE Block (Optional) | +----------------+------------------+ | +----------------V------------------+ | Pointwise Convolution Layer | Pipelined / Parallel Execution using DSP +----------------+------------------+ | +----------------V------------------+ | Batch Normalization | +----------------+------------------+ | +----------------V------------------+ | Activation Function | +----------------+------------------+ | +----------------V------------------+ | Output Data | +-----------------------------------+
1.1源码
//Activation.h #ifndef ACTIVATION_H #define ACTIVATION_H inline float hard_swich(float x) { return x > 0 ? x : 0; } template<int CHANNELS> void Activation(float input[CHANNELS][32][32], float output[CHANNELS][32][32], float(*activation_function)(float)) { for(int ch = 0; ch < CHANNELS; ch++) { for(int row = 0; row < 32; row++) { for(int col = 0; col < 32; col++) { output[ch][row][col] = activation_function(input[ch][row][col]); } } } } #endif
//BatchNormalization.h #ifndef BATCHNORMALIZATION_H #define BATCHNORMALIZATION_H template<int CHANNELS> void BatchNormalization(float input[CHANNELS][32][32], float output[CHANNELS][32][32], float mean[CHANNELS], float variance[CHANNELS], float gamma[CHANNELS], float beta[CHANNELS], float epsilon = 1e-5) { for(int ch = 0; ch < CHANNELS; ch++) { for(int row = 0; row < 32; row++) { for(int col = 0; col < 32; col++) { output[ch][row][col] = gamma[ch] * ((input[ch][row][col] - mean[ch]) / hls::sqrt(variance[ch] + epsilon)) + beta[ch]; } } } } #endif
// Conv2D.h #ifndef CONV2D_H #define CONV2D_H #include "ap_int.h" #include "hls_stream.h" #include "hls_math.h" // 普通卷积运算 template<int INPUT_CHANNELS, int OUTPUT_CHANNELS, int KERNEL_SIZE, int STRIDE> void Conv2D(float input[INPUT_CHANNELS][32][32], float output[OUTPUT_CHANNELS][32][32], float weights[OUTPUT_CHANNELS][INPUT_CHANNELS][KERNEL_SIZE][KERNEL_SIZE]) { // 卷积运算实现 for(int out_ch = 0; out_ch < OUTPUT_CHANNELS; out_ch++) { for(int row = 0; row < 32; row += STRIDE) { for(int col = 0; col < 32; col += STRIDE) { float acc = 0; for(int in_ch = 0; in_ch < INPUT_CHANNELS; in_ch++) { for(int k_row = 0; k_row < KERNEL_SIZE; k_row++) { for(int k_col = 0; k_col < KERNEL_SIZE; k_col++) { int in_row = row + k_row - KERNEL_SIZE/2; int in_col = col + k_col - KERNEL_SIZE/2; if (in_row >= 0 && in_row < 32 && in_col >= 0 && in_col < 32) { acc += input[in_ch][in_row][in_col] * weights[out_ch][in_ch][k_row][k_col]; } } } } output[out_ch][row][col] = acc; } } } } // 深度卷积运算 template<int CHANNELS, int KERNEL_SIZE, int STRIDE> void DepthwiseConv2D(float input[CHANNELS][32][32], float output[CHANNELS][32][32], float weights[CHANNELS][1][KERNEL_SIZE][KERNEL_SIZE]) { // 深度卷积运算实现 for(int ch = 0; ch < CHANNELS; ch++) { for(int row = 0; row < 32; row += STRIDE) { for(int col = 0; col < 32; col += STRIDE) { float acc = 0; for(int k_row = 0; k_row < KERNEL_SIZE; k_row++) { for(int k_col = 0; k_col < KERNEL_SIZE; k_col++) { int in_row = row + k_row - KERNEL_SIZE/2; int in_col = col + k_col - KERNEL_SIZE/2; if (in_row >= 0 && in_row < 32 && in_col >= 0 && in_col < 32) { acc += input[ch][in_row][in_col] * weights[ch][0][k_row][k_col]; } } } output[ch][row][col] = acc; } } } } #endif
// DethSepConv.h #ifndef DETHSEPCONV_H #define DETHSEPCONV_H #include "Conv2D.h" #include "BatchNormalization.h" #include "Activation.h" #include "SE_block.h" template<int INPUT_CHANNELS, int OUTPUT_CHANNELS, int KERNEL_SIZE, int STRIDE> void DethSepConv(float input[INPUT_CHANNELS][32][32], float output[OUTPUT_CHANNELS][32][32], float depthwise_weights[INPUT_CHANNELS][1][KERNEL_SIZE][KERNEL_SIZE], float pointwise_weights[OUTPUT_CHANNELS][INPUT_CHANNELS][1][1], float bn_mean1[INPUT_CHANNELS], float bn_var1[INPUT_CHANNELS], float bn_gamma1[INPUT_CHANNELS], float bn_beta1[INPUT_CHANNELS], float bn_mean2[OUTPUT_CHANNELS], float bn_var2[OUTPUT_CHANNELS], float bn_gamma2[OUTPUT_CHANNELS], float bn_beta2[OUTPUT_CHANNELS], bool use_se = false, float se_fc1_weights[INPUT_CHANNELS/16][INPUT_CHANNELS] = nullptr, float se_fc1_bias[INPUT_CHANNELS/16] = nullptr, float se_fc2_weights[INPUT_CHANNELS][INPUT_CHANNELS/16] = nullptr, float se_fc2_bias[INPUT_CHANNELS] = nullptr) { #pragma HLS INTERFACE m_axi port = input offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = output offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = depthwise_weights offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = pointwise_weights offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_mean1 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_var1 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_gamma1 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_beta1 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_mean2 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_var2 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_gamma2 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_beta2 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = se_fc1_weights offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = se_fc1_bias offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = se_fc2_weights offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = se_fc2_bias offset = slave bundle = gmem #pragma HLS INTERFACE s_axilite port = use_se bundle = control #pragma HLS INTERFACE s_axilite port = return bundle = control float depthwise_out[INPUT_CHANNELS][32][32]; float bn1_out[INPUT_CHANNELS][32][32]; float activation1_out[INPUT_CHANNELS][32][32]; float se_out[INPUT_CHANNELS][32][32]; float pointwise_out[OUTPUT_CHANNELS][32][32]; float bn2_out[OUTPUT_CHANNELS][32][32]; DepthwiseConv2D<INPUT_CHANNELS, KERNEL_SIZE, STRIDE>(input, depthwise_out, depthwise_weights); BatchNormalization<INPUT_CHANNELS>(depthwise_out, bn1_out, bn_mean1, bn_var1, bn_gamma1, bn_beta1); Activation<INPUT_CHANNELS>(bn1_out, activation1_out, hard_swich); if (use_se) { SE_block<INPUT_CHANNELS>(activation1_out, se_out, se_fc1_weights, se_fc1_bias, se_fc2_weights, se_fc2_bias); } else { for (int ch = 0; ch < INPUT_CHANNELS; ch++) { for (int row = 0; row < 32; row++) { for (int col = 0; col < 32; col++) { se_out[ch][row][col] = activation1_out[ch][row][col]; } } } } Conv2D<INPUT_CHANNELS, OUTPUT_CHANNELS, 1, 1>(se_out, pointwise_out, pointwise_weights); BatchNormalization<OUTPUT_CHANNELS>(pointwise_out, bn2_out, bn_mean2, bn_var2, bn_gamma2, bn_beta2); Activation<OUTPUT_CHANNELS>(bn2_out, output, hard_swich); } #endif
//SE_block.h #ifndef SE_BLOCK_H #define SE_BLOCK_H template<int CHANNELS> void SE_block(float input[CHANNELS][32][32], float output[CHANNELS][32][32], float fc1_weights[CHANNELS/16][CHANNELS], float fc1_bias[CHANNELS/16], float fc2_weights[CHANNELS][CHANNELS/16], float fc2_bias[CHANNELS]) { float pooled[CHANNELS] = {0}; // 全局平均池化 for (int ch = 0; ch < CHANNELS; ch++) { for (int row = 0; row < 32; row++) { for (int col = 0; col < 32; col++) { pooled[ch] += input[ch][row][col]; } } pooled[ch] /= 32 * 32; } float fc1_out[CHANNELS / 16] = {0}; // 第一个全连接层 for (int i = 0; i < CHANNELS / 16; i++) { for (int j = 0; j < CHANNELS; j++) { fc1_out[i] += pooled[j] * fc1_weights[i][j]; } fc1_out[i] += fc1_bias[i]; fc1_out[i] = hard_swich(fc1_out[i]); } float fc2_out[CHANNELS] = {0}; // 第二个全连接层 for (int i = 0; i < CHANNELS; i++) { for (int j = 0; j < CHANNELS / 16; j++) { fc2_out[i] += fc1_out[j] * fc2_weights[i][j]; } fc2_out[i] += fc2_bias[i]; fc2_out[i] = hard_swich(fc2_out[i]); } for (int ch = 0; ch < CHANNELS; ch++) { for (int row = 0; row < 32; row++) { for (int col = 0; col < 32; col++) { output[ch][row][col] = input[ch][row][col] * fc2_out[ch]; } } } } #endif
// top.cpp #include "DethSepConv.h" extern "C" { void top_function(float input[32][32][32], float output[64][32][32], float depthwise_weights[32][1][3][3], float pointwise_weights[64][32][1][1], float bn_mean1[32], float bn_var1[32], float bn_gamma1[32], float bn_beta1[32], float bn_mean2[64], float bn_var2[64], float bn_gamma2[64], float bn_beta2[64], bool use_se, float se_fc1_weights[2][32], float se_fc1_bias[2], float se_fc2_weights[32][2], float se_fc2_bias[32]) { #pragma HLS INTERFACE m_axi port = input offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = output offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = depthwise_weights offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = pointwise_weights offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_mean1 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_var1 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_gamma1 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_beta1 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_mean2 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_var2 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_gamma2 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = bn_beta2 offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = se_fc1_weights offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = se_fc1_bias offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = se_fc2_weights offset = slave bundle = gmem #pragma HLS INTERFACE m_axi port = se_fc2_bias offset = slave bundle = gmem #pragma HLS INTERFACE s_axilite port = use_se bundle = control #pragma HLS INTERFACE s_axilite port = return bundle = control DethSepConv<32, 64, 3, 1>(input, output, depthwise_weights, pointwise_weights, bn_mean1, bn_var1, bn_gamma1, bn_beta1, bn_mean2, bn_var2, bn_gamma2, bn_beta2, use_se, se_fc1_weights, se_fc1_bias, se_fc2_weights, se_fc2_bias); } }
1.2仿真激励
// TestDethSepConv.cpp #include <iostream> #include <cstdio> #include "DethSepConv.h" void initialize_array(float* array, int size) { for(int i = 0; i < size; i++) { array[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX) - 0.5; // [-0.5, 0.5] 范围内的随机值 } } int main() { constexpr int INPUT_CHANNELS = 32; //改成32通道 constexpr int OUTPUT_CHANNELS = 64; //改成64通道 constexpr int KERNEL_SIZE = 3; constexpr int STRIDE = 1; float input[INPUT_CHANNELS][32][32]; float output[OUTPUT_CHANNELS][32][32]; float depthwise_weights[INPUT_CHANNELS][1][KERNEL_SIZE][KERNEL_SIZE]; float pointwise_weights[OUTPUT_CHANNELS][INPUT_CHANNELS][1][1]; float bn_mean1[INPUT_CHANNELS]; float bn_var1[INPUT_CHANNELS]; float bn_gamma1[INPUT_CHANNELS]; float bn_beta1[INPUT_CHANNELS]; float bn_mean2[OUTPUT_CHANNELS]; float bn_var2[OUTPUT_CHANNELS]; float bn_gamma2[OUTPUT_CHANNELS]; float bn_beta2[OUTPUT_CHANNELS]; float se_fc1_weights[INPUT_CHANNELS/16][INPUT_CHANNELS]; float se_fc1_bias[INPUT_CHANNELS/16]; float se_fc2_weights[INPUT_CHANNELS][INPUT_CHANNELS/16]; float se_fc2_bias[INPUT_CHANNELS]; // 初始化数组 initialize_array((float*)input, INPUT_CHANNELS * 32 * 32); initialize_array((float*)depthwise_weights, INPUT_CHANNELS * 1 * KERNEL_SIZE * KERNEL_SIZE); initialize_array((float*)pointwise_weights, OUTPUT_CHANNELS * INPUT_CHANNELS * 1 * 1); initialize_array(bn_mean1, INPUT_CHANNELS); initialize_array(bn_var1, INPUT_CHANNELS); initialize_array(bn_gamma1, INPUT_CHANNELS); initialize_array(bn_beta1, INPUT_CHANNELS); initialize_array(bn_mean2, OUTPUT_CHANNELS); initialize_array(bn_var2, OUTPUT_CHANNELS); initialize_array(bn_gamma2, OUTPUT_CHANNELS); initialize_array(bn_beta2, OUTPUT_CHANNELS); initialize_array((float*)se_fc1_weights, (INPUT_CHANNELS/16) * INPUT_CHANNELS); initialize_array(se_fc1_bias, (INPUT_CHANNELS/16)); initialize_array((float*)se_fc2_weights, INPUT_CHANNELS * (INPUT_CHANNELS/16)); initialize_array(se_fc2_bias, INPUT_CHANNELS); // 调用DethSepConv功能 DethSepConv<INPUT_CHANNELS, OUTPUT_CHANNELS, KERNEL_SIZE, STRIDE>( input, output, depthwise_weights, pointwise_weights, bn_mean1, bn_var1, bn_gamma1, bn_beta1, bn_mean2, bn_var2, bn_gamma2, bn_beta2, true, // 使用SE se_fc1_weights, se_fc1_bias, se_fc2_weights, se_fc2_bias); // 打印输出以验证正确性 (简化版) for (int ch = 0; ch < OUTPUT_CHANNELS; ch++) { for (int row = 0; row < 32; row++) { for (int col = 0; col < 32; col++) { std::cout << "output[" << ch << "][" << row << "][" << col << "] = " << output[ch][row][col] << std::endl; } } } std::cout << "Test complete!" << std::endl; return 0; }
1.3初版编译(未进行优化,仅验证功能)
1.4功能验证
INFO: [SIM 2] *************** CSIM start *************** INFO: [SIM 4] CSIM will launch GCC as the compiler. Compiling ../../../../src/TestDethSepConv.cpp in debug mode Generating csim.exe output[0][0][0] = 0 output[0][0][1] = 0 output[0][0][2] = 0 output[0][0][3] = 0 output[0][0][4] = 0 ...... output[11][23][23] = 0.655102 output[11][23][24] = 0.65415 output[11][23][25] = 0.65152 ...... output[63][31][28] = 0 output[63][31][29] = 0 output[63][31][30] = 0 output[63][31][31] = 0 Test complete! INFO: [SIM 1] CSim done with 0 errors. INFO: [SIM 3] *************** CSIM finish ***************
功能验证通过!
2实现优化
2.1初版优化
优化基于Vitis HLS的深度卷积代码的主要方向是并行化计算和优化内存访问。以下是详细的优化步骤和代码调整建议:
2.1.1. 在Activation模块中加入数据流指令 (Dataflow)
通过在 Activation
函数中使用 #pragma HLS pipeline
和 #pragma HLS unroll
指令来提高并行度,从而加速计算速度。
Activation.h
inline float hard_swich(float x) { return x > 0 ? x : 0;}template<int CHANNELS>void Activation(float input[CHANNELS][32][32], float output[CHANNELS][32][32], float(*activation_function)(float)) {#pragma HLS inline for(int ch = 0; ch < CHANNELS; ch++) {#pragma HLS unroll for(int row = 0; row < 32; row++) {#pragma HLS pipeline for(int col = 0; col < 32; col++) { output[ch][row][col] = activation_function(input[ch][row][col]); } } }}
2.1.2. 在BatchNormalization模块中优化管道和循环级展开
同样的,我们可以对 BatchNormalization
函数进行类似的优化。
BatchNormalization.h
template<int CHANNELS>void BatchNormalization(float input[CHANNELS][32][32], float output[CHANNELS][32][32], float mean[CHANNELS], float variance[CHANNELS], float gamma[CHANNELS], float beta[CHANNELS], float epsilon = 1e-5) {#pragma HLS inline for(int ch = 0; ch < CHANNELS; ch++) {#pragma HLS unroll for(int row = 0; row < 32; row++) {#pragma HLS pipeline for(int col = 0; col < 32; col++) { output[ch][row][col] = gamma[ch] * ((input[ch][row][col] - mean[ch]) / hls::sqrt(variance[ch] + epsilon)) + beta[ch]; } } }}
2.1.3. 在Conv2D和DepthwiseConv2D模块中使用管道、展开和数据流指令
对于 Conv2D
和 DepthwiseConv2D
函数,重点优化内层的卷积计算部分。
Conv2D.h
template<int INPUT_CHANNELS, int OUTPUT_CHANNELS, int KERNEL_SIZE, int STRIDE>void Conv2D(float input[INPUT_CHANNELS][32][32], float output[OUTPUT_CHANNELS][32][32], float weights[OUTPUT_CHANNELS][INPUT_CHANNELS][KERNEL_SIZE][KERNEL_SIZE]) {#pragma HLS inline#pragma HLS dataflow for(int out_ch = 0; out_ch < OUTPUT_CHANNELS; out_ch++) {#pragma HLS unroll for(int row = 0; row < 32; row += STRIDE) {#pragma HLS pipeline for(int col = 0; col < 32; col += STRIDE) { float acc = 0; for(int in_ch = 0; in_ch < INPUT_CHANNELS; in_ch++) {#pragma HLS unroll for(int k_row = 0; k_row < KERNEL_SIZE; k_row++) {#pragma HLS unroll for(int k_col = 0; k_col < KERNEL_SIZE; k_col++) { int in_row = row + k_row - KERNEL_SIZE/2; int in_col = col + k_col - KERNEL_SIZE/2; if (in_row >= 0 && in_row < 32 && in_col >= 0 && in_col < 32) { acc += input[in_ch][in_row][in_col] * weights[out_ch][in_ch][k_row][k_col]; } } } } output[out_ch][row][col] = acc; } } }}template<int CHANNELS, int KERNEL_SIZE, int STRIDE>void DepthwiseConv2D(float input[CHANNELS][32][32], float output[CHANNELS][32][32], float weights[CHANNELS][1][KERNEL_SIZE][KERNEL_SIZE]) {#pragma HLS inline#pragma HLS dataflow for(int ch = 0; ch < CHANNELS; ch++) {#pragma HLS unroll for(int row = 0; row < 32; row += STRIDE) {#pragma HLS pipeline for(int col = 0; col < 32; col += STRIDE) { float acc = 0; for(int k_row = 0; k_row < KERNEL_SIZE; k_row++) {#pragma HLS unroll for(int k_col = 0; k_col < KERNEL_SIZE; k_col++) { int in_row = row + k_row - KERNEL_SIZE / 2; int in_col = col + k_col - KERNEL_SIZE / 2; if (in_row >= 0 && in_row < 32 && in_col >= 0 && in_col < 32) { acc += input[ch][in_row][in_col] * weights[ch][0][k_row][k_col]; } } } output[ch][row][col] = acc; } } }}
2.1.4. 在DethSepConv模块中使用数据流指令
将 DethSepConv
中不同层之间的数据流化。
DethSepConv.h
template<int INPUT_CHANNELS, int OUTPUT_CHANNELS, int KERNEL_SIZE, int STRIDE>void DethSepConv(float input[INPUT_CHANNELS][32][32], float output[OUTPUT_CHANNELS][32][32], float depthwise_weights[INPUT_CHANNELS][1][KERNEL_SIZE][KERNEL_SIZE], float pointwise_weights[OUTPUT_CHANNELS][INPUT_CHANNELS][1][1], float bn_mean1[INPUT_CHANNELS], float bn_var1[INPUT_CHANNELS], float bn_gamma1[INPUT_CHANNELS], float bn_beta1[INPUT_CHANNELS], float bn_mean2[OUTPUT_CHANNELS], float bn_var2[OUTPUT_CHANNELS], float bn_gamma2[OUTPUT_CHANNELS], float bn_beta2[OUTPUT_CHANNELS], bool use_se = false, float se_fc1_weights[INPUT_CHANNELS/16][INPUT_CHANNELS] = nullptr, float se_fc1_bias[INPUT_CHANNELS/16] = nullptr, float se_fc2_weights[INPUT_CHANNELS][INPUT_CHANNELS/16] = nullptr, float se_fc2_bias[INPUT_CHANNELS] = nullptr) {#pragma HLS dataflow float depthwise_out[INPUT_CHANNELS][32][32]; float bn1_out[INPUT_CHANNELS][32][32]; float activation1_out[INPUT_CHANNELS][32][32]; float se_out[INPUT_CHANNELS][32][32]; float pointwise_out[OUTPUT_CHANNELS][32][32]; float bn2_out[OUTPUT_CHANNELS][32][32]; DepthwiseConv2D<INPUT_CHANNELS, KERNEL_SIZE, STRIDE>(input, depthwise_out, depthwise_weights); BatchNormalization<INPUT_CHANNELS>(depthwise_out, bn1_out, bn_mean1, bn_var1, bn_gamma1, bn_beta1); Activation<INPUT_CHANNELS>(bn1_out, activation1_out, hard_swich); if (use_se) { SE_block<INPUT_CHANNELS>(activation1_out, se_out, se_fc1_weights, se_fc1_bias, se_fc2_weights, se_fc2_bias); } else { for (int ch = 0; ch < INPUT_CHANNELS; ch++) {#pragma HLS unroll for (int row = 0; row < 32; row++) {#pragma HLS pipeline for (int col = 0; col < 32; col++) { se_out[ch][row][col] = activation1_out[ch][row][col]; } } } } Conv2D<INPUT_CHANNELS, OUTPUT_CHANNELS, 1, 1>(se_out, pointwise_out, pointwise_weights); BatchNormalization<OUTPUT_CHANNELS>(pointwise_out, bn2_out, bn_mean2, bn_var2, bn_gamma2, bn_beta2); Activation<OUTPUT_CHANNELS>(bn2_out, output, hard_swich);}
2.1.5. 在SE_block模块中使用指令
为SE_block函数添加管道和展开指令。
template<int CHANNELS> void SE_block(float input[CHANNELS][32][32], float output[CHANNELS][32][32], float fc1_weights[CHANNELS/16][CHANNELS], float fc1_bias[CHANNELS/16], float fc2_weights[CHANNELS][CHANNELS/16], float fc2_bias[CHANNELS]) { float pooled[CHANNELS] = {0}; // 全局平均池化 for (int ch = 0; ch < CHANNELS; ch++) { #pragma HLS unroll for (int row = 0; row < 32; row++) { #pragma HLS pipeline for (int col = 0; col < 32; col++) { pooled[ch] += input[ch][row][col]; } } pooled[ch] /= 32 * 32; } float fc1_out[CHANNELS / 16] = {0}; // 第一个全连接层 for (int i = 0; i < CHANNELS / 16; i++) { #pragma HLS unroll for (int j = 0; j < CHANNELS; j++) { #pragma HLS pipeline fc1_out[i] += pooled[j] * fc1_weights[i][j]; } fc1_out[i] += fc1_bias[i]; fc1_out[i] = hard_swich(fc1_out[i]); } float fc2_out[CHANNELS] = {0}; // 第二个全连接层 for (int i = 0; i < CHANNELS; i++) { #pragma HLS unroll for (int j = 0; j < CHANNELS / 16; j++) { #pragma HLS pipeline fc2_out[i] += fc1_out[j] * fc2_weights[i][j]; } fc2_out[i] += fc2_bias[i]; fc2_out[i] = hard_swich(fc2_out[i]); } for (int ch = 0; ch < CHANNELS; ch++) { #pragma HLS unroll for (int row = 0; row < 32; row++) { #pragma HLS pipeline for (int col = 0; col < 32; col++) { output[ch][row][col] = input[ch][row][col] * fc2_out[ch]; } } } }
2.1.6. 在top_function中应用这些优化
我们可确保在 top_function
中使用了优化的子模块。
top.cpp
extern "C" {void top_function(float input[32][32][32], float output[64][32][32], float depthwise_weights[32][1][3][3], float pointwise_weights[64][32][1][1], float bn_mean1[32], float bn_var1[32], float bn_gamma1[32], float bn_beta1[32], float bn_mean2[64], float bn_var2[64], float bn_gamma2[64], float bn_beta2[64], bool use_se, float se_fc1_weights[2][32], float se_fc1_bias[2], float se_fc2_weights[32][2], float se_fc2_bias[32]) {#pragma HLS INTERFACE m_axi port = input offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = output offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = depthwise_weights offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = pointwise_weights offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = bn_mean1 offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = bn_var1 offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = bn_gamma1 offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = bn_beta1 offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = bn_mean2 offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = bn_var2 offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = bn_gamma2 offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = bn_beta2 offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = se_fc1_weights offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = se_fc1_bias offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = se_fc2_weights offset = slave bundle = gmem#pragma HLS INTERFACE m_axi port = se_fc2_bias offset = slave bundle = gmem#pragma HLS INTERFACE s_axilite port = use_se bundle = control#pragma HLS INTERFACE s_axilite port = return bundle = control#pragma HLS dataflow DethSepConv<32, 64, 3, 1>(input, output, depthwise_weights, pointwise_weights, bn_mean1, bn_var1, bn_gamma1, bn_beta1, bn_mean2, bn_var2, bn_gamma2, bn_beta2, use_se, se_fc1_weights, se_fc1_bias, se_fc2_weights, se_fc2_bias);}}
通过这些修改,我们可以显著提高这段基于Vitis HLS的深度卷积代码的执行速度。在每一个函数中使用 #pragma HLS pipeline
来提升并行度,通过 #pragma HLS unroll
指令减少循环开销,并且在需要数据流的地方使用 #pragma HLS dataflow
来优化模块间的数据传输。这样可以确保程序在FPGA上的高效执行。