Basic implementation for avx (#69)

* add baseline implementation * support onednn * kernel timing * performance * int8 onednn * utils * minor * fix * mnior * avx imp for int8 gemm * add file * fix * cuda compiler flags * compilation for int8 * minor * minor * minor * 2x2 unroll * omp imp * unroll 32 elements * min/max params * bias support * minor * minor * fix * bf32 fp32 ops * bmm * fix * rounding * fix
mit-han-lab · May 1, 2023 · fdc0019 · fdc0019
1 parent d412e11
commit fdc0019
Show file tree

Hide file tree

Showing 11 changed files with 2,168 additions and 0 deletions.
diff --git a/experimental/matmul_optimization/src/Makefile b/experimental/matmul_optimization/src/Makefile
@@ -0,0 +1,96 @@
+# Check operating system
+OS := $(shell uname)
+
+# OneDNN availability
+ONEDNN_AVAILABLE =
+ifeq ($(OS), Darwin)  # macOS
+    $(info Detected macOS)
+    ONEDNN_AVAILABLE := $(shell otool -L /usr/local/lib/libdnnl* 2> /dev/null)
+else ifeq ($(OS), Linux)  # Ubuntu or other Linux distributions
+    $(info Detected Linux)
+    ONEDNN_AVAILABLE_CHK := $(shell pkg-config --exists dnnl; echo $$?)
+    ifeq ($(ONEDNN_AVAILABLE_CHK), 0)
+        ONEDNN_AVAILABLE := $(shell pkg-config --exists onednn 2> /dev/null) # TODO: check this in Linux env
+	endif
+else
+    $(error Unsupported operating system)
+endif
+
+# Check if CUDA is available
+CUDA_AVAILABLE := $(shell command -v /usr/local/cuda/bin/nvcc 2> /dev/null)
+
+CC_FLAGS = -O3 -std=c++11 #-g
+#CC_FLAGS = -O3 -std=c++11 -Xclang -fopenmp -g
+# Compiler and flags
+ifdef CUDA_AVAILABLE
+	CC = /usr/local/cuda/bin/nvcc
+	CC_FLAGS += -DCUDA_ENABLE
+    $(info CUDA is available)
+else
+    CC = g++
+    CC_FLAGS += -mavx2 -mfma
+endif
+ifdef ONEDNN_AVAILABLE
+	CC_FLAGS += -DONEDNN_ENABLE
+    $(info ONEDNN is available)
+endif
+
+# Include directories
+# INCLUDE_DIRS = -I./ -I/usr/local/opt/libomp/include
+INCLUDE_DIRS = -I./
+
+# Library directories
+LIBRARY_DIRS = -L/usr/local/cuda/lib64
+
+# Library flag
+LDFLAGS =
+ifdef ONEDNN_AVAILABLE
+LDFLAGS += -ldnnl
+endif
+
+# TODO: openmp flag
+OMP_FLAGS =  -L/usr/local/opt/libomp/lib/ -lomp
+# LDFLAGS += $(OMP_FLAGS
+
+# Files
+TARGET = benchmark_run
+CUDA_SRCS = lib/matmul.cu
+CPP_SRCS = benchmark/main.cc lib/matmul_imp.cc lib/utils.cc lib/matmul_int8.cc lib/matmul_avx_int8.cc
+ONEDNN_SRCS = lib/matmul_onednn.cc
+
+# Objects
+OBJS = $(CPP_SRCS:.cc=.o)
+INT8_OBJS = $(INT8_CPP_SRCS:.cc=.o)
+ifdef CUDA_AVAILABLE
+OBJS += $(CUDA_SRCS:.cu=.o)
+endif
+ifdef ONEDNN_AVAILABLE
+OBJS += $(ONEDNN_SRCS:.cc=.o)
+INT8_OBJS += $(ONEDNN_SRCS:.cc=.o)
+endif
+
+
+# $(info ONEDNN_AVAILABLE: $(ONEDNN_AVAILABLE))
+$(info CC_FLAGS: $(CC_FLAGS))
+
+
+# Targets
+all: $(TARGET)
+
+$(TARGET): $(OBJS)
+	$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -o $(TARGET) $(OBJS)
+
+%.o: %.cu
+	$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -c $< -o $@
+
+ifdef CUDA_AVAILABLE
+%.o: %.cc
+	$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -x cu -c $< -o $@
+else
+%.o: %.cc
+	$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -c $< -o $@
+    #$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -c $< -o $@ $(OMP_FLAGS)
+endif
+
+clean:
+	rm -f $(TARGET) $(OBJS)
diff --git a/experimental/matmul_optimization/src/README.md b/experimental/matmul_optimization/src/README.md
@@ -0,0 +1,3 @@
+# Build onednn (enable openmp on mac)
+
+cmake ..  -DOpenMP_C_FLAGS="-Xclang -fopenmp -I/usr/local/opt/libomp/include" -DOpenMP_C_LIB_NAMES="libomp" -DDNNL_CPU_RUNTIME=OMP  -DOpenMP_CXX_FLAGS="-Xclang -fopenmp -I/usr/local/opt/libomp/include"  -DOpenMP_CXX_LIB_NAMES="libomp" -DOpenMP_libomp_LIBRARY=/usr/local/opt/libomp/lib/libomp.dylib -DCMAKE_SHARED_LINKER_FLAGS="-L/usr/local/opt/libomp/lib/ -lomp -Wl,-rpath,/usr/local/opt/libomp/lib/"
diff --git a/experimental/matmul_optimization/src/benchmark/main.cc b/experimental/matmul_optimization/src/benchmark/main.cc
@@ -0,0 +1,241 @@
+#include <math.h>
+#include <stdio.h>
+
+#include <cstdlib>
+#include <iostream>
+
+#include "lib/matmul.h"
+
+#define BLK_SIZE 16
+#define MAX_PRECISION_ERROR 0.01
+
+#define M 1024
+#define N 1024
+#define K 1024
+#define A_ROW M
+#define A_COLUMN K
+#define B_ROW K
+#define B_COLUMN N
+#define C_ROW M
+#define C_COLUMN N
+#define NUM_THREAD 16
+
+float MAT_A[A_ROW * A_COLUMN];
+float MAT_B[B_ROW * B_COLUMN];
+float transpose_B[B_ROW * B_COLUMN];
+float native_C[C_ROW * C_COLUMN];
+float output_C[C_ROW * C_COLUMN];
+
+int8_t MAT_A_s8[A_ROW * A_COLUMN];
+int8_t MAT_B_s8[B_ROW * B_COLUMN];
+int32_t bias_s32[C_COLUMN];
+int8_t transpose_B_s8[B_ROW * B_COLUMN];
+int8_t native_C_s8[C_ROW * C_COLUMN];
+int8_t output_C_s8[C_ROW * C_COLUMN];
+
+bool check_identical(float matA[], float matB[], int size) {
+    for (int i = 0; i < size; i++) {
+        if (abs((matA[i] - matB[i]) / (matA[i])) > MAX_PRECISION_ERROR) {
+            printf("%d: %f, %f", i, matA[i], matB[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+bool check_identical(int8_t matA[], int8_t matB[], int size) {
+    for (int i = 0; i < size; i++) {
+        if (matA[i] != matB[i]) {
+            printf("%d: %d, %d", i, matA[i], matB[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T>
+void dump_integer_array(T matA[], int size) {
+    for (int i = 0; i < size; i++) {
+        printf("%d,", matA[i]);
+    }
+    printf("\n");
+}
+
+void initialize_matrix(float A[], int size) {
+    for (int i = 0; i < size; i++) {
+        A[i] = (float)(rand()) / (float)(RAND_MAX);
+    }
+}
+
+void initialize_matrix(int8_t A[], int size) {
+    for (int i = 0; i < size; i++) {
+        // A[i] = (rand() % 2) - 1;
+        A[i] = (rand() % 2);
+    }
+}
+
+void initialize_matrix(int32_t A[], int size) {
+    for (int i = 0; i < size; i++) {
+        // A[i] = (rand() % 2) - 1;
+        A[i] = (rand() % 2);
+    }
+}
+
+using namespace matmul;
+
+int main() {
+    // initialize
+    initialize_matrix(MAT_A, A_ROW * A_COLUMN);
+    initialize_matrix(MAT_B, B_ROW * B_COLUMN);
+    initialize_matrix(native_C, C_ROW * C_COLUMN);
+
+    initialize_matrix(MAT_A_s8, A_ROW * A_COLUMN);
+    initialize_matrix(MAT_B_s8, B_ROW * B_COLUMN);
+    initialize_matrix(native_C_s8, C_ROW * C_COLUMN);
+    // initialize_matrix(bias_s32, C_ROW * C_COLUMN);
+
+    MatmulOperator matmul_op = MatmulOperator();
+
+    struct matmul_params params, params_int8;
+    params.A.row = A_ROW;
+    params.A.column = A_COLUMN;
+    params.A.data_ptr = MAT_A;
+    params.B.row = B_ROW;
+    params.B.column = B_COLUMN;
+    params.B.data_ptr = MAT_B;
+    params.C.row = C_ROW;
+    params.C.column = C_COLUMN;
+    params.opt_params.blk_size = BLK_SIZE;
+    params.opt_params.num_thread = NUM_THREAD;
+
+    // int8
+    params_int8.A.row = A_ROW;
+    params_int8.A.column = A_COLUMN;
+    params_int8.A.int8_data_ptr = MAT_A_s8;
+    params_int8.A.qparams.scale = 1.0;
+    params_int8.A.qparams.zero_point = 0;
+    params_int8.B.row = B_ROW;
+    params_int8.B.column = B_COLUMN;
+    params_int8.B.int8_data_ptr = MAT_B_s8;
+    params_int8.B.qparams.scale = 1.0;
+    params_int8.B.qparams.zero_point = 0;
+    params_int8.C.row = C_ROW;
+    params_int8.C.column = C_COLUMN;
+    params_int8.C.int8_data_ptr = native_C_s8;
+    params_int8.C.qparams.scale = 1.0;
+    params_int8.C.qparams.q_max = 127;
+    params_int8.C.qparams.q_min = -128;
+    params_int8.C.qparams.zero_point = 0;
+    params_int8.opt_params.blk_size = BLK_SIZE;
+    params_int8.opt_params.num_thread = NUM_THREAD;
+    params_int8.bias.row = 1;
+    params_int8.bias.column = C_COLUMN;
+    params_int8.bias.int32_data_ptr = bias_s32;
+
+    // Baseline
+    params.C.data_ptr = native_C;
+    matmul_op.evaluate(MatmulOperator::NAIVE, &params);
+
+    params.C.data_ptr = output_C;
+    // unrolling
+    matmul_op.evaluate(MatmulOperator::UNROLL, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_unrolling\n");
+
+    // reordering
+    matmul_op.evaluate(MatmulOperator::REORDER, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_reordering\n");
+
+    // tiling
+    matmul_op.evaluate(MatmulOperator::TILING, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_tiling\n");
+
+    // multithreading
+    matmul_op.evaluate(MatmulOperator::MULTITHREAD, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_multithreading\n");
+
+    // transpose
+    matmul_op.evaluate(MatmulOperator::TRANSPOSE, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_transpose\n");
+
+    // transpose + simd
+    initialize_matrix(output_C, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::TRANSPOSE_SIMD, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_transpose_simd\n");
+
+// cuda
+#ifdef CUDA_ENABLE
+    matmul_op.evaluate(MatmulOperator::CUDA, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_cuda\n");
+#endif
+
+// ONEDNN
+#ifdef ONEDNN_ENABLE
+    initialize_matrix(output_C, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::ONEDNN_FP32, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("\nincorrect output of mat_mul_onedenn\n");
+#endif
+
+    // For fast, we need to transpose B first
+    for (int i = 0; i < B_COLUMN; i++)
+        for (int j = 0; j < B_ROW; j++) transpose_B[i * B_ROW + j] = MAT_B[j * B_COLUMN + i];
+    params.B.column = B_ROW;
+    params.B.row = B_COLUMN;
+    params.B.data_ptr = transpose_B;
+    params.opt_params.blk_size = BLK_SIZE;
+    params.opt_params.num_thread = NUM_THREAD;
+
+    // fast
+    initialize_matrix(output_C, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::FAST, &params);
+    if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_fast\n");
+
+    // int8
+    matmul_op.evaluate(MatmulOperator::INT8_BASELINE, &params_int8);
+
+    params_int8.C.int8_data_ptr = output_C_s8;
+
+    // For int8 SIMD, we need to transpose B first
+    for (int i = 0; i < B_COLUMN; i++)
+        for (int j = 0; j < B_ROW; j++) transpose_B_s8[i * B_ROW + j] = MAT_B_s8[j * B_COLUMN + i];
+
+    params_int8.B.int8_data_ptr = transpose_B_s8;
+    initialize_matrix(output_C_s8, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::INT8_AVX, &params_int8);
+    if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN))
+        printf("incorrect output from mat_mul_avx_int8\n");
+
+    initialize_matrix(output_C_s8, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::INT8_AVX_FAST, &params_int8);
+    if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN))
+        printf("incorrect output from mat_mul_avx_int8_fast\n");
+
+    initialize_matrix(output_C_s8, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::INT8_AVX_FAST_2x2, &params_int8);
+    if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN))
+        printf("incorrect output from mat_mul_avx_int8_fast_2x2\n");
+
+    initialize_matrix(output_C_s8, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::INT8_AVX_FAST_2x2_32UNROLL, &params_int8);
+    if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN))
+        printf("incorrect output from mat_mul_avx_int8_fast_2x2_32unroll\n");
+
+    initialize_matrix(output_C_s8, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::INT8_AVX_FAST_2x2_OMP, &params_int8);
+    if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN))
+        printf("incorrect output from mat_mul_avx_int8_fast_2x2_omp\n");
+
+// ONEDNN
+#ifdef ONEDNN_ENABLE
+    initialize_matrix(output_C_s8, C_ROW * C_COLUMN);
+    matmul_op.evaluate(MatmulOperator::ONEDNN_INT8, &params_int8);
+    if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN))
+        printf("incorrect output from mat_mul_onednn_int8\n");
+#endif
+    // Debugging
+    // dump_integer_array(MAT_A_s8, A_ROW * A_COLUMN);
+    // dump_integer_array(MAT_B_s8, B_ROW * B_COLUMN);
+    // dump_integer_array(native_C_s8, C_ROW * C_COLUMN);
+    // dump_integer_array(output_C_s8, C_ROW * C_COLUMN);
+
+    return 0;
+}