-
Notifications
You must be signed in to change notification settings - Fork 132
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add baseline implementation * support onednn * kernel timing * performance * int8 onednn * utils * minor * fix * mnior * avx imp for int8 gemm * add file * fix * cuda compiler flags * compilation for int8 * minor * minor * minor * 2x2 unroll * omp imp * unroll 32 elements * min/max params * bias support * minor * minor * fix * bf32 fp32 ops * bmm * fix * rounding * fix
- Loading branch information
Showing
11 changed files
with
2,168 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# Check operating system | ||
OS := $(shell uname) | ||
|
||
# OneDNN availability | ||
ONEDNN_AVAILABLE = | ||
ifeq ($(OS), Darwin) # macOS | ||
$(info Detected macOS) | ||
ONEDNN_AVAILABLE := $(shell otool -L /usr/local/lib/libdnnl* 2> /dev/null) | ||
else ifeq ($(OS), Linux) # Ubuntu or other Linux distributions | ||
$(info Detected Linux) | ||
ONEDNN_AVAILABLE_CHK := $(shell pkg-config --exists dnnl; echo $$?) | ||
ifeq ($(ONEDNN_AVAILABLE_CHK), 0) | ||
ONEDNN_AVAILABLE := $(shell pkg-config --exists onednn 2> /dev/null) # TODO: check this in Linux env | ||
endif | ||
else | ||
$(error Unsupported operating system) | ||
endif | ||
|
||
# Check if CUDA is available | ||
CUDA_AVAILABLE := $(shell command -v /usr/local/cuda/bin/nvcc 2> /dev/null) | ||
|
||
CC_FLAGS = -O3 -std=c++11 #-g | ||
#CC_FLAGS = -O3 -std=c++11 -Xclang -fopenmp -g | ||
# Compiler and flags | ||
ifdef CUDA_AVAILABLE | ||
CC = /usr/local/cuda/bin/nvcc | ||
CC_FLAGS += -DCUDA_ENABLE | ||
$(info CUDA is available) | ||
else | ||
CC = g++ | ||
CC_FLAGS += -mavx2 -mfma | ||
endif | ||
ifdef ONEDNN_AVAILABLE | ||
CC_FLAGS += -DONEDNN_ENABLE | ||
$(info ONEDNN is available) | ||
endif | ||
|
||
# Include directories | ||
# INCLUDE_DIRS = -I./ -I/usr/local/opt/libomp/include | ||
INCLUDE_DIRS = -I./ | ||
|
||
# Library directories | ||
LIBRARY_DIRS = -L/usr/local/cuda/lib64 | ||
|
||
# Library flag | ||
LDFLAGS = | ||
ifdef ONEDNN_AVAILABLE | ||
LDFLAGS += -ldnnl | ||
endif | ||
|
||
# TODO: openmp flag | ||
OMP_FLAGS = -L/usr/local/opt/libomp/lib/ -lomp | ||
# LDFLAGS += $(OMP_FLAGS | ||
|
||
# Files | ||
TARGET = benchmark_run | ||
CUDA_SRCS = lib/matmul.cu | ||
CPP_SRCS = benchmark/main.cc lib/matmul_imp.cc lib/utils.cc lib/matmul_int8.cc lib/matmul_avx_int8.cc | ||
ONEDNN_SRCS = lib/matmul_onednn.cc | ||
|
||
# Objects | ||
OBJS = $(CPP_SRCS:.cc=.o) | ||
INT8_OBJS = $(INT8_CPP_SRCS:.cc=.o) | ||
ifdef CUDA_AVAILABLE | ||
OBJS += $(CUDA_SRCS:.cu=.o) | ||
endif | ||
ifdef ONEDNN_AVAILABLE | ||
OBJS += $(ONEDNN_SRCS:.cc=.o) | ||
INT8_OBJS += $(ONEDNN_SRCS:.cc=.o) | ||
endif | ||
|
||
|
||
# $(info ONEDNN_AVAILABLE: $(ONEDNN_AVAILABLE)) | ||
$(info CC_FLAGS: $(CC_FLAGS)) | ||
|
||
|
||
# Targets | ||
all: $(TARGET) | ||
|
||
$(TARGET): $(OBJS) | ||
$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -o $(TARGET) $(OBJS) | ||
|
||
%.o: %.cu | ||
$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -c $< -o $@ | ||
|
||
ifdef CUDA_AVAILABLE | ||
%.o: %.cc | ||
$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -x cu -c $< -o $@ | ||
else | ||
%.o: %.cc | ||
$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -c $< -o $@ | ||
#$(CC) $(CC_FLAGS) $(INCLUDE_DIRS) $(LDFLAGS) -c $< -o $@ $(OMP_FLAGS) | ||
endif | ||
|
||
clean: | ||
rm -f $(TARGET) $(OBJS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Build onednn (enable openmp on mac) | ||
|
||
cmake .. -DOpenMP_C_FLAGS="-Xclang -fopenmp -I/usr/local/opt/libomp/include" -DOpenMP_C_LIB_NAMES="libomp" -DDNNL_CPU_RUNTIME=OMP -DOpenMP_CXX_FLAGS="-Xclang -fopenmp -I/usr/local/opt/libomp/include" -DOpenMP_CXX_LIB_NAMES="libomp" -DOpenMP_libomp_LIBRARY=/usr/local/opt/libomp/lib/libomp.dylib -DCMAKE_SHARED_LINKER_FLAGS="-L/usr/local/opt/libomp/lib/ -lomp -Wl,-rpath,/usr/local/opt/libomp/lib/" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
#include <math.h> | ||
#include <stdio.h> | ||
|
||
#include <cstdlib> | ||
#include <iostream> | ||
|
||
#include "lib/matmul.h" | ||
|
||
#define BLK_SIZE 16 | ||
#define MAX_PRECISION_ERROR 0.01 | ||
|
||
#define M 1024 | ||
#define N 1024 | ||
#define K 1024 | ||
#define A_ROW M | ||
#define A_COLUMN K | ||
#define B_ROW K | ||
#define B_COLUMN N | ||
#define C_ROW M | ||
#define C_COLUMN N | ||
#define NUM_THREAD 16 | ||
|
||
float MAT_A[A_ROW * A_COLUMN]; | ||
float MAT_B[B_ROW * B_COLUMN]; | ||
float transpose_B[B_ROW * B_COLUMN]; | ||
float native_C[C_ROW * C_COLUMN]; | ||
float output_C[C_ROW * C_COLUMN]; | ||
|
||
int8_t MAT_A_s8[A_ROW * A_COLUMN]; | ||
int8_t MAT_B_s8[B_ROW * B_COLUMN]; | ||
int32_t bias_s32[C_COLUMN]; | ||
int8_t transpose_B_s8[B_ROW * B_COLUMN]; | ||
int8_t native_C_s8[C_ROW * C_COLUMN]; | ||
int8_t output_C_s8[C_ROW * C_COLUMN]; | ||
|
||
bool check_identical(float matA[], float matB[], int size) { | ||
for (int i = 0; i < size; i++) { | ||
if (abs((matA[i] - matB[i]) / (matA[i])) > MAX_PRECISION_ERROR) { | ||
printf("%d: %f, %f", i, matA[i], matB[i]); | ||
return false; | ||
} | ||
} | ||
return true; | ||
} | ||
|
||
bool check_identical(int8_t matA[], int8_t matB[], int size) { | ||
for (int i = 0; i < size; i++) { | ||
if (matA[i] != matB[i]) { | ||
printf("%d: %d, %d", i, matA[i], matB[i]); | ||
return false; | ||
} | ||
} | ||
return true; | ||
} | ||
|
||
template <typename T> | ||
void dump_integer_array(T matA[], int size) { | ||
for (int i = 0; i < size; i++) { | ||
printf("%d,", matA[i]); | ||
} | ||
printf("\n"); | ||
} | ||
|
||
void initialize_matrix(float A[], int size) { | ||
for (int i = 0; i < size; i++) { | ||
A[i] = (float)(rand()) / (float)(RAND_MAX); | ||
} | ||
} | ||
|
||
void initialize_matrix(int8_t A[], int size) { | ||
for (int i = 0; i < size; i++) { | ||
// A[i] = (rand() % 2) - 1; | ||
A[i] = (rand() % 2); | ||
} | ||
} | ||
|
||
void initialize_matrix(int32_t A[], int size) { | ||
for (int i = 0; i < size; i++) { | ||
// A[i] = (rand() % 2) - 1; | ||
A[i] = (rand() % 2); | ||
} | ||
} | ||
|
||
using namespace matmul; | ||
|
||
int main() { | ||
// initialize | ||
initialize_matrix(MAT_A, A_ROW * A_COLUMN); | ||
initialize_matrix(MAT_B, B_ROW * B_COLUMN); | ||
initialize_matrix(native_C, C_ROW * C_COLUMN); | ||
|
||
initialize_matrix(MAT_A_s8, A_ROW * A_COLUMN); | ||
initialize_matrix(MAT_B_s8, B_ROW * B_COLUMN); | ||
initialize_matrix(native_C_s8, C_ROW * C_COLUMN); | ||
// initialize_matrix(bias_s32, C_ROW * C_COLUMN); | ||
|
||
MatmulOperator matmul_op = MatmulOperator(); | ||
|
||
struct matmul_params params, params_int8; | ||
params.A.row = A_ROW; | ||
params.A.column = A_COLUMN; | ||
params.A.data_ptr = MAT_A; | ||
params.B.row = B_ROW; | ||
params.B.column = B_COLUMN; | ||
params.B.data_ptr = MAT_B; | ||
params.C.row = C_ROW; | ||
params.C.column = C_COLUMN; | ||
params.opt_params.blk_size = BLK_SIZE; | ||
params.opt_params.num_thread = NUM_THREAD; | ||
|
||
// int8 | ||
params_int8.A.row = A_ROW; | ||
params_int8.A.column = A_COLUMN; | ||
params_int8.A.int8_data_ptr = MAT_A_s8; | ||
params_int8.A.qparams.scale = 1.0; | ||
params_int8.A.qparams.zero_point = 0; | ||
params_int8.B.row = B_ROW; | ||
params_int8.B.column = B_COLUMN; | ||
params_int8.B.int8_data_ptr = MAT_B_s8; | ||
params_int8.B.qparams.scale = 1.0; | ||
params_int8.B.qparams.zero_point = 0; | ||
params_int8.C.row = C_ROW; | ||
params_int8.C.column = C_COLUMN; | ||
params_int8.C.int8_data_ptr = native_C_s8; | ||
params_int8.C.qparams.scale = 1.0; | ||
params_int8.C.qparams.q_max = 127; | ||
params_int8.C.qparams.q_min = -128; | ||
params_int8.C.qparams.zero_point = 0; | ||
params_int8.opt_params.blk_size = BLK_SIZE; | ||
params_int8.opt_params.num_thread = NUM_THREAD; | ||
params_int8.bias.row = 1; | ||
params_int8.bias.column = C_COLUMN; | ||
params_int8.bias.int32_data_ptr = bias_s32; | ||
|
||
// Baseline | ||
params.C.data_ptr = native_C; | ||
matmul_op.evaluate(MatmulOperator::NAIVE, ¶ms); | ||
|
||
params.C.data_ptr = output_C; | ||
// unrolling | ||
matmul_op.evaluate(MatmulOperator::UNROLL, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_unrolling\n"); | ||
|
||
// reordering | ||
matmul_op.evaluate(MatmulOperator::REORDER, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_reordering\n"); | ||
|
||
// tiling | ||
matmul_op.evaluate(MatmulOperator::TILING, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_tiling\n"); | ||
|
||
// multithreading | ||
matmul_op.evaluate(MatmulOperator::MULTITHREAD, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_multithreading\n"); | ||
|
||
// transpose | ||
matmul_op.evaluate(MatmulOperator::TRANSPOSE, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_transpose\n"); | ||
|
||
// transpose + simd | ||
initialize_matrix(output_C, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::TRANSPOSE_SIMD, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_transpose_simd\n"); | ||
|
||
// cuda | ||
#ifdef CUDA_ENABLE | ||
matmul_op.evaluate(MatmulOperator::CUDA, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_cuda\n"); | ||
#endif | ||
|
||
// ONEDNN | ||
#ifdef ONEDNN_ENABLE | ||
initialize_matrix(output_C, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::ONEDNN_FP32, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("\nincorrect output of mat_mul_onedenn\n"); | ||
#endif | ||
|
||
// For fast, we need to transpose B first | ||
for (int i = 0; i < B_COLUMN; i++) | ||
for (int j = 0; j < B_ROW; j++) transpose_B[i * B_ROW + j] = MAT_B[j * B_COLUMN + i]; | ||
params.B.column = B_ROW; | ||
params.B.row = B_COLUMN; | ||
params.B.data_ptr = transpose_B; | ||
params.opt_params.blk_size = BLK_SIZE; | ||
params.opt_params.num_thread = NUM_THREAD; | ||
|
||
// fast | ||
initialize_matrix(output_C, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::FAST, ¶ms); | ||
if (!check_identical(native_C, output_C, C_ROW * C_COLUMN)) printf("incorrect output of mat_mul_fast\n"); | ||
|
||
// int8 | ||
matmul_op.evaluate(MatmulOperator::INT8_BASELINE, ¶ms_int8); | ||
|
||
params_int8.C.int8_data_ptr = output_C_s8; | ||
|
||
// For int8 SIMD, we need to transpose B first | ||
for (int i = 0; i < B_COLUMN; i++) | ||
for (int j = 0; j < B_ROW; j++) transpose_B_s8[i * B_ROW + j] = MAT_B_s8[j * B_COLUMN + i]; | ||
|
||
params_int8.B.int8_data_ptr = transpose_B_s8; | ||
initialize_matrix(output_C_s8, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::INT8_AVX, ¶ms_int8); | ||
if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN)) | ||
printf("incorrect output from mat_mul_avx_int8\n"); | ||
|
||
initialize_matrix(output_C_s8, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::INT8_AVX_FAST, ¶ms_int8); | ||
if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN)) | ||
printf("incorrect output from mat_mul_avx_int8_fast\n"); | ||
|
||
initialize_matrix(output_C_s8, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::INT8_AVX_FAST_2x2, ¶ms_int8); | ||
if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN)) | ||
printf("incorrect output from mat_mul_avx_int8_fast_2x2\n"); | ||
|
||
initialize_matrix(output_C_s8, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::INT8_AVX_FAST_2x2_32UNROLL, ¶ms_int8); | ||
if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN)) | ||
printf("incorrect output from mat_mul_avx_int8_fast_2x2_32unroll\n"); | ||
|
||
initialize_matrix(output_C_s8, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::INT8_AVX_FAST_2x2_OMP, ¶ms_int8); | ||
if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN)) | ||
printf("incorrect output from mat_mul_avx_int8_fast_2x2_omp\n"); | ||
|
||
// ONEDNN | ||
#ifdef ONEDNN_ENABLE | ||
initialize_matrix(output_C_s8, C_ROW * C_COLUMN); | ||
matmul_op.evaluate(MatmulOperator::ONEDNN_INT8, ¶ms_int8); | ||
if (!check_identical(native_C_s8, output_C_s8, C_ROW * C_COLUMN)) | ||
printf("incorrect output from mat_mul_onednn_int8\n"); | ||
#endif | ||
// Debugging | ||
// dump_integer_array(MAT_A_s8, A_ROW * A_COLUMN); | ||
// dump_integer_array(MAT_B_s8, B_ROW * B_COLUMN); | ||
// dump_integer_array(native_C_s8, C_ROW * C_COLUMN); | ||
// dump_integer_array(output_C_s8, C_ROW * C_COLUMN); | ||
|
||
return 0; | ||
} |
Oops, something went wrong.