Skip to content

Commit

Permalink
Merge branch 'release/rocm-rel-5.6-staging' into release/rocm-rel-5.6
Browse files Browse the repository at this point in the history
  • Loading branch information
junliume committed Aug 17, 2023
2 parents 5b967b9 + 2d5df56 commit a3a8c84
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 44 deletions.
62 changes: 48 additions & 14 deletions src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,12 +382,28 @@ GetBwdXdlopsNHWCConfigList()
return kernel_param_list;
}

static std::tuple<std::string, // kernel_name
size_t, // block_size
size_t, // grid_size
size_t> // splits_4G
// clang-format off
static inline PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC
GetBwdXdlopsNHWCConfigLargestTileFp32()
{
return {"bwd", "nhwc", miopenFloat, 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}};
}
static inline PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC
GetBwdXdlopsNHWCConfigLargestTileFp16()
{
return {"bwd", "nhwc", miopenHalf, 0, 1, 256, 256, 32, 32, 32, 8, 2, 2, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}};
}
static inline PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC
GetBwdXdlopsNHWCConfigLargestTileBf16()
{
return {"bwd", "nhwc", miopenBFloat16, 0, 1, 256, 256, 32, 32, 32, 8, 2, 2, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}};
}
// clang-format on

static std::tuple<size_t, // block_size
size_t, // grid_size
size_t> // splits_4G
GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(
const ConvolutionContext& ctx,
const ProblemDescription& problem,
const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config)
{
Expand Down Expand Up @@ -441,12 +457,11 @@ GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(
integer_divide_ceil(gemm_n, config.gemm_n_per_block) * (1 << config.gemm_k_global_split);
if(config.multihead != 0)
grid_size *= num_of_gemm;
std::string kernel_name = config.ToKernelName(ctx);
return std::make_tuple(kernel_name, block_size, grid_size, splits_4G);
return std::make_tuple(block_size, grid_size, splits_4G);
}

void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(
const ConvolutionContext& ctx, const ProblemDescription& problem)
const ConvolutionContext&, const ProblemDescription& problem)
{
static const std::vector<std::tuple<int, int, int>> tile_list_fp32 = {
std::make_tuple(128, 128, 16),
Expand Down Expand Up @@ -693,8 +708,8 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(
}
}
size_t current_grid_size;
std::tie(std::ignore, std::ignore, current_grid_size, std::ignore) =
GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(ctx, problem, config);
std::tie(std::ignore, current_grid_size, std::ignore) =
GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(problem, config);
size_t gks = ComputeLog2GemmKGlobalSplitsWith2DMerge(current_grid_size,
1200,
k / group,
Expand Down Expand Up @@ -809,6 +824,13 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(
if(problem.IsFp16() && gemm_k_global_split != 0 && vector_store != 1 && splits_4G > 1)
return false;

size_t current_block_size, current_grid_size, current_splits_4G;
std::tie(current_block_size, current_grid_size, current_splits_4G) =
GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(problem, *this);

if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
return false;

bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) &&
(dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0);

Expand Down Expand Up @@ -934,7 +956,18 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(
problem.n_outputs,
miopen::GetTypeSize(problem.in_data_type)))
return false;

{
auto largest_config = problem.IsFp32()
? GetBwdXdlopsNHWCConfigLargestTileFp32()
: (problem.IsFp16() ? GetBwdXdlopsNHWCConfigLargestTileFp16()
: GetBwdXdlopsNHWCConfigLargestTileBf16());
size_t current_block_size, current_grid_size, current_splits_4G;
std::tie(current_block_size, current_grid_size, current_splits_4G) =
GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(problem, largest_config);

if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
return false;
}
return true;
}

Expand Down Expand Up @@ -1000,14 +1033,15 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution(
ConvSolution result;
KernelInfo kernel;

std::string kernel_name;
size_t block_size;
size_t grid_size;

int splits_4G;

std::tie(kernel_name, block_size, grid_size, splits_4G) =
GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(ctx, problem, config);
std::tie(block_size, grid_size, splits_4G) =
GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(problem, config);

std::string kernel_name = config.ToKernelName(ctx);

const auto required_workspace_size = GetWorkspaceSize(ctx, problem);
result.workspace_sz = required_workspace_size;
Expand Down
62 changes: 49 additions & 13 deletions src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,12 +312,28 @@ GetFwdXdlopsNHWCConfigList()
return kernel_param_list;
}

static std::tuple<std::string, // kernel_name
size_t, // block_size
size_t, // grid_size
size_t> // splits_4G
// clang-format off
static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
GetFwdXdlopsNHWCConfigLargestTileFp32()
{
return {"fwd", "nhwc", miopenFloat, 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}};
}
static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
GetFwdXdlopsNHWCConfigLargestTileFp16()
{
return {"fwd", "nhwc", miopenHalf, 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}};
}
static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
GetFwdXdlopsNHWCConfigLargestTileBf16()
{
return {"fwd", "nhwc", miopenBFloat16, 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}};
}
// clang-format on

static std::tuple<size_t, // block_size
size_t, // grid_size
size_t> // splits_4G
GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(
const ConvolutionContext& ctx,
const ProblemDescription& problem,
const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config)
{
Expand All @@ -340,12 +356,11 @@ GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(
size_t grid_size =
static_cast<size_t>(group) * integer_divide_ceil(gemm_m, config.gemm_m_per_block) *
integer_divide_ceil(gemm_n, config.gemm_n_per_block) * (1 << config.gemm_k_global_split);
std::string kernel_name = config.ToKernelName(ctx);
return std::make_tuple(kernel_name, block_size, grid_size, splits_4G);
return std::make_tuple(block_size, grid_size, splits_4G);
}

void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(
const ConvolutionContext& ctx, const ProblemDescription& problem)
const ConvolutionContext&, const ProblemDescription& problem)
{
static const std::vector<std::tuple<int, int, int>> tile_list_fp32 = {
std::make_tuple(128, 128, 16),
Expand Down Expand Up @@ -567,8 +582,8 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(
}
}
size_t current_grid_size;
std::tie(std::ignore, std::ignore, current_grid_size, std::ignore) =
GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(ctx, problem, config);
std::tie(std::ignore, current_grid_size, std::ignore) =
GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, config);
size_t gks = ComputeLog2GemmKGlobalSplitsWith2DMerge(current_grid_size,
1200,
c / group,
Expand Down Expand Up @@ -683,6 +698,13 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(
if(problem.IsFp16() && gemm_k_global_split != 0 && vector_store != 1 && splits_4G > 1)
return false;

size_t current_block_size, current_grid_size, current_splits_4G;
std::tie(current_block_size, current_grid_size, current_splits_4G) =
GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, *this);

if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
return false;

bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) &&
(dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0);

Expand Down Expand Up @@ -873,6 +895,19 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable(
miopen::GetTypeSize(problem.in_data_type)))
return false;

{
auto largest_config = problem.IsFp32()
? GetFwdXdlopsNHWCConfigLargestTileFp32()
: (problem.IsFp16() ? GetFwdXdlopsNHWCConfigLargestTileFp16()
: GetFwdXdlopsNHWCConfigLargestTileBf16());
size_t current_block_size, current_grid_size, current_splits_4G;
std::tie(current_block_size, current_grid_size, current_splits_4G) =
GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, largest_config);

if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
return false;
}

return true;
}
ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution(
Expand All @@ -883,14 +918,15 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution(
ConvSolution result;
KernelInfo kernel;

std::string kernel_name;
size_t block_size;
size_t grid_size;

int splits_4G;

std::tie(kernel_name, block_size, grid_size, splits_4G) =
GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(ctx, problem, config);
std::tie(block_size, grid_size, splits_4G) =
GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, config);

std::string kernel_name = config.ToKernelName(ctx);

const auto required_workspace_size = GetWorkspaceSize(ctx, problem);
result.workspace_sz = required_workspace_size;
Expand Down
66 changes: 51 additions & 15 deletions src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,12 +309,28 @@ GetWrwXdlopsNHWCConfigList()
return kernel_param_list;
}

static std::tuple<std::string, // kernel_name
size_t, // block_size
size_t, // grid_size
size_t> // occupancy
// clang-format off
static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
GetWrwXdlopsNHWCConfigLargestTileFp32()
{
return {"wrw", "nhwc", miopenFloat, 0, 0, 256, 128, 16, 32, 32, 2, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, { 1, 16, 1, 16}, { 1, 1, 1, 8}, { 1, 16, 1, 16}};
}
static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
GetWrwXdlopsNHWCConfigLargestTileFp16()
{
return {"wrw", "nhwc", miopenHalf, 0, 1, 256, 256, 32, 32, 32, 8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, { 1, 8, 1, 32}, { 1, 4, 1, 8}, { 1, 8, 1, 32}};
}
static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
GetWrwXdlopsNHWCConfigLargestTileBf16()
{
return {"wrw", "nhwc", miopenBFloat16, 0, 1, 256, 256, 32, 32, 32, 8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, { 1, 8, 1, 32}, { 1, 4, 1, 8}, { 1, 8, 1, 32}};
}
// clang-format on

static std::tuple<size_t, // block_size
size_t, // grid_size
size_t> // occupancy
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(
const ConvolutionContext& ctx,
const ProblemDescription& problem,
const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config)
{
Expand All @@ -338,9 +354,8 @@ GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(
size_t grid_size = static_cast<size_t>(group) *
integer_divide_ceil(gemm_m, config.gemm_m_per_block) *
integer_divide_ceil(gemm_n, config.gemm_n_per_block);
std::string kernel_name = config.ToKernelName(ctx);
size_t occupancy = config.ComputeKernelOccupancy();
return std::make_tuple(kernel_name, block_size, grid_size, occupancy);
size_t occupancy = config.ComputeKernelOccupancy();
return std::make_tuple(block_size, grid_size, occupancy);
}

size_t PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::ComputeKernelOccupancy() const
Expand Down Expand Up @@ -624,8 +639,8 @@ void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::HeuristicInit(

size_t current_grid_size;
size_t occupancy;
std::tie(std::ignore, std::ignore, current_grid_size, occupancy) =
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, problem, config_list[selected_index]);
std::tie(std::ignore, current_grid_size, occupancy) =
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config_list[selected_index]);
bool need_k_split = current_grid_size <= non_split_gridsize;
size_t gks = ComputeGemmKGlobalSplitsWith2DMerge(current_grid_size, occupancy, num_cu);
need_k_split |= gks != 0;
Expand Down Expand Up @@ -658,8 +673,8 @@ void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::HeuristicInit(
{
size_t current_grid_size;
size_t occupancy;
std::tie(std::ignore, std::ignore, current_grid_size, occupancy) =
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, problem, config);
std::tie(std::ignore, current_grid_size, occupancy) =
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config);
bool need_k_split = current_grid_size <= non_split_gridsize;
size_t gks =
ComputeGemmKGlobalSplitsWith2DMerge(current_grid_size, occupancy, num_cu);
Expand Down Expand Up @@ -787,6 +802,13 @@ bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::IsValid(
return false;
}

size_t current_block_size, current_grid_size, current_splits_4G;
std::tie(current_block_size, current_grid_size, current_splits_4G) =
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, *this);

if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
return false;

return true;
}

Expand Down Expand Up @@ -861,6 +883,19 @@ bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable(
miopen::GetTypeSize(problem.in_data_type)))
return false;

{
auto largest_config = problem.IsFp32()
? GetWrwXdlopsNHWCConfigLargestTileFp32()
: (problem.IsFp16() ? GetWrwXdlopsNHWCConfigLargestTileFp16()
: GetWrwXdlopsNHWCConfigLargestTileBf16());
size_t current_block_size, current_grid_size, current_splits_4G;
std::tie(current_block_size, current_grid_size, current_splits_4G) =
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, largest_config);

if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
return false;
}

return true;
}

Expand Down Expand Up @@ -975,12 +1010,13 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(
ConvSolution result;
KernelInfo kernel;

std::string kernel_name;
size_t block_size;
size_t grid_size;

std::tie(kernel_name, block_size, grid_size, std::ignore) =
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, problem, config);
std::tie(block_size, grid_size, std::ignore) =
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config);

std::string kernel_name = config.ToKernelName(ctx);

const auto& hi = problem.out_height;
const auto& wi = problem.out_width;
Expand Down
6 changes: 6 additions & 0 deletions src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@

#include "../composable_kernel/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"

#define WORKAROUND_SWDEV_411729 1

MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW)

namespace miopen {
Expand Down Expand Up @@ -82,7 +84,11 @@ bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::IsValid(const ProblemDescription& p
bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx,
const ProblemDescription& problem) const
{
#if WORKAROUND_SWDEV_411729
if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW{}))
#else
if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW{}))
#endif
return false;
if(!ctx.use_hip_kernels)
return false;
Expand Down
2 changes: 2 additions & 0 deletions src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,8 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx,
return false;
if(!IsIndexRangeLargeEnough(problem))
return false;
if(problem.GetGroupCount() > 1)
return false;
switch(problem.conv_problem.GetInDataType())
{
case miopenHalf: return CheckCKApplicability<ck::half_t>(problem);
Expand Down
2 changes: 2 additions & 0 deletions src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,8 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx,
return false;
if(!problem.IsLayoutNHWC())
return false;
if(problem.GetGroupCount() > 1)
return false;
switch(problem.conv_problem.GetInDataType())
{
case miopenInt8: return CheckCKApplicability<int8_t>(problem);
Expand Down
Loading

0 comments on commit a3a8c84

Please sign in to comment.