forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CatKernel.cpp
83 lines (72 loc) · 2.32 KB
/
CatKernel.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/Dispatch.h>
#include <ATen/Dispatch_v2.h>
#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <ATen/native/cpu/CatKernel.h>
#include <c10/util/irange.h>
namespace at::native {
namespace {
struct InputMeta {
const void* data_ptr;
int64_t inner_size;
InputMeta(const Tensor& t, int64_t dim, int64_t inner)
: data_ptr(t.const_data_ptr()), inner_size(t.sizes()[dim] * inner) {}
};
template <typename scalar_t>
void cat_serial_kernel_impl(
const Tensor& result,
const MaterializedITensorListRef& tensors,
int64_t dim) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
dim >= 0 && dim < result.dim(),
"dim out of range in cat_serial_kernel_impl");
int64_t outer =
result.numel() / (result.sizes()[dim] * result.strides()[dim]);
scalar_t* result_data = result.data_ptr<scalar_t>();
int64_t ninputs = static_cast<int64_t>(tensors.size());
std::vector<InputMeta> inputs;
inputs.reserve(ninputs);
for (const Tensor& tensor : tensors) {
inputs.emplace_back(tensor, dim, result.strides()[dim]);
}
using Vec = vec::Vectorized<scalar_t>;
scalar_t* result_ptr = result_data;
for (const auto i : c10::irange(outer)) {
for (const auto j : c10::irange(ninputs)) {
int64_t local_inner = inputs[j].inner_size;
const scalar_t* input_ptr =
(const scalar_t*)(inputs[j].data_ptr) + i * local_inner;
int64_t d = 0;
for (; d < local_inner - (local_inner % Vec::size()); d += Vec::size()) {
Vec in_vec = Vec::loadu(input_ptr + d);
in_vec.store(result_ptr + d);
}
#if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE)
#pragma unroll
#endif
for (; d < local_inner; d++) {
result_ptr[d] = input_ptr[d];
}
result_ptr += local_inner;
}
}
}
void cat_serial_kernel(
const Tensor& result,
const MaterializedITensorListRef& tensors,
int64_t dim) {
AT_DISPATCH_V2(
result.scalar_type(),
"cat_serial_kernel",
AT_WRAP(
[&]() { cat_serial_kernel_impl<scalar_t>(result, tensors, dim); }),
AT_EXPAND(AT_FLOATING_TYPES),
kBFloat16,
kHalf,
AT_EXPAND(AT_FLOAT8_TYPES));
}
} // anonymous namespace
REGISTER_DISPATCH(cat_serial_stub, &cat_serial_kernel)
} // namespace at::native