Skip to content

Commit

Permalink
Merge pull request #2 from sifive/myeh/invscalv_scrub2
Browse files Browse the repository at this point in the history
Restore changes from sifive-blis-private#28
  • Loading branch information
myeh01 authored and Aaron-Hutchinson committed Oct 12, 2023
2 parents efd65d3 + a0ec386 commit 8663e95
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 137 deletions.
12 changes: 4 additions & 8 deletions kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,7 @@ void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
__asm__("vadd.vx v24, v24, %0" : : "r"(offset));
__asm__("vmerge.vvm v16, v16, v24, v0");
}
inc_t tmp = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
offset += vl;
avl -= vl;
}
Expand Down Expand Up @@ -147,8 +146,7 @@ void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
__asm__("vadd.vx v24, v24, %0" : : "r"(offset));
__asm__("vmerge.vvm v16, v16, v24, v0");
}
inc_t tmp = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
offset += vl;
avl -= vl;
}
Expand Down Expand Up @@ -214,8 +212,7 @@ void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
__asm__("vadd.vx v24, v24, %0" : : "r"(offset));
__asm__("vmerge.vvm v16, v16, v24, v0");
}
inc_t tmp = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
offset += vl;
avl -= vl;
}
Expand Down Expand Up @@ -278,8 +275,7 @@ void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
__asm__("vadd.vx v24, v24, %0" : : "r"(offset));
__asm__("vmerge.vvm v16, v16, v24, v0");
}
inc_t tmp = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
offset += vl;
avl -= vl;
}
Expand Down
77 changes: 19 additions & 58 deletions kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
(void)cntx;
const float* restrict x = x_;
float* restrict y = y_;
if (n <= 0)
return;

incx *= FLT_SIZE;
incy *= FLT_SIZE;
Expand All @@ -69,10 +71,8 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
else
__asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));

inc_t tmp1 = vl * incx;
inc_t tmp2 = vl * incy;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
avl -= vl;
}
return;
Expand All @@ -93,8 +93,11 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
void * restrict y_, inc_t incy, const cntx_t *cntx) {
(void)conjx;
(void)cntx;
const double* restrict x = x_;
double* restrict y = y_;
if (n <= 0)
return;

incx *= FLT_SIZE;
incy *= FLT_SIZE;
Expand All @@ -114,10 +117,8 @@ void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
else
__asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));

inc_t tmp1 = vl * incx;
inc_t tmp2 = vl * incy;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
avl -= vl;
}
return;
Expand All @@ -144,6 +145,8 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
(void)cntx;
const scomplex* restrict x = x_;
scomplex* restrict y = y_;
if (n <= 0)
return;

incx *= 2 * FLT_SIZE;
incy *= 2 * FLT_SIZE;
Expand All @@ -164,10 +167,8 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
else
__asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));

inc_t tmp1 = vl * incx;
inc_t tmp2 = vl * incy;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
avl -= vl;
}
} else {
Expand All @@ -189,50 +190,10 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
else
__asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));

inc_t tmp1 = vl * incx;
inc_t tmp2 = vl * incy;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
avl -= vl;
}
/*
// After some benchmarks, it looks like using vl(s)e and vs(s)e with
masked
// instructions for conjugation is faster than using segment loads and
stores.
// We'll use the segment load/store version for now, but I'd like to
leave this
// code here (but commented out) for possible future use.
size_t avl = n;
// 0xA = 0b1010
// this masks off the real parts, so only the imaginary parts are
negated
// this mask is large enough only for vl <= 64
uint64_t mask[1] = {0xAAAAAAAAAAAAAAAA};
__asm__("vsetivli zero, 1, e64, m1, ta, ma");
__asm__("vle64.v v0, (%0)" : : "r"(mask));
while (avl) {
size_t vl;
__asm__ volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) :
"r"(avl)); if (incx == 8)
__asm__("vle64.v v4, (%0)" : : "r"(x));
else
__asm__("vlse64.v v4, (%0), %1" : : "r"(x), "r"(incx));
// set vl = VLMAX
__asm__ volatile("vsetvli t0, zero, e32, m4, ta, ma");
__asm__("vfneg.v v4, v4, v0.t");
__asm__ volatile ("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(avl));
if (incy == 8)
__asm__("vse64.v v4, (%0)" : : "r"(y));
else
__asm__("vsse64.v v4, (%0), %1" : : "r"(y), "r"(incy));
inc_t tmp1 = vl * incx;
inc_t tmp2 = vl * incy;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
avl -= vl;
}
*/
}
return;
}
Expand Down Expand Up @@ -263,6 +224,8 @@ void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
(void)cntx;
const dcomplex* restrict x = x_;
dcomplex* restrict y = y_;
if (n <= 0)
return;

incx *= 2 * FLT_SIZE;
incy *= 2 * FLT_SIZE;
Expand Down Expand Up @@ -300,10 +263,8 @@ void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
else
__asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));

inc_t tmp1 = vl * incx;
inc_t tmp2 = vl * incy;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
__asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
avl -= vl;
}
}
Expand Down
20 changes: 12 additions & 8 deletions kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
const cntx_t *cntx) {
(void)cntx;
float* restrict x = x_;
if (n <= 0)
return;

float one = 1.f;
__asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
Expand All @@ -68,8 +70,7 @@ void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
__asm__("vfrdiv.vf v0, v0, f0");
__asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
avl -= vl;
}
return;
Expand All @@ -93,6 +94,8 @@ void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
const cntx_t *cntx) {
(void)cntx;
double* restrict x = x_;
if (n <= 0)
return;

double one = 1.;
__asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
Expand All @@ -112,8 +115,7 @@ void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
__asm__("vfrdiv.vf v0, v0, f0");
__asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
avl -= vl;
}
return;
Expand All @@ -136,6 +138,8 @@ void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
const cntx_t *cntx) {
(void)cntx;
scomplex* restrict x = x_;
if (n <= 0)
return;

incx *= 2 * FLT_SIZE;
size_t avl = n;
Expand All @@ -161,8 +165,7 @@ void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
__asm__("vfdiv.vv v4, v4, v8");
__asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
avl -= vl;
}
return;
Expand All @@ -184,6 +187,8 @@ void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
const cntx_t *cntx) {
(void)cntx;
dcomplex* restrict x = x_;
if (n <= 0)
return;

incx *= 2 * FLT_SIZE;
size_t avl = n;
Expand All @@ -209,8 +214,7 @@ void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
__asm__("vfdiv.vv v4, v4, v8");
__asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
avl -= vl;
}
return;
Expand Down
20 changes: 12 additions & 8 deletions kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
(void)cntx;
const float* restrict alpha = alpha_;
float* restrict x = x_;
if (n <= 0 || *alpha == 0.f || *alpha == 1.f)
return;

float one = 1.f;
__asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
Expand All @@ -74,8 +76,7 @@ void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
__asm__("vfmul.vf v0, v0, f0");
__asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
avl -= vl;
}
return;
Expand Down Expand Up @@ -104,6 +105,8 @@ void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
(void)cntx;
const double* restrict alpha = alpha_;
double* restrict x = x_;
if (n <= 0 || *alpha == 0. || *alpha == 1.)
return;

double one = 1.;
__asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
Expand All @@ -125,8 +128,7 @@ void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
__asm__("vfmul.vf v0, v0, f0");
__asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
avl -= vl;
}
return;
Expand Down Expand Up @@ -157,6 +159,8 @@ void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
(void)cntx;
const scomplex* restrict alpha = alpha_;
scomplex* restrict x = x_;
if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f))
return;

__asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
__asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
Expand Down Expand Up @@ -188,8 +192,7 @@ void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
__asm__("vfmacc.vf v12, f1, v0");
__asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
avl -= vl;
}
return;
Expand Down Expand Up @@ -223,6 +226,8 @@ void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
(void)cntx;
const dcomplex* restrict alpha = alpha_;
dcomplex* restrict x = x_;
if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.))
return;

__asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
__asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
Expand Down Expand Up @@ -254,8 +259,7 @@ void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
__asm__("vfmacc.vf v12, f1, v0");
__asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
__asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
avl -= vl;
}
return;
Expand Down
Loading

0 comments on commit 8663e95

Please sign in to comment.