# a0 - x_ptr
# a1 - y_ptr
# a2 - out_ptr
# a3 - N
vsetvli zero, a3, e32, ta, ma
vle32.v v0, (a0)
vle32.v v1, (a1)
vadd.vv v2, v0, v1
vse32.v v2, (a2)
# fa0 - alpha
# a0 - x_ptr
# a1 - y_ptr
# a3 - out_ptr
# a4 - N
vsetvli zero, a4, e32, ta, ma
vle32.v v0, (a0)
vle32.v v1, (a1)
vfmul.vf v2, v0, fa0
vfadd.vv v3, v1, v2
vse32.v v3, (a3)
# a0 - in_ptr
# a1 - out_ptr
# a2 - N
vsetvli zero, a2, e32, ta, ma
vle32.v v0, (a0)
vmax.vx v1, v0, zero
vse32.v v1, (a1)
# a0 - in_ptr
# a1 - out_ptr
# a2 - N
vsetvli vl, a2, e32, ta, ma
vlseg3e32.v v0, (a1) # load into v0v1v2 at stride of 3 elements
vfmul.vv v3, v0, v0 # Xs = x * x
vfmacc.vv v3, v1, v1 # XYs += y * y
vfmacc.vv v3, v2, v2 # XYZs += z* z
vfsqrt.v v3, v3
vse32.v v3, (a2)
l = [x, y, z, x, y, z, x, y, z , ....]
d = seg_stride3f32(l)
d.0 = [x, x, x, ...]
d.1 = [y, y, y, ...]
d.2 = [z, z, z, ...]
Image is stored in SOA format
# a0 - r_ptr
# a1 - g_ptr
# a2 - b_ptr
# a3 - m_ptr
# a4 - N
vsetvli vl, a4, e32, ta, ma
flw ft0, 0(a3)
flw ft1, 4(a3)
flw ft2, 8(a3)
vle32.v v0, (a0)
vle32.v v1, (a1)
vle32.v v2, (a2)
vfmul.vf v0, v0, ft0
vfmul.vf v1, v1, ft1
vfmul.vf v2, v2, ft2
vse32.v v0, (a0)
vse32.v v1, (a1)
vse32.v v2, (a2)
What would be a good scheduling of ins?
#include <cmath>
#include <cstdint>
// newer compilers need std::size_t...
using namespace std;
template <typename T>
void add(T *x, T *__restrict__ y, T *__restrict__ out, size_t size) {
for (size_t i = 0; i < size; i++) {
out[i] = x[i] + y[i];
}
}
template void add<uint32_t>(uint32_t *, uint32_t *, uint32_t *, size_t);
template void add<float>(float *, float *, float *, size_t);
template void add<double>(double *, double *, double *, size_t);
template <typename T>
void axpy(T *__restrict__ alpha, T *__restrict__ x, T *__restrict__ y,
T *__restrict__ out, size_t size) {
for (size_t i = 0; i < size; i++) {
out[i] = alpha[i] * x[i] + y[i];
}
}
template void axpy<uint32_t>(uint32_t *, uint32_t *, uint32_t *, uint32_t *,
size_t);
template void axpy<float>(float *, float *, float *, float *, size_t);
template void axpy<double>(double *, double *, double *, double *, size_t);
template <typename T>
void relu(T *__restrict__ x, T *__restrict__ out, size_t size) {
for (size_t i = 0; i < size; i++) {
out[i] = max(x[i], T{0});
}
}
template void relu<uint32_t>(uint32_t *, uint32_t *, size_t);
template void relu<float>(float *, float *, size_t);
template void relu<double>(double *, double *, size_t);
riscv64-unknown-linux-gnu-g++ -std=c++20 -O2 -march=rv64gcv -c elementwise.cpp -o elementwise.o
riscv64-unknown-linux-gnu-objdump -d -C elementwise.oelemtwise.o: file format elf64-littleriscv
Disassembly of section .text._Z3addIjEvPT_S1_S1_m:
0000000000000000 <void add<unsigned int>(unsigned int*, unsigned int*, unsigned int*, unsigned long)>:
0: c295 beqz a3,24 <.L11>
0000000000000002 <.L3>:
2: 0d06f7d7 vsetvli a5,a3,e32,m1,ta,ma
6: 02056087 vle32.v v1,(a0)
a: 0205e107 vle32.v v2,(a1)
e: 00279713 slli a4,a5,0x2
12: 8e9d sub a3,a3,a5
14: 953a add a0,a0,a4
16: 95ba add a1,a1,a4
18: 021100d7 vadd.vv v1,v1,v2
1c: 020660a7 vse32.v v1,(a2)
20: 963a add a2,a2,a4
22: f2e5 bnez a3,2 <.L3>
0000000000000024 <.L11>:
24: 8082 ret
Disassembly of section .text._Z3addIfEvPT_S1_S1_m:
0000000000000000 <void add<float>(float*, float*, float*, unsigned long)>:
0: c295 beqz a3,24 <.L21>
0000000000000002 <.L14>:
2: 0d06f7d7 vsetvli a5,a3,e32,m1,ta,ma
6: 02056087 vle32.v v1,(a0)
a: 0205e107 vle32.v v2,(a1)
e: 00279713 slli a4,a5,0x2
12: 8e9d sub a3,a3,a5
14: 953a add a0,a0,a4
16: 95ba add a1,a1,a4
18: 021110d7 vfadd.vv v1,v1,v2
1c: 020660a7 vse32.v v1,(a2)
20: 963a add a2,a2,a4
22: f2e5 bnez a3,2 <.L14>
0000000000000024 <.L21>:
24: 8082 ret
Disassembly of section .text._Z3addIdEvPT_S1_S1_m:
0000000000000000 <void add<double>(double*, double*, double*, unsigned long)>:
0: c295 beqz a3,24 <.L31>
0000000000000002 <.L24>:
2: 0d86f7d7 vsetvli a5,a3,e64,m1,ta,ma
6: 02057087 vle64.v v1,(a0)
a: 0205f107 vle64.v v2,(a1)
e: 00379713 slli a4,a5,0x3
12: 8e9d sub a3,a3,a5
14: 953a add a0,a0,a4
16: 95ba add a1,a1,a4
18: 021110d7 vfadd.vv v1,v1,v2
1c: 020670a7 vse64.v v1,(a2)
20: 963a add a2,a2,a4
22: f2e5 bnez a3,2 <.L24>
0000000000000024 <.L31>:
24: 8082 ret
Disassembly of section .text._Z4axpyIjEvPT_S1_S1_S1_m:
0000000000000000 <void axpy<unsigned int>(unsigned int*, unsigned int*, unsigned int*, unsigned int*, unsigned long)>:
0: c70d beqz a4,2a <.L41>
0000000000000002 <.L34>:
2: 0d0777d7 vsetvli a5,a4,e32,m1,ta,ma
6: 02056187 vle32.v v3,(a0)
a: 0205e087 vle32.v v1,(a1)
e: 02066107 vle32.v v2,(a2)
12: 00279813 slli a6,a5,0x2
16: 8f1d sub a4,a4,a5
18: 9542 add a0,a0,a6
1a: 95c2 add a1,a1,a6
1c: 9642 add a2,a2,a6
1e: a621a0d7 vmadd.vv v1,v3,v2
22: 0206e0a7 vse32.v v1,(a3)
26: 96c2 add a3,a3,a6
28: ff69 bnez a4,2 <.L34>
000000000000002a <.L41>:
2a: 8082 ret
Disassembly of section .text._Z4axpyIfEvPT_S1_S1_S1_m:
0000000000000000 <void axpy<float>(float*, float*, float*, float*, unsigned long)>:
0: c70d beqz a4,2a <.L51>
0000000000000002 <.L44>:
2: 0d0777d7 vsetvli a5,a4,e32,m1,ta,ma
6: 02056187 vle32.v v3,(a0)
a: 0205e087 vle32.v v1,(a1)
e: 02066107 vle32.v v2,(a2)
12: 00279813 slli a6,a5,0x2
16: 8f1d sub a4,a4,a5
18: 9542 add a0,a0,a6
1a: 95c2 add a1,a1,a6
1c: 9642 add a2,a2,a6
1e: a22190d7 vfmadd.vv v1,v3,v2
22: 0206e0a7 vse32.v v1,(a3)
26: 96c2 add a3,a3,a6
28: ff69 bnez a4,2 <.L44>
000000000000002a <.L51>:
2a: 8082 ret
Disassembly of section .text._Z4axpyIdEvPT_S1_S1_S1_m:
0000000000000000 <void axpy<double>(double*, double*, double*, double*, unsigned long)>:
0: c70d beqz a4,2a <.L61>
0000000000000002 <.L54>:
2: 0d8777d7 vsetvli a5,a4,e64,m1,ta,ma
6: 02057187 vle64.v v3,(a0)
a: 0205f087 vle64.v v1,(a1)
e: 02067107 vle64.v v2,(a2)
12: 00379813 slli a6,a5,0x3
16: 8f1d sub a4,a4,a5
18: 9542 add a0,a0,a6
1a: 95c2 add a1,a1,a6
1c: 9642 add a2,a2,a6
1e: a22190d7 vfmadd.vv v1,v3,v2
22: 0206f0a7 vse64.v v1,(a3)
26: 96c2 add a3,a3,a6
28: ff69 bnez a4,2 <.L54>
000000000000002a <.L61>:
2a: 8082 ret
Disassembly of section .text._Z4reluIjEvPT_S1_m:
0000000000000000 <void relu<unsigned int>(unsigned int*, unsigned int*, unsigned long)>:
0: ce01 beqz a2,18 <.L69>
2: 060a slli a2,a2,0x2
0000000000000004 <.L64>:
4: 0c3677d7 vsetvli a5,a2,e8,m8,ta,ma
8: 02050407 vle8.v v8,(a0)
c: 8e1d sub a2,a2,a5
e: 953e add a0,a0,a5
10: 02058427 vse8.v v8,(a1)
14: 95be add a1,a1,a5
16: f67d bnez a2,4 <.L64>
0000000000000018 <.L69>:
18: 8082 ret
Disassembly of section .text._Z4reluIfEvPT_S1_m:
0000000000000000 <void relu<float>(float*, float*, unsigned long)>:
0: c61d beqz a2,2e <.L77>
2: 0d0077d7 vsetvli a5,zero,e32,m1,ta,ma
6: 5e0031d7 vmv.v.i v3,0
000000000000000a <.L72>:
a: 0d0677d7 vsetvli a5,a2,e32,m1,ta,ma
e: 02056087 vle32.v v1,(a0)
12: 5e003157 vmv.v.i v2,0
16: 00279713 slli a4,a5,0x2
1a: 8e1d sub a2,a2,a5
1c: 953a add a0,a0,a4
1e: 6e119057 vmflt.vv v0,v1,v3
22: 5c1100d7 vmerge.vvm v1,v1,v2,v0
26: 0205e0a7 vse32.v v1,(a1)
2a: 95ba add a1,a1,a4
2c: fe79 bnez a2,a <.L72>
000000000000002e <.L77>:
2e: 8082 ret
Disassembly of section .text._Z4reluIdEvPT_S1_m:
0000000000000000 <void relu<double>(double*, double*, unsigned long)>:
0: c20d beqz a2,22 <.L87>
2: f2000753 fmv.d.x fa4,zero
6: 060e slli a2,a2,0x3
8: 00c50733 add a4,a0,a2
000000000000000c <.L83>:
c: 211c fld fa5,0(a0)
e: 0521 addi a0,a0,8
10: a2e797d3 flt.d a5,fa5,fa4
14: c399 beqz a5,1a <.L82>
16: f20007d3 fmv.d.x fa5,zero
000000000000001a <.L82>:
1a: a19c fsd fa5,0(a1)
1c: 05a1 addi a1,a1,8
1e: fea717e3 bne a4,a0,c <.L83>
0000000000000022 <.L87>:
22: 8082 ret
Its curious that relu for double didnt got vectorized... I suppose that the scheduler model determined that scalar path gives better perf.
Add with carry
vadd.vv v2, v2, v3 # v2 = v2 + v3
vmadc.vv v0, v1, v2 # v0 = carry(v2 + v3)
vadc.vvm v2, v2, v4, v0 # v2 = v2 + v0(carry) + v4
Ignore a bit the above example...
Mmmm if I would do multi workd arith, it would be...
128 bit on 64 bit system
Rlow = Low0 + Low1
carry = C(Low0 + low1)
Rhigh = High0+ High1 + carry
Then the memory layout could be...
- an array of lows and an array of highs... Efficient to work with
or
- an array of numbers where the stride per element is 2 * word_size
I must pay on mem to go to a vectorizable way....
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: sending inst to dispatch: uid: 2 RENAMED 0 pid: 2 uopid: 0 'vsadd.vv 12,4,8'
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: setup source register bit mask [4] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: setup source register bit mask [4,8] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: setup destination register bit mask [32] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: sending inst to dispatch: uid: 2 RENAMED 0 pid: 2 uopid: 1 'vsadd.vv 13,5,9'
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: setup source register bit mask [5] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: setup source register bit mask [5,9] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: setup destination register bit mask [33] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: sending inst to dispatch: uid: 2 RENAMED 0 pid: 2 uopid: 2 'vsadd.vv 14,6,10'



