orion160/rvv_examples.md

RVV Examples

Add

# a0 - x_ptr
# a1 - y_ptr
# a2 - out_ptr
# a3 - N

vsetvli zero, a3, e32, ta, ma
vle32.v v0, (a0)
vle32.v v1, (a1)
vadd.vv v2, v0, v1
vse32.v v2, (a2)

SAXPY

# fa0 - alpha
# a0 - x_ptr
# a1 - y_ptr
# a3 - out_ptr
# a4 - N

vsetvli zero, a4, e32, ta, ma
vle32.v v0, (a0)
vle32.v v1, (a1)
vfmul.vf v2, v0, fa0
vfadd.vv v3, v1, v2
vse32.v v3, (a3)

ReLu

# a0 - in_ptr
# a1 - out_ptr
# a2 - N

vsetvli zero, a2, e32, ta, ma
vle32.v v0, (a0)
vmax.vx v1, v0, zero
vse32.v v1, (a1)

Norm

# a0 - in_ptr
# a1 - out_ptr
# a2 - N

vsetvli vl, a2, e32, ta, ma
vlseg3e32.v v0, (a1) # load into v0v1v2 at stride of 3 elements
vfmul.vv v3, v0, v0 # Xs = x * x
vfmacc.vv v3, v1, v1 # XYs += y * y
vfmacc.vv v3, v2, v2 # XYZs += z* z
vfsqrt.v v3, v3
vse32.v v3, (a2)

Segmented load

l = [x, y, z, x, y, z, x, y, z , ....]

d = seg_stride3f32(l)

d.0 = [x, x, x, ...]
d.1 = [y, y, y, ...]
d.2 = [z, z, z, ...]

grayscale conversion

Image is stored in SOA format

# a0 - r_ptr
# a1 - g_ptr
# a2 - b_ptr
# a3 - m_ptr
# a4 - N

vsetvli vl, a4, e32, ta, ma

flw ft0, 0(a3)
flw ft1, 4(a3)
flw ft2, 8(a3)

vle32.v v0, (a0)
vle32.v v1, (a1)
vle32.v v2, (a2)

vfmul.vf v0, v0, ft0 
vfmul.vf v1, v1, ft1
vfmul.vf v2, v2, ft2

vse32.v v0, (a0)
vse32.v v1, (a1)
vse32.v v2, (a2)

What would be a good scheduling of ins?

Simple elementwise example

#include <cmath>
#include <cstdint>

// newer compilers need std::size_t...

using namespace std;

template <typename T>
void add(T *x, T *__restrict__ y, T *__restrict__ out, size_t size) {
  for (size_t i = 0; i < size; i++) {
    out[i] = x[i] + y[i];
  }
}

template void add<uint32_t>(uint32_t *, uint32_t *, uint32_t *, size_t);

template void add<float>(float *, float *, float *, size_t);

template void add<double>(double *, double *, double *, size_t);

template <typename T>
void axpy(T *__restrict__ alpha, T *__restrict__ x, T *__restrict__ y,
          T *__restrict__ out, size_t size) {
  for (size_t i = 0; i < size; i++) {
    out[i] = alpha[i] * x[i] + y[i];
  }
}

template void axpy<uint32_t>(uint32_t *, uint32_t *, uint32_t *, uint32_t *,
                             size_t);

template void axpy<float>(float *, float *, float *, float *, size_t);

template void axpy<double>(double *, double *, double *, double *, size_t);

template <typename T>
void relu(T *__restrict__ x, T *__restrict__ out, size_t size) {
  for (size_t i = 0; i < size; i++) {
    out[i] = max(x[i], T{0});
  }
}

template void relu<uint32_t>(uint32_t *, uint32_t *, size_t);

template void relu<float>(float *, float *, size_t);

template void relu<double>(double *, double *, size_t);

riscv64-unknown-linux-gnu-g++ -std=c++20 -O2 -march=rv64gcv -c elementwise.cpp -o elementwise.o
riscv64-unknown-linux-gnu-objdump -d -C elementwise.o

elemtwise.o:     file format elf64-littleriscv


Disassembly of section .text._Z3addIjEvPT_S1_S1_m:

0000000000000000 <void add<unsigned int>(unsigned int*, unsigned int*, unsigned int*, unsigned long)>:
   0:	c295                	beqz	a3,24 <.L11>

0000000000000002 <.L3>:
   2:	0d06f7d7          	vsetvli	a5,a3,e32,m1,ta,ma
   6:	02056087          	vle32.v	v1,(a0)
   a:	0205e107          	vle32.v	v2,(a1)
   e:	00279713          	slli	a4,a5,0x2
  12:	8e9d                	sub	a3,a3,a5
  14:	953a                	add	a0,a0,a4
  16:	95ba                	add	a1,a1,a4
  18:	021100d7          	vadd.vv	v1,v1,v2
  1c:	020660a7          	vse32.v	v1,(a2)
  20:	963a                	add	a2,a2,a4
  22:	f2e5                	bnez	a3,2 <.L3>

0000000000000024 <.L11>:
  24:	8082                	ret

Disassembly of section .text._Z3addIfEvPT_S1_S1_m:

0000000000000000 <void add<float>(float*, float*, float*, unsigned long)>:
   0:	c295                	beqz	a3,24 <.L21>

0000000000000002 <.L14>:
   2:	0d06f7d7          	vsetvli	a5,a3,e32,m1,ta,ma
   6:	02056087          	vle32.v	v1,(a0)
   a:	0205e107          	vle32.v	v2,(a1)
   e:	00279713          	slli	a4,a5,0x2
  12:	8e9d                	sub	a3,a3,a5
  14:	953a                	add	a0,a0,a4
  16:	95ba                	add	a1,a1,a4
  18:	021110d7          	vfadd.vv	v1,v1,v2
  1c:	020660a7          	vse32.v	v1,(a2)
  20:	963a                	add	a2,a2,a4
  22:	f2e5                	bnez	a3,2 <.L14>

0000000000000024 <.L21>:
  24:	8082                	ret

Disassembly of section .text._Z3addIdEvPT_S1_S1_m:

0000000000000000 <void add<double>(double*, double*, double*, unsigned long)>:
   0:	c295                	beqz	a3,24 <.L31>

0000000000000002 <.L24>:
   2:	0d86f7d7          	vsetvli	a5,a3,e64,m1,ta,ma
   6:	02057087          	vle64.v	v1,(a0)
   a:	0205f107          	vle64.v	v2,(a1)
   e:	00379713          	slli	a4,a5,0x3
  12:	8e9d                	sub	a3,a3,a5
  14:	953a                	add	a0,a0,a4
  16:	95ba                	add	a1,a1,a4
  18:	021110d7          	vfadd.vv	v1,v1,v2
  1c:	020670a7          	vse64.v	v1,(a2)
  20:	963a                	add	a2,a2,a4
  22:	f2e5                	bnez	a3,2 <.L24>

0000000000000024 <.L31>:
  24:	8082                	ret

Disassembly of section .text._Z4axpyIjEvPT_S1_S1_S1_m:

0000000000000000 <void axpy<unsigned int>(unsigned int*, unsigned int*, unsigned int*, unsigned int*, unsigned long)>:
   0:	c70d                	beqz	a4,2a <.L41>

0000000000000002 <.L34>:
   2:	0d0777d7          	vsetvli	a5,a4,e32,m1,ta,ma
   6:	02056187          	vle32.v	v3,(a0)
   a:	0205e087          	vle32.v	v1,(a1)
   e:	02066107          	vle32.v	v2,(a2)
  12:	00279813          	slli	a6,a5,0x2
  16:	8f1d                	sub	a4,a4,a5
  18:	9542                	add	a0,a0,a6
  1a:	95c2                	add	a1,a1,a6
  1c:	9642                	add	a2,a2,a6
  1e:	a621a0d7          	vmadd.vv	v1,v3,v2
  22:	0206e0a7          	vse32.v	v1,(a3)
  26:	96c2                	add	a3,a3,a6
  28:	ff69                	bnez	a4,2 <.L34>

000000000000002a <.L41>:
  2a:	8082                	ret

Disassembly of section .text._Z4axpyIfEvPT_S1_S1_S1_m:

0000000000000000 <void axpy<float>(float*, float*, float*, float*, unsigned long)>:
   0:	c70d                	beqz	a4,2a <.L51>

0000000000000002 <.L44>:
   2:	0d0777d7          	vsetvli	a5,a4,e32,m1,ta,ma
   6:	02056187          	vle32.v	v3,(a0)
   a:	0205e087          	vle32.v	v1,(a1)
   e:	02066107          	vle32.v	v2,(a2)
  12:	00279813          	slli	a6,a5,0x2
  16:	8f1d                	sub	a4,a4,a5
  18:	9542                	add	a0,a0,a6
  1a:	95c2                	add	a1,a1,a6
  1c:	9642                	add	a2,a2,a6
  1e:	a22190d7          	vfmadd.vv	v1,v3,v2
  22:	0206e0a7          	vse32.v	v1,(a3)
  26:	96c2                	add	a3,a3,a6
  28:	ff69                	bnez	a4,2 <.L44>

000000000000002a <.L51>:
  2a:	8082                	ret

Disassembly of section .text._Z4axpyIdEvPT_S1_S1_S1_m:

0000000000000000 <void axpy<double>(double*, double*, double*, double*, unsigned long)>:
   0:	c70d                	beqz	a4,2a <.L61>

0000000000000002 <.L54>:
   2:	0d8777d7          	vsetvli	a5,a4,e64,m1,ta,ma
   6:	02057187          	vle64.v	v3,(a0)
   a:	0205f087          	vle64.v	v1,(a1)
   e:	02067107          	vle64.v	v2,(a2)
  12:	00379813          	slli	a6,a5,0x3
  16:	8f1d                	sub	a4,a4,a5
  18:	9542                	add	a0,a0,a6
  1a:	95c2                	add	a1,a1,a6
  1c:	9642                	add	a2,a2,a6
  1e:	a22190d7          	vfmadd.vv	v1,v3,v2
  22:	0206f0a7          	vse64.v	v1,(a3)
  26:	96c2                	add	a3,a3,a6
  28:	ff69                	bnez	a4,2 <.L54>

000000000000002a <.L61>:
  2a:	8082                	ret

Disassembly of section .text._Z4reluIjEvPT_S1_m:

0000000000000000 <void relu<unsigned int>(unsigned int*, unsigned int*, unsigned long)>:
   0:	ce01                	beqz	a2,18 <.L69>
   2:	060a                	slli	a2,a2,0x2

0000000000000004 <.L64>:
   4:	0c3677d7          	vsetvli	a5,a2,e8,m8,ta,ma
   8:	02050407          	vle8.v	v8,(a0)
   c:	8e1d                	sub	a2,a2,a5
   e:	953e                	add	a0,a0,a5
  10:	02058427          	vse8.v	v8,(a1)
  14:	95be                	add	a1,a1,a5
  16:	f67d                	bnez	a2,4 <.L64>

0000000000000018 <.L69>:
  18:	8082                	ret

Disassembly of section .text._Z4reluIfEvPT_S1_m:

0000000000000000 <void relu<float>(float*, float*, unsigned long)>:
   0:	c61d                	beqz	a2,2e <.L77>
   2:	0d0077d7          	vsetvli	a5,zero,e32,m1,ta,ma
   6:	5e0031d7          	vmv.v.i	v3,0

000000000000000a <.L72>:
   a:	0d0677d7          	vsetvli	a5,a2,e32,m1,ta,ma
   e:	02056087          	vle32.v	v1,(a0)
  12:	5e003157          	vmv.v.i	v2,0
  16:	00279713          	slli	a4,a5,0x2
  1a:	8e1d                	sub	a2,a2,a5
  1c:	953a                	add	a0,a0,a4
  1e:	6e119057          	vmflt.vv	v0,v1,v3
  22:	5c1100d7          	vmerge.vvm	v1,v1,v2,v0
  26:	0205e0a7          	vse32.v	v1,(a1)
  2a:	95ba                	add	a1,a1,a4
  2c:	fe79                	bnez	a2,a <.L72>

000000000000002e <.L77>:
  2e:	8082                	ret

Disassembly of section .text._Z4reluIdEvPT_S1_m:

0000000000000000 <void relu<double>(double*, double*, unsigned long)>:
   0:	c20d                	beqz	a2,22 <.L87>
   2:	f2000753          	fmv.d.x	fa4,zero
   6:	060e                	slli	a2,a2,0x3
   8:	00c50733          	add	a4,a0,a2

000000000000000c <.L83>:
   c:	211c                	fld	fa5,0(a0)
   e:	0521                	addi	a0,a0,8
  10:	a2e797d3          	flt.d	a5,fa5,fa4
  14:	c399                	beqz	a5,1a <.L82>
  16:	f20007d3          	fmv.d.x	fa5,zero

000000000000001a <.L82>:
  1a:	a19c                	fsd	fa5,0(a1)
  1c:	05a1                	addi	a1,a1,8
  1e:	fea717e3          	bne	a4,a0,c <.L83>

0000000000000022 <.L87>:
  22:	8082                	ret

Its curious that relu for double didnt got vectorized... I suppose that the scheduler model determined that scalar path gives better perf.

Carry operation

Add with carry

vadd.vv v2, v2, v3 # v2 = v2 + v3
vmadc.vv v0, v1, v2 # v0 = carry(v2 + v3)
vadc.vvm v2, v2, v4, v0 # v2 = v2 + v0(carry) + v4

Ignore a bit the above example...

Mmmm if I would do multi workd arith, it would be...

128 bit on 64 bit system

Rlow = Low0 + Low1
carry = C(Low0 + low1)

Rhigh = High0+ High1 + carry

Then the memory layout could be...

an array of lows and an array of highs... Efficient to work with

or

an array of numbers where the stride per element is 2 * word_size

I must pay on mem to go to a vectorizable way....

Rename olympia

{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: sending inst to dispatch: uid: 2    RENAMED 0 pid: 2 uopid: 0 'vsadd.vv	12,4,8' 
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: 	setup source register bit mask [4] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: 	setup source register bit mask [4,8] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: 	setup destination register bit mask [32] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: sending inst to dispatch: uid: 2    RENAMED 0 pid: 2 uopid: 1 'vsadd.vv	13,5,9' 
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: 	setup source register bit mask [5] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: 	setup source register bit mask [5,9] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: 	setup destination register bit mask [33] for 'vector' scoreboard
{0000000008 00000008 top.cpu.core0.rename info} renameInstructions_: sending inst to dispatch: uid: 2    RENAMED 0 pid: 2 uopid: 2 'vsadd.vv	14,6,10'

orion160/rvv_examples.md

Select an option

No results found