I threw together a quick risc-v vectorized implementation:<p><pre><code> size_t run(char *str) {
uint8_t *p = (uint8_t*)str;
long end = 0;
size_t res = 0, vl;
while (1) {
vl = __riscv_vsetvlmax_e8m8();
vuint8m8_t v = __riscv_vle8ff_v_u8m8(p, &vl, vl);
end = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, '\0', vl), vl);
if (end >= 0)
break;
res += __riscv_vcpop_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 's', vl), vl);
res -= __riscv_vcpop_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 'p', vl), vl);
p += vl;
}
vl = __riscv_vsetvl_e8m8(end);
vuint8m8_t v = __riscv_vle8_v_u8m8(p, vl);
res += __riscv_vcpop_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 's', vl), vl);
res -= __riscv_vcpop_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 'p', vl), vl);
return res;
}
</code></pre>
Here are the results from the above, the switch and the table c implementation, ran on my mangopi mq pro (C906, in order rv64gc with rvv 0.7.1, and a 128 bit vector length):<p><pre><code> switch: 0.19 Bytes/Cycle
tbl: 0.17 Bytes/Cycle
rvv: 1.57 Bytes/Cycle (dips down to 1.35 after ~30 KiB)
</code></pre>
Edit: you can go up to 2/1.7 Bytes/Cycle, if you make sure the pointer is page aligned (and vl isn't larger than the page size), see comments