Open
Description
Looks like we are 1400% (?!) behind for kernel s231 in TSVC compared to GCC.
Compile this code with -O3 -mcpu=neoverse-v2 -ffast-math
:
__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
aa[256][256],bb[256][256],cc[256][256],tt[256][256];
int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);
float s231()
{
for (int nl = 0; nl < 100*(100000/256); nl++) {
for (int i = 0; i < 256; ++i) {
for (int j = 1; j < 256; j++) {
aa[j][i] = aa[j - 1][i] + bb[j][i];
}
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
}
Clang's codegen:
.LBB44_3: // Parent Loop BB44_1 Depth=1
// Parent Loop BB44_2 Depth=2
// => This Inner Loop Header: Depth=3
add x12, x21, x10
add x13, x20, x10
subs x11, x11, #5
add x10, x10, x19
ldr s1, [x12, #1024]
fadd s0, s1, s0
ldr s1, [x12, #2048]
str s0, [x13, #1024]
fadd s0, s1, s0
ldr s1, [x12, #3072]
str s0, [x13, #2048]
fadd s0, s1, s0
ldr s1, [x12, #4096]
str s0, [x13, #3072]
fadd s0, s1, s0
ldr s1, [x12, #5120]
str s0, [x13, #4096]
fadd s0, s1, s0
str s0, [x13, #5120]
b.ne .LBB44_3
vs. GCC's codegen:
.L521:
ldr q0, [x8, x0]
ldr q1, [x2, x0]
fadd v0.4s, v0.4s, v1.4s
str q0, [x1, x0]
add x0, x0, 16
cmp x0, 1024
bne .L521
See also:
https://godbolt.org/z/jr9WKW95v
TODO:
root cause analysis.