Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/armv8a/Makefile b/kernel/armv8a/Makefile
new file mode 100644
index 0000000..75e1faf
--- /dev/null
+++ b/kernel/armv8a/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o
+OBJS += kernel_sgemm_16x4_lib4.o kernel_sgemm_12x4_lib4.o kernel_sgemm_8x8_lib4.o kernel_sgemm_8x4_lib4.o kernel_sgemm_4x4_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/armv8a/kernel_dgemm_4x4_lib4.S b/kernel/armv8a/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..2d43b10
--- /dev/null
+++ b/kernel/armv8a/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,414 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ add sp, sp, #-(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, %function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x10, #0]
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // preload
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x10, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ fmla v0.2d, v24.2d, v28.2d[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.2d, v25.2d, v28.2d[0]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.2d, v24.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x10, #64]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.2d, v26.2d, v30.2d[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.2d, v27.2d, v30.2d[0]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.2d, v26.2d, v30.2d[1]
+ fmla v3.2d, v27.2d, v30.2d[1]
+ fmla v4.2d, v26.2d, v31.2d[0]
+ fmla v5.2d, v27.2d, v31.2d[0]
+ fmla v6.2d, v26.2d, v31.2d[1]
+ fmla v7.2d, v27.2d, v31.2d[1]
+
+ // unroll 2
+ fmla v0.2d, v24.2d, v28.2d[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.2d, v25.2d, v28.2d[0]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.2d, v24.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x10, #64]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 3
+ fmla v0.2d, v26.2d, v30.2d[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.2d, v27.2d, v30.2d[0]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.2d, v26.2d, v30.2d[1]
+ fmla v3.2d, v27.2d, v30.2d[1]
+ fmla v4.2d, v26.2d, v31.2d[0]
+ fmla v5.2d, v27.2d, v31.2d[0]
+ fmla v6.2d, v26.2d, v31.2d[1]
+ fmla v7.2d, v27.2d, v31.2d[1]
+
+ cmp w8, #4
+ bgt 1b
+
+ sub x9, x9, #32
+ sub x10, x10, #32
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 1
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 2
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 3
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ sub w8, w8, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.2d, v0.2d, v28.2d[0]
+ fmul v1.2d, v1.2d, v28.2d[0]
+ fmul v2.2d, v2.2d, v28.2d[0]
+ fmul v3.2d, v3.2d, v28.2d[0]
+ fmul v4.2d, v4.2d, v28.2d[0]
+ fmul v5.2d, v5.2d, v28.2d[0]
+ fmul v6.2d, v6.2d, v28.2d[0]
+ fmul v7.2d, v7.2d, v28.2d[0]
+
+ ld1 {v28.2d}, [x9]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v26.2d, v28.2d[0]
+ fmla v3.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v4.2d, v24.2d, v28.2d[0]
+ fmla v5.2d, v25.2d, v28.2d[0]
+ fmla v6.2d, v26.2d, v28.2d[0]
+ fmla v7.2d, v27.2d, v28.2d[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .align 4
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#endif
+
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x8], #64
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x8], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 x3 x4 x5 x6
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+ .align 4
+ .global kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, %function
+kernel_dgemm_nt_4x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov x10, x3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ bl inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x4 // beta
+ mov x10, x5 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+ bl inner_scale_ab_4x4_lib4
+#endif
+
+
+
+ // store n
+ mov x8, x6
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+ bl inner_store_4x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
diff --git a/kernel/armv8a/kernel_dgemm_8x4_lib4.S b/kernel/armv8a/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..314489d
--- /dev/null
+++ b/kernel/armv8a/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,575 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+
+ // preload
+ ldp d24, d25, [x11], #16
+ ldp d26, d27, [x11], #16
+ ldp q16, q17, [x9], #32
+ ldp q20, q21, [x12], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 1
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ sub w8, w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ // unroll 2
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 3
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ cmp w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+
+ // unroll 0
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 1
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ sub w8, w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ // unroll 2
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 3
+// ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+// ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+// ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+// ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+// ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+// ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+// cmp w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x11, x11, #32
+ sub x12, x12, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v20.2d, v21.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x11], #32
+ fmla v0.2d, v20.2d, v28.2d[0]
+ fmla v1.2d, v21.2d, v28.2d[0]
+ fmla v2.2d, v20.2d, v28.2d[1]
+ fmla v3.2d, v21.2d, v28.2d[1]
+ fmla v4.2d, v20.2d, v29.2d[0]
+ fmla v5.2d, v21.2d, v29.2d[0]
+ fmla v6.2d, v20.2d, v29.2d[1]
+ fmla v7.2d, v21.2d, v29.2d[1]
+ ld1 {v22.2d, v23.2d}, [x12], #32
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ fmla v10.2d, v22.2d, v28.2d[1]
+ fmla v11.2d, v23.2d, v28.2d[1]
+ fmla v12.2d, v22.2d, v29.2d[0]
+ fmla v13.2d, v23.2d, v29.2d[0]
+ fmla v14.2d, v22.2d, v29.2d[1]
+ fmla v15.2d, v23.2d, v29.2d[1]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.2d, v0.2d, v28.2d[0]
+ fmul v1.2d, v1.2d, v28.2d[0]
+ fmul v2.2d, v2.2d, v28.2d[0]
+ fmul v3.2d, v3.2d, v28.2d[0]
+ fmul v4.2d, v4.2d, v28.2d[0]
+ fmul v5.2d, v5.2d, v28.2d[0]
+ fmul v6.2d, v6.2d, v28.2d[0]
+ fmul v7.2d, v7.2d, v28.2d[0]
+ fmul v8.2d, v8.2d, v28.2d[0]
+ fmul v9.2d, v9.2d, v28.2d[0]
+ fmul v10.2d, v10.2d, v28.2d[0]
+ fmul v11.2d, v11.2d, v28.2d[0]
+ fmul v12.2d, v12.2d, v28.2d[0]
+ fmul v13.2d, v13.2d, v28.2d[0]
+ fmul v14.2d, v14.2d, v28.2d[0]
+ fmul v15.2d, v15.2d, v28.2d[0]
+
+ ld1 {v28.2d}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v26.2d, v28.2d[0]
+ fmla v3.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v4.2d, v24.2d, v28.2d[0]
+ fmla v5.2d, v25.2d, v28.2d[0]
+ fmla v6.2d, v26.2d, v28.2d[0]
+ fmla v7.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x12], #64
+ fmla v8.2d, v24.2d, v28.2d[0]
+ fmla v9.2d, v25.2d, v28.2d[0]
+ fmla v10.2d, v26.2d, v28.2d[0]
+ fmla v11.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x12], #64
+ fmla v12.2d, v24.2d, v28.2d[0]
+ fmla v13.2d, v25.2d, v28.2d[0]
+ fmla v14.2d, v26.2d, v28.2d[0]
+ fmla v15.2d, v27.2d, v28.2d[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .align 4
+ .type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x8], #64
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x8], #64
+ st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x10], #64
+ st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_dgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_dgemm_nt_8x4_lib4
+ .type kernel_dgemm_nt_8x4_lib4, %function
+kernel_dgemm_nt_8x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #5 // 32*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #5 // 32*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+ bl inner_scale_ab_8x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #5 // 32*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+ bl inner_store_8x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
diff --git a/kernel/armv8a/kernel_sgemm_12x4_lib4.S b/kernel/armv8a/kernel_sgemm_12x4_lib4.S
new file mode 100644
index 0000000..ab66cad
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_12x4_lib4.S
@@ -0,0 +1,512 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_12x4_lib4, %function
+inner_kernel_gemm_add_nt_12x4_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+ add x13, x12, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+ prfm PLDL1KEEP, [x13, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x12], #32
+ ld1 {v16.4s, v17.4s}, [x13], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+ prfm PLDL1KEEP, [x13, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x12], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ ld1 {v23.4s}, [x12], #16
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ // unroll 1
+ ld1 {v18.4s}, [x13], #16
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ ld1 {v19.4s}, [x13], #16
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v8.4s, v17.4s, v29.4s[0]
+ fmla v9.4s, v17.4s, v29.4s[1]
+ sub w8, w8, #4
+ fmla v10.4s, v17.4s, v29.4s[2]
+ fmla v11.4s, v17.4s, v29.4s[3]
+
+ // unroll 2
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+ ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v8.4s, v18.4s, v30.4s[0]
+ fmla v9.4s, v18.4s, v30.4s[1]
+ ld1 {v21.4s}, [x12], #16
+ fmla v10.4s, v18.4s, v30.4s[2]
+ fmla v11.4s, v18.4s, v30.4s[3]
+
+ // unroll 3
+ ld1 {v16.4s}, [x13], #16
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ ld1 {v17.4s}, [x13], #16
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ cmp w8, #4
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v19.4s, v31.4s[0]
+ fmla v9.4s, v19.4s, v31.4s[1]
+ fmla v10.4s, v19.4s, v31.4s[2]
+ fmla v11.4s, v19.4s, v31.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x12], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ ld1 {v23.4s}, [x12], #16
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ // unroll 1
+ ld1 {v18.4s}, [x13], #16
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ ld1 {v19.4s}, [x13], #16
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v8.4s, v17.4s, v29.4s[0]
+ fmla v9.4s, v17.4s, v29.4s[1]
+ sub w8, w8, #4
+ fmla v10.4s, v17.4s, v29.4s[2]
+ fmla v11.4s, v17.4s, v29.4s[3]
+
+ // unroll 2
+// ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+// ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+// ld1 {v20.4s}, [x12], #16
+ fmla v8.4s, v18.4s, v30.4s[0]
+ fmla v9.4s, v18.4s, v30.4s[1]
+// ld1 {v21.4s}, [x12], #16
+ fmla v10.4s, v18.4s, v30.4s[2]
+ fmla v11.4s, v18.4s, v30.4s[3]
+
+ // unroll 3
+// ld1 {v16.4s}, [x13], #16
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+// ld1 {v17.4s}, [x13], #16
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ cmp w8, #4
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v19.4s, v31.4s[0]
+ fmla v9.4s, v19.4s, v31.4s[1]
+ fmla v10.4s, v19.4s, v31.4s[2]
+ fmla v11.4s, v19.4s, v31.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x12, x12, #32
+ sub x11, x11, #32
+ sub x13, x13, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v16.4s}, [x13], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_12x4_lib4, .-inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_12X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_12x4_lib4, %function
+inner_scale_ab_12x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+ add x13, x12, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x13], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_12X4_LIB4
+#else
+ .align 4
+ .type inner_store_12x4_lib4, %function
+inner_store_12x4_lib4:
+#endif
+
+ add x10, x8, x9
+ add x11, x10, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x11], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_12x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_12x4_lib4
+ .type kernel_sgemm_nt_12x4_lib4, %function
+kernel_sgemm_nt_12x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+ bl inner_scale_ab_12x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+ bl inner_store_12x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
+
diff --git a/kernel/armv8a/kernel_sgemm_16x4_lib4.S b/kernel/armv8a/kernel_sgemm_16x4_lib4.S
new file mode 100644
index 0000000..edc06ac
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_16x4_lib4.S
@@ -0,0 +1,600 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_16x4_lib4, %function
+inner_kernel_gemm_add_nt_16x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+ add x13, x12, x10
+ add x14, x13, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+ prfm PLDL1KEEP, [x13, #0]
+ prfm PLDL1KEEP, [x14, #0]
+
+ // preload
+ ldp s24, s25, [x11], #8
+ ldp s26, s27, [x11], #8
+ ldr q16, [x9], #16
+ ldr q17, [x12], #16
+ ldr q18, [x13], #16
+ ldr q19, [x14], #16
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+ prfm PLDL1KEEP, [x13, #32]
+ prfm PLDL1KEEP, [x14, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 1
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+ prfm PLDL1KEEP, [x13, #32]
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+ prfm PLDL1KEEP, [x14, #32]
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ // unroll 2
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 3
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+ sub w8, w8, #4
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+ cmp w8, #4
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+
+ // unroll 0
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 1
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+// prfm PLDL1KEEP, [x13, #64]
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ // unroll 2
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+// prfm PLDL1KEEP, [x14, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 3
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+// sub w8, w8, #4
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+// cmp w8, #4
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #16
+ sub x11, x11, #16
+ sub x12, x12, #16
+ sub x13, x13, #16
+ sub x14, x14, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ // TODO
+ ldp s24, s25, [x11], #8
+ ldr q16, [x9], #16
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q17, [x12], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ ldr q19, [x14], #16
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_16x4_lib4, .-inner_kernel_gemm_add_nt_16x4_lib4
+#endif
+
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_16X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_16x4_lib4, %function
+inner_scale_ab_16x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+ fmul v12.4s, v12.4s, v28.4s[0]
+ fmul v13.4s, v13.4s, v28.4s[0]
+ fmul v14.4s, v14.4s, v28.4s[0]
+ fmul v15.4s, v15.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+ add x13, x12, x11
+ add x14, x13, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x13], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x14], #64
+ fmla v12.4s, v24.4s, v28.4s[0]
+ fmla v13.4s, v25.4s, v28.4s[0]
+ fmla v14.4s, v26.4s, v28.4s[0]
+ fmla v15.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_16x4_lib4, .-inner_scale_ab_16x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_16X4_LIB4
+#else
+ .align 4
+ .type inner_store_16x4_lib4, %function
+inner_store_16x4_lib4:
+#endif
+
+ add x10, x8, x9
+ add x11, x10, x9
+ add x12, x11, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x11], #64
+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x12], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_16x4_lib4, .-inner_store_16x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_16x4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_16x4_lib4
+ .type kernel_sgemm_nt_16x4_lib4, %function
+kernel_sgemm_nt_16x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_16x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB4
+#else
+ bl inner_scale_ab_16x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB4
+#else
+ bl inner_store_16x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
diff --git a/kernel/armv8a/kernel_sgemm_4x4_lib4.S b/kernel/armv8a/kernel_sgemm_4x4_lib4.S
new file mode 100644
index 0000000..6d3850d
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_4x4_lib4.S
@@ -0,0 +1,354 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ add sp, sp, #-(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_4x4_lib4, %function
+inner_kernel_gemm_add_nt_4x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x10, #0]
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // preload
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x10, #32]
+
+ // main loop
+1:
+
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ prfm PLDL1KEEP, [x10, #64]
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ sub w8, w8, #4
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+
+ cmp w8, #4
+ bgt 1b
+
+ sub x9, x9, #32
+ sub x10, x10, #32
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ // unroll 2
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ sub w8, w8, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v24.2d}, [x9], #16
+ ld1 {v28.2d}, [x10], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+
+ ld1 {v28.2d}, [x9]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .align 4
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#endif
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 x3 x4 x5 x6
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+ .align 4
+ .global kernel_sgemm_nt_4x4_lib4
+ .type kernel_sgemm_nt_4x4_lib4, %function
+kernel_sgemm_nt_4x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov x10, x3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x4 // beta
+ mov x10, x5 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+ bl inner_scale_ab_4x4_lib4
+#endif
+
+
+
+ // store n
+ mov x8, x6
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+ bl inner_store_4x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
diff --git a/kernel/armv8a/kernel_sgemm_8x4_lib4.S b/kernel/armv8a/kernel_sgemm_8x4_lib4.S
new file mode 100644
index 0000000..016af72
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_8x4_lib4.S
@@ -0,0 +1,433 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x12], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.4s, v27.4s}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.4s, v31.4s}, [x11], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ ld1 {v22.4s, v23.4s}, [x12], #32
+ fmla v3.4s, v24.4s, v28.4s[3]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v20.4s, v28.4s[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ cmp w8, #4
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+ ld1 {v20.4s, v21.4s}, [x12], #32
+ fmla v3.4s, v26.4s, v30.4s[3]
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.4s, v27.4s}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.4s, v31.4s}, [x11], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ ld1 {v22.4s, v23.4s}, [x12], #32
+ fmla v3.4s, v24.4s, v28.4s[3]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v20.4s, v28.4s[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v5.4s, v20.4s, v28.4s[1]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+// cmp w8, #4
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+// ld1 {v24.4s, v25.4s}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v28.4s, v29.4s}, [x11], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+// ld1 {v20.4s, v21.4s}, [x12], #32
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v16.4s, v17.4s}, [x13], #32
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x12, x12, #32
+ sub x11, x11, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .align 4
+ .type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_8x4_lib4
+ .type kernel_sgemm_nt_8x4_lib4, %function
+kernel_sgemm_nt_8x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+ bl inner_scale_ab_8x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+ bl inner_store_8x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
diff --git a/kernel/armv8a/kernel_sgemm_8x8_lib4.S b/kernel/armv8a/kernel_sgemm_8x8_lib4.S
new file mode 100644
index 0000000..6c8c090
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_8x8_lib4.S
@@ -0,0 +1,565 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x8_lib4, %function
+inner_kernel_gemm_add_nt_8x8_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x13, x9, x10
+ add x14, x11, x12
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x13, #0]
+ prfm PLDL1KEEP, [x14, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x13], #32
+ ld1 {v16.4s, v17.4s}, [x14], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x13, #32]
+ prfm PLDL1KEEP, [x14, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x13], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ ld1 {v23.4s}, [x13], #16
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ ld1 {v18.4s}, [x14], #16
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ ld1 {v19.4s}, [x14], #16
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ // unroll 1
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x13, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x14, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ sub w8, w8, #4
+ fmla v8.4s, v25.4s, v17.4s[0]
+ fmla v9.4s, v25.4s, v17.4s[1]
+ fmla v10.4s, v25.4s, v17.4s[2]
+ fmla v11.4s, v25.4s, v17.4s[3]
+ fmla v12.4s, v21.4s, v17.4s[0]
+ fmla v13.4s, v21.4s, v17.4s[1]
+ cmp w8, #4
+ fmla v14.4s, v21.4s, v17.4s[2]
+ fmla v15.4s, v21.4s, v17.4s[3]
+
+ // unroll 2
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+ ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+ ld1 {v20.4s}, [x13], #16
+ fmla v8.4s, v26.4s, v18.4s[0]
+ fmla v9.4s, v26.4s, v18.4s[1]
+ ld1 {v21.4s}, [x13], #16
+ fmla v10.4s, v26.4s, v18.4s[2]
+ fmla v11.4s, v26.4s, v18.4s[3]
+ ld1 {v16.4s}, [x14], #16
+ fmla v12.4s, v22.4s, v18.4s[0]
+ fmla v13.4s, v22.4s, v18.4s[1]
+ ld1 {v17.4s}, [x14], #16
+ fmla v14.4s, v22.4s, v18.4s[2]
+ fmla v15.4s, v22.4s, v18.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v27.4s, v19.4s[0]
+ fmla v9.4s, v27.4s, v19.4s[1]
+ fmla v10.4s, v27.4s, v19.4s[2]
+ fmla v11.4s, v27.4s, v19.4s[3]
+ fmla v12.4s, v23.4s, v19.4s[0]
+ fmla v13.4s, v23.4s, v19.4s[1]
+ fmla v14.4s, v23.4s, v19.4s[2]
+ fmla v15.4s, v23.4s, v19.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x13], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ ld1 {v23.4s}, [x13], #16
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ ld1 {v18.4s}, [x14], #16
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ ld1 {v19.4s}, [x14], #16
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ // unroll 1
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x13, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x14, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ sub w8, w8, #4
+ fmla v8.4s, v25.4s, v17.4s[0]
+ fmla v9.4s, v25.4s, v17.4s[1]
+ fmla v10.4s, v25.4s, v17.4s[2]
+ fmla v11.4s, v25.4s, v17.4s[3]
+ fmla v12.4s, v21.4s, v17.4s[0]
+ fmla v13.4s, v21.4s, v17.4s[1]
+ cmp w8, #4
+ fmla v14.4s, v21.4s, v17.4s[2]
+ fmla v15.4s, v21.4s, v17.4s[3]
+
+ // unroll 2
+// ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+// ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+// ld1 {v20.4s}, [x13], #16
+ fmla v8.4s, v26.4s, v18.4s[0]
+ fmla v9.4s, v26.4s, v18.4s[1]
+// ld1 {v21.4s}, [x13], #16
+ fmla v10.4s, v26.4s, v18.4s[2]
+ fmla v11.4s, v26.4s, v18.4s[3]
+// ld1 {v16.4s}, [x14], #16
+ fmla v12.4s, v22.4s, v18.4s[0]
+ fmla v13.4s, v22.4s, v18.4s[1]
+// ld1 {v17.4s}, [x14], #16
+ fmla v14.4s, v22.4s, v18.4s[2]
+ fmla v15.4s, v22.4s, v18.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v27.4s, v19.4s[0]
+ fmla v9.4s, v27.4s, v19.4s[1]
+ fmla v10.4s, v27.4s, v19.4s[2]
+ fmla v11.4s, v27.4s, v19.4s[3]
+ fmla v12.4s, v23.4s, v19.4s[0]
+ fmla v13.4s, v23.4s, v19.4s[1]
+ fmla v14.4s, v23.4s, v19.4s[2]
+ fmla v15.4s, v23.4s, v19.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x13, x13, #32
+ sub x11, x11, #32
+ sub x14, x14, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x13], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v16.4s}, [x14], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x8_lib4, .-inner_kernel_gemm_add_nt_8x8_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X8_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x8_lib4, %function
+inner_scale_ab_8x8_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+ fmul v12.4s, v12.4s, v28.4s[0]
+ fmul v13.4s, v13.4s, v28.4s[0]
+ fmul v14.4s, v14.4s, v28.4s[0]
+ fmul v15.4s, v15.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v12.4s, v24.4s, v28.4s[0]
+ fmla v13.4s, v25.4s, v28.4s[0]
+ fmla v14.4s, v26.4s, v28.4s[0]
+ fmla v15.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X8_LIB4
+#else
+ .align 4
+ .type inner_store_8x8_lib4, %function
+inner_store_8x8_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x8_lib4, .-inner_store_8x8_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 w5 x6 x7 sp+0 sp+8 sp+16
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_8x8_lib4
+ .type kernel_sgemm_nt_8x8_lib4, %function
+kernel_sgemm_nt_8x8_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+ mov w12, w5 // sdb
+ lsl w12, w12, #4 // 16*sdb
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x8_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x6 // beta
+ mov x10, x7 // C
+ ldr w11, [sp, #(STACKSIZE + 0)] // D
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+ bl inner_scale_ab_8x8_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 8)] // D
+ ldr w9, [sp, #(STACKSIZE + 16)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB4
+#else
+ bl inner_store_8x8_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
+