kernel/armv8a/kernel_sgemm_8x8_lib4.S - RealtimeRoboticsGroup/test - Gitiles

 /**************************************************************************************************
 *                                                                                                 *
 * This file is part of BLASFEO.                                                                   *
 *                                                                                                 *
 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
 * Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
 * All rights reserved.                                                                            *
 *                                                                                                 *
 * HPMPC is free software; you can redistribute it and/or                                          *
 * modify it under the terms of the GNU Lesser General Public                                      *
 * License as published by the Free Software Foundation; either                                    *
 * version 2.1 of the License, or (at your option) any later version.                              *
 *                                                                                                 *
 * HPMPC is distributed in the hope that it will be useful,                                        *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
 * See the GNU Lesser General Public License for more details.                                     *
 *                                                                                                 *
 * You should have received a copy of the GNU Lesser General Public                                *
 * License along with HPMPC; if not, write to the Free Software                                    *
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
 *                                                                                                 *
 * Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
 *                          gianluca.frison (at) imtek.uni-freiburg.de                             *
 *                                                                                                 *
 **************************************************************************************************/

 #define STACKSIZE 11*16
 #define PROLOGUE \
 	sub sp, sp, #(11 * 16); \
 	stp d8, d9, [sp, #(0 * 16)]; \
 	stp d10, d11, [sp, #(1 * 16)]; \
 	stp d12, d13, [sp, #(2 * 16)]; \
 	stp d14, d15, [sp, #(3 * 16)]; \
 	stp x18, x19, [sp, #(4 * 16)]; \
 	stp x20, x21, [sp, #(5 * 16)]; \
 	stp x22, x23, [sp, #(6 * 16)]; \
 	stp x24, x25, [sp, #(7 * 16)]; \
 	stp x26, x27, [sp, #(8 * 16)]; \
 	stp x28, x29, [sp, #(9 * 16)]; \
 	str x30, [sp, #(10 * 16)];
 #define EPILOGUE \
 	ldp d8, d9, [sp, #(0 * 16)]; \
 	ldp d10, d11, [sp, #(1 * 16)]; \
 	ldp d12, d13, [sp, #(2 * 16)]; \
 	ldp d14, d15, [sp, #(3 * 16)]; \
 	ldp x18, x19, [sp, #(4 * 16)]; \
 	ldp x20, x21, [sp, #(5 * 16)]; \
 	ldp x22, x23, [sp, #(6 * 16)]; \
 	ldp x24, x25, [sp, #(7 * 16)]; \
 	ldp x26, x27, [sp, #(8 * 16)]; \
 	ldp x28, x29, [sp, #(9 * 16)]; \
 	ldr x30, [sp, #(10 * 16)]; \
 	add sp, sp, #(11 * 16);


 	.text


 // subroutine
 //
 // input arguments:
 // w8   <- k
 // x9   <- A
 // x10  <- sda
 // x11  <- B
 //
 // output arguments:

 #if MACRO_LEVEL>=2
 	.macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
 #else
 	.align	4
 	.type inner_kernel_gemm_add_nt_8x8_lib4, %function
 inner_kernel_gemm_add_nt_8x8_lib4:
 #endif

 	// early return
 	cmp		w8, #0
 	ble		2f // return

 	add		x13, x9, x10
 	add		x14, x11, x12

 	// prefetch
 	prfm	PLDL1KEEP, [x11, #0]
 	prfm	PLDL1KEEP, [x9, #0]
 	prfm	PLDL1KEEP, [x13, #0]
 	prfm	PLDL1KEEP, [x14, #0]

 	// preload
 	ld1		{v24.4s, v25.4s}, [x9], #32
 	ld1		{v28.4s, v29.4s}, [x11], #32
 	ld1		{v20.4s, v21.4s}, [x13], #32
 	ld1		{v16.4s, v17.4s}, [x14], #32

 	cmp		w8, #4
 	ble		0f // consider clean up loop

 	// prefetch
 	prfm	PLDL1KEEP, [x11, #32]
 	prfm	PLDL1KEEP, [x9, #32]
 	prfm	PLDL1KEEP, [x13, #32]
 	prfm	PLDL1KEEP, [x14, #32]

 	// main loop
 1:

 	// unroll 0
 	ld1		{v26.4s}, [x9], #16
 	fmla	v0.4s, v24.4s, v28.4s[0]
 	fmla	v1.4s, v24.4s, v28.4s[1]
 	ld1		{v27.4s}, [x9], #16
 	fmla	v2.4s, v24.4s, v28.4s[2]
 	fmla	v3.4s, v24.4s, v28.4s[3]
 	ld1		{v30.4s}, [x11], #16
 	fmla	v4.4s, v20.4s, v28.4s[0]
 	fmla	v5.4s, v20.4s, v28.4s[1]
 	ld1		{v31.4s}, [x11], #16
 	fmla	v6.4s, v20.4s, v28.4s[2]
 	fmla	v7.4s, v20.4s, v28.4s[3]
 	ld1		{v22.4s}, [x13], #16
 	fmla	v8.4s, v24.4s, v16.4s[0]
 	fmla	v9.4s, v24.4s, v16.4s[1]
 	ld1		{v23.4s}, [x13], #16
 	fmla	v10.4s, v24.4s, v16.4s[2]
 	fmla	v11.4s, v24.4s, v16.4s[3]
 	ld1		{v18.4s}, [x14], #16
 	fmla	v12.4s, v20.4s, v16.4s[0]
 	fmla	v13.4s, v20.4s, v16.4s[1]
 	ld1		{v19.4s}, [x14], #16
 	fmla	v14.4s, v20.4s, v16.4s[2]
 	fmla	v15.4s, v20.4s, v16.4s[3]

 	// unroll 1
 	prfm	PLDL1KEEP, [x11, #64]
 	fmla	v0.4s, v25.4s, v29.4s[0]
 	fmla	v1.4s, v25.4s, v29.4s[1]
 	prfm	PLDL1KEEP, [x9, #64]
 	fmla	v2.4s, v25.4s, v29.4s[2]
 	fmla	v3.4s, v25.4s, v29.4s[3]
 	prfm	PLDL1KEEP, [x13, #64]
 	fmla	v4.4s, v21.4s, v29.4s[0]
 	fmla	v5.4s, v21.4s, v29.4s[1]
 	prfm	PLDL1KEEP, [x14, #64]
 	fmla	v6.4s, v21.4s, v29.4s[2]
 	fmla	v7.4s, v21.4s, v29.4s[3]
 	sub		w8, w8, #4
 	fmla	v8.4s, v25.4s, v17.4s[0]
 	fmla	v9.4s, v25.4s, v17.4s[1]
 	fmla	v10.4s, v25.4s, v17.4s[2]
 	fmla	v11.4s, v25.4s, v17.4s[3]
 	fmla	v12.4s, v21.4s, v17.4s[0]
 	fmla	v13.4s, v21.4s, v17.4s[1]
 	cmp		w8, #4
 	fmla	v14.4s, v21.4s, v17.4s[2]
 	fmla	v15.4s, v21.4s, v17.4s[3]

 	// unroll 2
 	ld1		{v24.4s}, [x9], #16
 	fmla	v0.4s, v26.4s, v30.4s[0]
 	fmla	v1.4s, v26.4s, v30.4s[1]
 	ld1		{v25.4s}, [x9], #16
 	fmla	v2.4s, v26.4s, v30.4s[2]
 	fmla	v3.4s, v26.4s, v30.4s[3]
 	ld1		{v28.4s}, [x11], #16
 	fmla	v4.4s, v22.4s, v30.4s[0]
 	fmla	v5.4s, v22.4s, v30.4s[1]
 	ld1		{v29.4s}, [x11], #16
 	fmla	v6.4s, v22.4s, v30.4s[2]
 	fmla	v7.4s, v22.4s, v30.4s[3]
 	ld1		{v20.4s}, [x13], #16
 	fmla	v8.4s, v26.4s, v18.4s[0]
 	fmla	v9.4s, v26.4s, v18.4s[1]
 	ld1		{v21.4s}, [x13], #16
 	fmla	v10.4s, v26.4s, v18.4s[2]
 	fmla	v11.4s, v26.4s, v18.4s[3]
 	ld1		{v16.4s}, [x14], #16
 	fmla	v12.4s, v22.4s, v18.4s[0]
 	fmla	v13.4s, v22.4s, v18.4s[1]
 	ld1		{v17.4s}, [x14], #16
 	fmla	v14.4s, v22.4s, v18.4s[2]
 	fmla	v15.4s, v22.4s, v18.4s[3]

 	// unroll 3
 	fmla	v0.4s, v27.4s, v31.4s[0]
 	fmla	v1.4s, v27.4s, v31.4s[1]
 	fmla	v2.4s, v27.4s, v31.4s[2]
 	fmla	v3.4s, v27.4s, v31.4s[3]
 	fmla	v4.4s, v23.4s, v31.4s[0]
 	fmla	v5.4s, v23.4s, v31.4s[1]
 	fmla	v6.4s, v23.4s, v31.4s[2]
 	fmla	v7.4s, v23.4s, v31.4s[3]
 	fmla	v8.4s, v27.4s, v19.4s[0]
 	fmla	v9.4s, v27.4s, v19.4s[1]
 	fmla	v10.4s, v27.4s, v19.4s[2]
 	fmla	v11.4s, v27.4s, v19.4s[3]
 	fmla	v12.4s, v23.4s, v19.4s[0]
 	fmla	v13.4s, v23.4s, v19.4s[1]
 	fmla	v14.4s, v23.4s, v19.4s[2]
 	fmla	v15.4s, v23.4s, v19.4s[3]

 	bgt		1b

 0:

 	cmp		w8, #3
 	ble		4f

 	// unroll 0
 	ld1		{v26.4s}, [x9], #16
 	fmla	v0.4s, v24.4s, v28.4s[0]
 	fmla	v1.4s, v24.4s, v28.4s[1]
 	ld1		{v27.4s}, [x9], #16
 	fmla	v2.4s, v24.4s, v28.4s[2]
 	fmla	v3.4s, v24.4s, v28.4s[3]
 	ld1		{v30.4s}, [x11], #16
 	fmla	v4.4s, v20.4s, v28.4s[0]
 	fmla	v5.4s, v20.4s, v28.4s[1]
 	ld1		{v31.4s}, [x11], #16
 	fmla	v6.4s, v20.4s, v28.4s[2]
 	fmla	v7.4s, v20.4s, v28.4s[3]
 	ld1		{v22.4s}, [x13], #16
 	fmla	v8.4s, v24.4s, v16.4s[0]
 	fmla	v9.4s, v24.4s, v16.4s[1]
 	ld1		{v23.4s}, [x13], #16
 	fmla	v10.4s, v24.4s, v16.4s[2]
 	fmla	v11.4s, v24.4s, v16.4s[3]
 	ld1		{v18.4s}, [x14], #16
 	fmla	v12.4s, v20.4s, v16.4s[0]
 	fmla	v13.4s, v20.4s, v16.4s[1]
 	ld1		{v19.4s}, [x14], #16
 	fmla	v14.4s, v20.4s, v16.4s[2]
 	fmla	v15.4s, v20.4s, v16.4s[3]

 	// unroll 1
 //	prfm	PLDL1KEEP, [x11, #64]
 	fmla	v0.4s, v25.4s, v29.4s[0]
 	fmla	v1.4s, v25.4s, v29.4s[1]
 //	prfm	PLDL1KEEP, [x9, #64]
 	fmla	v2.4s, v25.4s, v29.4s[2]
 	fmla	v3.4s, v25.4s, v29.4s[3]
 //	prfm	PLDL1KEEP, [x13, #64]
 	fmla	v4.4s, v21.4s, v29.4s[0]
 	fmla	v5.4s, v21.4s, v29.4s[1]
 //	prfm	PLDL1KEEP, [x14, #64]
 	fmla	v6.4s, v21.4s, v29.4s[2]
 	fmla	v7.4s, v21.4s, v29.4s[3]
 	sub		w8, w8, #4
 	fmla	v8.4s, v25.4s, v17.4s[0]
 	fmla	v9.4s, v25.4s, v17.4s[1]
 	fmla	v10.4s, v25.4s, v17.4s[2]
 	fmla	v11.4s, v25.4s, v17.4s[3]
 	fmla	v12.4s, v21.4s, v17.4s[0]
 	fmla	v13.4s, v21.4s, v17.4s[1]
 	cmp		w8, #4
 	fmla	v14.4s, v21.4s, v17.4s[2]
 	fmla	v15.4s, v21.4s, v17.4s[3]

 	// unroll 2
 //	ld1		{v24.4s}, [x9], #16
 	fmla	v0.4s, v26.4s, v30.4s[0]
 	fmla	v1.4s, v26.4s, v30.4s[1]
 //	ld1		{v25.4s}, [x9], #16
 	fmla	v2.4s, v26.4s, v30.4s[2]
 	fmla	v3.4s, v26.4s, v30.4s[3]
 //	ld1		{v28.4s}, [x11], #16
 	fmla	v4.4s, v22.4s, v30.4s[0]
 	fmla	v5.4s, v22.4s, v30.4s[1]
 //	ld1		{v29.4s}, [x11], #16
 	fmla	v6.4s, v22.4s, v30.4s[2]
 	fmla	v7.4s, v22.4s, v30.4s[3]
 //	ld1		{v20.4s}, [x13], #16
 	fmla	v8.4s, v26.4s, v18.4s[0]
 	fmla	v9.4s, v26.4s, v18.4s[1]
 //	ld1		{v21.4s}, [x13], #16
 	fmla	v10.4s, v26.4s, v18.4s[2]
 	fmla	v11.4s, v26.4s, v18.4s[3]
 //	ld1		{v16.4s}, [x14], #16
 	fmla	v12.4s, v22.4s, v18.4s[0]
 	fmla	v13.4s, v22.4s, v18.4s[1]
 //	ld1		{v17.4s}, [x14], #16
 	fmla	v14.4s, v22.4s, v18.4s[2]
 	fmla	v15.4s, v22.4s, v18.4s[3]

 	// unroll 3
 	fmla	v0.4s, v27.4s, v31.4s[0]
 	fmla	v1.4s, v27.4s, v31.4s[1]
 	fmla	v2.4s, v27.4s, v31.4s[2]
 	fmla	v3.4s, v27.4s, v31.4s[3]
 	fmla	v4.4s, v23.4s, v31.4s[0]
 	fmla	v5.4s, v23.4s, v31.4s[1]
 	fmla	v6.4s, v23.4s, v31.4s[2]
 	fmla	v7.4s, v23.4s, v31.4s[3]
 	fmla	v8.4s, v27.4s, v19.4s[0]
 	fmla	v9.4s, v27.4s, v19.4s[1]
 	fmla	v10.4s, v27.4s, v19.4s[2]
 	fmla	v11.4s, v27.4s, v19.4s[3]
 	fmla	v12.4s, v23.4s, v19.4s[0]
 	fmla	v13.4s, v23.4s, v19.4s[1]
 	fmla	v14.4s, v23.4s, v19.4s[2]
 	fmla	v15.4s, v23.4s, v19.4s[3]

 	b		2f // return

 4: // consider clean1-up loop

 	cmp		w8, #0
 	ble		2f // return

 	sub		x9, x9, #32
 	sub		x13, x13, #32
 	sub		x11, x11, #32
 	sub		x14, x14, #32

 3: // clean1-up loop

 	// unroll 0

 	ld1		{v28.4s}, [x11], #16
 	ld1		{v24.4s}, [x9], #16
 	fmla	v0.4s, v24.4s, v28.4s[0]
 	fmla	v1.4s, v24.4s, v28.4s[1]
 	fmla	v2.4s, v24.4s, v28.4s[2]
 	fmla	v3.4s, v24.4s, v28.4s[3]
 	ld1		{v20.4s}, [x13], #16
 	fmla	v4.4s, v20.4s, v28.4s[0]
 	fmla	v5.4s, v20.4s, v28.4s[1]
 	fmla	v6.4s, v20.4s, v28.4s[2]
 	fmla	v7.4s, v20.4s, v28.4s[3]
 	ld1		{v16.4s}, [x14], #16
 	fmla	v8.4s, v24.4s, v16.4s[0]
 	fmla	v9.4s, v24.4s, v16.4s[1]
 	fmla	v10.4s, v24.4s, v16.4s[2]
 	fmla	v11.4s, v24.4s, v16.4s[3]
 	fmla	v12.4s, v20.4s, v16.4s[0]
 	fmla	v13.4s, v20.4s, v16.4s[1]
 	fmla	v14.4s, v20.4s, v16.4s[2]
 	fmla	v15.4s, v20.4s, v16.4s[3]

 	sub		w8, w8, #1
 	cmp		w8, #0
 	bgt		3b

 2: // return

 #if MACRO_LEVEL>=2
 	.endm
 #else
 	ret

 	.size	inner_kernel_gemm_add_nt_8x8_lib4, .-inner_kernel_gemm_add_nt_8x8_lib4
 #endif


 // subroutine
 //
 // input arguments:
 // x8   <- alpha
 // x9   <- beta
 // x10  <- C
 // x11  <- sdc
 //
 // output arguments:

 #if MACRO_LEVEL>=2
 	.macro INNER_SCALE_AB_8X8_LIB4
 #else
 	.align	4
 	.type inner_scale_ab_8x8_lib4, %function
 inner_scale_ab_8x8_lib4:
 #endif

 	ld1		{v28.4s}, [x8]

 	fmul	v0.4s, v0.4s, v28.4s[0]
 	fmul	v1.4s, v1.4s, v28.4s[0]
 	fmul	v2.4s, v2.4s, v28.4s[0]
 	fmul	v3.4s, v3.4s, v28.4s[0]
 	fmul	v4.4s, v4.4s, v28.4s[0]
 	fmul	v5.4s, v5.4s, v28.4s[0]
 	fmul	v6.4s, v6.4s, v28.4s[0]
 	fmul	v7.4s, v7.4s, v28.4s[0]
 	fmul	v8.4s, v8.4s, v28.4s[0]
 	fmul	v9.4s, v9.4s, v28.4s[0]
 	fmul	v10.4s, v10.4s, v28.4s[0]
 	fmul	v11.4s, v11.4s, v28.4s[0]
 	fmul	v12.4s, v12.4s, v28.4s[0]
 	fmul	v13.4s, v13.4s, v28.4s[0]
 	fmul	v14.4s, v14.4s, v28.4s[0]
 	fmul	v15.4s, v15.4s, v28.4s[0]

 	ld1		{v28.4s}, [x9]

 	add		x12, x10, x11

 	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
 	fmla	v0.4s, v24.4s, v28.4s[0]
 	fmla	v1.4s, v25.4s, v28.4s[0]
 	fmla	v2.4s, v26.4s, v28.4s[0]
 	fmla	v3.4s, v27.4s, v28.4s[0]

 	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
 	fmla	v4.4s, v24.4s, v28.4s[0]
 	fmla	v5.4s, v25.4s, v28.4s[0]
 	fmla	v6.4s, v26.4s, v28.4s[0]
 	fmla	v7.4s, v27.4s, v28.4s[0]

 	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
 	fmla	v8.4s, v24.4s, v28.4s[0]
 	fmla	v9.4s, v25.4s, v28.4s[0]
 	fmla	v10.4s, v26.4s, v28.4s[0]
 	fmla	v11.4s, v27.4s, v28.4s[0]

 	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
 	fmla	v12.4s, v24.4s, v28.4s[0]
 	fmla	v13.4s, v25.4s, v28.4s[0]
 	fmla	v14.4s, v26.4s, v28.4s[0]
 	fmla	v15.4s, v27.4s, v28.4s[0]

 #if MACRO_LEVEL>=2
 	.endm
 #else
 	ret

 	.size	inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
 #endif


 // subroutine
 //
 // input arguments:
 // x8   <- D
 // x9   <- sdd
 //
 // output arguments:

 #if MACRO_LEVEL>=2
 	.macro INNER_STORE_8X8_LIB4
 #else
 	.align 4
 	.type inner_store_8x8_lib4, %function
 inner_store_8x8_lib4:
 #endif

 	add		x10, x8, x9

 	st1		{v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
 	st1		{v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
 	st1		{v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
 	st1		{v12.4s, v13.4s, v14.4s, v15.4s}, [x10], #64

 #if MACRO_LEVEL>=2
 	.endm
 #else
 	ret

 	.size	inner_store_8x8_lib4, .-inner_store_8x8_lib4
 #endif


 //                               w0        x1             x2         w3       x4         w5       x6             x7        sp+0     sp+8       sp+16
 // void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd)

 	.align	4
 	.global	kernel_sgemm_nt_8x8_lib4
 	.type	kernel_sgemm_nt_8x8_lib4, %function
 kernel_sgemm_nt_8x8_lib4:


 	PROLOGUE


 	// TODO zero the entire 128-bit register ???
 	fmov	d0, xzr
 	fmov    d1, d0
 	fmov    d2, d0
 	fmov    d3, d0
 	fmov    d4, d0
 	fmov    d5, d0
 	fmov    d6, d0
 	fmov    d7, d0
 	fmov    d8, d0
 	fmov    d9, d0
 	fmov    d10, d0
 	fmov    d11, d0
 	fmov    d12, d0
 	fmov    d13, d0
 	fmov    d14, d0
 	fmov    d15, d0


 	// call inner kernel gemm nt
 	mov		w8, w0 // kmax
 	mov		x9, x2 // A
 	mov		w10, w3 // sda
 	lsl		w10, w10, #4 // 16*sda
 	mov		x11, x4 // B
 	mov		w12, w5 // sdb
 	lsl		w12, w12, #4 // 16*sdb

 #if MACRO_LEVEL>=2
 	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
 #else
 	bl	inner_kernel_gemm_add_nt_8x8_lib4
 #endif


 	// call inner blend for generic alpha and beta
 	mov		x8, x1 // alpha
 	mov		x9, x6 // beta
 	mov		x10, x7 // C
 	ldr		w11, [sp, #(STACKSIZE + 0)] // D
 	lsl		w11, w11, #4 // 16*sdc

 #if MACRO_LEVEL>=1
 	INNER_SCALE_AB_8X8_LIB4
 #else
 	bl inner_scale_ab_8x8_lib4
 #endif


 	// store n
 	ldr		x8, [sp, #(STACKSIZE + 8)] // D
 	ldr		w9, [sp, #(STACKSIZE + 16)] // sdd
 	lsl		w9, w9, #4 // 16*sdd

 #if MACRO_LEVEL>=1
 	INNER_STORE_8X8_LIB4
 #else
 	bl inner_store_8x8_lib4
 #endif


 	EPILOGUE

 	mov	x0, #0

 	ret
	/**************************************************************************************************
	* *
	* This file is part of BLASFEO. *
	* *
	* BLASFEO -- BLAS For Embedded Optimization. *
	* Copyright (C) 2016-2017 by Gianluca Frison. *
	* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
	* All rights reserved. *
	* *
	* HPMPC is free software; you can redistribute it and/or *
	* modify it under the terms of the GNU Lesser General Public *
	* License as published by the Free Software Foundation; either *
	* version 2.1 of the License, or (at your option) any later version. *
	* *
	* HPMPC is distributed in the hope that it will be useful, *
	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
	* See the GNU Lesser General Public License for more details. *
	* *
	* You should have received a copy of the GNU Lesser General Public *
	* License along with HPMPC; if not, write to the Free Software *
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
	* *
	* Author: Gianluca Frison, giaf (at) dtu.dk *
	* gianluca.frison (at) imtek.uni-freiburg.de *
	* *
	**************************************************************************************************/

	#define STACKSIZE 11*16
	#define PROLOGUE \
	sub sp, sp, #(11 * 16); \
	stp d8, d9, [sp, #(0 * 16)]; \
	stp d10, d11, [sp, #(1 * 16)]; \
	stp d12, d13, [sp, #(2 * 16)]; \
	stp d14, d15, [sp, #(3 * 16)]; \
	stp x18, x19, [sp, #(4 * 16)]; \
	stp x20, x21, [sp, #(5 * 16)]; \
	stp x22, x23, [sp, #(6 * 16)]; \
	stp x24, x25, [sp, #(7 * 16)]; \
	stp x26, x27, [sp, #(8 * 16)]; \
	stp x28, x29, [sp, #(9 * 16)]; \
	str x30, [sp, #(10 * 16)];
	#define EPILOGUE \
	ldp d8, d9, [sp, #(0 * 16)]; \
	ldp d10, d11, [sp, #(1 * 16)]; \
	ldp d12, d13, [sp, #(2 * 16)]; \
	ldp d14, d15, [sp, #(3 * 16)]; \
	ldp x18, x19, [sp, #(4 * 16)]; \
	ldp x20, x21, [sp, #(5 * 16)]; \
	ldp x22, x23, [sp, #(6 * 16)]; \
	ldp x24, x25, [sp, #(7 * 16)]; \
	ldp x26, x27, [sp, #(8 * 16)]; \
	ldp x28, x29, [sp, #(9 * 16)]; \
	ldr x30, [sp, #(10 * 16)]; \
	add sp, sp, #(11 * 16);





	.text





	// subroutine
	//
	// input arguments:
	// w8 <- k
	// x9 <- A
	// x10 <- sda
	// x11 <- B
	//
	// output arguments:

	#if MACRO_LEVEL>=2
	.macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
	#else
	.align 4
	.type inner_kernel_gemm_add_nt_8x8_lib4, %function
	inner_kernel_gemm_add_nt_8x8_lib4:
	#endif

	// early return
	cmp w8, #0
	ble 2f // return

	add x13, x9, x10
	add x14, x11, x12

	// prefetch
	prfm PLDL1KEEP, [x11, #0]
	prfm PLDL1KEEP, [x9, #0]
	prfm PLDL1KEEP, [x13, #0]
	prfm PLDL1KEEP, [x14, #0]

	// preload
	ld1 {v24.4s, v25.4s}, [x9], #32
	ld1 {v28.4s, v29.4s}, [x11], #32
	ld1 {v20.4s, v21.4s}, [x13], #32
	ld1 {v16.4s, v17.4s}, [x14], #32

	cmp w8, #4
	ble 0f // consider clean up loop

	// prefetch
	prfm PLDL1KEEP, [x11, #32]
	prfm PLDL1KEEP, [x9, #32]
	prfm PLDL1KEEP, [x13, #32]
	prfm PLDL1KEEP, [x14, #32]

	// main loop
	1:

	// unroll 0
	ld1 {v26.4s}, [x9], #16
	fmla v0.4s, v24.4s, v28.4s[0]
	fmla v1.4s, v24.4s, v28.4s[1]
	ld1 {v27.4s}, [x9], #16
	fmla v2.4s, v24.4s, v28.4s[2]
	fmla v3.4s, v24.4s, v28.4s[3]
	ld1 {v30.4s}, [x11], #16
	fmla v4.4s, v20.4s, v28.4s[0]
	fmla v5.4s, v20.4s, v28.4s[1]
	ld1 {v31.4s}, [x11], #16
	fmla v6.4s, v20.4s, v28.4s[2]
	fmla v7.4s, v20.4s, v28.4s[3]
	ld1 {v22.4s}, [x13], #16
	fmla v8.4s, v24.4s, v16.4s[0]
	fmla v9.4s, v24.4s, v16.4s[1]
	ld1 {v23.4s}, [x13], #16
	fmla v10.4s, v24.4s, v16.4s[2]
	fmla v11.4s, v24.4s, v16.4s[3]
	ld1 {v18.4s}, [x14], #16
	fmla v12.4s, v20.4s, v16.4s[0]
	fmla v13.4s, v20.4s, v16.4s[1]
	ld1 {v19.4s}, [x14], #16
	fmla v14.4s, v20.4s, v16.4s[2]
	fmla v15.4s, v20.4s, v16.4s[3]

	// unroll 1
	prfm PLDL1KEEP, [x11, #64]
	fmla v0.4s, v25.4s, v29.4s[0]
	fmla v1.4s, v25.4s, v29.4s[1]
	prfm PLDL1KEEP, [x9, #64]
	fmla v2.4s, v25.4s, v29.4s[2]
	fmla v3.4s, v25.4s, v29.4s[3]
	prfm PLDL1KEEP, [x13, #64]
	fmla v4.4s, v21.4s, v29.4s[0]
	fmla v5.4s, v21.4s, v29.4s[1]
	prfm PLDL1KEEP, [x14, #64]
	fmla v6.4s, v21.4s, v29.4s[2]
	fmla v7.4s, v21.4s, v29.4s[3]
	sub w8, w8, #4
	fmla v8.4s, v25.4s, v17.4s[0]
	fmla v9.4s, v25.4s, v17.4s[1]
	fmla v10.4s, v25.4s, v17.4s[2]
	fmla v11.4s, v25.4s, v17.4s[3]
	fmla v12.4s, v21.4s, v17.4s[0]
	fmla v13.4s, v21.4s, v17.4s[1]
	cmp w8, #4
	fmla v14.4s, v21.4s, v17.4s[2]
	fmla v15.4s, v21.4s, v17.4s[3]

	// unroll 2
	ld1 {v24.4s}, [x9], #16
	fmla v0.4s, v26.4s, v30.4s[0]
	fmla v1.4s, v26.4s, v30.4s[1]
	ld1 {v25.4s}, [x9], #16
	fmla v2.4s, v26.4s, v30.4s[2]
	fmla v3.4s, v26.4s, v30.4s[3]
	ld1 {v28.4s}, [x11], #16
	fmla v4.4s, v22.4s, v30.4s[0]
	fmla v5.4s, v22.4s, v30.4s[1]
	ld1 {v29.4s}, [x11], #16
	fmla v6.4s, v22.4s, v30.4s[2]
	fmla v7.4s, v22.4s, v30.4s[3]
	ld1 {v20.4s}, [x13], #16
	fmla v8.4s, v26.4s, v18.4s[0]
	fmla v9.4s, v26.4s, v18.4s[1]
	ld1 {v21.4s}, [x13], #16
	fmla v10.4s, v26.4s, v18.4s[2]
	fmla v11.4s, v26.4s, v18.4s[3]
	ld1 {v16.4s}, [x14], #16
	fmla v12.4s, v22.4s, v18.4s[0]
	fmla v13.4s, v22.4s, v18.4s[1]
	ld1 {v17.4s}, [x14], #16
	fmla v14.4s, v22.4s, v18.4s[2]
	fmla v15.4s, v22.4s, v18.4s[3]

	// unroll 3
	fmla v0.4s, v27.4s, v31.4s[0]
	fmla v1.4s, v27.4s, v31.4s[1]
	fmla v2.4s, v27.4s, v31.4s[2]
	fmla v3.4s, v27.4s, v31.4s[3]
	fmla v4.4s, v23.4s, v31.4s[0]
	fmla v5.4s, v23.4s, v31.4s[1]
	fmla v6.4s, v23.4s, v31.4s[2]
	fmla v7.4s, v23.4s, v31.4s[3]
	fmla v8.4s, v27.4s, v19.4s[0]
	fmla v9.4s, v27.4s, v19.4s[1]
	fmla v10.4s, v27.4s, v19.4s[2]
	fmla v11.4s, v27.4s, v19.4s[3]
	fmla v12.4s, v23.4s, v19.4s[0]
	fmla v13.4s, v23.4s, v19.4s[1]
	fmla v14.4s, v23.4s, v19.4s[2]
	fmla v15.4s, v23.4s, v19.4s[3]

	bgt 1b

	0:

	cmp w8, #3
	ble 4f

	// unroll 0
	ld1 {v26.4s}, [x9], #16
	fmla v0.4s, v24.4s, v28.4s[0]
	fmla v1.4s, v24.4s, v28.4s[1]
	ld1 {v27.4s}, [x9], #16
	fmla v2.4s, v24.4s, v28.4s[2]
	fmla v3.4s, v24.4s, v28.4s[3]
	ld1 {v30.4s}, [x11], #16
	fmla v4.4s, v20.4s, v28.4s[0]
	fmla v5.4s, v20.4s, v28.4s[1]
	ld1 {v31.4s}, [x11], #16
	fmla v6.4s, v20.4s, v28.4s[2]
	fmla v7.4s, v20.4s, v28.4s[3]
	ld1 {v22.4s}, [x13], #16
	fmla v8.4s, v24.4s, v16.4s[0]
	fmla v9.4s, v24.4s, v16.4s[1]
	ld1 {v23.4s}, [x13], #16
	fmla v10.4s, v24.4s, v16.4s[2]
	fmla v11.4s, v24.4s, v16.4s[3]
	ld1 {v18.4s}, [x14], #16
	fmla v12.4s, v20.4s, v16.4s[0]
	fmla v13.4s, v20.4s, v16.4s[1]
	ld1 {v19.4s}, [x14], #16
	fmla v14.4s, v20.4s, v16.4s[2]
	fmla v15.4s, v20.4s, v16.4s[3]

	// unroll 1
	// prfm PLDL1KEEP, [x11, #64]
	fmla v0.4s, v25.4s, v29.4s[0]
	fmla v1.4s, v25.4s, v29.4s[1]
	// prfm PLDL1KEEP, [x9, #64]
	fmla v2.4s, v25.4s, v29.4s[2]
	fmla v3.4s, v25.4s, v29.4s[3]
	// prfm PLDL1KEEP, [x13, #64]
	fmla v4.4s, v21.4s, v29.4s[0]
	fmla v5.4s, v21.4s, v29.4s[1]
	// prfm PLDL1KEEP, [x14, #64]
	fmla v6.4s, v21.4s, v29.4s[2]
	fmla v7.4s, v21.4s, v29.4s[3]
	sub w8, w8, #4
	fmla v8.4s, v25.4s, v17.4s[0]
	fmla v9.4s, v25.4s, v17.4s[1]
	fmla v10.4s, v25.4s, v17.4s[2]
	fmla v11.4s, v25.4s, v17.4s[3]
	fmla v12.4s, v21.4s, v17.4s[0]
	fmla v13.4s, v21.4s, v17.4s[1]
	cmp w8, #4
	fmla v14.4s, v21.4s, v17.4s[2]
	fmla v15.4s, v21.4s, v17.4s[3]

	// unroll 2
	// ld1 {v24.4s}, [x9], #16
	fmla v0.4s, v26.4s, v30.4s[0]
	fmla v1.4s, v26.4s, v30.4s[1]
	// ld1 {v25.4s}, [x9], #16
	fmla v2.4s, v26.4s, v30.4s[2]
	fmla v3.4s, v26.4s, v30.4s[3]
	// ld1 {v28.4s}, [x11], #16
	fmla v4.4s, v22.4s, v30.4s[0]
	fmla v5.4s, v22.4s, v30.4s[1]
	// ld1 {v29.4s}, [x11], #16
	fmla v6.4s, v22.4s, v30.4s[2]
	fmla v7.4s, v22.4s, v30.4s[3]
	// ld1 {v20.4s}, [x13], #16
	fmla v8.4s, v26.4s, v18.4s[0]
	fmla v9.4s, v26.4s, v18.4s[1]
	// ld1 {v21.4s}, [x13], #16
	fmla v10.4s, v26.4s, v18.4s[2]
	fmla v11.4s, v26.4s, v18.4s[3]
	// ld1 {v16.4s}, [x14], #16
	fmla v12.4s, v22.4s, v18.4s[0]
	fmla v13.4s, v22.4s, v18.4s[1]
	// ld1 {v17.4s}, [x14], #16
	fmla v14.4s, v22.4s, v18.4s[2]
	fmla v15.4s, v22.4s, v18.4s[3]

	// unroll 3
	fmla v0.4s, v27.4s, v31.4s[0]
	fmla v1.4s, v27.4s, v31.4s[1]
	fmla v2.4s, v27.4s, v31.4s[2]
	fmla v3.4s, v27.4s, v31.4s[3]
	fmla v4.4s, v23.4s, v31.4s[0]
	fmla v5.4s, v23.4s, v31.4s[1]
	fmla v6.4s, v23.4s, v31.4s[2]
	fmla v7.4s, v23.4s, v31.4s[3]
	fmla v8.4s, v27.4s, v19.4s[0]
	fmla v9.4s, v27.4s, v19.4s[1]
	fmla v10.4s, v27.4s, v19.4s[2]
	fmla v11.4s, v27.4s, v19.4s[3]
	fmla v12.4s, v23.4s, v19.4s[0]
	fmla v13.4s, v23.4s, v19.4s[1]
	fmla v14.4s, v23.4s, v19.4s[2]
	fmla v15.4s, v23.4s, v19.4s[3]

	b 2f // return

	4: // consider clean1-up loop

	cmp w8, #0
	ble 2f // return

	sub x9, x9, #32
	sub x13, x13, #32
	sub x11, x11, #32
	sub x14, x14, #32

	3: // clean1-up loop

	// unroll 0

	ld1 {v28.4s}, [x11], #16
	ld1 {v24.4s}, [x9], #16
	fmla v0.4s, v24.4s, v28.4s[0]
	fmla v1.4s, v24.4s, v28.4s[1]
	fmla v2.4s, v24.4s, v28.4s[2]
	fmla v3.4s, v24.4s, v28.4s[3]
	ld1 {v20.4s}, [x13], #16
	fmla v4.4s, v20.4s, v28.4s[0]
	fmla v5.4s, v20.4s, v28.4s[1]
	fmla v6.4s, v20.4s, v28.4s[2]
	fmla v7.4s, v20.4s, v28.4s[3]
	ld1 {v16.4s}, [x14], #16
	fmla v8.4s, v24.4s, v16.4s[0]
	fmla v9.4s, v24.4s, v16.4s[1]
	fmla v10.4s, v24.4s, v16.4s[2]
	fmla v11.4s, v24.4s, v16.4s[3]
	fmla v12.4s, v20.4s, v16.4s[0]
	fmla v13.4s, v20.4s, v16.4s[1]
	fmla v14.4s, v20.4s, v16.4s[2]
	fmla v15.4s, v20.4s, v16.4s[3]

	sub w8, w8, #1
	cmp w8, #0
	bgt 3b

	2: // return

	#if MACRO_LEVEL>=2
	.endm
	#else
	ret

	.size inner_kernel_gemm_add_nt_8x8_lib4, .-inner_kernel_gemm_add_nt_8x8_lib4
	#endif





	// subroutine
	//
	// input arguments:
	// x8 <- alpha
	// x9 <- beta
	// x10 <- C
	// x11 <- sdc
	//
	// output arguments:

	#if MACRO_LEVEL>=2
	.macro INNER_SCALE_AB_8X8_LIB4
	#else
	.align 4
	.type inner_scale_ab_8x8_lib4, %function
	inner_scale_ab_8x8_lib4:
	#endif

	ld1 {v28.4s}, [x8]

	fmul v0.4s, v0.4s, v28.4s[0]
	fmul v1.4s, v1.4s, v28.4s[0]
	fmul v2.4s, v2.4s, v28.4s[0]
	fmul v3.4s, v3.4s, v28.4s[0]
	fmul v4.4s, v4.4s, v28.4s[0]
	fmul v5.4s, v5.4s, v28.4s[0]
	fmul v6.4s, v6.4s, v28.4s[0]
	fmul v7.4s, v7.4s, v28.4s[0]
	fmul v8.4s, v8.4s, v28.4s[0]
	fmul v9.4s, v9.4s, v28.4s[0]
	fmul v10.4s, v10.4s, v28.4s[0]
	fmul v11.4s, v11.4s, v28.4s[0]
	fmul v12.4s, v12.4s, v28.4s[0]
	fmul v13.4s, v13.4s, v28.4s[0]
	fmul v14.4s, v14.4s, v28.4s[0]
	fmul v15.4s, v15.4s, v28.4s[0]

	ld1 {v28.4s}, [x9]

	add x12, x10, x11

	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
	fmla v0.4s, v24.4s, v28.4s[0]
	fmla v1.4s, v25.4s, v28.4s[0]
	fmla v2.4s, v26.4s, v28.4s[0]
	fmla v3.4s, v27.4s, v28.4s[0]

	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
	fmla v4.4s, v24.4s, v28.4s[0]
	fmla v5.4s, v25.4s, v28.4s[0]
	fmla v6.4s, v26.4s, v28.4s[0]
	fmla v7.4s, v27.4s, v28.4s[0]

	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
	fmla v8.4s, v24.4s, v28.4s[0]
	fmla v9.4s, v25.4s, v28.4s[0]
	fmla v10.4s, v26.4s, v28.4s[0]
	fmla v11.4s, v27.4s, v28.4s[0]

	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
	fmla v12.4s, v24.4s, v28.4s[0]
	fmla v13.4s, v25.4s, v28.4s[0]
	fmla v14.4s, v26.4s, v28.4s[0]
	fmla v15.4s, v27.4s, v28.4s[0]

	#if MACRO_LEVEL>=2
	.endm
	#else
	ret

	.size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
	#endif





	// subroutine
	//
	// input arguments:
	// x8 <- D
	// x9 <- sdd
	//
	// output arguments:

	#if MACRO_LEVEL>=2
	.macro INNER_STORE_8X8_LIB4
	#else
	.align 4
	.type inner_store_8x8_lib4, %function
	inner_store_8x8_lib4:
	#endif

	add x10, x8, x9

	st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
	st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
	st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
	st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], #64

	#if MACRO_LEVEL>=2
	.endm
	#else
	ret

	.size inner_store_8x8_lib4, .-inner_store_8x8_lib4
	#endif





	// w0 x1 x2 w3 x4 w5 x6 x7 sp+0 sp+8 sp+16
	// void kernel_sgemm_nt_8x4_lib4(int kmax, double alpha, double A, int sda, double B, int sdb, double beta, double C, int sdc, double D, int sdd)

	.align 4
	.global kernel_sgemm_nt_8x8_lib4
	.type kernel_sgemm_nt_8x8_lib4, %function
	kernel_sgemm_nt_8x8_lib4:



	PROLOGUE



	// TODO zero the entire 128-bit register ???
	fmov d0, xzr
	fmov d1, d0
	fmov d2, d0
	fmov d3, d0
	fmov d4, d0
	fmov d5, d0
	fmov d6, d0
	fmov d7, d0
	fmov d8, d0
	fmov d9, d0
	fmov d10, d0
	fmov d11, d0
	fmov d12, d0
	fmov d13, d0
	fmov d14, d0
	fmov d15, d0



	// call inner kernel gemm nt
	mov w8, w0 // kmax
	mov x9, x2 // A
	mov w10, w3 // sda
	lsl w10, w10, #4 // 16*sda
	mov x11, x4 // B
	mov w12, w5 // sdb
	lsl w12, w12, #4 // 16*sdb

	#if MACRO_LEVEL>=2
	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
	#else
	bl inner_kernel_gemm_add_nt_8x8_lib4
	#endif



	// call inner blend for generic alpha and beta
	mov x8, x1 // alpha
	mov x9, x6 // beta
	mov x10, x7 // C
	ldr w11, [sp, #(STACKSIZE + 0)] // D
	lsl w11, w11, #4 // 16*sdc

	#if MACRO_LEVEL>=1
	INNER_SCALE_AB_8X8_LIB4
	#else
	bl inner_scale_ab_8x8_lib4
	#endif



	// store n
	ldr x8, [sp, #(STACKSIZE + 8)] // D
	ldr w9, [sp, #(STACKSIZE + 16)] // sdd
	lsl w9, w9, #4 // 16*sdd

	#if MACRO_LEVEL>=1
	INNER_STORE_8X8_LIB4
	#else
	bl inner_store_8x8_lib4
	#endif



	EPILOGUE

	mov x0, #0

	ret