Blame - kernel/armv8a/kernel_sgemm_8x4_lib4.S - RealtimeRoboticsGroup/test

blob: 016af72e725bdc3db18202bed92c014d04414f3d [file] [log] [blame]

Austin Schuh	9a24b37	2018-01-28 16:12:29 -0800	[diff] [blame^]	1	/**************************************************************************************************
				2	* *
				3	* This file is part of BLASFEO. *
				4	* *
				5	* BLASFEO -- BLAS For Embedded Optimization. *
				6	* Copyright (C) 2016-2017 by Gianluca Frison. *
				7	* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
				8	* All rights reserved. *
				9	* *
				10	* HPMPC is free software; you can redistribute it and/or *
				11	* modify it under the terms of the GNU Lesser General Public *
				12	* License as published by the Free Software Foundation; either *
				13	* version 2.1 of the License, or (at your option) any later version. *
				14	* *
				15	* HPMPC is distributed in the hope that it will be useful, *
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
				18	* See the GNU Lesser General Public License for more details. *
				19	* *
				20	* You should have received a copy of the GNU Lesser General Public *
				21	* License along with HPMPC; if not, write to the Free Software *
				22	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
				23	* *
				24	* Author: Gianluca Frison, giaf (at) dtu.dk *
				25	* gianluca.frison (at) imtek.uni-freiburg.de *
				26	* *
				27	**************************************************************************************************/
				28
				29	#define STACKSIZE 11*16
				30	#define PROLOGUE \
				31	sub sp, sp, #(11 * 16); \
				32	stp d8, d9, [sp, #(0 * 16)]; \
				33	stp d10, d11, [sp, #(1 * 16)]; \
				34	stp d12, d13, [sp, #(2 * 16)]; \
				35	stp d14, d15, [sp, #(3 * 16)]; \
				36	stp x18, x19, [sp, #(4 * 16)]; \
				37	stp x20, x21, [sp, #(5 * 16)]; \
				38	stp x22, x23, [sp, #(6 * 16)]; \
				39	stp x24, x25, [sp, #(7 * 16)]; \
				40	stp x26, x27, [sp, #(8 * 16)]; \
				41	stp x28, x29, [sp, #(9 * 16)]; \
				42	str x30, [sp, #(10 * 16)];
				43	#define EPILOGUE \
				44	ldp d8, d9, [sp, #(0 * 16)]; \
				45	ldp d10, d11, [sp, #(1 * 16)]; \
				46	ldp d12, d13, [sp, #(2 * 16)]; \
				47	ldp d14, d15, [sp, #(3 * 16)]; \
				48	ldp x18, x19, [sp, #(4 * 16)]; \
				49	ldp x20, x21, [sp, #(5 * 16)]; \
				50	ldp x22, x23, [sp, #(6 * 16)]; \
				51	ldp x24, x25, [sp, #(7 * 16)]; \
				52	ldp x26, x27, [sp, #(8 * 16)]; \
				53	ldp x28, x29, [sp, #(9 * 16)]; \
				54	ldr x30, [sp, #(10 * 16)]; \
				55	add sp, sp, #(11 * 16);
				56
				57
				58
				59
				60
				61	.text
				62
				63
				64
				65
				66
				67	// subroutine
				68	//
				69	// input arguments:
				70	// w8 <- k
				71	// x9 <- A
				72	// x10 <- sda
				73	// x11 <- B
				74	//
				75	// output arguments:
				76
				77	#if MACRO_LEVEL>=2
				78	.macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
				79	#else
				80	.align 4
				81	.type inner_kernel_gemm_add_nt_8x4_lib4, %function
				82	inner_kernel_gemm_add_nt_8x4_lib4:
				83	#endif
				84
				85	// early return
				86	cmp w8, #0
				87	ble 2f // return
				88
				89	add x12, x9, x10
				90
				91	// prefetch
				92	prfm PLDL1KEEP, [x11, #0]
				93	prfm PLDL1KEEP, [x9, #0]
				94	prfm PLDL1KEEP, [x12, #0]
				95
				96	// preload
				97	ld1 {v24.4s, v25.4s}, [x9], #32
				98	ld1 {v28.4s, v29.4s}, [x11], #32
				99	ld1 {v20.4s, v21.4s}, [x12], #32
				100
				101	cmp w8, #4
				102	ble 0f // consider clean up loop
				103
				104	// prefetch
				105	prfm PLDL1KEEP, [x11, #32]
				106	prfm PLDL1KEEP, [x9, #32]
				107	prfm PLDL1KEEP, [x12, #32]
				108
				109	// main loop
				110	1:
				111
				112	// unroll 0
				113	fmla v0.4s, v24.4s, v28.4s[0]
				114	ld1 {v26.4s, v27.4s}, [x9], #32
				115	fmla v1.4s, v24.4s, v28.4s[1]
				116	ld1 {v30.4s, v31.4s}, [x11], #32
				117	fmla v2.4s, v24.4s, v28.4s[2]
				118	ld1 {v22.4s, v23.4s}, [x12], #32
				119	fmla v3.4s, v24.4s, v28.4s[3]
				120	prfm PLDL1KEEP, [x11, #64]
				121	fmla v4.4s, v20.4s, v28.4s[0]
				122	prfm PLDL1KEEP, [x9, #64]
				123	fmla v5.4s, v20.4s, v28.4s[1]
				124	prfm PLDL1KEEP, [x12, #64]
				125	fmla v6.4s, v20.4s, v28.4s[2]
				126	fmla v7.4s, v20.4s, v28.4s[3]
				127	sub w8, w8, #4
				128
				129	// unroll 1
				130	fmla v0.4s, v25.4s, v29.4s[0]
				131	fmla v1.4s, v25.4s, v29.4s[1]
				132	fmla v2.4s, v25.4s, v29.4s[2]
				133	fmla v3.4s, v25.4s, v29.4s[3]
				134	fmla v4.4s, v21.4s, v29.4s[0]
				135	fmla v5.4s, v21.4s, v29.4s[1]
				136	fmla v6.4s, v21.4s, v29.4s[2]
				137	fmla v7.4s, v21.4s, v29.4s[3]
				138	cmp w8, #4
				139
				140	// unroll 2
				141	fmla v0.4s, v26.4s, v30.4s[0]
				142	ld1 {v24.4s, v25.4s}, [x9], #32
				143	fmla v1.4s, v26.4s, v30.4s[1]
				144	ld1 {v28.4s, v29.4s}, [x11], #32
				145	fmla v2.4s, v26.4s, v30.4s[2]
				146	ld1 {v20.4s, v21.4s}, [x12], #32
				147	fmla v3.4s, v26.4s, v30.4s[3]
				148	fmla v4.4s, v22.4s, v30.4s[0]
				149	fmla v5.4s, v22.4s, v30.4s[1]
				150	fmla v6.4s, v22.4s, v30.4s[2]
				151	fmla v7.4s, v22.4s, v30.4s[3]
				152
				153	// unroll 3
				154	fmla v0.4s, v27.4s, v31.4s[0]
				155	fmla v1.4s, v27.4s, v31.4s[1]
				156	fmla v2.4s, v27.4s, v31.4s[2]
				157	fmla v3.4s, v27.4s, v31.4s[3]
				158	fmla v4.4s, v23.4s, v31.4s[0]
				159	fmla v5.4s, v23.4s, v31.4s[1]
				160	fmla v6.4s, v23.4s, v31.4s[2]
				161	fmla v7.4s, v23.4s, v31.4s[3]
				162
				163	bgt 1b
				164
				165	0:
				166
				167	cmp w8, #3
				168	ble 4f
				169
				170	// unroll 0
				171	fmla v0.4s, v24.4s, v28.4s[0]
				172	ld1 {v26.4s, v27.4s}, [x9], #32
				173	fmla v1.4s, v24.4s, v28.4s[1]
				174	ld1 {v30.4s, v31.4s}, [x11], #32
				175	fmla v2.4s, v24.4s, v28.4s[2]
				176	ld1 {v22.4s, v23.4s}, [x12], #32
				177	fmla v3.4s, v24.4s, v28.4s[3]
				178	// prfm PLDL1KEEP, [x11, #64]
				179	fmla v4.4s, v20.4s, v28.4s[0]
				180	// prfm PLDL1KEEP, [x9, #64]
				181	fmla v5.4s, v20.4s, v28.4s[1]
				182	// prfm PLDL1KEEP, [x12, #64]
				183	fmla v6.4s, v20.4s, v28.4s[2]
				184	fmla v7.4s, v20.4s, v28.4s[3]
				185	sub w8, w8, #4
				186
				187	// unroll 1
				188	fmla v0.4s, v25.4s, v29.4s[0]
				189	fmla v1.4s, v25.4s, v29.4s[1]
				190	fmla v2.4s, v25.4s, v29.4s[2]
				191	fmla v3.4s, v25.4s, v29.4s[3]
				192	fmla v4.4s, v21.4s, v29.4s[0]
				193	fmla v5.4s, v21.4s, v29.4s[1]
				194	fmla v6.4s, v21.4s, v29.4s[2]
				195	fmla v7.4s, v21.4s, v29.4s[3]
				196	// cmp w8, #4
				197
				198	// unroll 2
				199	fmla v0.4s, v26.4s, v30.4s[0]
				200	// ld1 {v24.4s, v25.4s}, [x9], #32
				201	fmla v1.4s, v26.4s, v30.4s[1]
				202	// ld1 {v28.4s, v29.4s}, [x11], #32
				203	fmla v2.4s, v26.4s, v30.4s[2]
				204	// ld1 {v20.4s, v21.4s}, [x12], #32
				205	fmla v3.4s, v26.4s, v30.4s[3]
				206	// ld1 {v16.4s, v17.4s}, [x13], #32
				207	fmla v4.4s, v22.4s, v30.4s[0]
				208	fmla v5.4s, v22.4s, v30.4s[1]
				209	fmla v6.4s, v22.4s, v30.4s[2]
				210	fmla v7.4s, v22.4s, v30.4s[3]
				211
				212	// unroll 3
				213	fmla v0.4s, v27.4s, v31.4s[0]
				214	fmla v1.4s, v27.4s, v31.4s[1]
				215	fmla v2.4s, v27.4s, v31.4s[2]
				216	fmla v3.4s, v27.4s, v31.4s[3]
				217	fmla v4.4s, v23.4s, v31.4s[0]
				218	fmla v5.4s, v23.4s, v31.4s[1]
				219	fmla v6.4s, v23.4s, v31.4s[2]
				220	fmla v7.4s, v23.4s, v31.4s[3]
				221
				222	b 2f // return
				223
				224	4: // consider clean1-up loop
				225
				226	cmp w8, #0
				227	ble 2f // return
				228
				229	sub x9, x9, #32
				230	sub x12, x12, #32
				231	sub x11, x11, #32
				232
				233	3: // clean1-up loop
				234
				235	// unroll 0
				236
				237	ld1 {v28.4s}, [x11], #16
				238	ld1 {v24.4s}, [x9], #16
				239	fmla v0.4s, v24.4s, v28.4s[0]
				240	fmla v1.4s, v24.4s, v28.4s[1]
				241	fmla v2.4s, v24.4s, v28.4s[2]
				242	fmla v3.4s, v24.4s, v28.4s[3]
				243	ld1 {v20.4s}, [x12], #16
				244	fmla v4.4s, v20.4s, v28.4s[0]
				245	fmla v5.4s, v20.4s, v28.4s[1]
				246	fmla v6.4s, v20.4s, v28.4s[2]
				247	fmla v7.4s, v20.4s, v28.4s[3]
				248
				249	sub w8, w8, #1
				250	cmp w8, #0
				251	bgt 3b
				252
				253	2: // return
				254
				255	#if MACRO_LEVEL>=2
				256	.endm
				257	#else
				258	ret
				259
				260	.size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
				261	#endif
				262
				263
				264
				265
				266
				267	// subroutine
				268	//
				269	// input arguments:
				270	// x8 <- alpha
				271	// x9 <- beta
				272	// x10 <- C
				273	// x11 <- sdc
				274	//
				275	// output arguments:
				276
				277	#if MACRO_LEVEL>=2
				278	.macro INNER_SCALE_AB_8X4_LIB4
				279	#else
				280	.align 4
				281	.type inner_scale_ab_8x4_lib4, %function
				282	inner_scale_ab_8x4_lib4:
				283	#endif
				284
				285	ld1 {v28.4s}, [x8]
				286
				287	fmul v0.4s, v0.4s, v28.4s[0]
				288	fmul v1.4s, v1.4s, v28.4s[0]
				289	fmul v2.4s, v2.4s, v28.4s[0]
				290	fmul v3.4s, v3.4s, v28.4s[0]
				291	fmul v4.4s, v4.4s, v28.4s[0]
				292	fmul v5.4s, v5.4s, v28.4s[0]
				293	fmul v6.4s, v6.4s, v28.4s[0]
				294	fmul v7.4s, v7.4s, v28.4s[0]
				295
				296	ld1 {v28.4s}, [x9]
				297
				298	add x12, x10, x11
				299
				300	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
				301	fmla v0.4s, v24.4s, v28.4s[0]
				302	fmla v1.4s, v25.4s, v28.4s[0]
				303	fmla v2.4s, v26.4s, v28.4s[0]
				304	fmla v3.4s, v27.4s, v28.4s[0]
				305
				306	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
				307	fmla v4.4s, v24.4s, v28.4s[0]
				308	fmla v5.4s, v25.4s, v28.4s[0]
				309	fmla v6.4s, v26.4s, v28.4s[0]
				310	fmla v7.4s, v27.4s, v28.4s[0]
				311
				312	#if MACRO_LEVEL>=2
				313	.endm
				314	#else
				315	ret
				316
				317	.size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
				318	#endif
				319
				320
				321
				322
				323
				324	// subroutine
				325	//
				326	// input arguments:
				327	// x8 <- D
				328	// x9 <- sdd
				329	//
				330	// output arguments:
				331
				332	#if MACRO_LEVEL>=2
				333	.macro INNER_STORE_8X4_LIB4
				334	#else
				335	.align 4
				336	.type inner_store_8x4_lib4, %function
				337	inner_store_8x4_lib4:
				338	#endif
				339
				340	add x10, x8, x9
				341
				342	st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
				343	st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
				344
				345	#if MACRO_LEVEL>=2
				346	.endm
				347	#else
				348	ret
				349
				350	.size inner_store_8x4_lib4, .-inner_store_8x4_lib4
				351	#endif
				352
				353
				354
				355
				356
				357	// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
				358	// void kernel_sgemm_nt_8x4_lib4(int kmax, double alpha, double A, int sda, double B, double beta, double C, int sdc, double D, int sdd)
				359
				360	.align 4
				361	.global kernel_sgemm_nt_8x4_lib4
				362	.type kernel_sgemm_nt_8x4_lib4, %function
				363	kernel_sgemm_nt_8x4_lib4:
				364
				365
				366
				367	PROLOGUE
				368
				369
				370
				371	// TODO zero the entire 128-bit register ???
				372	fmov d0, xzr
				373	fmov d1, d0
				374	fmov d2, d0
				375	fmov d3, d0
				376	fmov d4, d0
				377	fmov d5, d0
				378	fmov d6, d0
				379	fmov d7, d0
				380
				381
				382
				383	// call inner kernel gemm nt
				384	mov w8, w0 // kmax
				385	mov x9, x2 // A
				386	mov w10, w3 // sda
				387	lsl w10, w10, #4 // 16*sda
				388	mov x11, x4 // B
				389
				390	#if MACRO_LEVEL>=2
				391	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
				392	#else
				393	bl inner_kernel_gemm_add_nt_8x4_lib4
				394	#endif
				395
				396
				397
				398	// call inner blend for generic alpha and beta
				399	mov x8, x1 // alpha
				400	mov x9, x5 // beta
				401	mov x10, x6 // C
				402	mov w11, w7 // C
				403	lsl w11, w11, #4 // 16*sdc
				404
				405	#if MACRO_LEVEL>=1
				406	INNER_SCALE_AB_8X4_LIB4
				407	#else
				408	bl inner_scale_ab_8x4_lib4
				409	#endif
				410
				411
				412
				413	// store n
				414	ldr x8, [sp, #(STACKSIZE + 0)] // D
				415	ldr w9, [sp, #(STACKSIZE + 8)] // sdd
				416	lsl w9, w9, #4 // 16*sdd
				417
				418	#if MACRO_LEVEL>=1
				419	INNER_STORE_8X4_LIB4
				420	#else
				421	bl inner_store_8x4_lib4
				422	#endif
				423
				424
				425
				426	EPILOGUE
				427
				428	mov x0, #0
				429
				430	ret
				431
				432
				433