blob: 6c8c0900bd2a9a53eeb1c84eea2642e407a9cb72 [file] [log] [blame]
/**************************************************************************************************
* *
* This file is part of BLASFEO. *
* *
* BLASFEO -- BLAS For Embedded Optimization. *
* Copyright (C) 2016-2017 by Gianluca Frison. *
* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
* All rights reserved. *
* *
* HPMPC is free software; you can redistribute it and/or *
* modify it under the terms of the GNU Lesser General Public *
* License as published by the Free Software Foundation; either *
* version 2.1 of the License, or (at your option) any later version. *
* *
* HPMPC is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* See the GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public *
* License along with HPMPC; if not, write to the Free Software *
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
* *
* Author: Gianluca Frison, giaf (at) dtu.dk *
* gianluca.frison (at) imtek.uni-freiburg.de *
* *
**************************************************************************************************/
#define STACKSIZE 11*16
#define PROLOGUE \
sub sp, sp, #(11 * 16); \
stp d8, d9, [sp, #(0 * 16)]; \
stp d10, d11, [sp, #(1 * 16)]; \
stp d12, d13, [sp, #(2 * 16)]; \
stp d14, d15, [sp, #(3 * 16)]; \
stp x18, x19, [sp, #(4 * 16)]; \
stp x20, x21, [sp, #(5 * 16)]; \
stp x22, x23, [sp, #(6 * 16)]; \
stp x24, x25, [sp, #(7 * 16)]; \
stp x26, x27, [sp, #(8 * 16)]; \
stp x28, x29, [sp, #(9 * 16)]; \
str x30, [sp, #(10 * 16)];
#define EPILOGUE \
ldp d8, d9, [sp, #(0 * 16)]; \
ldp d10, d11, [sp, #(1 * 16)]; \
ldp d12, d13, [sp, #(2 * 16)]; \
ldp d14, d15, [sp, #(3 * 16)]; \
ldp x18, x19, [sp, #(4 * 16)]; \
ldp x20, x21, [sp, #(5 * 16)]; \
ldp x22, x23, [sp, #(6 * 16)]; \
ldp x24, x25, [sp, #(7 * 16)]; \
ldp x26, x27, [sp, #(8 * 16)]; \
ldp x28, x29, [sp, #(9 * 16)]; \
ldr x30, [sp, #(10 * 16)]; \
add sp, sp, #(11 * 16);
.text
// subroutine
//
// input arguments:
// w8 <- k
// x9 <- A
// x10 <- sda
// x11 <- B
//
// output arguments:
#if MACRO_LEVEL>=2
.macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
#else
.align 4
.type inner_kernel_gemm_add_nt_8x8_lib4, %function
inner_kernel_gemm_add_nt_8x8_lib4:
#endif
// early return
cmp w8, #0
ble 2f // return
add x13, x9, x10
add x14, x11, x12
// prefetch
prfm PLDL1KEEP, [x11, #0]
prfm PLDL1KEEP, [x9, #0]
prfm PLDL1KEEP, [x13, #0]
prfm PLDL1KEEP, [x14, #0]
// preload
ld1 {v24.4s, v25.4s}, [x9], #32
ld1 {v28.4s, v29.4s}, [x11], #32
ld1 {v20.4s, v21.4s}, [x13], #32
ld1 {v16.4s, v17.4s}, [x14], #32
cmp w8, #4
ble 0f // consider clean up loop
// prefetch
prfm PLDL1KEEP, [x11, #32]
prfm PLDL1KEEP, [x9, #32]
prfm PLDL1KEEP, [x13, #32]
prfm PLDL1KEEP, [x14, #32]
// main loop
1:
// unroll 0
ld1 {v26.4s}, [x9], #16
fmla v0.4s, v24.4s, v28.4s[0]
fmla v1.4s, v24.4s, v28.4s[1]
ld1 {v27.4s}, [x9], #16
fmla v2.4s, v24.4s, v28.4s[2]
fmla v3.4s, v24.4s, v28.4s[3]
ld1 {v30.4s}, [x11], #16
fmla v4.4s, v20.4s, v28.4s[0]
fmla v5.4s, v20.4s, v28.4s[1]
ld1 {v31.4s}, [x11], #16
fmla v6.4s, v20.4s, v28.4s[2]
fmla v7.4s, v20.4s, v28.4s[3]
ld1 {v22.4s}, [x13], #16
fmla v8.4s, v24.4s, v16.4s[0]
fmla v9.4s, v24.4s, v16.4s[1]
ld1 {v23.4s}, [x13], #16
fmla v10.4s, v24.4s, v16.4s[2]
fmla v11.4s, v24.4s, v16.4s[3]
ld1 {v18.4s}, [x14], #16
fmla v12.4s, v20.4s, v16.4s[0]
fmla v13.4s, v20.4s, v16.4s[1]
ld1 {v19.4s}, [x14], #16
fmla v14.4s, v20.4s, v16.4s[2]
fmla v15.4s, v20.4s, v16.4s[3]
// unroll 1
prfm PLDL1KEEP, [x11, #64]
fmla v0.4s, v25.4s, v29.4s[0]
fmla v1.4s, v25.4s, v29.4s[1]
prfm PLDL1KEEP, [x9, #64]
fmla v2.4s, v25.4s, v29.4s[2]
fmla v3.4s, v25.4s, v29.4s[3]
prfm PLDL1KEEP, [x13, #64]
fmla v4.4s, v21.4s, v29.4s[0]
fmla v5.4s, v21.4s, v29.4s[1]
prfm PLDL1KEEP, [x14, #64]
fmla v6.4s, v21.4s, v29.4s[2]
fmla v7.4s, v21.4s, v29.4s[3]
sub w8, w8, #4
fmla v8.4s, v25.4s, v17.4s[0]
fmla v9.4s, v25.4s, v17.4s[1]
fmla v10.4s, v25.4s, v17.4s[2]
fmla v11.4s, v25.4s, v17.4s[3]
fmla v12.4s, v21.4s, v17.4s[0]
fmla v13.4s, v21.4s, v17.4s[1]
cmp w8, #4
fmla v14.4s, v21.4s, v17.4s[2]
fmla v15.4s, v21.4s, v17.4s[3]
// unroll 2
ld1 {v24.4s}, [x9], #16
fmla v0.4s, v26.4s, v30.4s[0]
fmla v1.4s, v26.4s, v30.4s[1]
ld1 {v25.4s}, [x9], #16
fmla v2.4s, v26.4s, v30.4s[2]
fmla v3.4s, v26.4s, v30.4s[3]
ld1 {v28.4s}, [x11], #16
fmla v4.4s, v22.4s, v30.4s[0]
fmla v5.4s, v22.4s, v30.4s[1]
ld1 {v29.4s}, [x11], #16
fmla v6.4s, v22.4s, v30.4s[2]
fmla v7.4s, v22.4s, v30.4s[3]
ld1 {v20.4s}, [x13], #16
fmla v8.4s, v26.4s, v18.4s[0]
fmla v9.4s, v26.4s, v18.4s[1]
ld1 {v21.4s}, [x13], #16
fmla v10.4s, v26.4s, v18.4s[2]
fmla v11.4s, v26.4s, v18.4s[3]
ld1 {v16.4s}, [x14], #16
fmla v12.4s, v22.4s, v18.4s[0]
fmla v13.4s, v22.4s, v18.4s[1]
ld1 {v17.4s}, [x14], #16
fmla v14.4s, v22.4s, v18.4s[2]
fmla v15.4s, v22.4s, v18.4s[3]
// unroll 3
fmla v0.4s, v27.4s, v31.4s[0]
fmla v1.4s, v27.4s, v31.4s[1]
fmla v2.4s, v27.4s, v31.4s[2]
fmla v3.4s, v27.4s, v31.4s[3]
fmla v4.4s, v23.4s, v31.4s[0]
fmla v5.4s, v23.4s, v31.4s[1]
fmla v6.4s, v23.4s, v31.4s[2]
fmla v7.4s, v23.4s, v31.4s[3]
fmla v8.4s, v27.4s, v19.4s[0]
fmla v9.4s, v27.4s, v19.4s[1]
fmla v10.4s, v27.4s, v19.4s[2]
fmla v11.4s, v27.4s, v19.4s[3]
fmla v12.4s, v23.4s, v19.4s[0]
fmla v13.4s, v23.4s, v19.4s[1]
fmla v14.4s, v23.4s, v19.4s[2]
fmla v15.4s, v23.4s, v19.4s[3]
bgt 1b
0:
cmp w8, #3
ble 4f
// unroll 0
ld1 {v26.4s}, [x9], #16
fmla v0.4s, v24.4s, v28.4s[0]
fmla v1.4s, v24.4s, v28.4s[1]
ld1 {v27.4s}, [x9], #16
fmla v2.4s, v24.4s, v28.4s[2]
fmla v3.4s, v24.4s, v28.4s[3]
ld1 {v30.4s}, [x11], #16
fmla v4.4s, v20.4s, v28.4s[0]
fmla v5.4s, v20.4s, v28.4s[1]
ld1 {v31.4s}, [x11], #16
fmla v6.4s, v20.4s, v28.4s[2]
fmla v7.4s, v20.4s, v28.4s[3]
ld1 {v22.4s}, [x13], #16
fmla v8.4s, v24.4s, v16.4s[0]
fmla v9.4s, v24.4s, v16.4s[1]
ld1 {v23.4s}, [x13], #16
fmla v10.4s, v24.4s, v16.4s[2]
fmla v11.4s, v24.4s, v16.4s[3]
ld1 {v18.4s}, [x14], #16
fmla v12.4s, v20.4s, v16.4s[0]
fmla v13.4s, v20.4s, v16.4s[1]
ld1 {v19.4s}, [x14], #16
fmla v14.4s, v20.4s, v16.4s[2]
fmla v15.4s, v20.4s, v16.4s[3]
// unroll 1
// prfm PLDL1KEEP, [x11, #64]
fmla v0.4s, v25.4s, v29.4s[0]
fmla v1.4s, v25.4s, v29.4s[1]
// prfm PLDL1KEEP, [x9, #64]
fmla v2.4s, v25.4s, v29.4s[2]
fmla v3.4s, v25.4s, v29.4s[3]
// prfm PLDL1KEEP, [x13, #64]
fmla v4.4s, v21.4s, v29.4s[0]
fmla v5.4s, v21.4s, v29.4s[1]
// prfm PLDL1KEEP, [x14, #64]
fmla v6.4s, v21.4s, v29.4s[2]
fmla v7.4s, v21.4s, v29.4s[3]
sub w8, w8, #4
fmla v8.4s, v25.4s, v17.4s[0]
fmla v9.4s, v25.4s, v17.4s[1]
fmla v10.4s, v25.4s, v17.4s[2]
fmla v11.4s, v25.4s, v17.4s[3]
fmla v12.4s, v21.4s, v17.4s[0]
fmla v13.4s, v21.4s, v17.4s[1]
cmp w8, #4
fmla v14.4s, v21.4s, v17.4s[2]
fmla v15.4s, v21.4s, v17.4s[3]
// unroll 2
// ld1 {v24.4s}, [x9], #16
fmla v0.4s, v26.4s, v30.4s[0]
fmla v1.4s, v26.4s, v30.4s[1]
// ld1 {v25.4s}, [x9], #16
fmla v2.4s, v26.4s, v30.4s[2]
fmla v3.4s, v26.4s, v30.4s[3]
// ld1 {v28.4s}, [x11], #16
fmla v4.4s, v22.4s, v30.4s[0]
fmla v5.4s, v22.4s, v30.4s[1]
// ld1 {v29.4s}, [x11], #16
fmla v6.4s, v22.4s, v30.4s[2]
fmla v7.4s, v22.4s, v30.4s[3]
// ld1 {v20.4s}, [x13], #16
fmla v8.4s, v26.4s, v18.4s[0]
fmla v9.4s, v26.4s, v18.4s[1]
// ld1 {v21.4s}, [x13], #16
fmla v10.4s, v26.4s, v18.4s[2]
fmla v11.4s, v26.4s, v18.4s[3]
// ld1 {v16.4s}, [x14], #16
fmla v12.4s, v22.4s, v18.4s[0]
fmla v13.4s, v22.4s, v18.4s[1]
// ld1 {v17.4s}, [x14], #16
fmla v14.4s, v22.4s, v18.4s[2]
fmla v15.4s, v22.4s, v18.4s[3]
// unroll 3
fmla v0.4s, v27.4s, v31.4s[0]
fmla v1.4s, v27.4s, v31.4s[1]
fmla v2.4s, v27.4s, v31.4s[2]
fmla v3.4s, v27.4s, v31.4s[3]
fmla v4.4s, v23.4s, v31.4s[0]
fmla v5.4s, v23.4s, v31.4s[1]
fmla v6.4s, v23.4s, v31.4s[2]
fmla v7.4s, v23.4s, v31.4s[3]
fmla v8.4s, v27.4s, v19.4s[0]
fmla v9.4s, v27.4s, v19.4s[1]
fmla v10.4s, v27.4s, v19.4s[2]
fmla v11.4s, v27.4s, v19.4s[3]
fmla v12.4s, v23.4s, v19.4s[0]
fmla v13.4s, v23.4s, v19.4s[1]
fmla v14.4s, v23.4s, v19.4s[2]
fmla v15.4s, v23.4s, v19.4s[3]
b 2f // return
4: // consider clean1-up loop
cmp w8, #0
ble 2f // return
sub x9, x9, #32
sub x13, x13, #32
sub x11, x11, #32
sub x14, x14, #32
3: // clean1-up loop
// unroll 0
ld1 {v28.4s}, [x11], #16
ld1 {v24.4s}, [x9], #16
fmla v0.4s, v24.4s, v28.4s[0]
fmla v1.4s, v24.4s, v28.4s[1]
fmla v2.4s, v24.4s, v28.4s[2]
fmla v3.4s, v24.4s, v28.4s[3]
ld1 {v20.4s}, [x13], #16
fmla v4.4s, v20.4s, v28.4s[0]
fmla v5.4s, v20.4s, v28.4s[1]
fmla v6.4s, v20.4s, v28.4s[2]
fmla v7.4s, v20.4s, v28.4s[3]
ld1 {v16.4s}, [x14], #16
fmla v8.4s, v24.4s, v16.4s[0]
fmla v9.4s, v24.4s, v16.4s[1]
fmla v10.4s, v24.4s, v16.4s[2]
fmla v11.4s, v24.4s, v16.4s[3]
fmla v12.4s, v20.4s, v16.4s[0]
fmla v13.4s, v20.4s, v16.4s[1]
fmla v14.4s, v20.4s, v16.4s[2]
fmla v15.4s, v20.4s, v16.4s[3]
sub w8, w8, #1
cmp w8, #0
bgt 3b
2: // return
#if MACRO_LEVEL>=2
.endm
#else
ret
.size inner_kernel_gemm_add_nt_8x8_lib4, .-inner_kernel_gemm_add_nt_8x8_lib4
#endif
// subroutine
//
// input arguments:
// x8 <- alpha
// x9 <- beta
// x10 <- C
// x11 <- sdc
//
// output arguments:
#if MACRO_LEVEL>=2
.macro INNER_SCALE_AB_8X8_LIB4
#else
.align 4
.type inner_scale_ab_8x8_lib4, %function
inner_scale_ab_8x8_lib4:
#endif
ld1 {v28.4s}, [x8]
fmul v0.4s, v0.4s, v28.4s[0]
fmul v1.4s, v1.4s, v28.4s[0]
fmul v2.4s, v2.4s, v28.4s[0]
fmul v3.4s, v3.4s, v28.4s[0]
fmul v4.4s, v4.4s, v28.4s[0]
fmul v5.4s, v5.4s, v28.4s[0]
fmul v6.4s, v6.4s, v28.4s[0]
fmul v7.4s, v7.4s, v28.4s[0]
fmul v8.4s, v8.4s, v28.4s[0]
fmul v9.4s, v9.4s, v28.4s[0]
fmul v10.4s, v10.4s, v28.4s[0]
fmul v11.4s, v11.4s, v28.4s[0]
fmul v12.4s, v12.4s, v28.4s[0]
fmul v13.4s, v13.4s, v28.4s[0]
fmul v14.4s, v14.4s, v28.4s[0]
fmul v15.4s, v15.4s, v28.4s[0]
ld1 {v28.4s}, [x9]
add x12, x10, x11
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
fmla v0.4s, v24.4s, v28.4s[0]
fmla v1.4s, v25.4s, v28.4s[0]
fmla v2.4s, v26.4s, v28.4s[0]
fmla v3.4s, v27.4s, v28.4s[0]
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
fmla v4.4s, v24.4s, v28.4s[0]
fmla v5.4s, v25.4s, v28.4s[0]
fmla v6.4s, v26.4s, v28.4s[0]
fmla v7.4s, v27.4s, v28.4s[0]
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
fmla v8.4s, v24.4s, v28.4s[0]
fmla v9.4s, v25.4s, v28.4s[0]
fmla v10.4s, v26.4s, v28.4s[0]
fmla v11.4s, v27.4s, v28.4s[0]
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
fmla v12.4s, v24.4s, v28.4s[0]
fmla v13.4s, v25.4s, v28.4s[0]
fmla v14.4s, v26.4s, v28.4s[0]
fmla v15.4s, v27.4s, v28.4s[0]
#if MACRO_LEVEL>=2
.endm
#else
ret
.size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
#endif
// subroutine
//
// input arguments:
// x8 <- D
// x9 <- sdd
//
// output arguments:
#if MACRO_LEVEL>=2
.macro INNER_STORE_8X8_LIB4
#else
.align 4
.type inner_store_8x8_lib4, %function
inner_store_8x8_lib4:
#endif
add x10, x8, x9
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], #64
#if MACRO_LEVEL>=2
.endm
#else
ret
.size inner_store_8x8_lib4, .-inner_store_8x8_lib4
#endif
// w0 x1 x2 w3 x4 w5 x6 x7 sp+0 sp+8 sp+16
// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd)
.align 4
.global kernel_sgemm_nt_8x8_lib4
.type kernel_sgemm_nt_8x8_lib4, %function
kernel_sgemm_nt_8x8_lib4:
PROLOGUE
// TODO zero the entire 128-bit register ???
fmov d0, xzr
fmov d1, d0
fmov d2, d0
fmov d3, d0
fmov d4, d0
fmov d5, d0
fmov d6, d0
fmov d7, d0
fmov d8, d0
fmov d9, d0
fmov d10, d0
fmov d11, d0
fmov d12, d0
fmov d13, d0
fmov d14, d0
fmov d15, d0
// call inner kernel gemm nt
mov w8, w0 // kmax
mov x9, x2 // A
mov w10, w3 // sda
lsl w10, w10, #4 // 16*sda
mov x11, x4 // B
mov w12, w5 // sdb
lsl w12, w12, #4 // 16*sdb
#if MACRO_LEVEL>=2
INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
#else
bl inner_kernel_gemm_add_nt_8x8_lib4
#endif
// call inner blend for generic alpha and beta
mov x8, x1 // alpha
mov x9, x6 // beta
mov x10, x7 // C
ldr w11, [sp, #(STACKSIZE + 0)] // D
lsl w11, w11, #4 // 16*sdc
#if MACRO_LEVEL>=1
INNER_SCALE_AB_8X8_LIB4
#else
bl inner_scale_ab_8x8_lib4
#endif
// store n
ldr x8, [sp, #(STACKSIZE + 8)] // D
ldr w9, [sp, #(STACKSIZE + 16)] // sdd
lsl w9, w9, #4 // 16*sdd
#if MACRO_LEVEL>=1
INNER_STORE_8X8_LIB4
#else
bl inner_store_8x8_lib4
#endif
EPILOGUE
mov x0, #0
ret