blob: f6b0354b4fb52e042e3228e600f32b993ff867f9 [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001dnl x86 mpn_copyi -- copy limb vector, incrementing.
2
3dnl Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl This file is part of the GNU MP Library.
6dnl
7dnl The GNU MP Library is free software; you can redistribute it and/or modify
8dnl it under the terms of either:
9dnl
10dnl * the GNU Lesser General Public License as published by the Free
11dnl Software Foundation; either version 3 of the License, or (at your
12dnl option) any later version.
13dnl
14dnl or
15dnl
16dnl * the GNU General Public License as published by the Free Software
17dnl Foundation; either version 2 of the License, or (at your option) any
18dnl later version.
19dnl
20dnl or both in parallel, as here.
21dnl
22dnl The GNU MP Library is distributed in the hope that it will be useful, but
23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25dnl for more details.
26dnl
27dnl You should have received copies of the GNU General Public License and the
28dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29dnl see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C cycles/limb startup (approx)
35C P5 1.0 35
36C P6 0.75 45
37C K6 1.0 30
38C K7 1.3 65
39C P4 1.0 120
40C
41C (Startup time includes some function call overheads.)
42
43
44C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
45C
46C Copy src,size to dst,size, working from low to high addresses.
47C
48C The code here is very generic and can be expected to be reasonable on all
49C the x86 family.
50C
51C P6 - An MMX based copy was tried, but was found to be slower than a rep
52C movs in all cases. The fastest MMX found was 0.8 cycles/limb (when
53C fully aligned). A rep movs seems to have a startup time of about 15
54C cycles, but doing something special for small sizes could lead to a
55C branch misprediction that would destroy any saving. For now a plain
56C rep movs seems ok.
57C
58C K62 - We used to have a big chunk of code doing an MMX copy at 0.56 c/l if
59C aligned or a 1.0 rep movs if not. But that seemed excessive since
60C it only got an advantage half the time, and even then only showed it
61C above 50 limbs or so.
62
63defframe(PARAM_SIZE,12)
64defframe(PARAM_SRC, 8)
65defframe(PARAM_DST, 4)
66deflit(`FRAME',0)
67
68 TEXT
69 ALIGN(32)
70
71 C eax saved esi
72 C ebx
73 C ecx counter
74 C edx saved edi
75 C esi src
76 C edi dst
77 C ebp
78
79PROLOGUE(mpn_copyi)
80
81 movl PARAM_SIZE, %ecx
82 movl %esi, %eax
83
84 movl PARAM_SRC, %esi
85 movl %edi, %edx
86
87 movl PARAM_DST, %edi
88
89 cld C better safe than sorry, see mpn/x86/README
90
91 rep
92 movsl
93
94 movl %eax, %esi
95 movl %edx, %edi
96
97 ret
98
99EPILOGUE()