blob: f23bacb1ef0e4ce1da2f31ed86253fab19a04332 [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001/* Header for speed and threshold things.
2
3Copyright 1999-2003, 2005, 2006, 2008-2017, 2019 Free Software
4Foundation, Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21or both in parallel, as here.
22
23The GNU MP Library is distributed in the hope that it will be useful, but
24WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26for more details.
27
28You should have received copies of the GNU General Public License and the
29GNU Lesser General Public License along with the GNU MP Library. If not,
30see https://www.gnu.org/licenses/. */
31
32#ifndef __SPEED_H__
33#define __SPEED_H__
34
35
36/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
37 newsize long. */
38#define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \
39 do { \
40 ASSERT ((newsize) >= (oldsize)); \
41 MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \
42 } while (0)
43
44/* A mask of the least significant n bits. Note 1<<32 doesn't give zero on
45 x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
46#define MP_LIMB_T_LOWBITMASK(n) \
47 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
48
49
50/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
51
52#define TMP_ALLOC_ALIGNED(bytes, align) \
53 align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
54#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
55 ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
56
57/* CACHE_LINE_SIZE is our default alignment for speed operands, and the
58 limit on what s->align_xp etc and then request for off-alignment. Maybe
59 this should be an option of some sort, but in any case here are some line
60 sizes,
61
62 bytes
63 32 pentium
64 64 athlon
65 64 itanium-2 L1
66 128 itanium-2 L2
67*/
68#define CACHE_LINE_SIZE 64 /* bytes */
69
70#define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1)
71
72/* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
73 alignment. */
74#define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align) \
75 do { \
76 mp_ptr __ptr; \
77 mp_size_t __ptr_align, __ptr_add; \
78 \
79 ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0); \
80 __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK); \
81 __ptr_align = (__ptr - (mp_ptr) NULL); \
82 __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK; \
83 (ptr) = __ptr + __ptr_add; \
84 } while (0)
85
86
87/* This is the size for s->xp_block and s->yp_block, used in certain
88 routines that want to run across many different data values and use
89 s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
90
91 512 means 2kbytes of data for each of xp_block and yp_block, making 4k
92 total, which should fit easily in any L1 data cache. */
93
94#define SPEED_BLOCK_SIZE 512 /* limbs */
95
96
97extern double speed_unittime;
98extern double speed_cycletime;
99extern int speed_precision;
100extern char speed_time_string[];
101void speed_time_init (void);
102void speed_cycletime_fail (const char *str);
103void speed_cycletime_init (void);
104void speed_cycletime_need_cycles (void);
105void speed_cycletime_need_seconds (void);
106void speed_starttime (void);
107double speed_endtime (void);
108
109
110struct speed_params {
111 unsigned reps; /* how many times to run the routine */
112 mp_ptr xp; /* first argument */
113 mp_ptr yp; /* second argument */
114 mp_size_t size; /* size of both arguments */
115 mp_limb_t r; /* user supplied parameter */
116 mp_size_t align_xp; /* alignment of xp */
117 mp_size_t align_yp; /* alignment of yp */
118 mp_size_t align_wp; /* intended alignment of wp */
119 mp_size_t align_wp2; /* intended alignment of wp2 */
120 mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */
121 mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */
122
123 double time_divisor; /* optionally set by the speed routine */
124
125 /* used by the cache priming things */
126 int cache;
127 unsigned src_num, dst_num;
128 struct {
129 mp_ptr ptr;
130 mp_size_t size;
131 } src[5], dst[4];
132};
133
134typedef double (*speed_function_t) (struct speed_params *);
135
136double speed_measure (speed_function_t fun, struct speed_params *);
137
138/* Prototypes for speed measuring routines */
139
140double speed_back_to_back (struct speed_params *);
141double speed_count_leading_zeros (struct speed_params *);
142double speed_count_trailing_zeros (struct speed_params *);
143double speed_find_a (struct speed_params *);
144double speed_gmp_allocate_free (struct speed_params *);
145double speed_gmp_allocate_reallocate_free (struct speed_params *);
146double speed_invert_limb (struct speed_params *);
147double speed_malloc_free (struct speed_params *);
148double speed_malloc_realloc_free (struct speed_params *);
149double speed_memcpy (struct speed_params *);
150double speed_binvert_limb (struct speed_params *);
151double speed_binvert_limb_mul1 (struct speed_params *);
152double speed_binvert_limb_loop (struct speed_params *);
153double speed_binvert_limb_cond (struct speed_params *);
154double speed_binvert_limb_arith (struct speed_params *);
155
156double speed_mpf_init_clear (struct speed_params *);
157
158double speed_mpn_add_n (struct speed_params *);
159double speed_mpn_add_1 (struct speed_params *);
160double speed_mpn_add_1_inplace (struct speed_params *);
161double speed_mpn_add_err1_n (struct speed_params *);
162double speed_mpn_add_err2_n (struct speed_params *);
163double speed_mpn_add_err3_n (struct speed_params *);
164double speed_mpn_addlsh_n (struct speed_params *);
165double speed_mpn_addlsh1_n (struct speed_params *);
166double speed_mpn_addlsh2_n (struct speed_params *);
167double speed_mpn_addlsh_n_ip1 (struct speed_params *);
168double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
169double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
170double speed_mpn_addlsh_n_ip2 (struct speed_params *);
171double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
172double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
173double speed_mpn_add_n_sub_n (struct speed_params *);
174double speed_mpn_and_n (struct speed_params *);
175double speed_mpn_andn_n (struct speed_params *);
176double speed_mpn_addmul_1 (struct speed_params *);
177double speed_mpn_addmul_2 (struct speed_params *);
178double speed_mpn_addmul_3 (struct speed_params *);
179double speed_mpn_addmul_4 (struct speed_params *);
180double speed_mpn_addmul_5 (struct speed_params *);
181double speed_mpn_addmul_6 (struct speed_params *);
182double speed_mpn_addmul_7 (struct speed_params *);
183double speed_mpn_addmul_8 (struct speed_params *);
184double speed_mpn_cnd_add_n (struct speed_params *);
185double speed_mpn_cnd_sub_n (struct speed_params *);
186double speed_mpn_com (struct speed_params *);
187double speed_mpn_neg (struct speed_params *);
188double speed_mpn_copyd (struct speed_params *);
189double speed_mpn_copyi (struct speed_params *);
190double speed_MPN_COPY (struct speed_params *);
191double speed_MPN_COPY_DECR (struct speed_params *);
192double speed_MPN_COPY_INCR (struct speed_params *);
193double speed_mpn_sec_tabselect (struct speed_params *);
194double speed_mpn_divexact_1 (struct speed_params *);
195double speed_mpn_divexact_by3 (struct speed_params *);
196double speed_mpn_bdiv_q_1 (struct speed_params *);
197double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
198double speed_mpn_bdiv_dbm1c (struct speed_params *);
199double speed_mpn_divrem_1 (struct speed_params *);
200double speed_mpn_divrem_1f (struct speed_params *);
201double speed_mpn_divrem_1c (struct speed_params *);
202double speed_mpn_divrem_1cf (struct speed_params *);
203double speed_mpn_divrem_1_div (struct speed_params *);
204double speed_mpn_divrem_1f_div (struct speed_params *);
205double speed_mpn_divrem_1_inv (struct speed_params *);
206double speed_mpn_divrem_1f_inv (struct speed_params *);
207double speed_mpn_divrem_2 (struct speed_params *);
208double speed_mpn_divrem_2_div (struct speed_params *);
209double speed_mpn_divrem_2_inv (struct speed_params *);
210double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
211double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
212double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
213double speed_mpn_div_qr_1 (struct speed_params *);
214double speed_mpn_div_qr_2n (struct speed_params *);
215double speed_mpn_div_qr_2u (struct speed_params *);
216double speed_mpn_fib2_ui (struct speed_params *);
217double speed_mpn_matrix22_mul (struct speed_params *);
218double speed_mpn_hgcd2 (struct speed_params *);
219double speed_mpn_hgcd2_1 (struct speed_params *);
220double speed_mpn_hgcd2_2 (struct speed_params *);
221double speed_mpn_hgcd2_3 (struct speed_params *);
222double speed_mpn_hgcd2_4 (struct speed_params *);
223double speed_mpn_hgcd2_5 (struct speed_params *);
224double speed_mpn_hgcd (struct speed_params *);
225double speed_mpn_hgcd_lehmer (struct speed_params *);
226double speed_mpn_hgcd_appr (struct speed_params *);
227double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
228double speed_mpn_hgcd_reduce (struct speed_params *);
229double speed_mpn_hgcd_reduce_1 (struct speed_params *);
230double speed_mpn_hgcd_reduce_2 (struct speed_params *);
231double speed_mpn_gcd (struct speed_params *);
232double speed_mpn_gcd_1 (struct speed_params *);
233double speed_mpn_gcd_11 (struct speed_params *);
234double speed_mpn_gcd_1N (struct speed_params *);
235double speed_mpn_gcd_22 (struct speed_params *);
236double speed_mpn_gcdext (struct speed_params *);
237double speed_mpn_gcdext_double (struct speed_params *);
238double speed_mpn_gcdext_one_double (struct speed_params *);
239double speed_mpn_gcdext_one_single (struct speed_params *);
240double speed_mpn_gcdext_single (struct speed_params *);
241double speed_mpn_get_str (struct speed_params *);
242double speed_mpn_hamdist (struct speed_params *);
243double speed_mpn_ior_n (struct speed_params *);
244double speed_mpn_iorn_n (struct speed_params *);
245double speed_mpn_jacobi_base (struct speed_params *);
246double speed_mpn_jacobi_base_1 (struct speed_params *);
247double speed_mpn_jacobi_base_2 (struct speed_params *);
248double speed_mpn_jacobi_base_3 (struct speed_params *);
249double speed_mpn_jacobi_base_4 (struct speed_params *);
250double speed_mpn_lshift (struct speed_params *);
251double speed_mpn_lshiftc (struct speed_params *);
252double speed_mpn_mod_1 (struct speed_params *);
253double speed_mpn_mod_1c (struct speed_params *);
254double speed_mpn_mod_1_div (struct speed_params *);
255double speed_mpn_mod_1_inv (struct speed_params *);
256double speed_mpn_mod_1_1 (struct speed_params *);
257double speed_mpn_mod_1_1_1 (struct speed_params *);
258double speed_mpn_mod_1_1_2 (struct speed_params *);
259double speed_mpn_mod_1_2 (struct speed_params *);
260double speed_mpn_mod_1_3 (struct speed_params *);
261double speed_mpn_mod_1_4 (struct speed_params *);
262double speed_mpn_mod_34lsub1 (struct speed_params *);
263double speed_mpn_modexact_1_odd (struct speed_params *);
264double speed_mpn_modexact_1c_odd (struct speed_params *);
265double speed_mpn_mul_1 (struct speed_params *);
266double speed_mpn_mul_1_inplace (struct speed_params *);
267double speed_mpn_mul_2 (struct speed_params *);
268double speed_mpn_mul_3 (struct speed_params *);
269double speed_mpn_mul_4 (struct speed_params *);
270double speed_mpn_mul_5 (struct speed_params *);
271double speed_mpn_mul_6 (struct speed_params *);
272double speed_mpn_mul (struct speed_params *);
273double speed_mpn_mul_basecase (struct speed_params *);
274double speed_mpn_mulmid (struct speed_params *);
275double speed_mpn_mulmid_basecase (struct speed_params *);
276double speed_mpn_mul_fft (struct speed_params *);
277double speed_mpn_mul_fft_sqr (struct speed_params *);
278double speed_mpn_fft_mul (struct speed_params *);
279double speed_mpn_fft_sqr (struct speed_params *);
280#if WANT_OLD_FFT_FULL
281double speed_mpn_mul_fft_full (struct speed_params *);
282double speed_mpn_mul_fft_full_sqr (struct speed_params *);
283#endif
284double speed_mpn_nussbaumer_mul (struct speed_params *);
285double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
286double speed_mpn_mul_n (struct speed_params *);
287double speed_mpn_mul_n_sqr (struct speed_params *);
288double speed_mpn_mulmid_n (struct speed_params *);
289double speed_mpn_sqrlo (struct speed_params *);
290double speed_mpn_sqrlo_basecase (struct speed_params *);
291double speed_mpn_mullo_n (struct speed_params *);
292double speed_mpn_mullo_basecase (struct speed_params *);
293double speed_mpn_nand_n (struct speed_params *);
294double speed_mpn_nior_n (struct speed_params *);
295double speed_mpn_popcount (struct speed_params *);
296double speed_mpn_preinv_divrem_1 (struct speed_params *);
297double speed_mpn_preinv_divrem_1f (struct speed_params *);
298double speed_mpn_preinv_mod_1 (struct speed_params *);
299double speed_mpn_sbpi1_div_qr (struct speed_params *);
300double speed_mpn_dcpi1_div_qr (struct speed_params *);
301double speed_mpn_sbpi1_divappr_q (struct speed_params *);
302double speed_mpn_dcpi1_divappr_q (struct speed_params *);
303double speed_mpn_mu_div_qr (struct speed_params *);
304double speed_mpn_mu_divappr_q (struct speed_params *);
305double speed_mpn_mupi_div_qr (struct speed_params *);
306double speed_mpn_mu_div_q (struct speed_params *);
307double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
308double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
309double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
310double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
311double speed_mpn_sbpi1_bdiv_r (struct speed_params *);
312double speed_mpn_mu_bdiv_q (struct speed_params *);
313double speed_mpn_mu_bdiv_qr (struct speed_params *);
314double speed_mpn_broot (struct speed_params *);
315double speed_mpn_broot_invm1 (struct speed_params *);
316double speed_mpn_brootinv (struct speed_params *);
317double speed_mpn_invert (struct speed_params *);
318double speed_mpn_invertappr (struct speed_params *);
319double speed_mpn_ni_invertappr (struct speed_params *);
320double speed_mpn_sec_invert (struct speed_params *s);
321double speed_mpn_binvert (struct speed_params *);
322double speed_mpn_redc_1 (struct speed_params *);
323double speed_mpn_redc_2 (struct speed_params *);
324double speed_mpn_redc_n (struct speed_params *);
325double speed_mpn_rsblsh_n (struct speed_params *);
326double speed_mpn_rsblsh1_n (struct speed_params *);
327double speed_mpn_rsblsh2_n (struct speed_params *);
328double speed_mpn_rsh1add_n (struct speed_params *);
329double speed_mpn_rsh1sub_n (struct speed_params *);
330double speed_mpn_rshift (struct speed_params *);
331double speed_mpn_sb_divrem_m3 (struct speed_params *);
332double speed_mpn_sb_divrem_m3_div (struct speed_params *);
333double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
334double speed_mpn_set_str (struct speed_params *);
335double speed_mpn_bc_set_str (struct speed_params *);
336double speed_mpn_dc_set_str (struct speed_params *);
337double speed_mpn_set_str_pre (struct speed_params *);
338double speed_mpn_sqr_basecase (struct speed_params *);
339double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
340double speed_mpn_sqr_diagonal (struct speed_params *);
341double speed_mpn_sqr (struct speed_params *);
342double speed_mpn_sqrtrem (struct speed_params *);
343double speed_mpn_rootrem (struct speed_params *);
344double speed_mpn_sqrt (struct speed_params *);
345double speed_mpn_root (struct speed_params *);
346double speed_mpn_perfect_power_p (struct speed_params *);
347double speed_mpn_perfect_square_p (struct speed_params *);
348double speed_mpn_sub_n (struct speed_params *);
349double speed_mpn_sub_1 (struct speed_params *);
350double speed_mpn_sub_1_inplace (struct speed_params *);
351double speed_mpn_sub_err1_n (struct speed_params *);
352double speed_mpn_sub_err2_n (struct speed_params *);
353double speed_mpn_sub_err3_n (struct speed_params *);
354double speed_mpn_sublsh_n (struct speed_params *);
355double speed_mpn_sublsh1_n (struct speed_params *);
356double speed_mpn_sublsh2_n (struct speed_params *);
357double speed_mpn_sublsh_n_ip1 (struct speed_params *);
358double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
359double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
360double speed_mpn_submul_1 (struct speed_params *);
361double speed_mpn_toom2_sqr (struct speed_params *);
362double speed_mpn_toom3_sqr (struct speed_params *);
363double speed_mpn_toom4_sqr (struct speed_params *);
364double speed_mpn_toom6_sqr (struct speed_params *);
365double speed_mpn_toom8_sqr (struct speed_params *);
366double speed_mpn_toom22_mul (struct speed_params *);
367double speed_mpn_toom33_mul (struct speed_params *);
368double speed_mpn_toom44_mul (struct speed_params *);
369double speed_mpn_toom6h_mul (struct speed_params *);
370double speed_mpn_toom8h_mul (struct speed_params *);
371double speed_mpn_toom32_mul (struct speed_params *);
372double speed_mpn_toom42_mul (struct speed_params *);
373double speed_mpn_toom43_mul (struct speed_params *);
374double speed_mpn_toom63_mul (struct speed_params *);
375double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
376double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
377double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
378double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
379double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
380double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
381double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
382double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
383double speed_mpn_toom42_mulmid (struct speed_params *);
384double speed_mpn_mulmod_bnm1 (struct speed_params *);
385double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
386double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
387double speed_mpn_sqrmod_bnm1 (struct speed_params *);
388double speed_mpn_udiv_qrnnd (struct speed_params *);
389double speed_mpn_udiv_qrnnd_r (struct speed_params *);
390double speed_mpn_umul_ppmm (struct speed_params *);
391double speed_mpn_umul_ppmm_r (struct speed_params *);
392double speed_mpn_xnor_n (struct speed_params *);
393double speed_mpn_xor_n (struct speed_params *);
394double speed_MPN_ZERO (struct speed_params *);
395
396double speed_mpq_init_clear (struct speed_params *);
397
398double speed_mpz_add (struct speed_params *);
399double speed_mpz_invert (struct speed_params *);
400double speed_mpz_bin_uiui (struct speed_params *);
401double speed_mpz_bin_ui (struct speed_params *);
402double speed_mpz_fac_ui (struct speed_params *);
403double speed_mpz_2fac_ui (struct speed_params *);
404double speed_mpz_mfac_uiui (struct speed_params *);
405double speed_mpz_primorial_ui (struct speed_params *);
406double speed_mpz_fib_ui (struct speed_params *);
407double speed_mpz_fib2_ui (struct speed_params *);
408double speed_mpz_init_clear (struct speed_params *);
409double speed_mpz_init_realloc_clear (struct speed_params *);
410double speed_mpz_nextprime (struct speed_params *);
411double speed_mpz_jacobi (struct speed_params *);
412double speed_mpz_lucnum_ui (struct speed_params *);
413double speed_mpz_lucnum2_ui (struct speed_params *);
414double speed_mpz_mod (struct speed_params *);
415double speed_mpz_powm (struct speed_params *);
416double speed_mpz_powm_mod (struct speed_params *);
417double speed_mpz_powm_redc (struct speed_params *);
418double speed_mpz_powm_sec (struct speed_params *);
419double speed_mpz_powm_ui (struct speed_params *);
420double speed_mpz_urandomb (struct speed_params *);
421
422double speed_gmp_randseed (struct speed_params *);
423double speed_gmp_randseed_ui (struct speed_params *);
424
425double speed_noop (struct speed_params *);
426double speed_noop_wxs (struct speed_params *);
427double speed_noop_wxys (struct speed_params *);
428
429double speed_operator_div (struct speed_params *);
430double speed_operator_mod (struct speed_params *);
431
432double speed_udiv_qrnnd (struct speed_params *);
433double speed_udiv_qrnnd_preinv1 (struct speed_params *);
434double speed_udiv_qrnnd_preinv2 (struct speed_params *);
435double speed_udiv_qrnnd_preinv3 (struct speed_params *);
436double speed_udiv_qrnnd_c (struct speed_params *);
437double speed_umul_ppmm (struct speed_params *);
438
439/* Prototypes for other routines */
440
441#if defined (__cplusplus)
442extern "C" {
443#endif
444
445/* low 32-bits in p[0], high 32-bits in p[1] */
446void speed_cyclecounter (unsigned p[2]);
447
448#if defined (__cplusplus)
449}
450#endif
451
452void mftb_function (unsigned p[2]);
453
454double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
455int gettimeofday_microseconds_p (void);
456int getrusage_microseconds_p (void);
457int cycles_works_p (void);
458long clk_tck (void);
459double freq_measure (const char *, double (*)(void));
460
461int double_cmp_ptr (const double *, const double *);
462void pentium_wbinvd (void);
463typedef int (*qsort_function_t) (const void *, const void *);
464
465void noop (void);
466void noop_1 (mp_limb_t);
467void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
468void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
469void mpn_cache_fill (mp_srcptr, mp_size_t);
470void mpn_cache_fill_dummy (mp_limb_t);
471void speed_cache_fill (struct speed_params *);
472void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
473void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
474
475extern int speed_option_addrs;
476extern int speed_option_verbose;
477extern int speed_option_cycles_broken;
478void speed_option_set (const char *);
479
480mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
481mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
482
483mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
484mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
485mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
486mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
487
488int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
489int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
490int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
491int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
492
493int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
494int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
495int mpn_hgcd2_3 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
496int mpn_hgcd2_4 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
497int mpn_hgcd2_5 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
498
499mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
500mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
501
502mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
503mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
504
505void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
506void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
507
508mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
509mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
510mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
511mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
512mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
513mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
514
515mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
516mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
517
518mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
519mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
520
521mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
522mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
523
524mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
525mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
526
527mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
528void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
529
530void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
531void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
532
533int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
534
535
536/* "get" is called repeatedly until it ticks over, just in case on a fast
537 processor it takes less than a microsecond, though this is probably
538 unlikely if it's a system call.
539
540 speed_cyclecounter is called on the same side of the "get" for the start
541 and end measurements. It doesn't matter how long it takes from the "get"
542 sample to the cycles sample, since that period will cancel out in the
543 difference calculation (assuming it's the same each time).
544
545 Letting the test run for more than a process time slice is probably only
546 going to reduce accuracy, especially for getrusage when the cycle counter
547 is real time, or for gettimeofday if the cycle counter is in fact process
548 time. Use CLK_TCK/2 as a reasonable stop.
549
550 It'd be desirable to be quite accurate here. The default speed_precision
551 for a cycle counter is 10000 cycles, so to mix that with getrusage or
552 gettimeofday the frequency should be at least that accurate. But running
553 measurements for 10000 microseconds (or more) is too long. Be satisfied
554 with just a half clock tick (5000 microseconds usually). */
555
556#define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec) \
557 do { \
558 type st1, st, et1, et; \
559 unsigned sc[2], ec[2]; \
560 long dt, half_tick; \
561 double dc, cyc; \
562 \
563 half_tick = (1000000L / clk_tck()) / 2; \
564 \
565 get (st1); \
566 do { \
567 get (st); \
568 } while (usec(st) == usec(st1) && sec(st) == sec(st1)); \
569 \
570 getc (sc); \
571 \
572 for (;;) \
573 { \
574 get (et1); \
575 do { \
576 get (et); \
577 } while (usec(et) == usec(et1) && sec(et) == sec(et1)); \
578 \
579 getc (ec); \
580 \
581 dc = speed_cyclecounter_diff (ec, sc); \
582 \
583 /* allow secs to cancel before multiplying */ \
584 dt = sec(et) - sec(st); \
585 dt = dt * 1000000L + (usec(et) - usec(st)); \
586 \
587 if (dt >= half_tick) \
588 break; \
589 } \
590 \
591 cyc = dt * 1e-6 / dc; \
592 \
593 if (speed_option_verbose >= 2) \
594 printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n", \
595 name, dc, dt, cyc); \
596 \
597 return dt * 1e-6 / dc; \
598 \
599 } while (0)
600
601
602
603
604/* The measuring routines use these big macros to save duplication for
605 similar forms. They also get used for some automatically generated
606 measuring of new implementations of functions.
607
608 Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
609 function pointer is considered undesirable since it's not the way a
610 normal application will be calling, and some processors might do
611 different things with an indirect call, like not branch predicting, or
612 doing a full pipe flush. At least some of the "functions" measured are
613 actually macros too.
614
615 The net effect is to bloat the object code, possibly in a big way, but
616 only what's being measured is being run, so that doesn't matter.
617
618 The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
619 ATTRIBUTE_CONST on the called functions. Adding a cast to a non-pure
620 function pointer doesn't work in gcc 3.2. Using an actual non-pure
621 function pointer variable works, but stands a real risk of a
622 non-optimizing compiler generating unnecessary overheads in the call.
623 Currently the best idea is not to use those attributes for a timing
624 program build. __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
625 gmp-impl.h to omit them from routines there. */
626
627#define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
628
629/* For mpn_copy or similar. */
630#define SPEED_ROUTINE_MPN_COPY_CALL(call) \
631 { \
632 mp_ptr wp; \
633 unsigned i; \
634 double t; \
635 TMP_DECL; \
636 \
637 SPEED_RESTRICT_COND (s->size >= 0); \
638 \
639 TMP_MARK; \
640 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
641 \
642 speed_operand_src (s, s->xp, s->size); \
643 speed_operand_dst (s, wp, s->size); \
644 speed_cache_fill (s); \
645 \
646 speed_starttime (); \
647 i = s->reps; \
648 do \
649 call; \
650 while (--i != 0); \
651 t = speed_endtime (); \
652 \
653 TMP_FREE; \
654 return t; \
655 }
656#define SPEED_ROUTINE_MPN_COPY(function) \
657 SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
658
659#define SPEED_ROUTINE_MPN_TABSELECT(function) \
660 { \
661 mp_ptr xp, wp; \
662 unsigned i; \
663 double t; \
664 TMP_DECL; \
665 \
666 SPEED_RESTRICT_COND (s->size >= 0); \
667 \
668 if (s->r == 0) \
669 s->r = s->size; /* default to a quadratic shape */ \
670 \
671 TMP_MARK; \
672 SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp); \
673 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
674 \
675 speed_operand_src (s, xp, s->size * s->r); \
676 speed_operand_dst (s, wp, s->size); \
677 speed_cache_fill (s); \
678 \
679 speed_starttime (); \
680 i = s->reps; \
681 do \
682 function (wp, xp, s->size, s->r, (s->r) / 2); \
683 while (--i != 0); \
684 t = speed_endtime () / s->r; \
685 \
686 TMP_FREE; \
687 return t; \
688 }
689
690
691#define SPEED_ROUTINE_MPN_COPYC(function) \
692 { \
693 mp_ptr wp; \
694 unsigned i; \
695 double t; \
696 TMP_DECL; \
697 \
698 SPEED_RESTRICT_COND (s->size >= 0); \
699 \
700 TMP_MARK; \
701 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
702 \
703 speed_operand_src (s, s->xp, s->size); \
704 speed_operand_dst (s, wp, s->size); \
705 speed_cache_fill (s); \
706 \
707 speed_starttime (); \
708 i = s->reps; \
709 do \
710 function (wp, s->xp, s->size, 0); \
711 while (--i != 0); \
712 t = speed_endtime (); \
713 \
714 TMP_FREE; \
715 return t; \
716 }
717
718/* s->size is still in limbs, and it's limbs which are copied, but
719 "function" takes a size in bytes not limbs. */
720#define SPEED_ROUTINE_MPN_COPY_BYTES(function) \
721 { \
722 mp_ptr wp; \
723 unsigned i; \
724 double t; \
725 TMP_DECL; \
726 \
727 SPEED_RESTRICT_COND (s->size >= 0); \
728 \
729 TMP_MARK; \
730 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
731 \
732 speed_operand_src (s, s->xp, s->size); \
733 speed_operand_dst (s, wp, s->size); \
734 speed_cache_fill (s); \
735 \
736 speed_starttime (); \
737 i = s->reps; \
738 do \
739 function (wp, s->xp, s->size * GMP_LIMB_BYTES); \
740 while (--i != 0); \
741 t = speed_endtime (); \
742 \
743 TMP_FREE; \
744 return t; \
745 }
746
747
748/* For mpn_add_n, mpn_sub_n, or similar. */
749#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \
750 { \
751 mp_ptr wp; \
752 mp_ptr xp, yp; \
753 unsigned i; \
754 double t; \
755 TMP_DECL; \
756 \
757 SPEED_RESTRICT_COND (s->size >= 1); \
758 \
759 TMP_MARK; \
760 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
761 \
762 xp = s->xp; \
763 yp = s->yp; \
764 \
765 if (s->r == 0) ; \
766 else if (s->r == 1) { xp = wp; } \
767 else if (s->r == 2) { yp = wp; } \
768 else if (s->r == 3) { xp = wp; yp = wp; } \
769 else if (s->r == 4) { yp = xp; } \
770 else { \
771 TMP_FREE; \
772 return -1.0; \
773 } \
774 \
775 /* initialize wp if operand overlap */ \
776 if (xp == wp || yp == wp) \
777 MPN_COPY (wp, s->xp, s->size); \
778 \
779 speed_operand_src (s, xp, s->size); \
780 speed_operand_src (s, yp, s->size); \
781 speed_operand_dst (s, wp, s->size); \
782 speed_cache_fill (s); \
783 \
784 speed_starttime (); \
785 i = s->reps; \
786 do \
787 call; \
788 while (--i != 0); \
789 t = speed_endtime (); \
790 \
791 TMP_FREE; \
792 return t; \
793 }
794
795
796/* For mpn_aors_errK_n, where 1 <= K <= 3. */
797#define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K) \
798 { \
799 mp_ptr wp; \
800 mp_ptr xp, yp; \
801 mp_ptr zp[K]; \
802 mp_limb_t ep[2*K]; \
803 unsigned i; \
804 double t; \
805 TMP_DECL; \
806 \
807 SPEED_RESTRICT_COND (s->size >= 1); \
808 \
809 TMP_MARK; \
810 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
811 \
812 /* (don't have a mechanism to specify zp alignments) */ \
813 for (i = 0; i < K; i++) \
814 SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0); \
815 \
816 xp = s->xp; \
817 yp = s->yp; \
818 \
819 if (s->r == 0) ; \
820 else if (s->r == 1) { xp = wp; } \
821 else if (s->r == 2) { yp = wp; } \
822 else if (s->r == 3) { xp = wp; yp = wp; } \
823 else if (s->r == 4) { yp = xp; } \
824 else { \
825 TMP_FREE; \
826 return -1.0; \
827 } \
828 \
829 /* initialize wp if operand overlap */ \
830 if (xp == wp || yp == wp) \
831 MPN_COPY (wp, s->xp, s->size); \
832 \
833 speed_operand_src (s, xp, s->size); \
834 speed_operand_src (s, yp, s->size); \
835 for (i = 0; i < K; i++) \
836 speed_operand_src (s, zp[i], s->size); \
837 speed_operand_dst (s, wp, s->size); \
838 speed_cache_fill (s); \
839 \
840 speed_starttime (); \
841 i = s->reps; \
842 do \
843 call; \
844 while (--i != 0); \
845 t = speed_endtime (); \
846 \
847 TMP_FREE; \
848 return t; \
849 }
850
851#define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function) \
852 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
853
854#define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function) \
855 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
856
857#define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function) \
858 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
859
860
861/* For mpn_add_n, mpn_sub_n, or similar. */
862#define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call) \
863 { \
864 mp_ptr ap, sp; \
865 mp_ptr xp, yp; \
866 unsigned i; \
867 double t; \
868 TMP_DECL; \
869 \
870 SPEED_RESTRICT_COND (s->size >= 1); \
871 \
872 TMP_MARK; \
873 SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \
874 SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp); \
875 \
876 xp = s->xp; \
877 yp = s->yp; \
878 \
879 if ((s->r & 1) != 0) { xp = ap; } \
880 if ((s->r & 2) != 0) { yp = ap; } \
881 if ((s->r & 4) != 0) { xp = sp; } \
882 if ((s->r & 8) != 0) { yp = sp; } \
883 if ((s->r & 3) == 3 || (s->r & 12) == 12) \
884 { \
885 TMP_FREE; \
886 return -1.0; \
887 } \
888 \
889 /* initialize ap if operand overlap */ \
890 if (xp == ap || yp == ap) \
891 MPN_COPY (ap, s->xp, s->size); \
892 /* initialize sp if operand overlap */ \
893 if (xp == sp || yp == sp) \
894 MPN_COPY (sp, s->xp, s->size); \
895 \
896 speed_operand_src (s, xp, s->size); \
897 speed_operand_src (s, yp, s->size); \
898 speed_operand_dst (s, ap, s->size); \
899 speed_operand_dst (s, sp, s->size); \
900 speed_cache_fill (s); \
901 \
902 speed_starttime (); \
903 i = s->reps; \
904 do \
905 call; \
906 while (--i != 0); \
907 t = speed_endtime (); \
908 \
909 TMP_FREE; \
910 return t; \
911 }
912
913#define SPEED_ROUTINE_MPN_BINARY_N(function) \
914 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
915
916#define SPEED_ROUTINE_MPN_BINARY_NC(function) \
917 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
918
919
920/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
921#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \
922 { \
923 mp_ptr wp; \
924 unsigned i; \
925 double t; \
926 TMP_DECL; \
927 \
928 SPEED_RESTRICT_COND (s->size >= 1); \
929 \
930 TMP_MARK; \
931 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
932 \
933 speed_operand_src (s, s->xp, s->size); \
934 speed_operand_dst (s, wp, s->size); \
935 speed_cache_fill (s); \
936 \
937 speed_starttime (); \
938 i = s->reps; \
939 do \
940 call; \
941 while (--i != 0); \
942 t = speed_endtime (); \
943 \
944 TMP_FREE; \
945 return t; \
946 }
947
948#define SPEED_ROUTINE_MPN_UNARY_1(function) \
949 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
950
951#define SPEED_ROUTINE_MPN_UNARY_1C(function) \
952 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
953
954/* FIXME: wp is uninitialized here, should start it off from xp */
955#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \
956 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
957
958#define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \
959 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
960
961#define SPEED_ROUTINE_MPN_BDIV_Q_1(function) \
962 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
963
964#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call) \
965 { \
966 unsigned shift; \
967 mp_limb_t dinv; \
968 \
969 SPEED_RESTRICT_COND (s->size > 0); \
970 SPEED_RESTRICT_COND (s->r != 0); \
971 \
972 count_trailing_zeros (shift, s->r); \
973 binvert_limb (dinv, s->r >> shift); \
974 \
975 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \
976 }
977#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function) \
978 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL \
979 ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
980
981#define SPEED_ROUTINE_MPN_BDIV_DBM1C(function) \
982 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
983
984#define SPEED_ROUTINE_MPN_DIVREM_1(function) \
985 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
986
987#define SPEED_ROUTINE_MPN_DIVREM_1C(function) \
988 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
989
990#define SPEED_ROUTINE_MPN_DIVREM_1F(function) \
991 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
992
993#define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \
994 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
995
996
997#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \
998 { \
999 unsigned shift; \
1000 mp_limb_t dinv; \
1001 \
1002 SPEED_RESTRICT_COND (s->size >= 0); \
1003 SPEED_RESTRICT_COND (s->r != 0); \
1004 \
1005 count_leading_zeros (shift, s->r); \
1006 invert_limb (dinv, s->r << shift); \
1007 \
1008 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \
1009 } \
1010
1011#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \
1012 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \
1013 ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
1014
1015/* s->size limbs worth of fraction part */
1016#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \
1017 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \
1018 ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
1019
1020
1021/* s->r is duplicated to form the multiplier, defaulting to
1022 MP_BASES_BIG_BASE_10. Not sure if that's particularly useful, but at
1023 least it provides some control. */
1024#define SPEED_ROUTINE_MPN_UNARY_N(function,N) \
1025 { \
1026 mp_ptr wp; \
1027 mp_size_t wn; \
1028 unsigned i; \
1029 double t; \
1030 mp_limb_t yp[N]; \
1031 TMP_DECL; \
1032 \
1033 SPEED_RESTRICT_COND (s->size >= N); \
1034 \
1035 TMP_MARK; \
1036 wn = s->size + N-1; \
1037 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \
1038 for (i = 0; i < N; i++) \
1039 yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10); \
1040 \
1041 speed_operand_src (s, s->xp, s->size); \
1042 speed_operand_src (s, yp, (mp_size_t) N); \
1043 speed_operand_dst (s, wp, wn); \
1044 speed_cache_fill (s); \
1045 \
1046 speed_starttime (); \
1047 i = s->reps; \
1048 do \
1049 function (wp, s->xp, s->size, yp); \
1050 while (--i != 0); \
1051 t = speed_endtime (); \
1052 \
1053 TMP_FREE; \
1054 return t; \
1055 }
1056
1057#define SPEED_ROUTINE_MPN_UNARY_2(function) \
1058 SPEED_ROUTINE_MPN_UNARY_N (function, 2)
1059#define SPEED_ROUTINE_MPN_UNARY_3(function) \
1060 SPEED_ROUTINE_MPN_UNARY_N (function, 3)
1061#define SPEED_ROUTINE_MPN_UNARY_4(function) \
1062 SPEED_ROUTINE_MPN_UNARY_N (function, 4)
1063#define SPEED_ROUTINE_MPN_UNARY_5(function) \
1064 SPEED_ROUTINE_MPN_UNARY_N (function, 5)
1065#define SPEED_ROUTINE_MPN_UNARY_6(function) \
1066 SPEED_ROUTINE_MPN_UNARY_N (function, 6)
1067#define SPEED_ROUTINE_MPN_UNARY_7(function) \
1068 SPEED_ROUTINE_MPN_UNARY_N (function, 7)
1069#define SPEED_ROUTINE_MPN_UNARY_8(function) \
1070 SPEED_ROUTINE_MPN_UNARY_N (function, 8)
1071
1072
1073/* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
1074#define SPEED_ROUTINE_MPN_MUL(function) \
1075 { \
1076 mp_ptr wp; \
1077 mp_size_t size1; \
1078 unsigned i; \
1079 double t; \
1080 TMP_DECL; \
1081 \
1082 size1 = (s->r == 0 ? s->size : s->r); \
1083 if (size1 < 0) size1 = -size1 - s->size; \
1084 \
1085 SPEED_RESTRICT_COND (size1 >= 1); \
1086 SPEED_RESTRICT_COND (s->size >= size1); \
1087 \
1088 TMP_MARK; \
1089 SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp); \
1090 \
1091 speed_operand_src (s, s->xp, s->size); \
1092 speed_operand_src (s, s->yp, size1); \
1093 speed_operand_dst (s, wp, size1 + s->size); \
1094 speed_cache_fill (s); \
1095 \
1096 speed_starttime (); \
1097 i = s->reps; \
1098 do \
1099 function (wp, s->xp, s->size, s->yp, size1); \
1100 while (--i != 0); \
1101 t = speed_endtime (); \
1102 \
1103 TMP_FREE; \
1104 return t; \
1105 }
1106
1107
1108#define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \
1109 { \
1110 mp_ptr wp; \
1111 unsigned i; \
1112 double t; \
1113 TMP_DECL; \
1114 \
1115 SPEED_RESTRICT_COND (s->size >= 1); \
1116 \
1117 TMP_MARK; \
1118 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \
1119 \
1120 speed_operand_src (s, s->xp, s->size); \
1121 speed_operand_src (s, s->yp, s->size); \
1122 speed_operand_dst (s, wp, 2*s->size); \
1123 speed_cache_fill (s); \
1124 \
1125 speed_starttime (); \
1126 i = s->reps; \
1127 do \
1128 call; \
1129 while (--i != 0); \
1130 t = speed_endtime (); \
1131 \
1132 TMP_FREE; \
1133 return t; \
1134 }
1135
1136#define SPEED_ROUTINE_MPN_MUL_N(function) \
1137 SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
1138
1139#define SPEED_ROUTINE_MPN_MULLO_N_CALL(call) \
1140 { \
1141 mp_ptr wp; \
1142 unsigned i; \
1143 double t; \
1144 TMP_DECL; \
1145 \
1146 SPEED_RESTRICT_COND (s->size >= 1); \
1147 \
1148 TMP_MARK; \
1149 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
1150 \
1151 speed_operand_src (s, s->xp, s->size); \
1152 speed_operand_src (s, s->yp, s->size); \
1153 speed_operand_dst (s, wp, s->size); \
1154 speed_cache_fill (s); \
1155 \
1156 speed_starttime (); \
1157 i = s->reps; \
1158 do \
1159 call; \
1160 while (--i != 0); \
1161 t = speed_endtime (); \
1162 \
1163 TMP_FREE; \
1164 return t; \
1165 }
1166
1167#define SPEED_ROUTINE_MPN_MULLO_N(function) \
1168 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1169
1170#define SPEED_ROUTINE_MPN_MULLO_BASECASE(function) \
1171 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1172
1173#define SPEED_ROUTINE_MPN_SQRLO(function) \
1174 { \
1175 mp_ptr wp; \
1176 unsigned i; \
1177 double t; \
1178 TMP_DECL; \
1179 \
1180 SPEED_RESTRICT_COND (s->size >= 1); \
1181 \
1182 TMP_MARK; \
1183 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
1184 \
1185 speed_operand_src (s, s->xp, s->size); \
1186 speed_operand_dst (s, wp, s->size); \
1187 speed_cache_fill (s); \
1188 \
1189 speed_starttime (); \
1190 i = s->reps; \
1191 do \
1192 function (wp, s->xp, s->size); \
1193 while (--i != 0); \
1194 t = speed_endtime (); \
1195 \
1196 TMP_FREE; \
1197 return t; \
1198 }
1199
1200/* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
1201#define SPEED_ROUTINE_MPN_MULMID(function) \
1202 { \
1203 mp_ptr wp, xp; \
1204 mp_size_t size1; \
1205 unsigned i; \
1206 double t; \
1207 TMP_DECL; \
1208 \
1209 size1 = (s->r == 0 ? (2 * s->size - 1) : s->r); \
1210 \
1211 SPEED_RESTRICT_COND (s->size >= 1); \
1212 SPEED_RESTRICT_COND (size1 >= s->size); \
1213 \
1214 TMP_MARK; \
1215 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \
1216 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \
1217 \
1218 speed_operand_src (s, xp, size1); \
1219 speed_operand_src (s, s->yp, s->size); \
1220 speed_operand_dst (s, wp, size1 - s->size + 3); \
1221 speed_cache_fill (s); \
1222 \
1223 speed_starttime (); \
1224 i = s->reps; \
1225 do \
1226 function (wp, xp, size1, s->yp, s->size); \
1227 while (--i != 0); \
1228 t = speed_endtime (); \
1229 \
1230 TMP_FREE; \
1231 return t; \
1232 }
1233
1234#define SPEED_ROUTINE_MPN_MULMID_N(function) \
1235 { \
1236 mp_ptr wp, xp; \
1237 mp_size_t size1; \
1238 unsigned i; \
1239 double t; \
1240 TMP_DECL; \
1241 \
1242 size1 = 2 * s->size - 1; \
1243 \
1244 SPEED_RESTRICT_COND (s->size >= 1); \
1245 \
1246 TMP_MARK; \
1247 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \
1248 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \
1249 \
1250 speed_operand_src (s, xp, size1); \
1251 speed_operand_src (s, s->yp, s->size); \
1252 speed_operand_dst (s, wp, size1 - s->size + 3); \
1253 speed_cache_fill (s); \
1254 \
1255 speed_starttime (); \
1256 i = s->reps; \
1257 do \
1258 function (wp, xp, s->yp, s->size); \
1259 while (--i != 0); \
1260 t = speed_endtime (); \
1261 \
1262 TMP_FREE; \
1263 return t; \
1264 }
1265
1266#define SPEED_ROUTINE_MPN_TOOM42_MULMID(function) \
1267 { \
1268 mp_ptr wp, xp, scratch; \
1269 mp_size_t size1, scratch_size; \
1270 unsigned i; \
1271 double t; \
1272 TMP_DECL; \
1273 \
1274 size1 = 2 * s->size - 1; \
1275 \
1276 SPEED_RESTRICT_COND (s->size >= 1); \
1277 \
1278 TMP_MARK; \
1279 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \
1280 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \
1281 scratch_size = mpn_toom42_mulmid_itch (s->size); \
1282 SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0); \
1283 \
1284 speed_operand_src (s, xp, size1); \
1285 speed_operand_src (s, s->yp, s->size); \
1286 speed_operand_dst (s, wp, size1 - s->size + 3); \
1287 speed_cache_fill (s); \
1288 \
1289 speed_starttime (); \
1290 i = s->reps; \
1291 do \
1292 function (wp, xp, s->yp, s->size, scratch); \
1293 while (--i != 0); \
1294 t = speed_endtime (); \
1295 \
1296 TMP_FREE; \
1297 return t; \
1298 }
1299
1300#define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call) \
1301 { \
1302 mp_ptr wp, tp; \
1303 unsigned i; \
1304 double t; \
1305 mp_size_t itch; \
1306 TMP_DECL; \
1307 \
1308 SPEED_RESTRICT_COND (s->size >= 1); \
1309 \
1310 itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size); \
1311 \
1312 TMP_MARK; \
1313 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \
1314 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \
1315 \
1316 speed_operand_src (s, s->xp, s->size); \
1317 speed_operand_src (s, s->yp, s->size); \
1318 speed_operand_dst (s, wp, 2 * s->size); \
1319 speed_operand_dst (s, tp, itch); \
1320 speed_cache_fill (s); \
1321 \
1322 speed_starttime (); \
1323 i = s->reps; \
1324 do \
1325 call; \
1326 while (--i != 0); \
1327 t = speed_endtime (); \
1328 \
1329 TMP_FREE; \
1330 return t; \
1331 }
1332#define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function) \
1333 { \
1334 mp_ptr wp, tp; \
1335 unsigned i; \
1336 double t; \
1337 mp_size_t size, itch; \
1338 TMP_DECL; \
1339 \
1340 SPEED_RESTRICT_COND (s->size >= 1); \
1341 \
1342 size = mpn_mulmod_bnm1_next_size (s->size); \
1343 itch = mpn_mulmod_bnm1_itch (size, size, size); \
1344 \
1345 TMP_MARK; \
1346 SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp); \
1347 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \
1348 \
1349 speed_operand_src (s, s->xp, s->size); \
1350 speed_operand_src (s, s->yp, s->size); \
1351 speed_operand_dst (s, wp, size); \
1352 speed_operand_dst (s, tp, itch); \
1353 speed_cache_fill (s); \
1354 \
1355 speed_starttime (); \
1356 i = s->reps; \
1357 do \
1358 function (wp, size, s->xp, s->size, s->yp, s->size, tp); \
1359 while (--i != 0); \
1360 t = speed_endtime (); \
1361 \
1362 TMP_FREE; \
1363 return t; \
1364 }
1365
1366#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \
1367 { \
1368 mp_ptr wp, tspace; \
1369 unsigned i; \
1370 double t; \
1371 TMP_DECL; \
1372 \
1373 SPEED_RESTRICT_COND (s->size >= minsize); \
1374 \
1375 TMP_MARK; \
1376 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \
1377 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \
1378 \
1379 speed_operand_src (s, s->xp, s->size); \
1380 speed_operand_src (s, s->yp, s->size); \
1381 speed_operand_dst (s, wp, 2*s->size); \
1382 speed_operand_dst (s, tspace, tsize); \
1383 speed_cache_fill (s); \
1384 \
1385 speed_starttime (); \
1386 i = s->reps; \
1387 do \
1388 call; \
1389 while (--i != 0); \
1390 t = speed_endtime (); \
1391 \
1392 TMP_FREE; \
1393 return t; \
1394 }
1395
1396#define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function) \
1397 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1398 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1399 mpn_toom22_mul_itch (s->size, s->size), \
1400 MPN_TOOM22_MUL_MINSIZE)
1401
1402#define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function) \
1403 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1404 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1405 mpn_toom33_mul_itch (s->size, s->size), \
1406 MPN_TOOM33_MUL_MINSIZE)
1407
1408#define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function) \
1409 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1410 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1411 mpn_toom44_mul_itch (s->size, s->size), \
1412 MPN_TOOM44_MUL_MINSIZE)
1413
1414#define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function) \
1415 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1416 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1417 mpn_toom6h_mul_itch (s->size, s->size), \
1418 MPN_TOOM6H_MUL_MINSIZE)
1419
1420#define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function) \
1421 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1422 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1423 mpn_toom8h_mul_itch (s->size, s->size), \
1424 MPN_TOOM8H_MUL_MINSIZE)
1425
1426#define SPEED_ROUTINE_MPN_TOOM32_MUL(function) \
1427 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1428 (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace), \
1429 mpn_toom32_mul_itch (s->size, 2*s->size/3), \
1430 MPN_TOOM32_MUL_MINSIZE)
1431
1432#define SPEED_ROUTINE_MPN_TOOM42_MUL(function) \
1433 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1434 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \
1435 mpn_toom42_mul_itch (s->size, s->size/2), \
1436 MPN_TOOM42_MUL_MINSIZE)
1437
1438#define SPEED_ROUTINE_MPN_TOOM43_MUL(function) \
1439 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1440 (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace), \
1441 mpn_toom43_mul_itch (s->size, s->size*3/4), \
1442 MPN_TOOM43_MUL_MINSIZE)
1443
1444#define SPEED_ROUTINE_MPN_TOOM63_MUL(function) \
1445 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1446 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \
1447 mpn_toom63_mul_itch (s->size, s->size/2), \
1448 MPN_TOOM63_MUL_MINSIZE)
1449
1450#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function) \
1451 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1452 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \
1453 mpn_toom32_mul_itch (s->size, 17*s->size/24), \
1454 MPN_TOOM32_MUL_MINSIZE)
1455#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function) \
1456 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1457 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \
1458 mpn_toom43_mul_itch (s->size, 17*s->size/24), \
1459 MPN_TOOM43_MUL_MINSIZE)
1460
1461#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function) \
1462 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1463 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \
1464 mpn_toom32_mul_itch (s->size, 19*s->size/30), \
1465 MPN_TOOM32_MUL_MINSIZE)
1466#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function) \
1467 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1468 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \
1469 mpn_toom53_mul_itch (s->size, 19*s->size/30), \
1470 MPN_TOOM53_MUL_MINSIZE)
1471
1472#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function) \
1473 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1474 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \
1475 mpn_toom42_mul_itch (s->size, 11*s->size/20), \
1476 MPN_TOOM42_MUL_MINSIZE)
1477#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function) \
1478 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1479 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \
1480 mpn_toom53_mul_itch (s->size, 11*s->size/20), \
1481 MPN_TOOM53_MUL_MINSIZE)
1482
1483#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function) \
1484 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1485 (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \
1486 mpn_toom42_mul_itch (s->size, 5*s->size/6), \
1487 MPN_TOOM54_MUL_MINSIZE)
1488#define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function) \
1489 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1490 (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \
1491 mpn_toom54_mul_itch (s->size, 5*s->size/6), \
1492 MPN_TOOM54_MUL_MINSIZE)
1493
1494
1495
1496#define SPEED_ROUTINE_MPN_SQR_CALL(call) \
1497 { \
1498 mp_ptr wp; \
1499 unsigned i; \
1500 double t; \
1501 TMP_DECL; \
1502 \
1503 SPEED_RESTRICT_COND (s->size >= 1); \
1504 \
1505 TMP_MARK; \
1506 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \
1507 \
1508 speed_operand_src (s, s->xp, s->size); \
1509 speed_operand_dst (s, wp, 2*s->size); \
1510 speed_cache_fill (s); \
1511 \
1512 speed_starttime (); \
1513 i = s->reps; \
1514 do \
1515 call; \
1516 while (--i != 0); \
1517 t = speed_endtime (); \
1518 \
1519 TMP_FREE; \
1520 return t; \
1521 }
1522
1523#define SPEED_ROUTINE_MPN_SQR(function) \
1524 SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1525
1526#define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call) \
1527 { \
1528 mp_ptr wp, tp; \
1529 unsigned i; \
1530 double t; \
1531 TMP_DECL; \
1532 \
1533 SPEED_RESTRICT_COND (s->size >= 2); \
1534 \
1535 TMP_MARK; \
1536 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp); \
1537 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \
1538 \
1539 speed_operand_src (s, s->xp, s->size); \
1540 speed_operand_src (s, tp, 2 * s->size); \
1541 speed_operand_dst (s, wp, 2 * s->size); \
1542 speed_cache_fill (s); \
1543 \
1544 speed_starttime (); \
1545 i = s->reps; \
1546 do \
1547 call; \
1548 while (--i != 0); \
1549 t = speed_endtime () / 2; \
1550 \
1551 TMP_FREE; \
1552 return t; \
1553 }
1554
1555#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \
1556 { \
1557 mp_ptr wp, tspace; \
1558 unsigned i; \
1559 double t; \
1560 TMP_DECL; \
1561 \
1562 SPEED_RESTRICT_COND (s->size >= minsize); \
1563 \
1564 TMP_MARK; \
1565 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \
1566 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \
1567 \
1568 speed_operand_src (s, s->xp, s->size); \
1569 speed_operand_dst (s, wp, 2*s->size); \
1570 speed_operand_dst (s, tspace, tsize); \
1571 speed_cache_fill (s); \
1572 \
1573 speed_starttime (); \
1574 i = s->reps; \
1575 do \
1576 call; \
1577 while (--i != 0); \
1578 t = speed_endtime (); \
1579 \
1580 TMP_FREE; \
1581 return t; \
1582 }
1583
1584#define SPEED_ROUTINE_MPN_TOOM2_SQR(function) \
1585 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1586 mpn_toom2_sqr_itch (s->size), \
1587 MPN_TOOM2_SQR_MINSIZE)
1588
1589#define SPEED_ROUTINE_MPN_TOOM3_SQR(function) \
1590 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1591 mpn_toom3_sqr_itch (s->size), \
1592 MPN_TOOM3_SQR_MINSIZE)
1593
1594
1595#define SPEED_ROUTINE_MPN_TOOM4_SQR(function) \
1596 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1597 mpn_toom4_sqr_itch (s->size), \
1598 MPN_TOOM4_SQR_MINSIZE)
1599
1600#define SPEED_ROUTINE_MPN_TOOM6_SQR(function) \
1601 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1602 mpn_toom6_sqr_itch (s->size), \
1603 MPN_TOOM6_SQR_MINSIZE)
1604
1605#define SPEED_ROUTINE_MPN_TOOM8_SQR(function) \
1606 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1607 mpn_toom8_sqr_itch (s->size), \
1608 MPN_TOOM8_SQR_MINSIZE)
1609
1610#define SPEED_ROUTINE_MPN_MOD_CALL(call) \
1611 { \
1612 unsigned i; \
1613 \
1614 SPEED_RESTRICT_COND (s->size >= 0); \
1615 \
1616 speed_operand_src (s, s->xp, s->size); \
1617 speed_cache_fill (s); \
1618 \
1619 speed_starttime (); \
1620 i = s->reps; \
1621 do \
1622 call; \
1623 while (--i != 0); \
1624 \
1625 return speed_endtime (); \
1626 }
1627
1628#define SPEED_ROUTINE_MPN_MOD_1(function) \
1629 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1630
1631#define SPEED_ROUTINE_MPN_MOD_1C(function) \
1632 SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1633
1634#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \
1635 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1636
1637#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \
1638 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1639
1640#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \
1641 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1642
1643#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \
1644 { \
1645 unsigned i; \
1646 mp_limb_t inv; \
1647 \
1648 SPEED_RESTRICT_COND (s->size >= 0); \
1649 SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT); \
1650 \
1651 invert_limb (inv, s->r); \
1652 speed_operand_src (s, s->xp, s->size); \
1653 speed_cache_fill (s); \
1654 \
1655 speed_starttime (); \
1656 i = s->reps; \
1657 do \
1658 (*function) (s->xp, s->size, s->r, inv); \
1659 while (--i != 0); \
1660 \
1661 return speed_endtime (); \
1662 }
1663
1664#define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc) \
1665 { \
1666 unsigned i; \
1667 mp_limb_t inv[4]; \
1668 \
1669 SPEED_RESTRICT_COND (s->size >= 2); \
1670 \
1671 mpn_mod_1_1p_cps (inv, s->r); \
1672 speed_operand_src (s, s->xp, s->size); \
1673 speed_cache_fill (s); \
1674 \
1675 speed_starttime (); \
1676 i = s->reps; \
1677 do { \
1678 pfunc (inv, s->r); \
1679 function (s->xp, s->size, s->r << inv[1], inv); \
1680 } while (--i != 0); \
1681 \
1682 return speed_endtime (); \
1683 }
1684#define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N) \
1685 { \
1686 unsigned i; \
1687 mp_limb_t inv[N+3]; \
1688 \
1689 SPEED_RESTRICT_COND (s->size >= 1); \
1690 SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N); \
1691 \
1692 speed_operand_src (s, s->xp, s->size); \
1693 speed_cache_fill (s); \
1694 \
1695 speed_starttime (); \
1696 i = s->reps; \
1697 do { \
1698 pfunc (inv, s->r); \
1699 function (s->xp, s->size, s->r, inv); \
1700 } while (--i != 0); \
1701 \
1702 return speed_endtime (); \
1703 }
1704
1705
1706/* A division of 2*s->size by s->size limbs */
1707
1708#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \
1709 { \
1710 unsigned i; \
1711 mp_ptr a, d, q, r; \
1712 double t; \
1713 gmp_pi1_t dinv; \
1714 TMP_DECL; \
1715 \
1716 SPEED_RESTRICT_COND (s->size >= 1); \
1717 \
1718 TMP_MARK; \
1719 SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
1720 SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
1721 SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
1722 SPEED_TMP_ALLOC_LIMBS (r, s->size, s->align_wp2); \
1723 \
1724 MPN_COPY (a, s->xp, s->size); \
1725 MPN_COPY (a+s->size, s->xp, s->size); \
1726 \
1727 MPN_COPY (d, s->yp, s->size); \
1728 \
1729 /* normalize the data */ \
1730 d[s->size-1] |= GMP_NUMB_HIGHBIT; \
1731 a[2*s->size-1] = d[s->size-1] - 1; \
1732 \
1733 invert_pi1 (dinv, d[s->size-1], d[s->size-2]); \
1734 \
1735 speed_operand_src (s, a, 2*s->size); \
1736 speed_operand_src (s, d, s->size); \
1737 speed_operand_dst (s, q, s->size+1); \
1738 speed_operand_dst (s, r, s->size); \
1739 speed_cache_fill (s); \
1740 \
1741 speed_starttime (); \
1742 i = s->reps; \
1743 do \
1744 call; \
1745 while (--i != 0); \
1746 t = speed_endtime (); \
1747 \
1748 TMP_FREE; \
1749 return t; \
1750 }
1751
1752
1753/* A remainder 2*s->size by s->size limbs */
1754
1755#define SPEED_ROUTINE_MPZ_MOD(function) \
1756 { \
1757 unsigned i; \
1758 mpz_t a, d, r; \
1759 \
1760 SPEED_RESTRICT_COND (s->size >= 1); \
1761 \
1762 mpz_init_set_n (d, s->yp, s->size); \
1763 \
1764 /* high part less than d, low part a duplicate copied in */ \
1765 mpz_init_set_n (a, s->xp, s->size); \
1766 mpz_mod (a, a, d); \
1767 mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size); \
1768 MPN_COPY (PTR(a), s->xp, s->size); \
1769 \
1770 mpz_init (r); \
1771 \
1772 speed_operand_src (s, PTR(a), SIZ(a)); \
1773 speed_operand_src (s, PTR(d), SIZ(d)); \
1774 speed_cache_fill (s); \
1775 \
1776 speed_starttime (); \
1777 i = s->reps; \
1778 do \
1779 function (r, a, d); \
1780 while (--i != 0); \
1781 return speed_endtime (); \
1782 }
1783
1784#define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN) \
1785 { \
1786 unsigned i; \
1787 mp_ptr dp, tp, ap, qp; \
1788 gmp_pi1_t inv; \
1789 double t; \
1790 mp_size_t size1; \
1791 TMP_DECL; \
1792 \
1793 size1 = (s->r == 0 ? 2 * s->size : s->r); \
1794 \
1795 SPEED_RESTRICT_COND (s->size >= DMIN); \
1796 SPEED_RESTRICT_COND (size1 - s->size >= QMIN); \
1797 \
1798 TMP_MARK; \
1799 SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp); \
1800 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1801 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \
1802 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2); \
1803 \
1804 /* we don't fill in dividend completely when size1 > s->size */ \
1805 MPN_COPY (ap, s->xp, s->size); \
1806 MPN_COPY (ap + size1 - s->size, s->xp, s->size); \
1807 \
1808 MPN_COPY (dp, s->yp, s->size); \
1809 \
1810 /* normalize the data */ \
1811 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \
1812 ap[size1 - 1] = dp[s->size - 1] - 1; \
1813 \
1814 invert_pi1 (inv, dp[s->size-1], dp[s->size-2]); \
1815 \
1816 speed_operand_src (s, ap, size1); \
1817 speed_operand_dst (s, tp, size1); \
1818 speed_operand_src (s, dp, s->size); \
1819 speed_operand_dst (s, qp, size1 - s->size); \
1820 speed_cache_fill (s); \
1821 \
1822 speed_starttime (); \
1823 i = s->reps; \
1824 do { \
1825 MPN_COPY (tp, ap, size1); \
1826 function (qp, tp, size1, dp, s->size, INV); \
1827 } while (--i != 0); \
1828 t = speed_endtime (); \
1829 \
1830 TMP_FREE; \
1831 return t; \
1832 }
1833#define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn) \
1834 { \
1835 unsigned i; \
1836 mp_ptr dp, tp, qp, scratch; \
1837 double t; \
1838 mp_size_t itch; \
1839 TMP_DECL; \
1840 \
1841 SPEED_RESTRICT_COND (s->size >= 2); \
1842 \
1843 itch = itchfn (2 * s->size, s->size, 0); \
1844 TMP_MARK; \
1845 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1846 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
1847 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \
1848 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
1849 \
1850 MPN_COPY (tp, s->xp, s->size); \
1851 MPN_COPY (tp+s->size, s->xp, s->size); \
1852 \
1853 /* normalize the data */ \
1854 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \
1855 tp[2*s->size-1] = dp[s->size-1] - 1; \
1856 \
1857 speed_operand_dst (s, qp, s->size); \
1858 speed_operand_src (s, tp, 2 * s->size); \
1859 speed_operand_src (s, dp, s->size); \
1860 speed_operand_dst (s, scratch, itch); \
1861 speed_cache_fill (s); \
1862 \
1863 speed_starttime (); \
1864 i = s->reps; \
1865 do { \
1866 function (qp, tp, 2 * s->size, dp, s->size, scratch); \
1867 } while (--i != 0); \
1868 t = speed_endtime (); \
1869 \
1870 TMP_FREE; \
1871 return t; \
1872 }
1873#define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn) \
1874 { \
1875 unsigned i; \
1876 mp_ptr dp, tp, qp, rp, scratch; \
1877 double t; \
1878 mp_size_t size1, itch; \
1879 TMP_DECL; \
1880 \
1881 size1 = (s->r == 0 ? 2 * s->size : s->r); \
1882 \
1883 SPEED_RESTRICT_COND (s->size >= 2); \
1884 SPEED_RESTRICT_COND (size1 >= s->size); \
1885 \
1886 itch = itchfn (size1, s->size, 0); \
1887 TMP_MARK; \
1888 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1889 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \
1890 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \
1891 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
1892 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
1893 \
1894 /* we don't fill in dividend completely when size1 > s->size */ \
1895 MPN_COPY (tp, s->xp, s->size); \
1896 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \
1897 \
1898 MPN_COPY (dp, s->yp, s->size); \
1899 \
1900 /* normalize the data */ \
1901 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \
1902 tp[size1 - 1] = dp[s->size - 1] - 1; \
1903 \
1904 speed_operand_dst (s, qp, size1 - s->size); \
1905 speed_operand_dst (s, rp, s->size); \
1906 speed_operand_src (s, tp, size1); \
1907 speed_operand_src (s, dp, s->size); \
1908 speed_operand_dst (s, scratch, itch); \
1909 speed_cache_fill (s); \
1910 \
1911 speed_starttime (); \
1912 i = s->reps; \
1913 do { \
1914 function (qp, rp, tp, size1, dp, s->size, scratch); \
1915 } while (--i != 0); \
1916 t = speed_endtime (); \
1917 \
1918 TMP_FREE; \
1919 return t; \
1920 }
1921#define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn) \
1922 { \
1923 unsigned i; \
1924 mp_ptr dp, tp, qp, rp, ip, scratch, tmp; \
1925 double t; \
1926 mp_size_t size1, itch; \
1927 TMP_DECL; \
1928 \
1929 size1 = (s->r == 0 ? 2 * s->size : s->r); \
1930 \
1931 SPEED_RESTRICT_COND (s->size >= 2); \
1932 SPEED_RESTRICT_COND (size1 >= s->size); \
1933 \
1934 itch = itchfn (size1, s->size, s->size); \
1935 TMP_MARK; \
1936 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1937 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \
1938 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \
1939 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
1940 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
1941 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */ \
1942 \
1943 /* we don't fill in dividend completely when size1 > s->size */ \
1944 MPN_COPY (tp, s->xp, s->size); \
1945 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \
1946 \
1947 MPN_COPY (dp, s->yp, s->size); \
1948 \
1949 /* normalize the data */ \
1950 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \
1951 tp[size1 - 1] = dp[s->size-1] - 1; \
1952 \
1953 tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size)); \
1954 mpn_invert (ip, dp, s->size, tmp); \
1955 \
1956 speed_operand_dst (s, qp, size1 - s->size); \
1957 speed_operand_dst (s, rp, s->size); \
1958 speed_operand_src (s, tp, size1); \
1959 speed_operand_src (s, dp, s->size); \
1960 speed_operand_src (s, ip, s->size); \
1961 speed_operand_dst (s, scratch, itch); \
1962 speed_cache_fill (s); \
1963 \
1964 speed_starttime (); \
1965 i = s->reps; \
1966 do { \
1967 function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch); \
1968 } while (--i != 0); \
1969 t = speed_endtime (); \
1970 \
1971 TMP_FREE; \
1972 return t; \
1973 }
1974
1975#define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function) \
1976 { \
1977 unsigned i; \
1978 mp_ptr dp, tp, ap, qp; \
1979 mp_limb_t inv; \
1980 double t; \
1981 TMP_DECL; \
1982 \
1983 SPEED_RESTRICT_COND (s->size >= 1); \
1984 \
1985 TMP_MARK; \
1986 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \
1987 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1988 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
1989 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \
1990 \
1991 MPN_COPY (ap, s->xp, s->size); \
1992 MPN_COPY (ap+s->size, s->xp, s->size); \
1993 \
1994 /* divisor must be odd */ \
1995 MPN_COPY (dp, s->yp, s->size); \
1996 dp[0] |= 1; \
1997 binvert_limb (inv, dp[0]); \
1998 inv = -inv; \
1999 \
2000 speed_operand_src (s, ap, 2*s->size); \
2001 speed_operand_dst (s, tp, 2*s->size); \
2002 speed_operand_src (s, dp, s->size); \
2003 speed_operand_dst (s, qp, s->size); \
2004 speed_cache_fill (s); \
2005 \
2006 speed_starttime (); \
2007 i = s->reps; \
2008 do { \
2009 MPN_COPY (tp, ap, 2*s->size); \
2010 function (qp, tp, 2*s->size, dp, s->size, inv); \
2011 } while (--i != 0); \
2012 t = speed_endtime (); \
2013 \
2014 TMP_FREE; \
2015 return t; \
2016 }
2017#define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function) \
2018 { \
2019 unsigned i; \
2020 mp_ptr dp, tp, qp; \
2021 mp_limb_t inv; \
2022 double t; \
2023 TMP_DECL; \
2024 \
2025 SPEED_RESTRICT_COND (s->size >= 1); \
2026 \
2027 TMP_MARK; \
2028 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
2029 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
2030 SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2); \
2031 \
2032 /* divisor must be odd */ \
2033 MPN_COPY (dp, s->yp, s->size); \
2034 dp[0] |= 1; \
2035 binvert_limb (inv, dp[0]); \
2036 inv = -inv; \
2037 \
2038 speed_operand_src (s, s->xp, s->size); \
2039 speed_operand_dst (s, tp, s->size); \
2040 speed_operand_src (s, dp, s->size); \
2041 speed_operand_dst (s, qp, s->size); \
2042 speed_cache_fill (s); \
2043 \
2044 speed_starttime (); \
2045 i = s->reps; \
2046 do { \
2047 MPN_COPY (tp, s->xp, s->size); \
2048 function (qp, tp, s->size, dp, s->size, inv); \
2049 } while (--i != 0); \
2050 t = speed_endtime (); \
2051 \
2052 TMP_FREE; \
2053 return t; \
2054 }
2055#define SPEED_ROUTINE_MPN_PI1_BDIV_R(function) \
2056 { \
2057 unsigned i; \
2058 mp_ptr dp, tp, ap; \
2059 mp_limb_t inv; \
2060 double t; \
2061 TMP_DECL; \
2062 \
2063 SPEED_RESTRICT_COND (s->size >= 1); \
2064 \
2065 TMP_MARK; \
2066 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \
2067 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
2068 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \
2069 \
2070 MPN_COPY (ap, s->xp, s->size); \
2071 MPN_COPY (ap+s->size, s->xp, s->size); \
2072 \
2073 /* divisor must be odd */ \
2074 MPN_COPY (dp, s->yp, s->size); \
2075 dp[0] |= 1; \
2076 binvert_limb (inv, dp[0]); \
2077 inv = -inv; \
2078 \
2079 speed_operand_src (s, ap, 2*s->size); \
2080 speed_operand_dst (s, tp, 2*s->size); \
2081 speed_operand_src (s, dp, s->size); \
2082 speed_cache_fill (s); \
2083 \
2084 speed_starttime (); \
2085 i = s->reps; \
2086 do { \
2087 MPN_COPY (tp, ap, 2*s->size); \
2088 function (tp, 2*s->size, dp, s->size, inv); \
2089 } while (--i != 0); \
2090 t = speed_endtime (); \
2091 \
2092 TMP_FREE; \
2093 return t; \
2094 }
2095#define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn) \
2096 { \
2097 unsigned i; \
2098 mp_ptr dp, qp, scratch; \
2099 double t; \
2100 mp_size_t itch; \
2101 TMP_DECL; \
2102 \
2103 SPEED_RESTRICT_COND (s->size >= 2); \
2104 \
2105 itch = itchfn (s->size, s->size); \
2106 TMP_MARK; \
2107 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
2108 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
2109 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
2110 \
2111 /* divisor must be odd */ \
2112 MPN_COPY (dp, s->yp, s->size); \
2113 dp[0] |= 1; \
2114 \
2115 speed_operand_dst (s, qp, s->size); \
2116 speed_operand_src (s, s->xp, s->size); \
2117 speed_operand_src (s, dp, s->size); \
2118 speed_operand_dst (s, scratch, itch); \
2119 speed_cache_fill (s); \
2120 \
2121 speed_starttime (); \
2122 i = s->reps; \
2123 do { \
2124 function (qp, s->xp, s->size, dp, s->size, scratch); \
2125 } while (--i != 0); \
2126 t = speed_endtime (); \
2127 \
2128 TMP_FREE; \
2129 return t; \
2130 }
2131#define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn) \
2132 { \
2133 unsigned i; \
2134 mp_ptr dp, tp, qp, rp, scratch; \
2135 double t; \
2136 mp_size_t itch; \
2137 TMP_DECL; \
2138 \
2139 SPEED_RESTRICT_COND (s->size >= 2); \
2140 \
2141 itch = itchfn (2 * s->size, s->size); \
2142 TMP_MARK; \
2143 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
2144 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
2145 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \
2146 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
2147 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
2148 \
2149 MPN_COPY (tp, s->xp, s->size); \
2150 MPN_COPY (tp+s->size, s->xp, s->size); \
2151 \
2152 /* divisor must be odd */ \
2153 MPN_COPY (dp, s->yp, s->size); \
2154 dp[0] |= 1; \
2155 \
2156 speed_operand_dst (s, qp, s->size); \
2157 speed_operand_dst (s, rp, s->size); \
2158 speed_operand_src (s, tp, 2 * s->size); \
2159 speed_operand_src (s, dp, s->size); \
2160 speed_operand_dst (s, scratch, itch); \
2161 speed_cache_fill (s); \
2162 \
2163 speed_starttime (); \
2164 i = s->reps; \
2165 do { \
2166 function (qp, rp, tp, 2 * s->size, dp, s->size, scratch); \
2167 } while (--i != 0); \
2168 t = speed_endtime (); \
2169 \
2170 TMP_FREE; \
2171 return t; \
2172 }
2173
2174#define SPEED_ROUTINE_MPN_BROOT(function) \
2175 { \
2176 SPEED_RESTRICT_COND (s->r & 1); \
2177 s->xp[0] |= 1; \
2178 SPEED_ROUTINE_MPN_UNARY_1_CALL \
2179 ((*function) (wp, s->xp, s->size, s->r)); \
2180 }
2181
2182#define SPEED_ROUTINE_MPN_BROOTINV(function, itch) \
2183 { \
2184 mp_ptr wp, tp; \
2185 unsigned i; \
2186 double t; \
2187 TMP_DECL; \
2188 TMP_MARK; \
2189 SPEED_RESTRICT_COND (s->size >= 1); \
2190 SPEED_RESTRICT_COND (s->r & 1); \
2191 wp = TMP_ALLOC_LIMBS (s->size); \
2192 tp = TMP_ALLOC_LIMBS ( (itch)); \
2193 s->xp[0] |= 1; \
2194 \
2195 speed_operand_src (s, s->xp, s->size); \
2196 speed_operand_dst (s, wp, s->size); \
2197 speed_cache_fill (s); \
2198 \
2199 speed_starttime (); \
2200 i = s->reps; \
2201 do \
2202 (*function) (wp, s->xp, s->size, s->r, tp); \
2203 while (--i != 0); \
2204 t = speed_endtime (); \
2205 \
2206 TMP_FREE; \
2207 return t; \
2208 }
2209
2210#define SPEED_ROUTINE_MPN_INVERT(function,itchfn) \
2211 { \
2212 long i; \
2213 mp_ptr up, tp, ip; \
2214 double t; \
2215 TMP_DECL; \
2216 \
2217 SPEED_RESTRICT_COND (s->size >= 1); \
2218 \
2219 TMP_MARK; \
2220 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
2221 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
2222 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
2223 \
2224 MPN_COPY (up, s->xp, s->size); \
2225 \
2226 /* normalize the data */ \
2227 up[s->size-1] |= GMP_NUMB_HIGHBIT; \
2228 \
2229 speed_operand_src (s, up, s->size); \
2230 speed_operand_dst (s, tp, s->size); \
2231 speed_operand_dst (s, ip, s->size); \
2232 speed_cache_fill (s); \
2233 \
2234 speed_starttime (); \
2235 i = s->reps; \
2236 do \
2237 function (ip, up, s->size, tp); \
2238 while (--i != 0); \
2239 t = speed_endtime (); \
2240 \
2241 TMP_FREE; \
2242 return t; \
2243 }
2244
2245#define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn) \
2246 { \
2247 long i; \
2248 mp_ptr up, tp, ip; \
2249 double t; \
2250 TMP_DECL; \
2251 \
2252 SPEED_RESTRICT_COND (s->size >= 1); \
2253 \
2254 TMP_MARK; \
2255 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
2256 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
2257 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
2258 \
2259 MPN_COPY (up, s->xp, s->size); \
2260 \
2261 /* normalize the data */ \
2262 up[s->size-1] |= GMP_NUMB_HIGHBIT; \
2263 \
2264 speed_operand_src (s, up, s->size); \
2265 speed_operand_dst (s, tp, s->size); \
2266 speed_operand_dst (s, ip, s->size); \
2267 speed_cache_fill (s); \
2268 \
2269 speed_starttime (); \
2270 i = s->reps; \
2271 do \
2272 function (ip, up, s->size, tp); \
2273 while (--i != 0); \
2274 t = speed_endtime (); \
2275 \
2276 TMP_FREE; \
2277 return t; \
2278 }
2279
2280#define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn) \
2281 { \
2282 long i; \
2283 mp_ptr up, tp, ip; \
2284 double t; \
2285 TMP_DECL; \
2286 \
2287 SPEED_RESTRICT_COND (s->size >= 3); \
2288 \
2289 TMP_MARK; \
2290 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
2291 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
2292 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
2293 \
2294 MPN_COPY (up, s->xp, s->size); \
2295 \
2296 /* normalize the data */ \
2297 up[s->size-1] |= GMP_NUMB_HIGHBIT; \
2298 \
2299 speed_operand_src (s, up, s->size); \
2300 speed_operand_dst (s, tp, s->size); \
2301 speed_operand_dst (s, ip, s->size); \
2302 speed_cache_fill (s); \
2303 \
2304 speed_starttime (); \
2305 i = s->reps; \
2306 do \
2307 function (ip, up, s->size, tp); \
2308 while (--i != 0); \
2309 t = speed_endtime (); \
2310 \
2311 TMP_FREE; \
2312 return t; \
2313 }
2314
2315#define SPEED_ROUTINE_MPN_BINVERT(function,itchfn) \
2316 { \
2317 long i; \
2318 mp_ptr up, tp, ip; \
2319 double t; \
2320 TMP_DECL; \
2321 \
2322 SPEED_RESTRICT_COND (s->size >= 1); \
2323 \
2324 TMP_MARK; \
2325 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
2326 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
2327 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
2328 \
2329 MPN_COPY (up, s->xp, s->size); \
2330 \
2331 /* normalize the data */ \
2332 up[0] |= 1; \
2333 \
2334 speed_operand_src (s, up, s->size); \
2335 speed_operand_dst (s, tp, s->size); \
2336 speed_operand_dst (s, ip, s->size); \
2337 speed_cache_fill (s); \
2338 \
2339 speed_starttime (); \
2340 i = s->reps; \
2341 do \
2342 function (ip, up, s->size, tp); \
2343 while (--i != 0); \
2344 t = speed_endtime (); \
2345 \
2346 TMP_FREE; \
2347 return t; \
2348 }
2349
2350#define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn) \
2351 { \
2352 long i; \
2353 mp_ptr up, mp, tp, ip; \
2354 double t; \
2355 TMP_DECL; \
2356 \
2357 SPEED_RESTRICT_COND (s->size >= 1); \
2358 \
2359 TMP_MARK; \
2360 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
2361 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
2362 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \
2363 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
2364 \
2365 speed_operand_src (s, up, s->size); \
2366 speed_operand_dst (s, tp, s->size); \
2367 speed_operand_dst (s, ip, s->size); \
2368 speed_cache_fill (s); \
2369 \
2370 MPN_COPY (mp, s->yp, s->size); \
2371 /* Must be odd */ \
2372 mp[0] |= 1; \
2373 speed_starttime (); \
2374 i = s->reps; \
2375 do \
2376 { \
2377 MPN_COPY (up, s->xp, s->size); \
2378 function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp); \
2379 } \
2380 while (--i != 0); \
2381 t = speed_endtime (); \
2382 \
2383 TMP_FREE; \
2384 return t; \
2385 }
2386
2387#define SPEED_ROUTINE_REDC_1(function) \
2388 { \
2389 unsigned i; \
2390 mp_ptr cp, mp, tp, ap; \
2391 mp_limb_t inv; \
2392 double t; \
2393 TMP_DECL; \
2394 \
2395 SPEED_RESTRICT_COND (s->size >= 1); \
2396 \
2397 TMP_MARK; \
2398 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \
2399 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \
2400 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \
2401 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \
2402 \
2403 MPN_COPY (ap, s->xp, s->size); \
2404 MPN_COPY (ap+s->size, s->xp, s->size); \
2405 \
2406 /* modulus must be odd */ \
2407 MPN_COPY (mp, s->yp, s->size); \
2408 mp[0] |= 1; \
2409 binvert_limb (inv, mp[0]); \
2410 inv = -inv; \
2411 \
2412 speed_operand_src (s, ap, 2*s->size+1); \
2413 speed_operand_dst (s, tp, 2*s->size+1); \
2414 speed_operand_src (s, mp, s->size); \
2415 speed_operand_dst (s, cp, s->size); \
2416 speed_cache_fill (s); \
2417 \
2418 speed_starttime (); \
2419 i = s->reps; \
2420 do { \
2421 MPN_COPY (tp, ap, 2*s->size); \
2422 function (cp, tp, mp, s->size, inv); \
2423 } while (--i != 0); \
2424 t = speed_endtime (); \
2425 \
2426 TMP_FREE; \
2427 return t; \
2428 }
2429#define SPEED_ROUTINE_REDC_2(function) \
2430 { \
2431 unsigned i; \
2432 mp_ptr cp, mp, tp, ap; \
2433 mp_limb_t invp[2]; \
2434 double t; \
2435 TMP_DECL; \
2436 \
2437 SPEED_RESTRICT_COND (s->size >= 1); \
2438 \
2439 TMP_MARK; \
2440 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \
2441 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \
2442 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \
2443 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \
2444 \
2445 MPN_COPY (ap, s->xp, s->size); \
2446 MPN_COPY (ap+s->size, s->xp, s->size); \
2447 \
2448 /* modulus must be odd */ \
2449 MPN_COPY (mp, s->yp, s->size); \
2450 mp[0] |= 1; \
2451 mpn_binvert (invp, mp, 2, tp); \
2452 invp[0] = -invp[0]; invp[1] = ~invp[1]; \
2453 \
2454 speed_operand_src (s, ap, 2*s->size+1); \
2455 speed_operand_dst (s, tp, 2*s->size+1); \
2456 speed_operand_src (s, mp, s->size); \
2457 speed_operand_dst (s, cp, s->size); \
2458 speed_cache_fill (s); \
2459 \
2460 speed_starttime (); \
2461 i = s->reps; \
2462 do { \
2463 MPN_COPY (tp, ap, 2*s->size); \
2464 function (cp, tp, mp, s->size, invp); \
2465 } while (--i != 0); \
2466 t = speed_endtime (); \
2467 \
2468 TMP_FREE; \
2469 return t; \
2470 }
2471#define SPEED_ROUTINE_REDC_N(function) \
2472 { \
2473 unsigned i; \
2474 mp_ptr cp, mp, tp, ap, invp; \
2475 double t; \
2476 TMP_DECL; \
2477 \
2478 SPEED_RESTRICT_COND (s->size > 8); \
2479 \
2480 TMP_MARK; \
2481 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \
2482 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \
2483 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \
2484 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \
2485 SPEED_TMP_ALLOC_LIMBS (invp, s->size, s->align_wp2); /* align? */ \
2486 \
2487 MPN_COPY (ap, s->xp, s->size); \
2488 MPN_COPY (ap+s->size, s->xp, s->size); \
2489 \
2490 /* modulus must be odd */ \
2491 MPN_COPY (mp, s->yp, s->size); \
2492 mp[0] |= 1; \
2493 mpn_binvert (invp, mp, s->size, tp); \
2494 \
2495 speed_operand_src (s, ap, 2*s->size+1); \
2496 speed_operand_dst (s, tp, 2*s->size+1); \
2497 speed_operand_src (s, mp, s->size); \
2498 speed_operand_dst (s, cp, s->size); \
2499 speed_cache_fill (s); \
2500 \
2501 speed_starttime (); \
2502 i = s->reps; \
2503 do { \
2504 MPN_COPY (tp, ap, 2*s->size); \
2505 function (cp, tp, mp, s->size, invp); \
2506 } while (--i != 0); \
2507 t = speed_endtime (); \
2508 \
2509 TMP_FREE; \
2510 return t; \
2511 }
2512
2513
2514#define SPEED_ROUTINE_MPN_POPCOUNT(function) \
2515 { \
2516 unsigned i; \
2517 \
2518 SPEED_RESTRICT_COND (s->size >= 1); \
2519 \
2520 speed_operand_src (s, s->xp, s->size); \
2521 speed_cache_fill (s); \
2522 \
2523 speed_starttime (); \
2524 i = s->reps; \
2525 do \
2526 function (s->xp, s->size); \
2527 while (--i != 0); \
2528 \
2529 return speed_endtime (); \
2530 }
2531
2532#define SPEED_ROUTINE_MPN_HAMDIST(function) \
2533 { \
2534 unsigned i; \
2535 \
2536 SPEED_RESTRICT_COND (s->size >= 1); \
2537 \
2538 speed_operand_src (s, s->xp, s->size); \
2539 speed_operand_src (s, s->yp, s->size); \
2540 speed_cache_fill (s); \
2541 \
2542 speed_starttime (); \
2543 i = s->reps; \
2544 do \
2545 function (s->xp, s->yp, s->size); \
2546 while (--i != 0); \
2547 \
2548 return speed_endtime (); \
2549 }
2550
2551
2552#define SPEED_ROUTINE_MPZ_UI(function) \
2553 { \
2554 mpz_t z; \
2555 unsigned i; \
2556 double t; \
2557 \
2558 SPEED_RESTRICT_COND (s->size >= 0); \
2559 \
2560 mpz_init (z); \
2561 \
2562 speed_starttime (); \
2563 i = s->reps; \
2564 do \
2565 function (z, s->size); \
2566 while (--i != 0); \
2567 t = speed_endtime (); \
2568 \
2569 mpz_clear (z); \
2570 return t; \
2571 }
2572
2573#define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2574#define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2575#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2576
2577
2578#define SPEED_ROUTINE_MPZ_2_UI(function) \
2579 { \
2580 mpz_t z, z2; \
2581 unsigned i; \
2582 double t; \
2583 \
2584 SPEED_RESTRICT_COND (s->size >= 0); \
2585 \
2586 mpz_init (z); \
2587 mpz_init (z2); \
2588 \
2589 speed_starttime (); \
2590 i = s->reps; \
2591 do \
2592 function (z, z2, s->size); \
2593 while (--i != 0); \
2594 t = speed_endtime (); \
2595 \
2596 mpz_clear (z); \
2597 mpz_clear (z2); \
2598 return t; \
2599 }
2600
2601#define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2602#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2603
2604
2605#define SPEED_ROUTINE_MPN_FIB2_UI(function) \
2606 { \
2607 mp_ptr fp, f1p; \
2608 mp_size_t alloc; \
2609 unsigned i; \
2610 double t; \
2611 TMP_DECL; \
2612 \
2613 SPEED_RESTRICT_COND (s->size >= 0); \
2614 \
2615 TMP_MARK; \
2616 alloc = MPN_FIB2_SIZE (s->size); \
2617 SPEED_TMP_ALLOC_LIMBS (fp, alloc, s->align_xp); \
2618 SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp); \
2619 \
2620 speed_starttime (); \
2621 i = s->reps; \
2622 do \
2623 function (fp, f1p, s->size); \
2624 while (--i != 0); \
2625 t = speed_endtime (); \
2626 \
2627 TMP_FREE; \
2628 return t; \
2629 }
2630
2631
2632
2633/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2634 limbs. m is forced to odd so that redc can be used. e is limited in
2635 size so the calculation doesn't take too long. */
2636#define SPEED_ROUTINE_MPZ_POWM(function) \
2637 { \
2638 mpz_t r, b, e, m; \
2639 unsigned i; \
2640 double t; \
2641 \
2642 SPEED_RESTRICT_COND (s->size >= 1); \
2643 \
2644 mpz_init (r); \
2645 if (s->r < 2) \
2646 mpz_init_set_n (b, s->xp, s->size); \
2647 else \
2648 mpz_init_set_ui (b, s->r); \
2649 mpz_init_set_n (m, s->yp, s->size); \
2650 mpz_setbit (m, 0); /* force m to odd */ \
2651 mpz_init_set_n (e, s->xp_block, 6); \
2652 \
2653 speed_starttime (); \
2654 i = s->reps; \
2655 do \
2656 function (r, b, e, m); \
2657 while (--i != 0); \
2658 t = speed_endtime (); \
2659 \
2660 mpz_clear (r); \
2661 mpz_clear (b); \
2662 mpz_clear (e); \
2663 mpz_clear (m); \
2664 return t; \
2665 }
2666
2667/* (m-2)^0xAAAAAAAA mod m */
2668#define SPEED_ROUTINE_MPZ_POWM_UI(function) \
2669 { \
2670 mpz_t r, b, m; \
2671 unsigned long e; \
2672 unsigned i; \
2673 double t; \
2674 \
2675 SPEED_RESTRICT_COND (s->size >= 1); \
2676 \
2677 mpz_init (r); \
2678 \
2679 /* force m to odd */ \
2680 mpz_init (m); \
2681 mpz_set_n (m, s->xp, s->size); \
2682 PTR(m)[0] |= 1; \
2683 \
2684 e = (~ (unsigned long) 0) / 3; \
2685 if (s->r != 0) \
2686 e = s->r; \
2687 \
2688 mpz_init_set (b, m); \
2689 mpz_sub_ui (b, b, 2); \
2690/* printf ("%X\n", mpz_get_ui(m)); */ \
2691 i = s->reps; \
2692 speed_starttime (); \
2693 do \
2694 function (r, b, e, m); \
2695 while (--i != 0); \
2696 t = speed_endtime (); \
2697 \
2698 mpz_clear (r); \
2699 mpz_clear (b); \
2700 mpz_clear (m); \
2701 return t; \
2702 }
2703
2704
2705#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \
2706 { \
2707 mp_ptr wp, wp2, xp, yp; \
2708 unsigned i; \
2709 double t; \
2710 TMP_DECL; \
2711 \
2712 SPEED_RESTRICT_COND (s->size >= 0); \
2713 \
2714 TMP_MARK; \
2715 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
2716 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \
2717 xp = s->xp; \
2718 yp = s->yp; \
2719 \
2720 if (s->r == 0) ; \
2721 else if (s->r == 1) { xp = wp; } \
2722 else if (s->r == 2) { yp = wp2; } \
2723 else if (s->r == 3) { xp = wp; yp = wp2; } \
2724 else if (s->r == 4) { xp = wp2; yp = wp; } \
2725 else { \
2726 TMP_FREE; \
2727 return -1.0; \
2728 } \
2729 if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \
2730 if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \
2731 \
2732 speed_operand_src (s, xp, s->size); \
2733 speed_operand_src (s, yp, s->size); \
2734 speed_operand_dst (s, wp, s->size); \
2735 speed_operand_dst (s, wp2, s->size); \
2736 speed_cache_fill (s); \
2737 \
2738 speed_starttime (); \
2739 i = s->reps; \
2740 do \
2741 call; \
2742 while (--i != 0); \
2743 t = speed_endtime (); \
2744 \
2745 TMP_FREE; \
2746 return t; \
2747 }
2748
2749#define SPEED_ROUTINE_MPN_ADDSUB_N(function) \
2750 SPEED_ROUTINE_MPN_ADDSUB_CALL \
2751 (function (wp, wp2, xp, yp, s->size));
2752
2753#define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \
2754 SPEED_ROUTINE_MPN_ADDSUB_CALL \
2755 (function (wp, wp2, xp, yp, s->size, 0));
2756
2757
2758/* Doing an Nx1 gcd with the given r. */
2759#define SPEED_ROUTINE_MPN_GCD_1N(function) \
2760 { \
2761 mp_ptr xp; \
2762 unsigned i; \
2763 double t; \
2764 TMP_DECL; \
2765 \
2766 SPEED_RESTRICT_COND (s->size >= 1); \
2767 SPEED_RESTRICT_COND (s->r != 0); \
2768 \
2769 TMP_MARK; \
2770 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \
2771 MPN_COPY (xp, s->xp, s->size); \
2772 xp[0] |= refmpn_zero_p (xp, s->size); \
2773 \
2774 speed_operand_src (s, s->xp, s->size); \
2775 speed_cache_fill (s); \
2776 \
2777 speed_starttime (); \
2778 i = s->reps; \
2779 do \
2780 function (xp, s->size, s->r); \
2781 while (--i != 0); \
2782 t = speed_endtime (); \
2783 \
2784 TMP_FREE; \
2785 return t; \
2786 }
2787
2788
2789/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2790
2791#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \
2792 { \
2793 unsigned i, j; \
2794 mp_ptr px, py; \
2795 mp_limb_t x_mask, y_mask; \
2796 double t; \
2797 TMP_DECL; \
2798 \
2799 SPEED_RESTRICT_COND (s->size >= 1); \
2800 SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \
2801 \
2802 TMP_MARK; \
2803 SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp); \
2804 SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp); \
2805 MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \
2806 MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \
2807 \
2808 x_mask = MP_LIMB_T_LOWBITMASK (s->size); \
2809 y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \
2810 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
2811 { \
2812 px[i] &= x_mask; px[i] += (px[i] == 0); \
2813 py[i] &= y_mask; py[i] += (py[i] == 0); \
2814 setup; \
2815 } \
2816 \
2817 speed_operand_src (s, px, SPEED_BLOCK_SIZE); \
2818 speed_operand_src (s, py, SPEED_BLOCK_SIZE); \
2819 speed_cache_fill (s); \
2820 \
2821 speed_starttime (); \
2822 i = s->reps; \
2823 do \
2824 { \
2825 j = SPEED_BLOCK_SIZE; \
2826 do \
2827 { \
2828 call; \
2829 } \
2830 while (--j != 0); \
2831 } \
2832 while (--i != 0); \
2833 t = speed_endtime (); \
2834 \
2835 TMP_FREE; \
2836 \
2837 s->time_divisor = SPEED_BLOCK_SIZE; \
2838 return t; \
2839 }
2840
2841#define SPEED_ROUTINE_MPN_GCD_1(function) \
2842 SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
2843
2844#define SPEED_ROUTINE_MPN_GCD_11(function) \
2845 SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1), \
2846 function (px[j-1], py[j-1]))
2847
2848/* Multiply limbs by (B+1). Then we get a gcd exceeding one limb, so
2849 we can measure gcd_22 loop only, without gcd_11. */
2850#define SPEED_ROUTINE_MPN_GCD_22(function) \
2851 SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1), \
2852 function (px[j-1], px[j-1], py[j-1], py[j-1]))
2853
2854#define SPEED_ROUTINE_MPN_JACBASE(function) \
2855 SPEED_ROUTINE_MPN_GCD_1_CALL \
2856 ({ \
2857 /* require x<y, y odd, y!=1 */ \
2858 px[i] %= py[i]; \
2859 px[i] |= 1; \
2860 py[i] |= 1; \
2861 if (py[i]==1) py[i]=3; \
2862 }, \
2863 function (px[j-1], py[j-1], 0))
2864
2865#define SPEED_ROUTINE_MPN_HGCD2(function) \
2866 { \
2867 unsigned i, j; \
2868 struct hgcd_matrix1 m = {{{0,0},{0,0}}}; \
2869 double t; \
2870 \
2871 speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \
2872 speed_operand_src (s, s->yp_block, SPEED_BLOCK_SIZE); \
2873 speed_cache_fill (s); \
2874 \
2875 speed_starttime (); \
2876 i = s->reps; \
2877 mp_limb_t chain = 0; \
2878 do \
2879 { \
2880 for (j = 0; j < SPEED_BLOCK_SIZE; j+= 2) \
2881 { \
2882 /* randomized but successively dependent */ \
2883 function (s->xp_block[j] | GMP_NUMB_HIGHBIT, \
2884 s->xp_block[j+1] + chain, \
2885 s->yp_block[j] | GMP_NUMB_HIGHBIT, \
2886 s->yp_block[j+1], &m); \
2887 chain += m.u[0][0]; \
2888 } \
2889 } \
2890 while (--i != 0); \
2891 t = speed_endtime (); \
2892 \
2893 /* make sure the compiler won't optimize away chain */ \
2894 noop_1 (chain); \
2895 \
2896 s->time_divisor = SPEED_BLOCK_SIZE / 2; \
2897 return t; \
2898 }
2899
2900#define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc) \
2901 { \
2902 mp_size_t hgcd_init_itch, hgcd_itch; \
2903 mp_ptr ap, bp, wp, tmp1; \
2904 struct hgcd_matrix hgcd; \
2905 int res; \
2906 unsigned i; \
2907 double t; \
2908 TMP_DECL; \
2909 \
2910 if (s->size < 2) \
2911 return -1; \
2912 \
2913 TMP_MARK; \
2914 \
2915 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \
2916 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \
2917 \
2918 s->xp[s->size - 1] |= 1; \
2919 s->yp[s->size - 1] |= 1; \
2920 \
2921 hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \
2922 hgcd_itch = itchfunc (s->size); \
2923 \
2924 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \
2925 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp); \
2926 \
2927 speed_operand_src (s, s->xp, s->size); \
2928 speed_operand_src (s, s->yp, s->size); \
2929 speed_operand_dst (s, ap, s->size + 1); \
2930 speed_operand_dst (s, bp, s->size + 1); \
2931 speed_operand_dst (s, wp, hgcd_itch); \
2932 speed_operand_dst (s, tmp1, hgcd_init_itch); \
2933 speed_cache_fill (s); \
2934 \
2935 speed_starttime (); \
2936 i = s->reps; \
2937 do \
2938 { \
2939 MPN_COPY (ap, s->xp, s->size); \
2940 MPN_COPY (bp, s->yp, s->size); \
2941 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \
2942 res = func (ap, bp, s->size, &hgcd, wp); \
2943 } \
2944 while (--i != 0); \
2945 t = speed_endtime (); \
2946 TMP_FREE; \
2947 return t; \
2948 }
2949
2950#define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc) \
2951 { \
2952 mp_size_t hgcd_init_itch, hgcd_step_itch; \
2953 mp_ptr ap, bp, wp, tmp1; \
2954 struct hgcd_matrix hgcd; \
2955 mp_size_t p = s->size/2; \
2956 int res; \
2957 unsigned i; \
2958 double t; \
2959 TMP_DECL; \
2960 \
2961 if (s->size < 2) \
2962 return -1; \
2963 \
2964 TMP_MARK; \
2965 \
2966 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \
2967 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \
2968 \
2969 s->xp[s->size - 1] |= 1; \
2970 s->yp[s->size - 1] |= 1; \
2971 \
2972 hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \
2973 hgcd_step_itch = itchfunc (s->size, p); \
2974 \
2975 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \
2976 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp); \
2977 \
2978 speed_operand_src (s, s->xp, s->size); \
2979 speed_operand_src (s, s->yp, s->size); \
2980 speed_operand_dst (s, ap, s->size + 1); \
2981 speed_operand_dst (s, bp, s->size + 1); \
2982 speed_operand_dst (s, wp, hgcd_step_itch); \
2983 speed_operand_dst (s, tmp1, hgcd_init_itch); \
2984 speed_cache_fill (s); \
2985 \
2986 speed_starttime (); \
2987 i = s->reps; \
2988 do \
2989 { \
2990 MPN_COPY (ap, s->xp, s->size); \
2991 MPN_COPY (bp, s->yp, s->size); \
2992 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \
2993 res = func (&hgcd, ap, bp, s->size, p, wp); \
2994 } \
2995 while (--i != 0); \
2996 t = speed_endtime (); \
2997 TMP_FREE; \
2998 return t; \
2999 }
3000
3001/* Run some GCDs of s->size limbs each. The number of different data values
3002 is decreased as s->size**2, since GCD is a quadratic algorithm.
3003 SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
3004 though, because the plain gcd is about twice as fast as gcdext. */
3005
3006#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \
3007 { \
3008 unsigned i; \
3009 mp_size_t j, pieces, psize; \
3010 mp_ptr wp, wp2, xtmp, ytmp, px, py; \
3011 double t; \
3012 TMP_DECL; \
3013 \
3014 SPEED_RESTRICT_COND (s->size >= 1); \
3015 \
3016 TMP_MARK; \
3017 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \
3018 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \
3019 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \
3020 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \
3021 \
3022 pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \
3023 pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \
3024 pieces = MAX (pieces, 1); \
3025 \
3026 psize = pieces * s->size; \
3027 px = TMP_ALLOC_LIMBS (psize); \
3028 py = TMP_ALLOC_LIMBS (psize); \
3029 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
3030 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
3031 \
3032 /* Requirements: x >= y, y must be odd, high limbs != 0. \
3033 No need to ensure random numbers are really great. */ \
3034 for (j = 0; j < pieces; j++) \
3035 { \
3036 mp_ptr x = px + j * s->size; \
3037 mp_ptr y = py + j * s->size; \
3038 if (x[s->size - 1] == 0) x[s->size - 1] = 1; \
3039 if (y[s->size - 1] == 0) y[s->size - 1] = 1; \
3040 \
3041 if (x[s->size - 1] < y[s->size - 1]) \
3042 MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]); \
3043 else if (x[s->size - 1] == y[s->size - 1]) \
3044 { \
3045 x[s->size - 1] = 2; \
3046 y[s->size - 1] = 1; \
3047 } \
3048 y[0] |= 1; \
3049 } \
3050 \
3051 speed_operand_src (s, px, psize); \
3052 speed_operand_src (s, py, psize); \
3053 speed_operand_dst (s, xtmp, s->size); \
3054 speed_operand_dst (s, ytmp, s->size); \
3055 speed_operand_dst (s, wp, s->size); \
3056 speed_cache_fill (s); \
3057 \
3058 speed_starttime (); \
3059 i = s->reps; \
3060 do \
3061 { \
3062 j = pieces; \
3063 do \
3064 { \
3065 MPN_COPY (xtmp, px+(j - 1)*s->size, s->size); \
3066 MPN_COPY (ytmp, py+(j - 1)*s->size, s->size); \
3067 call; \
3068 } \
3069 while (--j != 0); \
3070 } \
3071 while (--i != 0); \
3072 t = speed_endtime (); \
3073 \
3074 TMP_FREE; \
3075 \
3076 s->time_divisor = pieces; \
3077 return t; \
3078 }
3079
3080#define SPEED_ROUTINE_MPN_GCD(function) \
3081 SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
3082
3083#define SPEED_ROUTINE_MPN_GCDEXT(function) \
3084 SPEED_ROUTINE_MPN_GCD_CALL \
3085 (4, { mp_size_t wp2size; \
3086 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
3087
3088
3089#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \
3090 { \
3091 unsigned i; \
3092 mp_size_t j, pieces, psize, wp2size; \
3093 mp_ptr wp, wp2, xtmp, ytmp, px, py; \
3094 double t; \
3095 TMP_DECL; \
3096 \
3097 SPEED_RESTRICT_COND (s->size >= 1); \
3098 \
3099 TMP_MARK; \
3100 \
3101 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \
3102 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \
3103 MPN_COPY (xtmp, s->xp, s->size); \
3104 MPN_COPY (ytmp, s->yp, s->size); \
3105 \
3106 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \
3107 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \
3108 \
3109 pieces = SPEED_BLOCK_SIZE / 3; \
3110 psize = 3 * pieces; \
3111 px = TMP_ALLOC_LIMBS (psize); \
3112 py = TMP_ALLOC_LIMBS (psize); \
3113 MPN_COPY (px, s->xp_block, psize); \
3114 MPN_COPY (py, s->yp_block, psize); \
3115 \
3116 /* x must have at least as many bits as y, \
3117 high limbs must be non-zero */ \
3118 for (j = 0; j < pieces; j++) \
3119 { \
3120 mp_ptr x = px+3*j; \
3121 mp_ptr y = py+3*j; \
3122 x[2] += (x[2] == 0); \
3123 y[2] += (y[2] == 0); \
3124 if (x[2] < y[2]) \
3125 MP_LIMB_T_SWAP (x[2], y[2]); \
3126 } \
3127 \
3128 speed_operand_src (s, px, psize); \
3129 speed_operand_src (s, py, psize); \
3130 speed_operand_dst (s, xtmp, s->size); \
3131 speed_operand_dst (s, ytmp, s->size); \
3132 speed_operand_dst (s, wp, s->size); \
3133 speed_cache_fill (s); \
3134 \
3135 speed_starttime (); \
3136 i = s->reps; \
3137 do \
3138 { \
3139 mp_ptr x = px; \
3140 mp_ptr y = py; \
3141 mp_ptr xth = &xtmp[s->size-3]; \
3142 mp_ptr yth = &ytmp[s->size-3]; \
3143 j = pieces; \
3144 do \
3145 { \
3146 xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \
3147 yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \
3148 \
3149 ytmp[0] |= 1; /* y must be odd, */ \
3150 \
3151 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \
3152 \
3153 x += 3; \
3154 y += 3; \
3155 } \
3156 while (--j != 0); \
3157 } \
3158 while (--i != 0); \
3159 t = speed_endtime (); \
3160 \
3161 TMP_FREE; \
3162 \
3163 s->time_divisor = pieces; \
3164 return t; \
3165 }
3166
3167/* Calculate nextprime(n) for random n of s->size bits (not limbs). */
3168#define SPEED_ROUTINE_MPZ_NEXTPRIME(function) \
3169 { \
3170 unsigned i, j; \
3171 mpz_t wp, n; \
3172 double t; \
3173 \
3174 SPEED_RESTRICT_COND (s->size >= 10); \
3175 \
3176 mpz_init (wp); \
3177 mpz_init_set_n (n, s->xp, s->size); \
3178 /* limit to s->size bits, as this function is very slow */ \
3179 mpz_tdiv_r_2exp (n, n, s->size); \
3180 /* set high bits so operand and result are genaral s->size bits */ \
3181 mpz_setbit (n, s->size - 1); \
3182 mpz_clrbit (n, s->size - 2); \
3183 \
3184 speed_starttime (); \
3185 i = s->reps; \
3186 do \
3187 { \
3188 /* nextprime timing is variable, so average over many calls */ \
3189 j = SPEED_BLOCK_SIZE - 1; \
3190 /* starts on random, after measures prime to next prime */ \
3191 function (wp, n); \
3192 do \
3193 { \
3194 function (wp, wp); \
3195 } \
3196 while (--j != 0); \
3197 } \
3198 while (--i != 0); \
3199 t = speed_endtime (); \
3200 \
3201 mpz_clear (wp); \
3202 mpz_clear (n); \
3203 \
3204 s->time_divisor = SPEED_BLOCK_SIZE; \
3205 return t; \
3206 }
3207
3208#define SPEED_ROUTINE_MPZ_JACOBI(function) \
3209 { \
3210 mpz_t a, b; \
3211 unsigned i; \
3212 mp_size_t j, pieces, psize; \
3213 mp_ptr px, py; \
3214 double t; \
3215 TMP_DECL; \
3216 \
3217 TMP_MARK; \
3218 pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \
3219 pieces = MAX (pieces, 1); \
3220 s->time_divisor = pieces; \
3221 \
3222 psize = pieces * s->size; \
3223 px = TMP_ALLOC_LIMBS (psize); \
3224 py = TMP_ALLOC_LIMBS (psize); \
3225 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
3226 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
3227 \
3228 for (j = 0; j < pieces; j++) \
3229 { \
3230 mp_ptr x = px+j*s->size; \
3231 mp_ptr y = py+j*s->size; \
3232 \
3233 /* y odd */ \
3234 y[0] |= 1; \
3235 \
3236 /* high limbs non-zero */ \
3237 if (x[s->size-1] == 0) x[s->size-1] = 1; \
3238 if (y[s->size-1] == 0) y[s->size-1] = 1; \
3239 } \
3240 \
3241 SIZ(a) = s->size; \
3242 SIZ(b) = s->size; \
3243 \
3244 speed_operand_src (s, px, psize); \
3245 speed_operand_src (s, py, psize); \
3246 speed_cache_fill (s); \
3247 \
3248 speed_starttime (); \
3249 i = s->reps; \
3250 do \
3251 { \
3252 j = pieces; \
3253 do \
3254 { \
3255 PTR(a) = px+(j-1)*s->size; \
3256 PTR(b) = py+(j-1)*s->size; \
3257 function (a, b); \
3258 } \
3259 while (--j != 0); \
3260 } \
3261 while (--i != 0); \
3262 t = speed_endtime (); \
3263 \
3264 TMP_FREE; \
3265 return t; \
3266 }
3267
3268#define SPEED_ROUTINE_MPN_DIVREM_2(function) \
3269 { \
3270 mp_ptr wp, xp; \
3271 mp_limb_t yp[2]; \
3272 unsigned i; \
3273 double t; \
3274 TMP_DECL; \
3275 \
3276 SPEED_RESTRICT_COND (s->size >= 2); \
3277 \
3278 TMP_MARK; \
3279 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \
3280 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
3281 \
3282 /* source is destroyed */ \
3283 MPN_COPY (xp, s->xp, s->size); \
3284 \
3285 /* divisor must be normalized */ \
3286 MPN_COPY (yp, s->yp_block, 2); \
3287 yp[1] |= GMP_NUMB_HIGHBIT; \
3288 \
3289 speed_operand_src (s, xp, s->size); \
3290 speed_operand_src (s, yp, 2); \
3291 speed_operand_dst (s, wp, s->size); \
3292 speed_cache_fill (s); \
3293 \
3294 speed_starttime (); \
3295 i = s->reps; \
3296 do \
3297 function (wp, 0, xp, s->size, yp); \
3298 while (--i != 0); \
3299 t = speed_endtime (); \
3300 \
3301 TMP_FREE; \
3302 return t; \
3303 }
3304
3305#define SPEED_ROUTINE_MPN_DIV_QR_1(function) \
3306 { \
3307 mp_ptr wp, xp; \
3308 mp_limb_t d; \
3309 mp_limb_t r; \
3310 unsigned i; \
3311 double t; \
3312 TMP_DECL; \
3313 \
3314 SPEED_RESTRICT_COND (s->size >= 1); \
3315 \
3316 TMP_MARK; \
3317 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
3318 \
3319 d = s->r; \
3320 if (d == 0) \
3321 d = 1; \
3322 speed_operand_src (s, s->xp, s->size); \
3323 speed_operand_dst (s, wp, s->size); \
3324 speed_cache_fill (s); \
3325 \
3326 speed_starttime (); \
3327 i = s->reps; \
3328 do \
3329 r = function (wp, wp+s->size-1, s->xp, s->size, d); \
3330 while (--i != 0); \
3331 t = speed_endtime (); \
3332 \
3333 TMP_FREE; \
3334 return t; \
3335 }
3336
3337#define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function) \
3338 { \
3339 mp_ptr wp, xp; \
3340 mp_limb_t d, dinv; \
3341 mp_limb_t r; \
3342 unsigned i; \
3343 double t; \
3344 TMP_DECL; \
3345 \
3346 SPEED_RESTRICT_COND (s->size >= 1); \
3347 \
3348 TMP_MARK; \
3349 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
3350 \
3351 d = s->r; \
3352 /* divisor must be normalized */ \
3353 SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT); \
3354 invert_limb (dinv, d); \
3355 speed_operand_src (s, s->xp, s->size); \
3356 speed_operand_dst (s, wp, s->size); \
3357 speed_cache_fill (s); \
3358 \
3359 speed_starttime (); \
3360 i = s->reps; \
3361 do \
3362 r = function (wp, s->xp, s->size, 0, d, dinv); \
3363 while (--i != 0); \
3364 t = speed_endtime (); \
3365 \
3366 TMP_FREE; \
3367 return t; \
3368 }
3369
3370#define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm) \
3371 { \
3372 mp_ptr wp, xp; \
3373 mp_limb_t yp[2]; \
3374 mp_limb_t rp[2]; \
3375 unsigned i; \
3376 double t; \
3377 TMP_DECL; \
3378 \
3379 SPEED_RESTRICT_COND (s->size >= 2); \
3380 \
3381 TMP_MARK; \
3382 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
3383 \
3384 /* divisor must be normalized */ \
3385 MPN_COPY (yp, s->yp_block, 2); \
3386 if (norm) \
3387 yp[1] |= GMP_NUMB_HIGHBIT; \
3388 else \
3389 { \
3390 yp[1] &= ~GMP_NUMB_HIGHBIT; \
3391 if (yp[1] == 0) \
3392 yp[1] = 1; \
3393 } \
3394 speed_operand_src (s, s->xp, s->size); \
3395 speed_operand_src (s, yp, 2); \
3396 speed_operand_dst (s, wp, s->size); \
3397 speed_operand_dst (s, rp, 2); \
3398 speed_cache_fill (s); \
3399 \
3400 speed_starttime (); \
3401 i = s->reps; \
3402 do \
3403 function (wp, rp, s->xp, s->size, yp); \
3404 while (--i != 0); \
3405 t = speed_endtime (); \
3406 \
3407 TMP_FREE; \
3408 return t; \
3409 }
3410
3411#define SPEED_ROUTINE_MODLIMB_INVERT(function) \
3412 { \
3413 unsigned i, j; \
3414 mp_ptr xp; \
3415 mp_limb_t n = 1; \
3416 double t; \
3417 \
3418 xp = s->xp_block-1; \
3419 \
3420 speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \
3421 speed_cache_fill (s); \
3422 \
3423 speed_starttime (); \
3424 i = s->reps; \
3425 do \
3426 { \
3427 j = SPEED_BLOCK_SIZE; \
3428 do \
3429 { \
3430 /* randomized but successively dependent */ \
3431 n += (xp[j] << 1); \
3432 \
3433 function (n, n); \
3434 } \
3435 while (--j != 0); \
3436 } \
3437 while (--i != 0); \
3438 t = speed_endtime (); \
3439 \
3440 /* make sure the compiler won't optimize away n */ \
3441 noop_1 (n); \
3442 \
3443 s->time_divisor = SPEED_BLOCK_SIZE; \
3444 return t; \
3445 }
3446
3447
3448#define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call) \
3449 { \
3450 mp_ptr wp, wp2; \
3451 unsigned i; \
3452 double t; \
3453 TMP_DECL; \
3454 \
3455 SPEED_RESTRICT_COND (s->size >= 1); \
3456 \
3457 TMP_MARK; \
3458 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
3459 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \
3460 \
3461 speed_operand_src (s, s->xp, s->size); \
3462 speed_operand_dst (s, wp, s->size); \
3463 speed_operand_dst (s, wp2, s->size); \
3464 speed_cache_fill (s); \
3465 \
3466 speed_starttime (); \
3467 i = s->reps; \
3468 do \
3469 call; \
3470 while (--i != 0); \
3471 t = speed_endtime (); \
3472 \
3473 TMP_FREE; \
3474 return t; \
3475 }
3476
3477
3478/* Calculate worst case for perfect_power
3479 Worst case is multiple prime factors larger than trial div limit. */
3480#define SPEED_ROUTINE_MPN_PERFECT_POWER(function) \
3481 { \
3482 mpz_t r; \
3483 unsigned i, power; \
3484 double t; \
3485 \
3486 SPEED_RESTRICT_COND (s->size >= 10); \
3487 \
3488 mpz_init (r); \
3489 power = s->size * GMP_NUMB_BITS / 17; \
3490 mpz_ui_pow_ui(r, (1 << 17) - 1, power - 1); \
3491 mpz_mul_ui(r, r, (1 << 16) + 1); /* larger than 1000th prime */ \
3492 \
3493 speed_starttime (); \
3494 i = s->reps; \
3495 do \
3496 function (PTR(r), SIZ(r)); \
3497 while (--i != 0); \
3498 t = speed_endtime (); \
3499 \
3500 mpz_clear (r); \
3501 return t; \
3502 }
3503
3504/* Calculate worst case (larger prime) for perfect_square */
3505#define SPEED_ROUTINE_MPN_PERFECT_SQUARE(function) \
3506 { \
3507 mpz_t r; \
3508 unsigned i; \
3509 double t; \
3510 \
3511 SPEED_RESTRICT_COND (s->size >= 2); \
3512 mpz_init_set_n (r, s->xp, s->size / 2); \
3513 mpz_setbit (r, s->size * GMP_NUMB_BITS / 2 - 1); \
3514 mpz_mul (r, r, r); \
3515 \
3516 speed_starttime (); \
3517 i = s->reps; \
3518 do \
3519 function (PTR(r), SIZ(r)); \
3520 while (--i != 0); \
3521 t = speed_endtime (); \
3522 \
3523 mpz_clear (r); \
3524 return t; \
3525 }
3526
3527
3528/* s->size controls the number of limbs in the input, s->r is the base, or
3529 decimal by default. */
3530#define SPEED_ROUTINE_MPN_GET_STR(function) \
3531 { \
3532 unsigned char *wp; \
3533 mp_size_t wn; \
3534 mp_ptr xp; \
3535 int base; \
3536 unsigned i; \
3537 double t; \
3538 TMP_DECL; \
3539 \
3540 SPEED_RESTRICT_COND (s->size >= 1); \
3541 \
3542 base = s->r == 0 ? 10 : s->r; \
3543 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \
3544 \
3545 TMP_MARK; \
3546 SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp); \
3547 \
3548 MPN_SIZEINBASE (wn, s->xp, s->size, base); \
3549 wp = (unsigned char *) TMP_ALLOC (wn); \
3550 \
3551 /* use this during development to guard against overflowing wp */ \
3552 /* \
3553 MPN_COPY (xp, s->xp, s->size); \
3554 ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn); \
3555 */ \
3556 \
3557 speed_operand_src (s, s->xp, s->size); \
3558 speed_operand_dst (s, xp, s->size); \
3559 speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES); \
3560 speed_cache_fill (s); \
3561 \
3562 speed_starttime (); \
3563 i = s->reps; \
3564 do \
3565 { \
3566 MPN_COPY (xp, s->xp, s->size); \
3567 function (wp, base, xp, s->size); \
3568 } \
3569 while (--i != 0); \
3570 t = speed_endtime (); \
3571 \
3572 TMP_FREE; \
3573 return t; \
3574 }
3575
3576/* s->size controls the number of digits in the input, s->r is the base, or
3577 decimal by default. */
3578#define SPEED_ROUTINE_MPN_SET_STR_CALL(call) \
3579 { \
3580 unsigned char *xp; \
3581 mp_ptr wp; \
3582 mp_size_t wn; \
3583 unsigned i; \
3584 int base; \
3585 double t; \
3586 TMP_DECL; \
3587 \
3588 SPEED_RESTRICT_COND (s->size >= 1); \
3589 \
3590 base = s->r == 0 ? 10 : s->r; \
3591 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \
3592 \
3593 TMP_MARK; \
3594 \
3595 xp = (unsigned char *) TMP_ALLOC (s->size); \
3596 for (i = 0; i < s->size; i++) \
3597 xp[i] = s->xp[i] % base; \
3598 \
3599 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); \
3600 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \
3601 \
3602 /* use this during development to check wn is big enough */ \
3603 /* \
3604 ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn); \
3605 */ \
3606 \
3607 speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES); \
3608 speed_operand_dst (s, wp, wn); \
3609 speed_cache_fill (s); \
3610 \
3611 speed_starttime (); \
3612 i = s->reps; \
3613 do \
3614 call; \
3615 while (--i != 0); \
3616 t = speed_endtime (); \
3617 \
3618 TMP_FREE; \
3619 return t; \
3620 }
3621
3622
3623/* Run an accel gcd find_a() function over various data values. A set of
3624 values is used in case some run particularly fast or slow. The size
3625 parameter is ignored, the amount of data tested is fixed. */
3626
3627#define SPEED_ROUTINE_MPN_GCD_FINDA(function) \
3628 { \
3629 unsigned i, j; \
3630 mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \
3631 double t; \
3632 TMP_DECL; \
3633 \
3634 TMP_MARK; \
3635 \
3636 /* low must be odd, high must be non-zero */ \
3637 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
3638 { \
3639 cp[i][0] = s->xp_block[i] | 1; \
3640 cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \
3641 } \
3642 \
3643 speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \
3644 speed_cache_fill (s); \
3645 \
3646 speed_starttime (); \
3647 i = s->reps; \
3648 do \
3649 { \
3650 j = SPEED_BLOCK_SIZE; \
3651 do \
3652 { \
3653 function (cp[j-1]); \
3654 } \
3655 while (--j != 0); \
3656 } \
3657 while (--i != 0); \
3658 t = speed_endtime (); \
3659 \
3660 TMP_FREE; \
3661 \
3662 s->time_divisor = SPEED_BLOCK_SIZE; \
3663 return t; \
3664 }
3665
3666
3667/* "call" should do "count_foo_zeros(c,n)".
3668 Give leading=1 if foo is leading zeros, leading=0 for trailing.
3669 Give zero=1 if n=0 is allowed in the call, zero=0 if not. */
3670
3671#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \
3672 { \
3673 mp_ptr xp; \
3674 int i, c; \
3675 unsigned j; \
3676 mp_limb_t n; \
3677 double t; \
3678 TMP_DECL; \
3679 \
3680 TMP_MARK; \
3681 SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp); \
3682 \
3683 if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \
3684 return -1.0; \
3685 speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \
3686 speed_cache_fill (s); \
3687 \
3688 c = 0; \
3689 speed_starttime (); \
3690 j = s->reps; \
3691 do { \
3692 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
3693 { \
3694 n = xp[i]; \
3695 n ^= c; \
3696
3697#define SPEED_ROUTINE_COUNT_ZEROS_B() \
3698 } \
3699 } while (--j != 0); \
3700 t = speed_endtime (); \
3701 \
3702 /* don't let c go dead */ \
3703 noop_1 (c); \
3704 \
3705 s->time_divisor = SPEED_BLOCK_SIZE; \
3706 \
3707 TMP_FREE; \
3708 return t; \
3709 } \
3710
3711#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \
3712 do { \
3713 SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \
3714 call; \
3715 SPEED_ROUTINE_COUNT_ZEROS_B (); \
3716 } while (0) \
3717
3718#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \
3719 SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
3720#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \
3721 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
3722
3723#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \
3724 SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
3725#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \
3726 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
3727
3728
3729#define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \
3730 { \
3731 unsigned i, j; \
3732 mp_limb_t d, dinv=0; \
3733 mp_ptr xp = s->xp_block - 1; \
3734 \
3735 s->time_divisor = SPEED_BLOCK_SIZE; \
3736 \
3737 speed_starttime (); \
3738 i = s->reps; \
3739 do \
3740 { \
3741 j = SPEED_BLOCK_SIZE; \
3742 do \
3743 { \
3744 d = dinv ^ xp[j]; \
3745 d |= GMP_LIMB_HIGHBIT; \
3746 do { call; } while (0); \
3747 } \
3748 while (--j != 0); \
3749 } \
3750 while (--i != 0); \
3751 \
3752 /* don't let the compiler optimize everything away */ \
3753 noop_1 (dinv); \
3754 \
3755 return speed_endtime(); \
3756 }
3757
3758
3759#define SPEED_ROUTINE_MPN_BACK_TO_BACK(function) \
3760 { \
3761 unsigned i; \
3762 speed_starttime (); \
3763 i = s->reps; \
3764 do \
3765 function (); \
3766 while (--i != 0); \
3767 return speed_endtime (); \
3768 }
3769
3770
3771#define SPEED_ROUTINE_MPN_ZERO_CALL(call) \
3772 { \
3773 mp_ptr wp; \
3774 unsigned i; \
3775 double t; \
3776 TMP_DECL; \
3777 \
3778 SPEED_RESTRICT_COND (s->size >= 0); \
3779 \
3780 TMP_MARK; \
3781 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
3782 speed_operand_dst (s, wp, s->size); \
3783 speed_cache_fill (s); \
3784 \
3785 speed_starttime (); \
3786 i = s->reps; \
3787 do \
3788 call; \
3789 while (--i != 0); \
3790 t = speed_endtime (); \
3791 \
3792 TMP_FREE; \
3793 return t; \
3794 }
3795
3796#define SPEED_ROUTINE_MPN_ZERO(function) \
3797 SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
3798
3799
3800#endif