blob: 86ba5edf67bdb1c8dabba791f49d7728789dc4cf [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001/* Create tuned thresholds for various algorithms.
2
3Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
4
5This file is part of the GNU MP Library.
6
7The GNU MP Library is free software; you can redistribute it and/or modify
8it under the terms of either:
9
10 * the GNU Lesser General Public License as published by the Free
11 Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13
14or
15
16 * the GNU General Public License as published by the Free Software
17 Foundation; either version 2 of the License, or (at your option) any
18 later version.
19
20or both in parallel, as here.
21
22The GNU MP Library is distributed in the hope that it will be useful, but
23WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25for more details.
26
27You should have received copies of the GNU General Public License and the
28GNU Lesser General Public License along with the GNU MP Library. If not,
29see https://www.gnu.org/licenses/. */
30
31
32/* Usage: tuneup [-t] [-t] [-p precision]
33
34 -t turns on some diagnostic traces, a second -t turns on more traces.
35
36 Notes:
37
38 The code here isn't a vision of loveliness, mainly because it's subject
39 to ongoing changes according to new things wanting to be tuned, and
40 practical requirements of systems tested.
41
42 Sometimes running the program twice produces slightly different results.
43 This is probably because there's so little separating algorithms near
44 their crossover, and on that basis it should make little or no difference
45 to the final speed of the relevant routines, but nothing has been done to
46 check that carefully.
47
48 Algorithm:
49
50 The thresholds are determined as follows. A crossover may not be a
51 single size but rather a range where it oscillates between method A or
52 method B faster. If the threshold is set making B used where A is faster
53 (or vice versa) that's bad. Badness is the percentage time lost and
54 total badness is the sum of this over all sizes measured. The threshold
55 is set to minimize total badness.
56
57 Suppose, as sizes increase, method B becomes faster than method A. The
58 effect of the rule is that, as you look at increasing sizes, isolated
59 points where B is faster are ignored, but when it's consistently faster,
60 or faster on balance, then the threshold is set there. The same result
61 is obtained thinking in the other direction of A becoming faster at
62 smaller sizes.
63
64 In practice the thresholds tend to be chosen to bring on the next
65 algorithm fairly quickly.
66
67 This rule is attractive because it's got a basis in reason and is fairly
68 easy to implement, but no work has been done to actually compare it in
69 absolute terms to other possibilities.
70
71 Implementation:
72
73 In a normal library build the thresholds are constants. To tune them
74 selected objects are recompiled with the thresholds as global variables
75 instead. #define TUNE_PROGRAM_BUILD does this, with help from code at
76 the end of gmp-impl.h, and rules in tune/Makefile.am.
77
78 MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The
79 threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
80 level, but recurse into the basecase.
81
82 MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
83 Other routines in turn will make use of both of those. Naturally the
84 dependants must be tuned first.
85
86 In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
87 just a threshold based on comparing two routines (mpn_divrem_1 and
88 mpn_divexact_1), and no further use of the value determined.
89
90 Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
91 just comparisons between certain routines on representative data.
92
93 Shortcuts are applied when native (assembler) versions of routines exist.
94 For instance a native mpn_sqr_basecase is assumed to be always faster
95 than mpn_mul_basecase, with no measuring.
96
97 No attempt is made to tune within assembler routines, for instance
98 DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be
99 written and tuned all by hand. Assembler routines that might have hard
100 limits are recompiled though, to make them accept a bigger range of sizes
101 than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
102
103 Limitations:
104
105 The FFTs aren't subject to the same badness rule as the other thresholds,
106 so each k is probably being brought on a touch early. This isn't likely
107 to make a difference, and the simpler probing means fewer tests.
108
109*/
110
111#define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
112
113#include "config.h"
114
115#include <math.h>
116#include <stdio.h>
117#include <stdlib.h>
118#include <time.h>
119#if HAVE_UNISTD_H
120#include <unistd.h>
121#endif
122
123#include "gmp-impl.h"
124#include "longlong.h"
125
126#include "tests.h"
127#include "speed.h"
128
129#if !HAVE_DECL_OPTARG
130extern char *optarg;
131extern int optind, opterr;
132#endif
133
134
135#define DEFAULT_MAX_SIZE 1000 /* limbs */
136
137#if WANT_FFT
138mp_size_t option_fft_max_size = 50000; /* limbs */
139#else
140mp_size_t option_fft_max_size = 0;
141#endif
142int option_trace = 0;
143int option_fft_trace = 0;
144struct speed_params s;
145
146struct dat_t {
147 mp_size_t size;
148 double d;
149} *dat = NULL;
150int ndat = 0;
151int allocdat = 0;
152
153/* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
154 case use zero here, which for params.max_size means no limit. */
155#ifndef TUNE_SQR_TOOM2_MAX
156#define TUNE_SQR_TOOM2_MAX 0
157#endif
158
159mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX;
160mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT;
161mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT;
162mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT;
163mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
164mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
165mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
166mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
167mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
168mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
169mp_size_t mul_fft_threshold = MP_SIZE_T_MAX;
170mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX;
171mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
172mp_size_t sqr_toom2_threshold
173 = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
174mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
175mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
176mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
177mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
178mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX;
179mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX;
180mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX;
181mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX;
182mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX;
183mp_size_t sqrlo_basecase_threshold = MP_SIZE_T_MAX;
184mp_size_t sqrlo_dc_threshold = MP_SIZE_T_MAX;
185mp_size_t sqrlo_sqr_threshold = MP_SIZE_T_MAX;
186mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX;
187mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX;
188mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX;
189mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX;
190mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
191mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX;
192mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX;
193mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX;
194mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX;
195mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX;
196mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX;
197mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX;
198mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX;
199mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX;
200mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX;
201mp_size_t inv_newton_threshold = MP_SIZE_T_MAX;
202mp_size_t inv_appr_threshold = MP_SIZE_T_MAX;
203mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
204mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
205mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
206mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
207mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
208mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
209mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX;
210mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX;
211mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
212mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
213int div_qr_1n_pi1_method = 0;
214mp_size_t div_qr_1_norm_threshold = MP_SIZE_T_MAX;
215mp_size_t div_qr_1_unnorm_threshold = MP_SIZE_T_MAX;
216mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
217mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
218mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
219mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
220int mod_1_1p_method = 0;
221mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX;
222mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX;
223mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
224mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
225mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
226mp_size_t divrem_2_threshold = MP_SIZE_T_MAX;
227mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX;
228mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX;
229mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX;
230mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX;
231mp_size_t fac_odd_threshold = 0;
232mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT;
233
234mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX;
235mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX;
236
237struct param_t {
238 const char *name;
239 speed_function_t function;
240 speed_function_t function2;
241 double step_factor; /* how much to step relatively */
242 int step; /* how much to step absolutely */
243 double function_fudge; /* multiplier for "function" speeds */
244 int stop_since_change;
245 double stop_factor;
246 mp_size_t min_size;
247 int min_is_always;
248 mp_size_t max_size;
249 mp_size_t check_size;
250 mp_size_t size_extra;
251
252#define DATA_HIGH_LT_R 1
253#define DATA_HIGH_GE_R 2
254 int data_high;
255
256 int noprint;
257};
258
259
260/* These are normally undefined when false, which suits "#if" fine.
261 But give them zero values so they can be used in plain C "if"s. */
262#ifndef UDIV_PREINV_ALWAYS
263#define UDIV_PREINV_ALWAYS 0
264#endif
265#ifndef HAVE_NATIVE_mpn_divexact_1
266#define HAVE_NATIVE_mpn_divexact_1 0
267#endif
268#ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
269#define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
270#endif
271#ifndef HAVE_NATIVE_mpn_divrem_1
272#define HAVE_NATIVE_mpn_divrem_1 0
273#endif
274#ifndef HAVE_NATIVE_mpn_divrem_2
275#define HAVE_NATIVE_mpn_divrem_2 0
276#endif
277#ifndef HAVE_NATIVE_mpn_mod_1
278#define HAVE_NATIVE_mpn_mod_1 0
279#endif
280#ifndef HAVE_NATIVE_mpn_mod_1_1p
281#define HAVE_NATIVE_mpn_mod_1_1p 0
282#endif
283#ifndef HAVE_NATIVE_mpn_modexact_1_odd
284#define HAVE_NATIVE_mpn_modexact_1_odd 0
285#endif
286#ifndef HAVE_NATIVE_mpn_preinv_divrem_1
287#define HAVE_NATIVE_mpn_preinv_divrem_1 0
288#endif
289#ifndef HAVE_NATIVE_mpn_preinv_mod_1
290#define HAVE_NATIVE_mpn_preinv_mod_1 0
291#endif
292#ifndef HAVE_NATIVE_mpn_sqr_basecase
293#define HAVE_NATIVE_mpn_sqr_basecase 0
294#endif
295
296
297#define MAX3(a,b,c) MAX (MAX (a, b), c)
298
299mp_limb_t
300randlimb_norm (void)
301{
302 mp_limb_t n;
303 mpn_random (&n, 1);
304 n |= GMP_NUMB_HIGHBIT;
305 return n;
306}
307
308#define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
309
310mp_limb_t
311randlimb_half (void)
312{
313 mp_limb_t n;
314 mpn_random (&n, 1);
315 n &= GMP_NUMB_HALFMASK;
316 n += (n==0);
317 return n;
318}
319
320
321/* Add an entry to the end of the dat[] array, reallocing to make it bigger
322 if necessary. */
323void
324add_dat (mp_size_t size, double d)
325{
326#define ALLOCDAT_STEP 500
327
328 ASSERT_ALWAYS (ndat <= allocdat);
329
330 if (ndat == allocdat)
331 {
332 dat = (struct dat_t *) __gmp_allocate_or_reallocate
333 (dat, allocdat * sizeof(dat[0]),
334 (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
335 allocdat += ALLOCDAT_STEP;
336 }
337
338 dat[ndat].size = size;
339 dat[ndat].d = d;
340 ndat++;
341}
342
343
344/* Return the threshold size based on the data accumulated. */
345mp_size_t
346analyze_dat (int final)
347{
348 double x, min_x;
349 int j, min_j;
350
351 /* If the threshold is set at dat[0].size, any positive values are bad. */
352 x = 0.0;
353 for (j = 0; j < ndat; j++)
354 if (dat[j].d > 0.0)
355 x += dat[j].d;
356
357 if (option_trace >= 2 && final)
358 {
359 printf ("\n");
360 printf ("x is the sum of the badness from setting thresh at given size\n");
361 printf (" (minimum x is sought)\n");
362 printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
363 }
364
365 min_x = x;
366 min_j = 0;
367
368
369 /* When stepping to the next dat[j].size, positive values are no longer
370 bad (so subtracted), negative values become bad (so add the absolute
371 value, meaning subtract). */
372 for (j = 0; j < ndat; x -= dat[j].d, j++)
373 {
374 if (option_trace >= 2 && final)
375 printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
376
377 if (x < min_x)
378 {
379 min_x = x;
380 min_j = j;
381 }
382 }
383
384 return min_j;
385}
386
387
388/* Measuring for recompiled mpn/generic/div_qr_1.c,
389 * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
390
391mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
392
393#if defined (__cplusplus)
394extern "C" {
395#endif
396
397mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
398mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
399void mpz_fac_ui_tune (mpz_ptr, unsigned long);
400
401#if defined (__cplusplus)
402}
403#endif
404
405double
406speed_mpn_mod_1_tune (struct speed_params *s)
407{
408 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
409}
410double
411speed_mpn_divrem_1_tune (struct speed_params *s)
412{
413 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
414}
415double
416speed_mpz_fac_ui_tune (struct speed_params *s)
417{
418 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
419}
420double
421speed_mpn_div_qr_1_tune (struct speed_params *s)
422{
423 SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
424}
425
426double
427tuneup_measure (speed_function_t fun,
428 const struct param_t *param,
429 struct speed_params *s)
430{
431 static struct param_t dummy;
432 double t;
433 TMP_DECL;
434
435 if (! param)
436 param = &dummy;
437
438 s->size += param->size_extra;
439
440 TMP_MARK;
441 SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
442 SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
443
444 mpn_random (s->xp, s->size);
445 mpn_random (s->yp, s->size);
446
447 switch (param->data_high) {
448 case DATA_HIGH_LT_R:
449 s->xp[s->size-1] %= s->r;
450 s->yp[s->size-1] %= s->r;
451 break;
452 case DATA_HIGH_GE_R:
453 s->xp[s->size-1] |= s->r;
454 s->yp[s->size-1] |= s->r;
455 break;
456 }
457
458 t = speed_measure (fun, s);
459
460 s->size -= param->size_extra;
461
462 TMP_FREE;
463 return t;
464}
465
466
467#define PRINT_WIDTH 31
468
469void
470print_define_start (const char *name)
471{
472 printf ("#define %-*s ", PRINT_WIDTH, name);
473 if (option_trace)
474 printf ("...\n");
475}
476
477void
478print_define_end_remark (const char *name, mp_size_t value, const char *remark)
479{
480 if (option_trace)
481 printf ("#define %-*s ", PRINT_WIDTH, name);
482
483 if (value == MP_SIZE_T_MAX)
484 printf ("MP_SIZE_T_MAX");
485 else
486 printf ("%5ld", (long) value);
487
488 if (remark != NULL)
489 printf (" /* %s */", remark);
490 printf ("\n");
491 fflush (stdout);
492}
493
494void
495print_define_end (const char *name, mp_size_t value)
496{
497 const char *remark;
498 if (value == MP_SIZE_T_MAX)
499 remark = "never";
500 else if (value == 0)
501 remark = "always";
502 else
503 remark = NULL;
504 print_define_end_remark (name, value, remark);
505}
506
507void
508print_define (const char *name, mp_size_t value)
509{
510 print_define_start (name);
511 print_define_end (name, value);
512}
513
514void
515print_define_remark (const char *name, mp_size_t value, const char *remark)
516{
517 print_define_start (name);
518 print_define_end_remark (name, value, remark);
519}
520
521void
522print_define_with_speedup (const char *name, mp_size_t value,
523 mp_size_t runner_up, double speedup)
524{
525 char buf[100];
526 snprintf (buf, sizeof(buf), "%.2f%% faster than %ld",
527 100.0 * (speedup - 1), runner_up);
528 print_define_remark (name, value, buf);
529}
530
531void
532one (mp_size_t *threshold, struct param_t *param)
533{
534 int since_positive, since_thresh_change;
535 int thresh_idx, new_thresh_idx;
536
537#define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0)
538
539 DEFAULT (param->function_fudge, 1.0);
540 DEFAULT (param->function2, param->function);
541 DEFAULT (param->step_factor, 0.01); /* small steps by default */
542 DEFAULT (param->step, 1); /* small steps by default */
543 DEFAULT (param->stop_since_change, 80);
544 DEFAULT (param->stop_factor, 1.2);
545 DEFAULT (param->min_size, 10);
546 DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
547
548 if (param->check_size != 0)
549 {
550 double t1, t2;
551 s.size = param->check_size;
552
553 *threshold = s.size+1;
554 t1 = tuneup_measure (param->function, param, &s);
555
556 *threshold = s.size;
557 t2 = tuneup_measure (param->function2, param, &s);
558 if (t1 == -1.0 || t2 == -1.0)
559 {
560 printf ("Oops, can't run both functions at size %ld\n",
561 (long) s.size);
562 abort ();
563 }
564 t1 *= param->function_fudge;
565
566 /* ask that t2 is at least 4% below t1 */
567 if (t1 < t2*1.04)
568 {
569 if (option_trace)
570 printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
571 *threshold = MP_SIZE_T_MAX;
572 if (! param->noprint)
573 print_define (param->name, *threshold);
574 return;
575 }
576
577 if (option_trace >= 2)
578 printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
579 (long) s.size, t1, t2);
580 }
581
582 if (! param->noprint || option_trace)
583 print_define_start (param->name);
584
585 ndat = 0;
586 since_positive = 0;
587 since_thresh_change = 0;
588 thresh_idx = 0;
589
590 if (option_trace >= 2)
591 {
592 printf (" algorithm-A algorithm-B ratio possible\n");
593 printf (" (seconds) (seconds) diff thresh\n");
594 }
595
596 for (s.size = param->min_size;
597 s.size < param->max_size;
598 s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
599 {
600 double ti, tiplus1, d;
601
602 /*
603 FIXME: check minimum size requirements are met, possibly by just
604 checking for the -1 returns from the speed functions.
605 */
606
607 /* using method A at this size */
608 *threshold = s.size+1;
609 ti = tuneup_measure (param->function, param, &s);
610 if (ti == -1.0)
611 abort ();
612 ti *= param->function_fudge;
613
614 /* using method B at this size */
615 *threshold = s.size;
616 tiplus1 = tuneup_measure (param->function2, param, &s);
617 if (tiplus1 == -1.0)
618 abort ();
619
620 /* Calculate the fraction by which the one or the other routine is
621 slower. */
622 if (tiplus1 >= ti)
623 d = (tiplus1 - ti) / tiplus1; /* negative */
624 else
625 d = (tiplus1 - ti) / ti; /* positive */
626
627 add_dat (s.size, d);
628
629 new_thresh_idx = analyze_dat (0);
630
631 if (option_trace >= 2)
632 printf ("size=%ld %.9f %.9f % .4f %c %ld\n",
633 (long) s.size, ti, tiplus1, d,
634 ti > tiplus1 ? '#' : ' ',
635 (long) dat[new_thresh_idx].size);
636
637 /* Stop if the last time method i was faster was more than a
638 certain number of measurements ago. */
639#define STOP_SINCE_POSITIVE 200
640 if (d >= 0)
641 since_positive = 0;
642 else
643 if (++since_positive > STOP_SINCE_POSITIVE)
644 {
645 if (option_trace >= 1)
646 printf ("stopped due to since_positive (%d)\n",
647 STOP_SINCE_POSITIVE);
648 break;
649 }
650
651 /* Stop if method A has become slower by a certain factor. */
652 if (ti >= tiplus1 * param->stop_factor)
653 {
654 if (option_trace >= 1)
655 printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
656 param->stop_factor);
657 break;
658 }
659
660 /* Stop if the threshold implied hasn't changed in a certain
661 number of measurements. (It's this condition that usually
662 stops the loop.) */
663 if (thresh_idx != new_thresh_idx)
664 since_thresh_change = 0, thresh_idx = new_thresh_idx;
665 else
666 if (++since_thresh_change > param->stop_since_change)
667 {
668 if (option_trace >= 1)
669 printf ("stopped due to since_thresh_change (%d)\n",
670 param->stop_since_change);
671 break;
672 }
673
674 /* Stop if the threshold implied is more than a certain number of
675 measurements ago. */
676#define STOP_SINCE_AFTER 500
677 if (ndat - thresh_idx > STOP_SINCE_AFTER)
678 {
679 if (option_trace >= 1)
680 printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
681 STOP_SINCE_AFTER);
682 break;
683 }
684
685 /* Stop when the size limit is reached before the end of the
686 crossover, but only show this as an error for >= the default max
687 size. FIXME: Maybe should make it a param choice whether this is
688 an error. */
689 if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
690 {
691 fprintf (stderr, "%s\n", param->name);
692 fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
693 (long) dat[0].size, (long) dat[ndat-1].size, ndat);
694 fprintf (stderr, " max size reached before end of crossover\n");
695 break;
696 }
697 }
698
699 if (option_trace >= 1)
700 printf ("sizes %ld to %ld total %d measurements\n",
701 (long) dat[0].size, (long) dat[ndat-1].size, ndat);
702
703 *threshold = dat[analyze_dat (1)].size;
704
705 if (param->min_is_always)
706 {
707 if (*threshold == param->min_size)
708 *threshold = 0;
709 }
710
711 if (! param->noprint || option_trace)
712 print_define_end (param->name, *threshold);
713}
714
715/* Time N different FUNCTIONS with the same parameters and size, to
716 select the fastest. Since *_METHOD defines start numbering from
717 one, if functions[i] is fastest, the value of the define is i+1.
718 Also output a comment with speedup compared to the next fastest
719 function. The NAME argument is used only for trace output.
720
721 Returns the index of the fastest function.
722*/
723int
724one_method (int n, speed_function_t *functions,
725 const char *name, const char *define,
726 const struct param_t *param)
727{
728 double *t;
729 int i;
730 int method;
731 int method_runner_up;
732
733 TMP_DECL;
734 TMP_MARK;
735 t = (double*) TMP_ALLOC (n * sizeof (*t));
736
737 for (i = 0; i < n; i++)
738 {
739 t[i] = tuneup_measure (functions[i], param, &s);
740 if (option_trace >= 1)
741 printf ("size=%ld, %s, method %d %.9f\n",
742 (long) s.size, name, i + 1, t[i]);
743 if (t[i] == -1.0)
744 {
745 printf ("Oops, can't measure all %s methods\n", name);
746 abort ();
747 }
748 }
749 method = 0;
750 for (i = 1; i < n; i++)
751 if (t[i] < t[method])
752 method = i;
753
754 method_runner_up = (method == 0);
755 for (i = 0; i < n; i++)
756 if (i != method && t[i] < t[method_runner_up])
757 method_runner_up = i;
758
759 print_define_with_speedup (define, method + 1, method_runner_up + 1,
760 t[method_runner_up] / t[method]);
761
762 TMP_FREE;
763 return method;
764}
765
766
767/* Special probing for the fft thresholds. The size restrictions on the
768 FFTs mean the graph of time vs size has a step effect. See this for
769 example using
770
771 ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
772 gnuplot foo.gnuplot
773
774 The current approach is to compare routines at the midpoint of relevant
775 steps. Arguably a more sophisticated system of threshold data is wanted
776 if this step effect remains. */
777
778struct fft_param_t {
779 const char *table_name;
780 const char *threshold_name;
781 const char *modf_threshold_name;
782 mp_size_t *p_threshold;
783 mp_size_t *p_modf_threshold;
784 mp_size_t first_size;
785 mp_size_t max_size;
786 speed_function_t function;
787 speed_function_t mul_modf_function;
788 speed_function_t mul_function;
789 mp_size_t sqr;
790};
791
792
793/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
794 N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
795 of 2^(k-1) bits. */
796
797mp_size_t
798fft_step_size (int k)
799{
800 mp_size_t step;
801
802 step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
803 step *= (mp_size_t) 1 << k;
804
805 if (step <= 0)
806 {
807 printf ("Can't handle k=%d\n", k);
808 abort ();
809 }
810
811 return step;
812}
813
814mp_size_t
815fft_next_size (mp_size_t pl, int k)
816{
817 mp_size_t m = fft_step_size (k);
818
819/* printf ("[k=%d %ld] %ld ->", k, m, pl); */
820
821 if (pl == 0 || (pl & (m-1)) != 0)
822 pl = (pl | (m-1)) + 1;
823
824/* printf (" %ld\n", pl); */
825 return pl;
826}
827
828#define NMAX_DEFAULT 1000000
829#define MAX_REPS 25
830#define MIN_REPS 5
831
832static inline size_t
833mpn_mul_fft_lcm (size_t a, unsigned int k)
834{
835 unsigned int l = k;
836
837 while (a % 2 == 0 && k > 0)
838 {
839 a >>= 1;
840 k--;
841 }
842 return a << l;
843}
844
845mp_size_t
846fftfill (mp_size_t pl, int k, int sqr)
847{
848 mp_size_t maxLK;
849 mp_bitcnt_t N, Nprime, nprime, M;
850
851 N = pl * GMP_NUMB_BITS;
852 M = N >> k;
853
854 maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
855
856 Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
857 nprime = Nprime / GMP_NUMB_BITS;
858 if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
859 {
860 size_t K2;
861 for (;;)
862 {
863 K2 = 1L << mpn_fft_best_k (nprime, sqr);
864 if ((nprime & (K2 - 1)) == 0)
865 break;
866 nprime = (nprime + K2 - 1) & -K2;
867 Nprime = nprime * GMP_LIMB_BITS;
868 }
869 }
870 ASSERT_ALWAYS (nprime < pl);
871
872 return Nprime;
873}
874
875static int
876compare_double (const void *ap, const void *bp)
877{
878 double a = * (const double *) ap;
879 double b = * (const double *) bp;
880
881 if (a < b)
882 return -1;
883 else if (a > b)
884 return 1;
885 else
886 return 0;
887}
888
889double
890median (double *times, int n)
891{
892 qsort (times, n, sizeof (double), compare_double);
893 return times[n/2];
894}
895
896#define FFT_CACHE_SIZE 25
897typedef struct fft_cache
898{
899 mp_size_t n;
900 double time;
901} fft_cache_t;
902
903fft_cache_t fft_cache[FFT_CACHE_SIZE];
904
905double
906cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
907 int n_measurements)
908{
909 int i;
910 double t, ttab[MAX_REPS];
911
912 if (fft_cache[k].n == n)
913 return fft_cache[k].time;
914
915 for (i = 0; i < n_measurements; i++)
916 {
917 speed_starttime ();
918 mpn_mul_fft (rp, n, ap, n, bp, n, k);
919 ttab[i] = speed_endtime ();
920 }
921
922 t = median (ttab, n_measurements);
923 fft_cache[k].n = n;
924 fft_cache[k].time = t;
925 return t;
926}
927
928#define INSERT_FFTTAB(idx, nval, kval) \
929 do { \
930 fft_tab[idx].n = nval; \
931 fft_tab[idx].k = kval; \
932 fft_tab[idx+1].n = (1 << 27) - 1; /* sentinel, 27b wide field */ \
933 fft_tab[idx+1].k = (1 << 5) - 1; \
934 } while (0)
935
936int
937fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
938{
939 mp_size_t n, n1, prev_n1;
940 int k, best_k, last_best_k, kmax;
941 int eff, prev_eff;
942 double t0, t1;
943 int n_measurements;
944 mp_limb_t *ap, *bp, *rp;
945 mp_size_t alloc;
946 struct fft_table_nk *fft_tab;
947
948 fft_tab = mpn_fft_table3[p->sqr];
949
950 for (k = 0; k < FFT_CACHE_SIZE; k++)
951 fft_cache[k].n = 0;
952
953 if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
954 {
955 nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
956 }
957
958 if (print)
959 printf ("#define %s%*s", p->table_name, 38, "");
960
961 if (idx == 0)
962 {
963 INSERT_FFTTAB (0, nmin, initial_k);
964
965 if (print)
966 {
967 printf ("\\\n { ");
968 printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
969 }
970
971 idx = 1;
972 }
973
974 ap = (mp_ptr) malloc (sizeof (mp_limb_t));
975 if (p->sqr)
976 bp = ap;
977 else
978 bp = (mp_ptr) malloc (sizeof (mp_limb_t));
979 rp = (mp_ptr) malloc (sizeof (mp_limb_t));
980 alloc = 1;
981
982 /* Round n to comply to initial k value */
983 n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
984
985 n_measurements = (18 - initial_k) | 1;
986 n_measurements = MAX (n_measurements, MIN_REPS);
987 n_measurements = MIN (n_measurements, MAX_REPS);
988
989 last_best_k = initial_k;
990 best_k = initial_k;
991
992 while (n < nmax)
993 {
994 int start_k, end_k;
995
996 /* Assume the current best k is best until we hit its next FFT step. */
997 t0 = 99999;
998
999 prev_n1 = n + 1;
1000
1001 start_k = MAX (4, best_k - 4);
1002 end_k = MIN (24, best_k + 4);
1003 for (k = start_k; k <= end_k; k++)
1004 {
1005 n1 = mpn_fft_next_size (prev_n1, k);
1006
1007 eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
1008
1009 if (eff < 70) /* avoid measuring too slow fft:s */
1010 continue;
1011
1012 if (n1 > alloc)
1013 {
1014 alloc = n1;
1015 if (p->sqr)
1016 {
1017 ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
1018 rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
1019 ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
1020 mpn_random (ap, alloc);
1021 rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
1022 }
1023 else
1024 {
1025 ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
1026 bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
1027 rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
1028 ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
1029 mpn_random (ap, alloc);
1030 bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
1031 mpn_random (bp, alloc);
1032 rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
1033 }
1034 }
1035
1036 t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
1037
1038 if (t1 * n_measurements > 0.3)
1039 n_measurements -= 2;
1040 n_measurements = MAX (n_measurements, MIN_REPS);
1041
1042 if (t1 < t0)
1043 {
1044 best_k = k;
1045 t0 = t1;
1046 }
1047 }
1048
1049 n1 = mpn_fft_next_size (prev_n1, best_k);
1050
1051 if (last_best_k != best_k)
1052 {
1053 ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
1054
1055 if (idx >= FFT_TABLE3_SIZE)
1056 {
1057 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1058 abort ();
1059 }
1060 INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
1061
1062 if (print)
1063 {
1064 printf (", ");
1065 if (idx % 4 == 0)
1066 printf ("\\\n ");
1067 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1068 }
1069
1070 if (option_trace >= 2)
1071 {
1072 printf ("{%lu,%u}\n", prev_n1, best_k);
1073 fflush (stdout);
1074 }
1075
1076 last_best_k = best_k;
1077 idx++;
1078 }
1079
1080 for (;;)
1081 {
1082 prev_n1 = n1;
1083 prev_eff = fftfill (prev_n1, best_k, p->sqr);
1084 n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
1085 eff = fftfill (n1, best_k, p->sqr);
1086
1087 if (eff != prev_eff)
1088 break;
1089 }
1090
1091 n = prev_n1;
1092 }
1093
1094 kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */
1095 kmax = MIN (kmax, 25-1);
1096 for (k = last_best_k + 1; k <= kmax; k++)
1097 {
1098 if (idx >= FFT_TABLE3_SIZE)
1099 {
1100 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1101 abort ();
1102 }
1103 INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1104
1105 if (print)
1106 {
1107 printf (", ");
1108 if (idx % 4 == 0)
1109 printf ("\\\n ");
1110 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1111 }
1112
1113 idx++;
1114 }
1115
1116 if (print)
1117 printf (" }\n");
1118
1119 free (ap);
1120 if (! p->sqr)
1121 free (bp);
1122 free (rp);
1123
1124 return idx;
1125}
1126
1127void
1128fft (struct fft_param_t *p)
1129{
1130 mp_size_t size;
1131 int k, idx, initial_k;
1132
1133 /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1134
1135#if 1
1136 {
1137 /* Use plain one() mechanism, for some reasonable initial values of k. The
1138 advantage is that we don't depend on mpn_fft_table3, which can therefore
1139 leave it completely uninitialized. */
1140
1141 static struct param_t param;
1142 mp_size_t thres, best_thres;
1143 int best_k;
1144 char buf[20];
1145
1146 best_thres = MP_SIZE_T_MAX;
1147 best_k = -1;
1148
1149 for (k = 5; k <= 7; k++)
1150 {
1151 param.name = p->modf_threshold_name;
1152 param.min_size = 100;
1153 param.max_size = 2000;
1154 param.function = p->mul_function;
1155 param.step_factor = 0.0;
1156 param.step = 4;
1157 param.function2 = p->mul_modf_function;
1158 param.noprint = 1;
1159 s.r = k;
1160 one (&thres, &param);
1161 if (thres < best_thres)
1162 {
1163 best_thres = thres;
1164 best_k = k;
1165 }
1166 }
1167
1168 *(p->p_modf_threshold) = best_thres;
1169 sprintf (buf, "k = %d", best_k);
1170 print_define_remark (p->modf_threshold_name, best_thres, buf);
1171 initial_k = best_k;
1172 }
1173#else
1174 size = p->first_size;
1175 for (;;)
1176 {
1177 double tk, tm;
1178
1179 size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1180 k = mpn_fft_best_k (size, p->sqr);
1181
1182 if (size >= p->max_size)
1183 break;
1184
1185 s.size = size + fft_step_size (k) / 2;
1186 s.r = k;
1187 tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1188 if (tk == -1.0)
1189 abort ();
1190
1191 tm = tuneup_measure (p->mul_function, NULL, &s);
1192 if (tm == -1.0)
1193 abort ();
1194
1195 if (option_trace >= 2)
1196 printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n",
1197 (long) size,
1198 (long) size + fft_step_size (k) / 2, k, tk,
1199 (long) s.size, tm);
1200
1201 if (tk < tm)
1202 {
1203 *p->p_modf_threshold = s.size;
1204 print_define (p->modf_threshold_name, *p->p_modf_threshold);
1205 break;
1206 }
1207 }
1208 initial_k = ?;
1209#endif
1210
1211 /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1212
1213 idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1214 printf ("#define %s_SIZE %d\n", p->table_name, idx);
1215
1216 /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1217
1218 size = 2 * *p->p_modf_threshold; /* OK? */
1219 for (;;)
1220 {
1221 double tk, tm;
1222 mp_size_t mulmod_size, mul_size;;
1223
1224 if (size >= p->max_size)
1225 break;
1226
1227 mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1228 mul_size = (size + mulmod_size) / 2; /* middle of step */
1229
1230 s.size = mulmod_size;
1231 tk = tuneup_measure (p->function, NULL, &s);
1232 if (tk == -1.0)
1233 abort ();
1234
1235 s.size = mul_size;
1236 tm = tuneup_measure (p->mul_function, NULL, &s);
1237 if (tm == -1.0)
1238 abort ();
1239
1240 if (option_trace >= 2)
1241 printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n",
1242 (long) size,
1243 (long) mulmod_size, tk,
1244 (long) mul_size, tm);
1245
1246 size = mulmod_size;
1247
1248 if (tk < tm)
1249 {
1250 *p->p_threshold = s.size;
1251 print_define (p->threshold_name, *p->p_threshold);
1252 break;
1253 }
1254 }
1255}
1256
1257/* Compare mpn_mul_1 to whatever fast exact single-limb division we have. This
1258 is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
1259 This is used in get_str and set_str. */
1260void
1261relspeed_div_1_vs_mul_1 (void)
1262{
1263 const size_t max_opsize = 100;
1264 mp_size_t n;
1265 long j;
1266 mp_limb_t rp[max_opsize];
1267 mp_limb_t ap[max_opsize];
1268 double multime, divtime;
1269
1270 mpn_random (ap, max_opsize);
1271
1272 multime = 0;
1273 for (n = max_opsize; n > 1; n--)
1274 {
1275 mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1276 speed_starttime ();
1277 for (j = speed_precision; j != 0 ; j--)
1278 mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1279 multime += speed_endtime () / n;
1280 }
1281
1282 divtime = 0;
1283 for (n = max_opsize; n > 1; n--)
1284 {
1285 /* Make input divisible for good measure. */
1286 ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
1287
1288#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
1289 mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
1290 MP_BASES_BIG_BASE_BINVERTED_10,
1291 MP_BASES_BIG_BASE_CTZ_10);
1292#else
1293 mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1294#endif
1295 speed_starttime ();
1296 for (j = speed_precision; j != 0 ; j--)
1297 {
1298#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
1299 mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
1300 MP_BASES_BIG_BASE_BINVERTED_10,
1301 MP_BASES_BIG_BASE_CTZ_10);
1302#else
1303 mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
1304#endif
1305 }
1306 divtime += speed_endtime () / n;
1307 }
1308
1309 print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
1310}
1311
1312
1313/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1314 giving wrong results. */
1315void
1316tune_mul_n (void)
1317{
1318 static struct param_t param;
1319 mp_size_t next_toom_start;
1320 int something_changed;
1321
1322 param.function = speed_mpn_mul_n;
1323
1324 param.name = "MUL_TOOM22_THRESHOLD";
1325 param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1326 param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1327 one (&mul_toom22_threshold, &param);
1328
1329 param.noprint = 1;
1330
1331 /* Threshold sequence loop. Disable functions that would be used in a very
1332 narrow range, re-measuring things when that happens. */
1333 something_changed = 1;
1334 while (something_changed)
1335 {
1336 something_changed = 0;
1337
1338 next_toom_start = mul_toom22_threshold;
1339
1340 if (mul_toom33_threshold != 0)
1341 {
1342 param.name = "MUL_TOOM33_THRESHOLD";
1343 param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
1344 param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1345 one (&mul_toom33_threshold, &param);
1346
1347 if (next_toom_start * 1.05 >= mul_toom33_threshold)
1348 {
1349 mul_toom33_threshold = 0;
1350 something_changed = 1;
1351 }
1352 }
1353
1354 next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
1355
1356 if (mul_toom44_threshold != 0)
1357 {
1358 param.name = "MUL_TOOM44_THRESHOLD";
1359 param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
1360 param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1361 one (&mul_toom44_threshold, &param);
1362
1363 if (next_toom_start * 1.05 >= mul_toom44_threshold)
1364 {
1365 mul_toom44_threshold = 0;
1366 something_changed = 1;
1367 }
1368 }
1369
1370 next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
1371
1372 if (mul_toom6h_threshold != 0)
1373 {
1374 param.name = "MUL_TOOM6H_THRESHOLD";
1375 param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
1376 param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1377 one (&mul_toom6h_threshold, &param);
1378
1379 if (next_toom_start * 1.05 >= mul_toom6h_threshold)
1380 {
1381 mul_toom6h_threshold = 0;
1382 something_changed = 1;
1383 }
1384 }
1385
1386 next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
1387
1388 if (mul_toom8h_threshold != 0)
1389 {
1390 param.name = "MUL_TOOM8H_THRESHOLD";
1391 param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
1392 param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1393 one (&mul_toom8h_threshold, &param);
1394
1395 if (next_toom_start * 1.05 >= mul_toom8h_threshold)
1396 {
1397 mul_toom8h_threshold = 0;
1398 something_changed = 1;
1399 }
1400 }
1401 }
1402
1403 print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
1404 print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
1405 print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
1406 print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
1407
1408 /* disabled until tuned */
1409 MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1410}
1411
1412void
1413tune_mul (void)
1414{
1415 static struct param_t param;
1416 mp_size_t thres;
1417
1418 param.noprint = 1;
1419
1420 param.function = speed_mpn_toom32_for_toom43_mul;
1421 param.function2 = speed_mpn_toom43_for_toom32_mul;
1422 param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1423 param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
1424 one (&thres, &param);
1425 mul_toom32_to_toom43_threshold = thres * 17 / 24;
1426 print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1427
1428 param.function = speed_mpn_toom32_for_toom53_mul;
1429 param.function2 = speed_mpn_toom53_for_toom32_mul;
1430 param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1431 param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
1432 one (&thres, &param);
1433 mul_toom32_to_toom53_threshold = thres * 19 / 30;
1434 print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1435
1436 param.function = speed_mpn_toom42_for_toom53_mul;
1437 param.function2 = speed_mpn_toom53_for_toom42_mul;
1438 param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1439 param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
1440 one (&thres, &param);
1441 mul_toom42_to_toom53_threshold = thres * 11 / 20;
1442 print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1443
1444 param.function = speed_mpn_toom42_mul;
1445 param.function2 = speed_mpn_toom63_mul;
1446 param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1447 param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
1448 one (&thres, &param);
1449 mul_toom42_to_toom63_threshold = thres / 2;
1450 print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1451
1452 /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
1453 param.function = speed_mpn_toom43_for_toom54_mul;
1454 param.function2 = speed_mpn_toom54_for_toom43_mul;
1455 param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
1456 param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
1457 one (&thres, &param);
1458 mul_toom43_to_toom54_threshold = thres * 5 / 6;
1459 print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
1460}
1461
1462
1463void
1464tune_mullo (void)
1465{
1466 static struct param_t param;
1467
1468 param.function = speed_mpn_mullo_n;
1469
1470 param.name = "MULLO_BASECASE_THRESHOLD";
1471 param.min_size = 2;
1472 param.min_is_always = 1;
1473 param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1474 param.stop_factor = 1.5;
1475 param.noprint = 1;
1476 one (&mullo_basecase_threshold, &param);
1477
1478 param.name = "MULLO_DC_THRESHOLD";
1479 param.min_size = 8;
1480 param.min_is_always = 0;
1481 param.max_size = 1000;
1482 one (&mullo_dc_threshold, &param);
1483
1484 if (mullo_basecase_threshold >= mullo_dc_threshold)
1485 {
1486 print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1487 print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1488 }
1489 else
1490 {
1491 print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1492 print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1493 }
1494
1495 if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
1496 {
1497 param.name = "MULLO_MUL_N_THRESHOLD";
1498 param.min_size = mullo_dc_threshold;
1499 param.max_size = 2 * mul_fft_threshold;
1500 param.noprint = 0;
1501 param.step_factor = 0.03;
1502 one (&mullo_mul_n_threshold, &param);
1503 }
1504 else
1505 print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1506 "without FFT use mullo forever");
1507}
1508
1509void
1510tune_sqrlo (void)
1511{
1512 static struct param_t param;
1513
1514 param.function = speed_mpn_sqrlo;
1515
1516 param.name = "SQRLO_BASECASE_THRESHOLD";
1517 param.min_size = 2;
1518 param.min_is_always = 1;
1519 param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
1520 param.stop_factor = 1.5;
1521 param.noprint = 1;
1522 one (&sqrlo_basecase_threshold, &param);
1523
1524 param.name = "SQRLO_DC_THRESHOLD";
1525 param.min_size = 8;
1526 param.min_is_always = 0;
1527 param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
1528 one (&sqrlo_dc_threshold, &param);
1529
1530 if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
1531 {
1532 print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
1533 print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
1534 }
1535 else
1536 {
1537 print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
1538 print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
1539 }
1540
1541 if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
1542 {
1543 param.name = "SQRLO_SQR_THRESHOLD";
1544 param.min_size = sqrlo_dc_threshold;
1545 param.max_size = 2 * sqr_fft_threshold;
1546 param.noprint = 0;
1547 param.step_factor = 0.03;
1548 one (&sqrlo_sqr_threshold, &param);
1549 }
1550 else
1551 print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
1552 "without FFT use sqrlo forever");
1553}
1554
1555void
1556tune_mulmid (void)
1557{
1558 static struct param_t param;
1559
1560 param.name = "MULMID_TOOM42_THRESHOLD";
1561 param.function = speed_mpn_mulmid_n;
1562 param.min_size = 4;
1563 param.max_size = 100;
1564 one (&mulmid_toom42_threshold, &param);
1565}
1566
1567void
1568tune_mulmod_bnm1 (void)
1569{
1570 static struct param_t param;
1571
1572 param.name = "MULMOD_BNM1_THRESHOLD";
1573 param.function = speed_mpn_mulmod_bnm1;
1574 param.min_size = 4;
1575 param.max_size = 100;
1576 one (&mulmod_bnm1_threshold, &param);
1577}
1578
1579void
1580tune_sqrmod_bnm1 (void)
1581{
1582 static struct param_t param;
1583
1584 param.name = "SQRMOD_BNM1_THRESHOLD";
1585 param.function = speed_mpn_sqrmod_bnm1;
1586 param.min_size = 4;
1587 param.max_size = 100;
1588 one (&sqrmod_bnm1_threshold, &param);
1589}
1590
1591
1592/* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1593 is faster only at size==2 then we don't want to bother with extra code
1594 just for that. Start karatsuba from 4 same as MUL above. */
1595
1596void
1597tune_sqr (void)
1598{
1599 /* disabled until tuned */
1600 SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1601
1602 if (HAVE_NATIVE_mpn_sqr_basecase)
1603 {
1604 print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1605 sqr_basecase_threshold = 0;
1606 }
1607 else
1608 {
1609 static struct param_t param;
1610 param.name = "SQR_BASECASE_THRESHOLD";
1611 param.function = speed_mpn_sqr;
1612 param.min_size = 3;
1613 param.min_is_always = 1;
1614 param.max_size = TUNE_SQR_TOOM2_MAX;
1615 param.noprint = 1;
1616 one (&sqr_basecase_threshold, &param);
1617 }
1618
1619 {
1620 static struct param_t param;
1621 param.name = "SQR_TOOM2_THRESHOLD";
1622 param.function = speed_mpn_sqr;
1623 param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1624 param.max_size = TUNE_SQR_TOOM2_MAX;
1625 param.noprint = 1;
1626 one (&sqr_toom2_threshold, &param);
1627
1628 if (! HAVE_NATIVE_mpn_sqr_basecase
1629 && sqr_toom2_threshold < sqr_basecase_threshold)
1630 {
1631 /* Karatsuba becomes faster than mul_basecase before
1632 sqr_basecase does. Arrange for the expression
1633 "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1634 selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1635 SQR_TOOM2_THRESHOLD to zero, making
1636 SQR_BASECASE_THRESHOLD the toom2 threshold. */
1637
1638 sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1639 SQR_TOOM2_THRESHOLD = 0;
1640
1641 print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1642 "toom2");
1643 print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1644 "never sqr_basecase");
1645 }
1646 else
1647 {
1648 if (! HAVE_NATIVE_mpn_sqr_basecase)
1649 print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1650 print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1651 }
1652 }
1653
1654 {
1655 static struct param_t param;
1656 mp_size_t next_toom_start;
1657 int something_changed;
1658
1659 param.function = speed_mpn_sqr;
1660 param.noprint = 1;
1661
1662 /* Threshold sequence loop. Disable functions that would be used in a very
1663 narrow range, re-measuring things when that happens. */
1664 something_changed = 1;
1665 while (something_changed)
1666 {
1667 something_changed = 0;
1668
1669 next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1670
1671 sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
1672 param.name = "SQR_TOOM3_THRESHOLD";
1673 param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
1674 param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1675 one (&sqr_toom3_threshold, &param);
1676
1677 next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
1678
1679 if (sqr_toom4_threshold != 0)
1680 {
1681 param.name = "SQR_TOOM4_THRESHOLD";
1682 sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
1683 param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
1684 param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1685 one (&sqr_toom4_threshold, &param);
1686
1687 if (next_toom_start * 1.05 >= sqr_toom4_threshold)
1688 {
1689 sqr_toom4_threshold = 0;
1690 something_changed = 1;
1691 }
1692 }
1693
1694 next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
1695
1696 if (sqr_toom6_threshold != 0)
1697 {
1698 param.name = "SQR_TOOM6_THRESHOLD";
1699 sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
1700 param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
1701 param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1702 one (&sqr_toom6_threshold, &param);
1703
1704 if (next_toom_start * 1.05 >= sqr_toom6_threshold)
1705 {
1706 sqr_toom6_threshold = 0;
1707 something_changed = 1;
1708 }
1709 }
1710
1711 next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
1712
1713 if (sqr_toom8_threshold != 0)
1714 {
1715 param.name = "SQR_TOOM8_THRESHOLD";
1716 sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
1717 param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
1718 param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1719 one (&sqr_toom8_threshold, &param);
1720
1721 if (next_toom_start * 1.05 >= sqr_toom8_threshold)
1722 {
1723 sqr_toom8_threshold = 0;
1724 something_changed = 1;
1725 }
1726 }
1727 }
1728
1729 print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
1730 print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
1731 print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
1732 print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
1733 }
1734}
1735
1736
1737void
1738tune_dc_div (void)
1739{
1740 s.r = 0; /* clear to make speed function do 2n/n */
1741 {
1742 static struct param_t param;
1743 param.name = "DC_DIV_QR_THRESHOLD";
1744 param.function = speed_mpn_sbpi1_div_qr;
1745 param.function2 = speed_mpn_dcpi1_div_qr;
1746 param.min_size = 6;
1747 one (&dc_div_qr_threshold, &param);
1748 }
1749 {
1750 static struct param_t param;
1751 param.name = "DC_DIVAPPR_Q_THRESHOLD";
1752 param.function = speed_mpn_sbpi1_divappr_q;
1753 param.function2 = speed_mpn_dcpi1_divappr_q;
1754 param.min_size = 6;
1755 one (&dc_divappr_q_threshold, &param);
1756 }
1757}
1758
1759static double
1760speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1761{
1762 if (s->size < DC_DIV_QR_THRESHOLD)
1763 return speed_mpn_sbpi1_div_qr (s);
1764 else
1765 return speed_mpn_dcpi1_div_qr (s);
1766}
1767
1768void
1769tune_mu_div (void)
1770{
1771 s.r = 0; /* clear to make speed function do 2n/n */
1772 {
1773 static struct param_t param;
1774 param.name = "MU_DIV_QR_THRESHOLD";
1775 param.function = speed_mpn_dcpi1_div_qr;
1776 param.function2 = speed_mpn_mu_div_qr;
1777 param.min_size = mul_toom22_threshold;
1778 param.max_size = 5000;
1779 param.step_factor = 0.02;
1780 one (&mu_div_qr_threshold, &param);
1781 }
1782 {
1783 static struct param_t param;
1784 param.name = "MU_DIVAPPR_Q_THRESHOLD";
1785 param.function = speed_mpn_dcpi1_divappr_q;
1786 param.function2 = speed_mpn_mu_divappr_q;
1787 param.min_size = mul_toom22_threshold;
1788 param.max_size = 5000;
1789 param.step_factor = 0.02;
1790 one (&mu_divappr_q_threshold, &param);
1791 }
1792 {
1793 static struct param_t param;
1794 param.name = "MUPI_DIV_QR_THRESHOLD";
1795 param.function = speed_mpn_sbordcpi1_div_qr;
1796 param.function2 = speed_mpn_mupi_div_qr;
1797 param.min_size = 6;
1798 param.min_is_always = 1;
1799 param.max_size = 1000;
1800 param.step_factor = 0.02;
1801 one (&mupi_div_qr_threshold, &param);
1802 }
1803}
1804
1805void
1806tune_dc_bdiv (void)
1807{
1808 s.r = 0; /* clear to make speed function do 2n/n*/
1809 {
1810 static struct param_t param;
1811 param.name = "DC_BDIV_QR_THRESHOLD";
1812 param.function = speed_mpn_sbpi1_bdiv_qr;
1813 param.function2 = speed_mpn_dcpi1_bdiv_qr;
1814 param.min_size = 4;
1815 one (&dc_bdiv_qr_threshold, &param);
1816 }
1817 {
1818 static struct param_t param;
1819 param.name = "DC_BDIV_Q_THRESHOLD";
1820 param.function = speed_mpn_sbpi1_bdiv_q;
1821 param.function2 = speed_mpn_dcpi1_bdiv_q;
1822 param.min_size = 4;
1823 one (&dc_bdiv_q_threshold, &param);
1824 }
1825}
1826
1827void
1828tune_mu_bdiv (void)
1829{
1830 s.r = 0; /* clear to make speed function do 2n/n*/
1831 {
1832 static struct param_t param;
1833 param.name = "MU_BDIV_QR_THRESHOLD";
1834 param.function = speed_mpn_dcpi1_bdiv_qr;
1835 param.function2 = speed_mpn_mu_bdiv_qr;
1836 param.min_size = dc_bdiv_qr_threshold;
1837 param.max_size = 5000;
1838 param.step_factor = 0.02;
1839 one (&mu_bdiv_qr_threshold, &param);
1840 }
1841 {
1842 static struct param_t param;
1843 param.name = "MU_BDIV_Q_THRESHOLD";
1844 param.function = speed_mpn_dcpi1_bdiv_q;
1845 param.function2 = speed_mpn_mu_bdiv_q;
1846 param.min_size = dc_bdiv_q_threshold;
1847 param.max_size = 5000;
1848 param.step_factor = 0.02;
1849 one (&mu_bdiv_q_threshold, &param);
1850 }
1851}
1852
1853void
1854tune_invertappr (void)
1855{
1856 static struct param_t param;
1857
1858 param.function = speed_mpn_ni_invertappr;
1859 param.name = "INV_MULMOD_BNM1_THRESHOLD";
1860 param.min_size = 5;
1861 one (&inv_mulmod_bnm1_threshold, &param);
1862
1863 param.function = speed_mpn_invertappr;
1864 param.name = "INV_NEWTON_THRESHOLD";
1865 param.min_size = 5;
1866 one (&inv_newton_threshold, &param);
1867}
1868
1869void
1870tune_invert (void)
1871{
1872 static struct param_t param;
1873
1874 param.function = speed_mpn_invert;
1875 param.name = "INV_APPR_THRESHOLD";
1876 param.min_size = 5;
1877 one (&inv_appr_threshold, &param);
1878}
1879
1880void
1881tune_binvert (void)
1882{
1883 static struct param_t param;
1884
1885 param.function = speed_mpn_binvert;
1886 param.name = "BINV_NEWTON_THRESHOLD";
1887 param.min_size = 8; /* pointless with smaller operands */
1888 one (&binv_newton_threshold, &param);
1889}
1890
1891void
1892tune_redc (void)
1893{
1894#define TUNE_REDC_2_MAX 100
1895#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1896#define WANT_REDC_2 1
1897#endif
1898
1899#if WANT_REDC_2
1900 {
1901 static struct param_t param;
1902 param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1903 param.function = speed_mpn_redc_1;
1904 param.function2 = speed_mpn_redc_2;
1905 param.min_size = 1;
1906 param.min_is_always = 1;
1907 param.max_size = TUNE_REDC_2_MAX;
1908 param.noprint = 1;
1909 param.stop_factor = 1.5;
1910 one (&redc_1_to_redc_2_threshold, &param);
1911 }
1912 {
1913 static struct param_t param;
1914 param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1915 param.function = speed_mpn_redc_2;
1916 param.function2 = speed_mpn_redc_n;
1917 param.min_size = 16;
1918 param.noprint = 1;
1919 one (&redc_2_to_redc_n_threshold, &param);
1920 }
1921 if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
1922 {
1923 redc_2_to_redc_n_threshold = 0; /* disable redc_2 */
1924
1925 /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
1926 REDC_1_TO_REDC_2_THRESHOLD. */
1927 {
1928 static struct param_t param;
1929 param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1930 param.function = speed_mpn_redc_1;
1931 param.function2 = speed_mpn_redc_n;
1932 param.min_size = 16;
1933 param.noprint = 1;
1934 one (&redc_1_to_redc_2_threshold, &param);
1935 }
1936 }
1937 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1938 print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1939#else
1940 {
1941 static struct param_t param;
1942 param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1943 param.function = speed_mpn_redc_1;
1944 param.function2 = speed_mpn_redc_n;
1945 param.min_size = 16;
1946 one (&redc_1_to_redc_n_threshold, &param);
1947 }
1948#endif
1949}
1950
1951void
1952tune_matrix22_mul (void)
1953{
1954 static struct param_t param;
1955 param.name = "MATRIX22_STRASSEN_THRESHOLD";
1956 param.function = speed_mpn_matrix22_mul;
1957 param.min_size = 2;
1958 one (&matrix22_strassen_threshold, &param);
1959}
1960
1961void
1962tune_hgcd2 (void)
1963{
1964 static struct param_t param;
1965 hgcd2_func_t *f[5] =
1966 { mpn_hgcd2_1,
1967 mpn_hgcd2_2,
1968 mpn_hgcd2_3,
1969 mpn_hgcd2_4,
1970 mpn_hgcd2_5 };
1971 speed_function_t speed_f[5] =
1972 { speed_mpn_hgcd2_1,
1973 speed_mpn_hgcd2_2,
1974 speed_mpn_hgcd2_3,
1975 speed_mpn_hgcd2_4,
1976 speed_mpn_hgcd2_5 };
1977 int best;
1978
1979 s.size = 1;
1980 best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", &param);
1981
1982 /* Use selected function when tuning hgcd and gcd */
1983 hgcd2_func = f[best];
1984}
1985
1986void
1987tune_hgcd (void)
1988{
1989 static struct param_t param;
1990 param.name = "HGCD_THRESHOLD";
1991 param.function = speed_mpn_hgcd;
1992 /* We seem to get strange results for small sizes */
1993 param.min_size = 30;
1994 one (&hgcd_threshold, &param);
1995}
1996
1997void
1998tune_hgcd_appr (void)
1999{
2000 static struct param_t param;
2001 param.name = "HGCD_APPR_THRESHOLD";
2002 param.function = speed_mpn_hgcd_appr;
2003 /* We seem to get strange results for small sizes */
2004 param.min_size = 50;
2005 param.stop_since_change = 150;
2006 one (&hgcd_appr_threshold, &param);
2007}
2008
2009void
2010tune_hgcd_reduce (void)
2011{
2012 static struct param_t param;
2013 param.name = "HGCD_REDUCE_THRESHOLD";
2014 param.function = speed_mpn_hgcd_reduce;
2015 param.min_size = 30;
2016 param.max_size = 7000;
2017 param.step_factor = 0.04;
2018 one (&hgcd_reduce_threshold, &param);
2019}
2020
2021void
2022tune_gcd_dc (void)
2023{
2024 static struct param_t param;
2025 param.name = "GCD_DC_THRESHOLD";
2026 param.function = speed_mpn_gcd;
2027 param.min_size = hgcd_threshold;
2028 param.max_size = 3000;
2029 param.step_factor = 0.02;
2030 one (&gcd_dc_threshold, &param);
2031}
2032
2033void
2034tune_gcdext_dc (void)
2035{
2036 static struct param_t param;
2037 param.name = "GCDEXT_DC_THRESHOLD";
2038 param.function = speed_mpn_gcdext;
2039 param.min_size = hgcd_threshold;
2040 param.max_size = 3000;
2041 param.step_factor = 0.02;
2042 one (&gcdext_dc_threshold, &param);
2043}
2044
2045/* In tune_powm_sec we compute the table used by the win_size function. The
2046 cutoff points are in exponent bits, disregarding other operand sizes. It is
2047 not possible to use the one framework since it currently uses a granularity
2048 of full limbs.
2049*/
2050
2051/* This win_size replaces the variant in the powm code, allowing us to
2052 control k in the k-ary algorithms. */
2053int winsize;
2054int
2055win_size (mp_bitcnt_t eb)
2056{
2057 return winsize;
2058}
2059
2060void
2061tune_powm_sec (void)
2062{
2063 mp_size_t n;
2064 int k, i;
2065 mp_size_t itch;
2066 mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
2067 const int n_max = 3000 / GMP_NUMB_BITS;
2068 const int n_measurements = 5;
2069 mp_ptr rp, bp, ep, mp, tp;
2070 double ttab[n_measurements], tk, tkp1;
2071 TMP_DECL;
2072 TMP_MARK;
2073
2074 possible_nbits_cutoff = 0;
2075
2076 k = 1;
2077
2078 winsize = 10; /* the itch function needs this */
2079 itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
2080
2081 rp = TMP_ALLOC_LIMBS (n_max);
2082 bp = TMP_ALLOC_LIMBS (n_max);
2083 ep = TMP_ALLOC_LIMBS (n_max);
2084 mp = TMP_ALLOC_LIMBS (n_max);
2085 tp = TMP_ALLOC_LIMBS (itch);
2086
2087 mpn_random (bp, n_max);
2088 mpn_random (mp, n_max);
2089 mp[0] |= 1;
2090
2091/* How about taking the M operand size into account?
2092
2093 An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
2094 B = O(M)).
2095
2096 Using k-ary and no sliding window, the precomputation will need time
2097 O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
2098 O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
2099
2100 An operation R=powm_sec(B,E,N) will take time like powm.
2101
2102 Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
2103 main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
2104 O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
2105 table reads, respectively. */
2106
2107 printf ("#define POWM_SEC_TABLE ");
2108
2109 /* For nbits == 1, we should always use k == 1, so no need to tune
2110 that. Starting with nbits == 2 also ensure that nbits always is
2111 larger than the windowsize k+1. */
2112 for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
2113 {
2114 n = (nbits - 1) / GMP_NUMB_BITS + 1;
2115
2116 /* Generate E such that sliding-window for k and k+1 works equally
2117 well/poorly (but sliding is not used in powm_sec, of course). */
2118 for (i = 0; i < n; i++)
2119 ep[i] = ~CNST_LIMB(0);
2120
2121 winsize = k;
2122 for (i = 0; i < n_measurements; i++)
2123 {
2124 speed_starttime ();
2125 mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
2126 ttab[i] = speed_endtime ();
2127 }
2128 tk = median (ttab, n_measurements);
2129
2130 winsize = k + 1;
2131 speed_starttime ();
2132 for (i = 0; i < n_measurements; i++)
2133 {
2134 speed_starttime ();
2135 mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
2136 ttab[i] = speed_endtime ();
2137 }
2138 tkp1 = median (ttab, n_measurements);
2139/*
2140 printf ("testing: %ld, %d", nbits, k, ep[n-1]);
2141 printf (" %10.5f %10.5f\n", tk, tkp1);
2142*/
2143 if (tkp1 < tk)
2144 {
2145 if (possible_nbits_cutoff)
2146 {
2147 /* Two consecutive sizes indicate k increase, obey. */
2148
2149 /* Must always have x[k] >= k */
2150 ASSERT_ALWAYS (possible_nbits_cutoff >= k);
2151
2152 if (k > 1)
2153 printf (",");
2154 printf ("%ld", (long) possible_nbits_cutoff);
2155 k++;
2156 possible_nbits_cutoff = 0;
2157 }
2158 else
2159 {
2160 /* One measurement indicate k increase, save nbits for further
2161 consideration. */
2162 /* The new larger k gets used for sizes > the cutoff
2163 value, hence the cutoff should be one less than the
2164 smallest size where it gives a speedup. */
2165 possible_nbits_cutoff = nbits - 1;
2166 }
2167 }
2168 else
2169 possible_nbits_cutoff = 0;
2170
2171 nbits_next = nbits * 65 / 64;
2172 nbits = nbits_next + (nbits_next == nbits);
2173 }
2174 printf ("\n");
2175 TMP_FREE;
2176}
2177
2178
2179/* size_extra==1 reflects the fact that with high<divisor one division is
2180 always skipped. Forcing high<divisor while testing ensures consistency
2181 while stepping through sizes, ie. that size-1 divides will be done each
2182 time.
2183
2184 min_size==2 and min_is_always are used so that if plain division is only
2185 better at size==1 then don't bother including that code just for that
2186 case, instead go with preinv always and get a size saving. */
2187
2188#define DIV_1_PARAMS \
2189 param.check_size = 256; \
2190 param.min_size = 2; \
2191 param.min_is_always = 1; \
2192 param.data_high = DATA_HIGH_LT_R; \
2193 param.size_extra = 1; \
2194 param.stop_factor = 2.0;
2195
2196
2197double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
2198
2199void
2200tune_divrem_1 (void)
2201{
2202 /* plain version by default */
2203 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
2204
2205 /* No support for tuning native assembler code, do that by hand and put
2206 the results in the .asm file, there's no need for such thresholds to
2207 appear in gmp-mparam.h. */
2208 if (HAVE_NATIVE_mpn_divrem_1)
2209 return;
2210
2211 if (GMP_NAIL_BITS != 0)
2212 {
2213 print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2214 "no preinv with nails");
2215 print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2216 "no preinv with nails");
2217 return;
2218 }
2219
2220 if (UDIV_PREINV_ALWAYS)
2221 {
2222 print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
2223 print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
2224 return;
2225 }
2226
2227 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
2228
2229 /* Tune for the integer part of mpn_divrem_1. This will very possibly be
2230 a bit out for the fractional part, but that's too bad, the integer part
2231 is more important. */
2232 {
2233 static struct param_t param;
2234 param.name = "DIVREM_1_NORM_THRESHOLD";
2235 DIV_1_PARAMS;
2236 s.r = randlimb_norm ();
2237 param.function = speed_mpn_divrem_1_tune;
2238 one (&divrem_1_norm_threshold, &param);
2239 }
2240 {
2241 static struct param_t param;
2242 param.name = "DIVREM_1_UNNORM_THRESHOLD";
2243 DIV_1_PARAMS;
2244 s.r = randlimb_half ();
2245 param.function = speed_mpn_divrem_1_tune;
2246 one (&divrem_1_unnorm_threshold, &param);
2247 }
2248}
2249
2250void
2251tune_div_qr_1 (void)
2252{
2253 if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
2254 {
2255 static struct param_t param;
2256 speed_function_t f[2] =
2257 {
2258 speed_mpn_div_qr_1n_pi1_1,
2259 speed_mpn_div_qr_1n_pi1_2,
2260 };
2261
2262 s.size = 10;
2263 s.r = randlimb_norm ();
2264
2265 one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
2266 }
2267
2268 {
2269 static struct param_t param;
2270 param.name = "DIV_QR_1_NORM_THRESHOLD";
2271 DIV_1_PARAMS;
2272 param.min_size = 1;
2273 param.min_is_always = 0;
2274 s.r = randlimb_norm ();
2275 param.function = speed_mpn_div_qr_1_tune;
2276 one (&div_qr_1_norm_threshold, &param);
2277 }
2278 {
2279 static struct param_t param;
2280 param.name = "DIV_QR_1_UNNORM_THRESHOLD";
2281 DIV_1_PARAMS;
2282 param.min_size = 1;
2283 param.min_is_always = 0;
2284 s.r = randlimb_half();
2285 param.function = speed_mpn_div_qr_1_tune;
2286 one (&div_qr_1_unnorm_threshold, &param);
2287 }
2288}
2289
2290
2291void
2292tune_mod_1 (void)
2293{
2294 /* No support for tuning native assembler code, do that by hand and put
2295 the results in the .asm file, there's no need for such thresholds to
2296 appear in gmp-mparam.h. */
2297 if (HAVE_NATIVE_mpn_mod_1)
2298 return;
2299
2300 if (GMP_NAIL_BITS != 0)
2301 {
2302 print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2303 "no preinv with nails");
2304 print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2305 "no preinv with nails");
2306 return;
2307 }
2308
2309 if (!HAVE_NATIVE_mpn_mod_1_1p)
2310 {
2311 static struct param_t param;
2312 speed_function_t f[2] =
2313 {
2314 speed_mpn_mod_1_1_1,
2315 speed_mpn_mod_1_1_2,
2316 };
2317
2318 s.size = 10;
2319 s.r = randlimb_half ();
2320 one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", &param);
2321 }
2322
2323 if (UDIV_PREINV_ALWAYS)
2324 {
2325 print_define ("MOD_1_NORM_THRESHOLD", 0L);
2326 print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
2327 }
2328 else
2329 {
2330 {
2331 static struct param_t param;
2332 param.name = "MOD_1_NORM_THRESHOLD";
2333 DIV_1_PARAMS;
2334 s.r = randlimb_norm ();
2335 param.function = speed_mpn_mod_1_tune;
2336 one (&mod_1_norm_threshold, &param);
2337 }
2338 {
2339 static struct param_t param;
2340 param.name = "MOD_1_UNNORM_THRESHOLD";
2341 DIV_1_PARAMS;
2342 s.r = randlimb_half ();
2343 param.function = speed_mpn_mod_1_tune;
2344 one (&mod_1_unnorm_threshold, &param);
2345 }
2346 }
2347 {
2348 static struct param_t param;
2349
2350 param.check_size = 256;
2351
2352 s.r = randlimb_norm ();
2353 param.function = speed_mpn_mod_1_tune;
2354
2355 param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
2356 param.min_size = 2;
2357 one (&mod_1n_to_mod_1_1_threshold, &param);
2358 }
2359
2360 {
2361 static struct param_t param;
2362
2363 param.check_size = 256;
2364 s.r = randlimb_half ();
2365 param.noprint = 1;
2366
2367 param.function = speed_mpn_mod_1_1;
2368 param.function2 = speed_mpn_mod_1_2;
2369 param.min_is_always = 1;
2370 param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
2371 param.min_size = 2;
2372 one (&mod_1_1_to_mod_1_2_threshold, &param);
2373
2374 param.function = speed_mpn_mod_1_2;
2375 param.function2 = speed_mpn_mod_1_4;
2376 param.min_is_always = 1;
2377 param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
2378 param.min_size = 1;
2379 one (&mod_1_2_to_mod_1_4_threshold, &param);
2380
2381 if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
2382 {
2383 /* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
2384 mod_1_2_to_mod_1_4_threshold = 0;
2385
2386 param.function = speed_mpn_mod_1_1;
2387 param.function2 = speed_mpn_mod_1_4;
2388 param.min_is_always = 1;
2389 param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
2390 param.min_size = 2;
2391 one (&mod_1_1_to_mod_1_2_threshold, &param);
2392 }
2393
2394 param.function = speed_mpn_mod_1_tune;
2395 param.function2 = NULL;
2396 param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
2397 param.min_size = 2;
2398 param.min_is_always = 0;
2399 one (&mod_1u_to_mod_1_1_threshold, &param);
2400
2401 if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
2402 mod_1_1_to_mod_1_2_threshold = 0;
2403 if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
2404 mod_1_2_to_mod_1_4_threshold = 0;
2405
2406 print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
2407 print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
2408 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
2409 print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
2410 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
2411 }
2412
2413 {
2414 static struct param_t param;
2415
2416 param.check_size = 256;
2417
2418 param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
2419 s.r = randlimb_norm ();
2420 param.function = speed_mpn_preinv_mod_1;
2421 param.function2 = speed_mpn_mod_1_tune;
2422 param.min_size = 1;
2423 one (&preinv_mod_1_to_mod_1_threshold, &param);
2424 }
2425}
2426
2427
2428/* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
2429 imply that udiv_qrnnd_preinv is worth using, but it seems most
2430 straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
2431 directly. */
2432
2433void
2434tune_preinv_divrem_1 (void)
2435{
2436 static struct param_t param;
2437 speed_function_t divrem_1;
2438 const char *divrem_1_name;
2439 double t1, t2;
2440
2441 if (GMP_NAIL_BITS != 0)
2442 {
2443 print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
2444 return;
2445 }
2446
2447 /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
2448 it's faster than mpn_divrem_1. */
2449 if (HAVE_NATIVE_mpn_preinv_divrem_1)
2450 {
2451 print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
2452 return;
2453 }
2454
2455 /* If udiv_qrnnd_preinv is the only division method then of course
2456 mpn_preinv_divrem_1 should be used. */
2457 if (UDIV_PREINV_ALWAYS)
2458 {
2459 print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
2460 return;
2461 }
2462
2463 /* If we've got an assembler version of mpn_divrem_1, then compare against
2464 that, not the mpn_divrem_1_div generic C. */
2465 if (HAVE_NATIVE_mpn_divrem_1)
2466 {
2467 divrem_1 = speed_mpn_divrem_1;
2468 divrem_1_name = "mpn_divrem_1";
2469 }
2470 else
2471 {
2472 divrem_1 = speed_mpn_divrem_1_div;
2473 divrem_1_name = "mpn_divrem_1_div";
2474 }
2475
2476 param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
2477 s.size = 200; /* generous but not too big */
2478 /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
2479 since in general that's probably most common, though in fact for a
2480 64-bit limb mp_bases[10].big_base is normalized. */
2481 s.r = urandom() & (GMP_NUMB_MASK >> 4);
2482 if (s.r == 0) s.r = 123;
2483
2484 t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
2485 t2 = tuneup_measure (divrem_1, &param, &s);
2486 if (t1 == -1.0 || t2 == -1.0)
2487 {
2488 printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
2489 divrem_1_name, (long) s.size);
2490 abort ();
2491 }
2492 if (option_trace >= 1)
2493 printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
2494 (long) s.size, t1, divrem_1_name, t2);
2495
2496 print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
2497}
2498
2499
2500
2501void
2502tune_divrem_2 (void)
2503{
2504 static struct param_t param;
2505
2506 /* No support for tuning native assembler code, do that by hand and put
2507 the results in the .asm file, and there's no need for such thresholds
2508 to appear in gmp-mparam.h. */
2509 if (HAVE_NATIVE_mpn_divrem_2)
2510 return;
2511
2512 if (GMP_NAIL_BITS != 0)
2513 {
2514 print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
2515 "no preinv with nails");
2516 return;
2517 }
2518
2519 if (UDIV_PREINV_ALWAYS)
2520 {
2521 print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
2522 return;
2523 }
2524
2525 /* Tune for the integer part of mpn_divrem_2. This will very possibly be
2526 a bit out for the fractional part, but that's too bad, the integer part
2527 is more important.
2528
2529 min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
2530 code space if plain division is better only at size==2 or size==3. */
2531 param.name = "DIVREM_2_THRESHOLD";
2532 param.check_size = 256;
2533 param.min_size = 4;
2534 param.min_is_always = 1;
2535 param.size_extra = 2; /* does qsize==nsize-2 divisions */
2536 param.stop_factor = 2.0;
2537
2538 s.r = randlimb_norm ();
2539 param.function = speed_mpn_divrem_2;
2540 one (&divrem_2_threshold, &param);
2541}
2542
2543void
2544tune_div_qr_2 (void)
2545{
2546 static struct param_t param;
2547 param.name = "DIV_QR_2_PI2_THRESHOLD";
2548 param.function = speed_mpn_div_qr_2n;
2549 param.check_size = 500;
2550 param.min_size = 4;
2551 one (&div_qr_2_pi2_threshold, &param);
2552}
2553
2554/* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
2555 tune for that. Its speed can differ on odd or even divisor, so take an
2556 average threshold for the two.
2557
2558 mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
2559 might not vary that way, but don't test this since high<divisor isn't
2560 expected to occur often with small divisors. */
2561
2562void
2563tune_divexact_1 (void)
2564{
2565 static struct param_t param;
2566 mp_size_t thresh[2], average;
2567 int low, i;
2568
2569 /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
2570 full mpn_divrem_1. */
2571 if (HAVE_NATIVE_mpn_divexact_1)
2572 {
2573 print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
2574 return;
2575 }
2576
2577 ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
2578
2579 param.name = "DIVEXACT_1_THRESHOLD";
2580 param.data_high = DATA_HIGH_GE_R;
2581 param.check_size = 256;
2582 param.min_size = 2;
2583 param.stop_factor = 1.5;
2584 param.function = tuned_speed_mpn_divrem_1;
2585 param.function2 = speed_mpn_divexact_1;
2586 param.noprint = 1;
2587
2588 print_define_start (param.name);
2589
2590 for (low = 0; low <= 1; low++)
2591 {
2592 s.r = randlimb_half();
2593 if (low == 0)
2594 s.r |= 1;
2595 else
2596 s.r &= ~CNST_LIMB(7);
2597
2598 one (&thresh[low], &param);
2599 if (option_trace)
2600 printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
2601
2602 if (thresh[low] == MP_SIZE_T_MAX)
2603 {
2604 average = MP_SIZE_T_MAX;
2605 goto divexact_1_done;
2606 }
2607 }
2608
2609 if (option_trace)
2610 {
2611 printf ("average of:");
2612 for (i = 0; i < numberof(thresh); i++)
2613 printf (" %ld", (long) thresh[i]);
2614 printf ("\n");
2615 }
2616
2617 average = 0;
2618 for (i = 0; i < numberof(thresh); i++)
2619 average += thresh[i];
2620 average /= numberof(thresh);
2621
2622 /* If divexact turns out to be better as early as 3 limbs, then use it
2623 always, so as to reduce code size and conditional jumps. */
2624 if (average <= 3)
2625 average = 0;
2626
2627 divexact_1_done:
2628 print_define_end (param.name, average);
2629}
2630
2631
2632/* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2633 same as mpn_mod_1, but this might not be true of an assembler
2634 implementation. The threshold used is an average based on data where a
2635 divide can be skipped and where it can't.
2636
2637 If modexact turns out to be better as early as 3 limbs, then use it
2638 always, so as to reduce code size and conditional jumps. */
2639
2640void
2641tune_modexact_1_odd (void)
2642{
2643 static struct param_t param;
2644 mp_size_t thresh_lt, thresh_ge, average;
2645
2646#if 0
2647 /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2648 of a full mpn_mod_1. */
2649 if (HAVE_NATIVE_mpn_modexact_1_odd)
2650 {
2651 print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2652 return;
2653 }
2654#endif
2655
2656 param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2657 param.check_size = 256;
2658 param.min_size = 2;
2659 param.stop_factor = 1.5;
2660 param.function = speed_mpn_modexact_1c_odd;
2661 param.function2 = speed_mpn_mod_1_tune;
2662 param.noprint = 1;
2663 s.r = randlimb_half () | 1;
2664
2665 print_define_start (param.name);
2666
2667 param.data_high = DATA_HIGH_LT_R;
2668 one (&thresh_lt, &param);
2669 if (option_trace)
2670 printf ("lt thresh %ld\n", (long) thresh_lt);
2671
2672 average = thresh_lt;
2673 if (thresh_lt != MP_SIZE_T_MAX)
2674 {
2675 param.data_high = DATA_HIGH_GE_R;
2676 one (&thresh_ge, &param);
2677 if (option_trace)
2678 printf ("ge thresh %ld\n", (long) thresh_ge);
2679
2680 if (thresh_ge != MP_SIZE_T_MAX)
2681 {
2682 average = (thresh_ge + thresh_lt) / 2;
2683 if (thresh_ge <= 3)
2684 average = 0;
2685 }
2686 }
2687
2688 print_define_end (param.name, average);
2689}
2690
2691
2692void
2693tune_jacobi_base (void)
2694{
2695 static struct param_t param;
2696 speed_function_t f[4] =
2697 {
2698 speed_mpn_jacobi_base_1,
2699 speed_mpn_jacobi_base_2,
2700 speed_mpn_jacobi_base_3,
2701 speed_mpn_jacobi_base_4,
2702 };
2703
2704 s.size = GMP_LIMB_BITS * 3 / 4;
2705
2706 one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", &param);
2707}
2708
2709
2710void
2711tune_get_str (void)
2712{
2713 /* Tune for decimal, it being most common. Some rough testing suggests
2714 other bases are different, but not by very much. */
2715 s.r = 10;
2716 {
2717 static struct param_t param;
2718 GET_STR_PRECOMPUTE_THRESHOLD = 0;
2719 param.name = "GET_STR_DC_THRESHOLD";
2720 param.function = speed_mpn_get_str;
2721 param.min_size = 4;
2722 param.max_size = GET_STR_THRESHOLD_LIMIT;
2723 one (&get_str_dc_threshold, &param);
2724 }
2725 {
2726 static struct param_t param;
2727 param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2728 param.function = speed_mpn_get_str;
2729 param.min_size = GET_STR_DC_THRESHOLD;
2730 param.max_size = GET_STR_THRESHOLD_LIMIT;
2731 one (&get_str_precompute_threshold, &param);
2732 }
2733}
2734
2735
2736double
2737speed_mpn_pre_set_str (struct speed_params *s)
2738{
2739 unsigned char *str;
2740 mp_ptr wp;
2741 mp_size_t wn;
2742 unsigned i;
2743 int base;
2744 double t;
2745 mp_ptr powtab_mem, tp;
2746 powers_t powtab[GMP_LIMB_BITS];
2747 mp_size_t un;
2748 int chars_per_limb;
2749 TMP_DECL;
2750
2751 SPEED_RESTRICT_COND (s->size >= 1);
2752
2753 base = s->r == 0 ? 10 : s->r;
2754 SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2755
2756 TMP_MARK;
2757
2758 str = (unsigned char *) TMP_ALLOC (s->size);
2759 for (i = 0; i < s->size; i++)
2760 str[i] = s->xp[i] % base;
2761
2762 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
2763 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2764
2765 /* use this during development to check wn is big enough */
2766 /*
2767 ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2768 */
2769
2770 speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
2771 speed_operand_dst (s, wp, wn);
2772 speed_cache_fill (s);
2773
2774 chars_per_limb = mp_bases[base].chars_per_limb;
2775 un = s->size / chars_per_limb + 1;
2776 powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
2777 size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
2778 powers_t *pt = powtab + n_pows;
2779 tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2780
2781 speed_starttime ();
2782 i = s->reps;
2783 do
2784 {
2785 mpn_pre_set_str (wp, str, s->size, pt, tp);
2786 }
2787 while (--i != 0);
2788 t = speed_endtime ();
2789
2790 TMP_FREE;
2791 return t;
2792}
2793
2794void
2795tune_set_str (void)
2796{
2797 s.r = 10; /* decimal */
2798 {
2799 static struct param_t param;
2800 SET_STR_PRECOMPUTE_THRESHOLD = 0;
2801 param.step_factor = 0.01;
2802 param.name = "SET_STR_DC_THRESHOLD";
2803 param.function = speed_mpn_pre_set_str;
2804 param.min_size = 100;
2805 param.max_size = 50000;
2806 one (&set_str_dc_threshold, &param);
2807 }
2808 {
2809 static struct param_t param;
2810 param.step_factor = 0.02;
2811 param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2812 param.function = speed_mpn_set_str;
2813 param.min_size = SET_STR_DC_THRESHOLD;
2814 param.max_size = 100000;
2815 one (&set_str_precompute_threshold, &param);
2816 }
2817}
2818
2819
2820void
2821tune_fft_mul (void)
2822{
2823 static struct fft_param_t param;
2824
2825 if (option_fft_max_size == 0)
2826 return;
2827
2828 param.table_name = "MUL_FFT_TABLE3";
2829 param.threshold_name = "MUL_FFT_THRESHOLD";
2830 param.p_threshold = &mul_fft_threshold;
2831 param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2832 param.p_modf_threshold = &mul_fft_modf_threshold;
2833 param.first_size = MUL_TOOM33_THRESHOLD / 2;
2834 param.max_size = option_fft_max_size;
2835 param.function = speed_mpn_fft_mul;
2836 param.mul_modf_function = speed_mpn_mul_fft;
2837 param.mul_function = speed_mpn_mul_n;
2838 param.sqr = 0;
2839 fft (&param);
2840}
2841
2842
2843void
2844tune_fft_sqr (void)
2845{
2846 static struct fft_param_t param;
2847
2848 if (option_fft_max_size == 0)
2849 return;
2850
2851 param.table_name = "SQR_FFT_TABLE3";
2852 param.threshold_name = "SQR_FFT_THRESHOLD";
2853 param.p_threshold = &sqr_fft_threshold;
2854 param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2855 param.p_modf_threshold = &sqr_fft_modf_threshold;
2856 param.first_size = SQR_TOOM3_THRESHOLD / 2;
2857 param.max_size = option_fft_max_size;
2858 param.function = speed_mpn_fft_sqr;
2859 param.mul_modf_function = speed_mpn_mul_fft_sqr;
2860 param.mul_function = speed_mpn_sqr;
2861 param.sqr = 1;
2862 fft (&param);
2863}
2864
2865void
2866tune_fac_ui (void)
2867{
2868 static struct param_t param;
2869
2870 param.function = speed_mpz_fac_ui_tune;
2871
2872 param.name = "FAC_DSC_THRESHOLD";
2873 param.min_size = 70;
2874 param.max_size = FAC_DSC_THRESHOLD_LIMIT;
2875 one (&fac_dsc_threshold, &param);
2876
2877 param.name = "FAC_ODD_THRESHOLD";
2878 param.min_size = 22;
2879 param.stop_factor = 1.7;
2880 param.min_is_always = 1;
2881 one (&fac_odd_threshold, &param);
2882}
2883
2884void
2885all (void)
2886{
2887 time_t start_time, end_time;
2888 TMP_DECL;
2889
2890 TMP_MARK;
2891 SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2892 SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2893
2894 mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2895 mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2896
2897 fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2898
2899 speed_time_init ();
2900 fprintf (stderr, "Using: %s\n", speed_time_string);
2901
2902 fprintf (stderr, "speed_precision %d", speed_precision);
2903 if (speed_unittime == 1.0)
2904 fprintf (stderr, ", speed_unittime 1 cycle");
2905 else
2906 fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2907 if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2908 fprintf (stderr, ", CPU freq unknown\n");
2909 else
2910 fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2911
2912 fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2913 DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2914 fprintf (stderr, "\n");
2915
2916 time (&start_time);
2917 {
2918 struct tm *tp;
2919 tp = localtime (&start_time);
2920 printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2921 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2922
2923#ifdef __GNUC__
2924 /* gcc sub-minor version doesn't seem to come through as a define */
2925 printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2926#define PRINTED_COMPILER
2927#endif
2928#if defined (__SUNPRO_C)
2929 printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2930#define PRINTED_COMPILER
2931#endif
2932#if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2933 /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2934 printf ("MIPSpro C %d.%d.%d */\n",
2935 _COMPILER_VERSION / 100,
2936 _COMPILER_VERSION / 10 % 10,
2937 _COMPILER_VERSION % 10);
2938#define PRINTED_COMPILER
2939#endif
2940#if defined (__DECC) && defined (__DECC_VER)
2941 printf ("DEC C %d */\n", __DECC_VER);
2942#define PRINTED_COMPILER
2943#endif
2944#if ! defined (PRINTED_COMPILER)
2945 printf ("system compiler */\n");
2946#endif
2947 }
2948 printf ("\n");
2949
2950 tune_divrem_1 ();
2951 tune_mod_1 ();
2952 tune_preinv_divrem_1 ();
2953 tune_div_qr_1 ();
2954#if 0
2955 tune_divrem_2 ();
2956#endif
2957 tune_div_qr_2 ();
2958 tune_divexact_1 ();
2959 tune_modexact_1_odd ();
2960 printf("\n");
2961
2962 relspeed_div_1_vs_mul_1 ();
2963 printf("\n");
2964
2965 tune_mul_n ();
2966 printf("\n");
2967
2968 tune_mul ();
2969 printf("\n");
2970
2971 tune_sqr ();
2972 printf("\n");
2973
2974 tune_mulmid ();
2975 printf("\n");
2976
2977 tune_mulmod_bnm1 ();
2978 tune_sqrmod_bnm1 ();
2979 printf("\n");
2980
2981 tune_fft_mul ();
2982 printf("\n");
2983
2984 tune_fft_sqr ();
2985 printf ("\n");
2986
2987 tune_mullo ();
2988 tune_sqrlo ();
2989 printf("\n");
2990
2991 tune_dc_div ();
2992 tune_dc_bdiv ();
2993
2994 printf("\n");
2995 tune_invertappr ();
2996 tune_invert ();
2997 printf("\n");
2998
2999 tune_binvert ();
3000 tune_redc ();
3001 printf("\n");
3002
3003 tune_mu_div ();
3004 tune_mu_bdiv ();
3005 printf("\n");
3006
3007 tune_powm_sec ();
3008 printf("\n");
3009
3010 tune_get_str ();
3011 tune_set_str ();
3012 printf("\n");
3013
3014 tune_fac_ui ();
3015 printf("\n");
3016
3017 tune_matrix22_mul ();
3018 tune_hgcd2 ();
3019 tune_hgcd ();
3020 tune_hgcd_appr ();
3021 tune_hgcd_reduce();
3022 tune_gcd_dc ();
3023 tune_gcdext_dc ();
3024 tune_jacobi_base ();
3025 printf("\n");
3026
3027 time (&end_time);
3028 printf ("/* Tuneup completed successfully, took %ld seconds */\n",
3029 (long) (end_time - start_time));
3030
3031 TMP_FREE;
3032}
3033
3034
3035int
3036main (int argc, char *argv[])
3037{
3038 int opt;
3039
3040 /* Unbuffered so if output is redirected to a file it isn't lost if the
3041 program is killed part way through. */
3042 setbuf (stdout, NULL);
3043 setbuf (stderr, NULL);
3044
3045 while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
3046 {
3047 switch (opt) {
3048 case 'f':
3049 if (optarg[0] == 't')
3050 option_fft_trace = 2;
3051 else
3052 option_fft_max_size = atol (optarg);
3053 break;
3054 case 'o':
3055 speed_option_set (optarg);
3056 break;
3057 case 'p':
3058 speed_precision = atoi (optarg);
3059 break;
3060 case 't':
3061 option_trace++;
3062 break;
3063 case '?':
3064 exit(1);
3065 }
3066 }
3067
3068 all ();
3069 exit (0);
3070}