Blame - third_party/gmp/tune/tuneup.c - RealtimeRoboticsGroup/test

blob: 86ba5edf67bdb1c8dabba791f49d7728789dc4cf [file] [log] [blame]

Austin Schuh	dace2a6	2020-08-18 10:56:48 -0700	[diff] [blame]	1	/* Create tuned thresholds for various algorithms.
				2
				3	Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
				4
				5	This file is part of the GNU MP Library.
				6
				7	The GNU MP Library is free software; you can redistribute it and/or modify
				8	it under the terms of either:
				9
				10	* the GNU Lesser General Public License as published by the Free
				11	Software Foundation; either version 3 of the License, or (at your
				12	option) any later version.
				13
				14	or
				15
				16	* the GNU General Public License as published by the Free Software
				17	Foundation; either version 2 of the License, or (at your option) any
				18	later version.
				19
				20	or both in parallel, as here.
				21
				22	The GNU MP Library is distributed in the hope that it will be useful, but
				23	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				24	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				25	for more details.
				26
				27	You should have received copies of the GNU General Public License and the
				28	GNU Lesser General Public License along with the GNU MP Library. If not,
				29	see https://www.gnu.org/licenses/. */
				30
				31
				32	/* Usage: tuneup [-t] [-t] [-p precision]
				33
				34	-t turns on some diagnostic traces, a second -t turns on more traces.
				35
				36	Notes:
				37
				38	The code here isn't a vision of loveliness, mainly because it's subject
				39	to ongoing changes according to new things wanting to be tuned, and
				40	practical requirements of systems tested.
				41
				42	Sometimes running the program twice produces slightly different results.
				43	This is probably because there's so little separating algorithms near
				44	their crossover, and on that basis it should make little or no difference
				45	to the final speed of the relevant routines, but nothing has been done to
				46	check that carefully.
				47
				48	Algorithm:
				49
				50	The thresholds are determined as follows. A crossover may not be a
				51	single size but rather a range where it oscillates between method A or
				52	method B faster. If the threshold is set making B used where A is faster
				53	(or vice versa) that's bad. Badness is the percentage time lost and
				54	total badness is the sum of this over all sizes measured. The threshold
				55	is set to minimize total badness.
				56
				57	Suppose, as sizes increase, method B becomes faster than method A. The
				58	effect of the rule is that, as you look at increasing sizes, isolated
				59	points where B is faster are ignored, but when it's consistently faster,
				60	or faster on balance, then the threshold is set there. The same result
				61	is obtained thinking in the other direction of A becoming faster at
				62	smaller sizes.
				63
				64	In practice the thresholds tend to be chosen to bring on the next
				65	algorithm fairly quickly.
				66
				67	This rule is attractive because it's got a basis in reason and is fairly
				68	easy to implement, but no work has been done to actually compare it in
				69	absolute terms to other possibilities.
				70
				71	Implementation:
				72
				73	In a normal library build the thresholds are constants. To tune them
				74	selected objects are recompiled with the thresholds as global variables
				75	instead. #define TUNE_PROGRAM_BUILD does this, with help from code at
				76	the end of gmp-impl.h, and rules in tune/Makefile.am.
				77
				78	MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The
				79	threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
				80	level, but recurse into the basecase.
				81
				82	MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
				83	Other routines in turn will make use of both of those. Naturally the
				84	dependants must be tuned first.
				85
				86	In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
				87	just a threshold based on comparing two routines (mpn_divrem_1 and
				88	mpn_divexact_1), and no further use of the value determined.
				89
				90	Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
				91	just comparisons between certain routines on representative data.
				92
				93	Shortcuts are applied when native (assembler) versions of routines exist.
				94	For instance a native mpn_sqr_basecase is assumed to be always faster
				95	than mpn_mul_basecase, with no measuring.
				96
				97	No attempt is made to tune within assembler routines, for instance
				98	DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be
				99	written and tuned all by hand. Assembler routines that might have hard
				100	limits are recompiled though, to make them accept a bigger range of sizes
				101	than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
				102
				103	Limitations:
				104
				105	The FFTs aren't subject to the same badness rule as the other thresholds,
				106	so each k is probably being brought on a touch early. This isn't likely
				107	to make a difference, and the simpler probing means fewer tests.
				108
				109	*/
				110
				111	#define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
				112
				113	#include "config.h"
				114
				115	#include <math.h>
				116	#include <stdio.h>
				117	#include <stdlib.h>
				118	#include <time.h>
				119	#if HAVE_UNISTD_H
				120	#include <unistd.h>
				121	#endif
				122
				123	#include "gmp-impl.h"
				124	#include "longlong.h"
				125
				126	#include "tests.h"
				127	#include "speed.h"
				128
				129	#if !HAVE_DECL_OPTARG
				130	extern char *optarg;
				131	extern int optind, opterr;
				132	#endif
				133
				134
				135	#define DEFAULT_MAX_SIZE 1000 /* limbs */
				136
				137	#if WANT_FFT
				138	mp_size_t option_fft_max_size = 50000; /* limbs */
				139	#else
				140	mp_size_t option_fft_max_size = 0;
				141	#endif
				142	int option_trace = 0;
				143	int option_fft_trace = 0;
				144	struct speed_params s;
				145
				146	struct dat_t {
				147	mp_size_t size;
				148	double d;
				149	} *dat = NULL;
				150	int ndat = 0;
				151	int allocdat = 0;
				152
				153	/* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
				154	case use zero here, which for params.max_size means no limit. */
				155	#ifndef TUNE_SQR_TOOM2_MAX
				156	#define TUNE_SQR_TOOM2_MAX 0
				157	#endif
				158
				159	mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX;
				160	mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT;
				161	mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT;
				162	mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT;
				163	mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
				164	mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
				165	mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
				166	mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
				167	mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
				168	mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
				169	mp_size_t mul_fft_threshold = MP_SIZE_T_MAX;
				170	mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX;
				171	mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
				172	mp_size_t sqr_toom2_threshold
				173	= (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
				174	mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
				175	mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
				176	mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
				177	mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
				178	mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX;
				179	mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX;
				180	mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX;
				181	mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX;
				182	mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX;
				183	mp_size_t sqrlo_basecase_threshold = MP_SIZE_T_MAX;
				184	mp_size_t sqrlo_dc_threshold = MP_SIZE_T_MAX;
				185	mp_size_t sqrlo_sqr_threshold = MP_SIZE_T_MAX;
				186	mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX;
				187	mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX;
				188	mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX;
				189	mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX;
				190	mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
				191	mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX;
				192	mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX;
				193	mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX;
				194	mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX;
				195	mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX;
				196	mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX;
				197	mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX;
				198	mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX;
				199	mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX;
				200	mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX;
				201	mp_size_t inv_newton_threshold = MP_SIZE_T_MAX;
				202	mp_size_t inv_appr_threshold = MP_SIZE_T_MAX;
				203	mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
				204	mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
				205	mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
				206	mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
				207	mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
				208	mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
				209	mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX;
				210	mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX;
				211	mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
				212	mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
				213	int div_qr_1n_pi1_method = 0;
				214	mp_size_t div_qr_1_norm_threshold = MP_SIZE_T_MAX;
				215	mp_size_t div_qr_1_unnorm_threshold = MP_SIZE_T_MAX;
				216	mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
				217	mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
				218	mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
				219	mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
				220	int mod_1_1p_method = 0;
				221	mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX;
				222	mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX;
				223	mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
				224	mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
				225	mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
				226	mp_size_t divrem_2_threshold = MP_SIZE_T_MAX;
				227	mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX;
				228	mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX;
				229	mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX;
				230	mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX;
				231	mp_size_t fac_odd_threshold = 0;
				232	mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT;
				233
				234	mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX;
				235	mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX;
				236
				237	struct param_t {
				238	const char *name;
				239	speed_function_t function;
				240	speed_function_t function2;
				241	double step_factor; /* how much to step relatively */
				242	int step; /* how much to step absolutely */
				243	double function_fudge; /* multiplier for "function" speeds */
				244	int stop_since_change;
				245	double stop_factor;
				246	mp_size_t min_size;
				247	int min_is_always;
				248	mp_size_t max_size;
				249	mp_size_t check_size;
				250	mp_size_t size_extra;
				251
				252	#define DATA_HIGH_LT_R 1
				253	#define DATA_HIGH_GE_R 2
				254	int data_high;
				255
				256	int noprint;
				257	};
				258
				259
				260	/* These are normally undefined when false, which suits "#if" fine.
				261	But give them zero values so they can be used in plain C "if"s. */
				262	#ifndef UDIV_PREINV_ALWAYS
				263	#define UDIV_PREINV_ALWAYS 0
				264	#endif
				265	#ifndef HAVE_NATIVE_mpn_divexact_1
				266	#define HAVE_NATIVE_mpn_divexact_1 0
				267	#endif
				268	#ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
				269	#define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
				270	#endif
				271	#ifndef HAVE_NATIVE_mpn_divrem_1
				272	#define HAVE_NATIVE_mpn_divrem_1 0
				273	#endif
				274	#ifndef HAVE_NATIVE_mpn_divrem_2
				275	#define HAVE_NATIVE_mpn_divrem_2 0
				276	#endif
				277	#ifndef HAVE_NATIVE_mpn_mod_1
				278	#define HAVE_NATIVE_mpn_mod_1 0
				279	#endif
				280	#ifndef HAVE_NATIVE_mpn_mod_1_1p
				281	#define HAVE_NATIVE_mpn_mod_1_1p 0
				282	#endif
				283	#ifndef HAVE_NATIVE_mpn_modexact_1_odd
				284	#define HAVE_NATIVE_mpn_modexact_1_odd 0
				285	#endif
				286	#ifndef HAVE_NATIVE_mpn_preinv_divrem_1
				287	#define HAVE_NATIVE_mpn_preinv_divrem_1 0
				288	#endif
				289	#ifndef HAVE_NATIVE_mpn_preinv_mod_1
				290	#define HAVE_NATIVE_mpn_preinv_mod_1 0
				291	#endif
				292	#ifndef HAVE_NATIVE_mpn_sqr_basecase
				293	#define HAVE_NATIVE_mpn_sqr_basecase 0
				294	#endif
				295
				296
				297	#define MAX3(a,b,c) MAX (MAX (a, b), c)
				298
				299	mp_limb_t
				300	randlimb_norm (void)
				301	{
				302	mp_limb_t n;
				303	mpn_random (&n, 1);
				304	n \|= GMP_NUMB_HIGHBIT;
				305	return n;
				306	}
				307
				308	#define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
				309
				310	mp_limb_t
				311	randlimb_half (void)
				312	{
				313	mp_limb_t n;
				314	mpn_random (&n, 1);
				315	n &= GMP_NUMB_HALFMASK;
				316	n += (n==0);
				317	return n;
				318	}
				319
				320
				321	/* Add an entry to the end of the dat[] array, reallocing to make it bigger
				322	if necessary. */
				323	void
				324	add_dat (mp_size_t size, double d)
				325	{
				326	#define ALLOCDAT_STEP 500
				327
				328	ASSERT_ALWAYS (ndat <= allocdat);
				329
				330	if (ndat == allocdat)
				331	{
				332	dat = (struct dat_t *) __gmp_allocate_or_reallocate
				333	(dat, allocdat * sizeof(dat[0]),
				334	(allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
				335	allocdat += ALLOCDAT_STEP;
				336	}
				337
				338	dat[ndat].size = size;
				339	dat[ndat].d = d;
				340	ndat++;
				341	}
				342
				343
				344	/* Return the threshold size based on the data accumulated. */
				345	mp_size_t
				346	analyze_dat (int final)
				347	{
				348	double x, min_x;
				349	int j, min_j;
				350
				351	/* If the threshold is set at dat[0].size, any positive values are bad. */
				352	x = 0.0;
				353	for (j = 0; j < ndat; j++)
				354	if (dat[j].d > 0.0)
				355	x += dat[j].d;
				356
				357	if (option_trace >= 2 && final)
				358	{
				359	printf ("\n");
				360	printf ("x is the sum of the badness from setting thresh at given size\n");
				361	printf (" (minimum x is sought)\n");
				362	printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
				363	}
				364
				365	min_x = x;
				366	min_j = 0;
				367
				368
				369	/* When stepping to the next dat[j].size, positive values are no longer
				370	bad (so subtracted), negative values become bad (so add the absolute
				371	value, meaning subtract). */
				372	for (j = 0; j < ndat; x -= dat[j].d, j++)
				373	{
				374	if (option_trace >= 2 && final)
				375	printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
				376
				377	if (x < min_x)
				378	{
				379	min_x = x;
				380	min_j = j;
				381	}
				382	}
				383
				384	return min_j;
				385	}
				386
				387
				388	/* Measuring for recompiled mpn/generic/div_qr_1.c,
				389	* mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
				390
				391	mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
				392
				393	#if defined (__cplusplus)
				394	extern "C" {
				395	#endif
				396
				397	mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
				398	mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
				399	void mpz_fac_ui_tune (mpz_ptr, unsigned long);
				400
				401	#if defined (__cplusplus)
				402	}
				403	#endif
				404
				405	double
				406	speed_mpn_mod_1_tune (struct speed_params *s)
				407	{
				408	SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
				409	}
				410	double
				411	speed_mpn_divrem_1_tune (struct speed_params *s)
				412	{
				413	SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
				414	}
				415	double
				416	speed_mpz_fac_ui_tune (struct speed_params *s)
				417	{
				418	SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
				419	}
				420	double
				421	speed_mpn_div_qr_1_tune (struct speed_params *s)
				422	{
				423	SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
				424	}
				425
				426	double
				427	tuneup_measure (speed_function_t fun,
				428	const struct param_t *param,
				429	struct speed_params *s)
				430	{
				431	static struct param_t dummy;
				432	double t;
				433	TMP_DECL;
				434
				435	if (! param)
				436	param = &dummy;
				437
				438	s->size += param->size_extra;
				439
				440	TMP_MARK;
				441	SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
				442	SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
				443
				444	mpn_random (s->xp, s->size);
				445	mpn_random (s->yp, s->size);
				446
				447	switch (param->data_high) {
				448	case DATA_HIGH_LT_R:
				449	s->xp[s->size-1] %= s->r;
				450	s->yp[s->size-1] %= s->r;
				451	break;
				452	case DATA_HIGH_GE_R:
				453	s->xp[s->size-1] \|= s->r;
				454	s->yp[s->size-1] \|= s->r;
				455	break;
				456	}
				457
				458	t = speed_measure (fun, s);
				459
				460	s->size -= param->size_extra;
				461
				462	TMP_FREE;
				463	return t;
				464	}
				465
				466
				467	#define PRINT_WIDTH 31
				468
				469	void
				470	print_define_start (const char *name)
				471	{
				472	printf ("#define %-*s ", PRINT_WIDTH, name);
				473	if (option_trace)
				474	printf ("...\n");
				475	}
				476
				477	void
				478	print_define_end_remark (const char name, mp_size_t value, const char remark)
				479	{
				480	if (option_trace)
				481	printf ("#define %-*s ", PRINT_WIDTH, name);
				482
				483	if (value == MP_SIZE_T_MAX)
				484	printf ("MP_SIZE_T_MAX");
				485	else
				486	printf ("%5ld", (long) value);
				487
				488	if (remark != NULL)
				489	printf (" /* %s */", remark);
				490	printf ("\n");
				491	fflush (stdout);
				492	}
				493
				494	void
				495	print_define_end (const char *name, mp_size_t value)
				496	{
				497	const char *remark;
				498	if (value == MP_SIZE_T_MAX)
				499	remark = "never";
				500	else if (value == 0)
				501	remark = "always";
				502	else
				503	remark = NULL;
				504	print_define_end_remark (name, value, remark);
				505	}
				506
				507	void
				508	print_define (const char *name, mp_size_t value)
				509	{
				510	print_define_start (name);
				511	print_define_end (name, value);
				512	}
				513
				514	void
				515	print_define_remark (const char name, mp_size_t value, const char remark)
				516	{
				517	print_define_start (name);
				518	print_define_end_remark (name, value, remark);
				519	}
				520
				521	void
				522	print_define_with_speedup (const char *name, mp_size_t value,
				523	mp_size_t runner_up, double speedup)
				524	{
				525	char buf[100];
				526	snprintf (buf, sizeof(buf), "%.2f%% faster than %ld",
				527	100.0 * (speedup - 1), runner_up);
				528	print_define_remark (name, value, buf);
				529	}
				530
				531	void
				532	one (mp_size_t threshold, struct param_t param)
				533	{
				534	int since_positive, since_thresh_change;
				535	int thresh_idx, new_thresh_idx;
				536
				537	#define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0)
				538
				539	DEFAULT (param->function_fudge, 1.0);
				540	DEFAULT (param->function2, param->function);
				541	DEFAULT (param->step_factor, 0.01); /* small steps by default */
				542	DEFAULT (param->step, 1); /* small steps by default */
				543	DEFAULT (param->stop_since_change, 80);
				544	DEFAULT (param->stop_factor, 1.2);
				545	DEFAULT (param->min_size, 10);
				546	DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
				547
				548	if (param->check_size != 0)
				549	{
				550	double t1, t2;
				551	s.size = param->check_size;
				552
				553	*threshold = s.size+1;
				554	t1 = tuneup_measure (param->function, param, &s);
				555
				556	*threshold = s.size;
				557	t2 = tuneup_measure (param->function2, param, &s);
				558	if (t1 == -1.0 \|\| t2 == -1.0)
				559	{
				560	printf ("Oops, can't run both functions at size %ld\n",
				561	(long) s.size);
				562	abort ();
				563	}
				564	t1 *= param->function_fudge;
				565
				566	/* ask that t2 is at least 4% below t1 */
				567	if (t1 < t2*1.04)
				568	{
				569	if (option_trace)
				570	printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
				571	*threshold = MP_SIZE_T_MAX;
				572	if (! param->noprint)
				573	print_define (param->name, *threshold);
				574	return;
				575	}
				576
				577	if (option_trace >= 2)
				578	printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
				579	(long) s.size, t1, t2);
				580	}
				581
				582	if (! param->noprint \|\| option_trace)
				583	print_define_start (param->name);
				584
				585	ndat = 0;
				586	since_positive = 0;
				587	since_thresh_change = 0;
				588	thresh_idx = 0;
				589
				590	if (option_trace >= 2)
				591	{
				592	printf (" algorithm-A algorithm-B ratio possible\n");
				593	printf (" (seconds) (seconds) diff thresh\n");
				594	}
				595
				596	for (s.size = param->min_size;
				597	s.size < param->max_size;
				598	s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
				599	{
				600	double ti, tiplus1, d;
				601
				602	/*
				603	FIXME: check minimum size requirements are met, possibly by just
				604	checking for the -1 returns from the speed functions.
				605	*/
				606
				607	/* using method A at this size */
				608	*threshold = s.size+1;
				609	ti = tuneup_measure (param->function, param, &s);
				610	if (ti == -1.0)
				611	abort ();
				612	ti *= param->function_fudge;
				613
				614	/* using method B at this size */
				615	*threshold = s.size;
				616	tiplus1 = tuneup_measure (param->function2, param, &s);
				617	if (tiplus1 == -1.0)
				618	abort ();
				619
				620	/* Calculate the fraction by which the one or the other routine is
				621	slower. */
				622	if (tiplus1 >= ti)
				623	d = (tiplus1 - ti) / tiplus1; /* negative */
				624	else
				625	d = (tiplus1 - ti) / ti; /* positive */
				626
				627	add_dat (s.size, d);
				628
				629	new_thresh_idx = analyze_dat (0);
				630
				631	if (option_trace >= 2)
				632	printf ("size=%ld %.9f %.9f % .4f %c %ld\n",
				633	(long) s.size, ti, tiplus1, d,
				634	ti > tiplus1 ? '#' : ' ',
				635	(long) dat[new_thresh_idx].size);
				636
				637	/* Stop if the last time method i was faster was more than a
				638	certain number of measurements ago. */
				639	#define STOP_SINCE_POSITIVE 200
				640	if (d >= 0)
				641	since_positive = 0;
				642	else
				643	if (++since_positive > STOP_SINCE_POSITIVE)
				644	{
				645	if (option_trace >= 1)
				646	printf ("stopped due to since_positive (%d)\n",
				647	STOP_SINCE_POSITIVE);
				648	break;
				649	}
				650
				651	/* Stop if method A has become slower by a certain factor. */
				652	if (ti >= tiplus1 * param->stop_factor)
				653	{
				654	if (option_trace >= 1)
				655	printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
				656	param->stop_factor);
				657	break;
				658	}
				659
				660	/* Stop if the threshold implied hasn't changed in a certain
				661	number of measurements. (It's this condition that usually
				662	stops the loop.) */
				663	if (thresh_idx != new_thresh_idx)
				664	since_thresh_change = 0, thresh_idx = new_thresh_idx;
				665	else
				666	if (++since_thresh_change > param->stop_since_change)
				667	{
				668	if (option_trace >= 1)
				669	printf ("stopped due to since_thresh_change (%d)\n",
				670	param->stop_since_change);
				671	break;
				672	}
				673
				674	/* Stop if the threshold implied is more than a certain number of
				675	measurements ago. */
				676	#define STOP_SINCE_AFTER 500
				677	if (ndat - thresh_idx > STOP_SINCE_AFTER)
				678	{
				679	if (option_trace >= 1)
				680	printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
				681	STOP_SINCE_AFTER);
				682	break;
				683	}
				684
				685	/* Stop when the size limit is reached before the end of the
				686	crossover, but only show this as an error for >= the default max
				687	size. FIXME: Maybe should make it a param choice whether this is
				688	an error. */
				689	if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
				690	{
				691	fprintf (stderr, "%s\n", param->name);
				692	fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
				693	(long) dat[0].size, (long) dat[ndat-1].size, ndat);
				694	fprintf (stderr, " max size reached before end of crossover\n");
				695	break;
				696	}
				697	}
				698
				699	if (option_trace >= 1)
				700	printf ("sizes %ld to %ld total %d measurements\n",
				701	(long) dat[0].size, (long) dat[ndat-1].size, ndat);
				702
				703	*threshold = dat[analyze_dat (1)].size;
				704
				705	if (param->min_is_always)
				706	{
				707	if (*threshold == param->min_size)
				708	*threshold = 0;
				709	}
				710
				711	if (! param->noprint \|\| option_trace)
				712	print_define_end (param->name, *threshold);
				713	}
				714
				715	/* Time N different FUNCTIONS with the same parameters and size, to
				716	select the fastest. Since *_METHOD defines start numbering from
				717	one, if functions[i] is fastest, the value of the define is i+1.
				718	Also output a comment with speedup compared to the next fastest
				719	function. The NAME argument is used only for trace output.
				720
				721	Returns the index of the fastest function.
				722	*/
				723	int
				724	one_method (int n, speed_function_t *functions,
				725	const char name, const char define,
				726	const struct param_t *param)
				727	{
				728	double *t;
				729	int i;
				730	int method;
				731	int method_runner_up;
				732
				733	TMP_DECL;
				734	TMP_MARK;
				735	t = (double) TMP_ALLOC (n sizeof (*t));
				736
				737	for (i = 0; i < n; i++)
				738	{
				739	t[i] = tuneup_measure (functions[i], param, &s);
				740	if (option_trace >= 1)
				741	printf ("size=%ld, %s, method %d %.9f\n",
				742	(long) s.size, name, i + 1, t[i]);
				743	if (t[i] == -1.0)
				744	{
				745	printf ("Oops, can't measure all %s methods\n", name);
				746	abort ();
				747	}
				748	}
				749	method = 0;
				750	for (i = 1; i < n; i++)
				751	if (t[i] < t[method])
				752	method = i;
				753
				754	method_runner_up = (method == 0);
				755	for (i = 0; i < n; i++)
				756	if (i != method && t[i] < t[method_runner_up])
				757	method_runner_up = i;
				758
				759	print_define_with_speedup (define, method + 1, method_runner_up + 1,
				760	t[method_runner_up] / t[method]);
				761
				762	TMP_FREE;
				763	return method;
				764	}
				765
				766
				767	/* Special probing for the fft thresholds. The size restrictions on the
				768	FFTs mean the graph of time vs size has a step effect. See this for
				769	example using
				770
				771	./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
				772	gnuplot foo.gnuplot
				773
				774	The current approach is to compare routines at the midpoint of relevant
				775	steps. Arguably a more sophisticated system of threshold data is wanted
				776	if this step effect remains. */
				777
				778	struct fft_param_t {
				779	const char *table_name;
				780	const char *threshold_name;
				781	const char *modf_threshold_name;
				782	mp_size_t *p_threshold;
				783	mp_size_t *p_modf_threshold;
				784	mp_size_t first_size;
				785	mp_size_t max_size;
				786	speed_function_t function;
				787	speed_function_t mul_modf_function;
				788	speed_function_t mul_function;
				789	mp_size_t sqr;
				790	};
				791
				792
				793	/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
				794	N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
				795	of 2^(k-1) bits. */
				796
				797	mp_size_t
				798	fft_step_size (int k)
				799	{
				800	mp_size_t step;
				801
				802	step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
				803	step *= (mp_size_t) 1 << k;
				804
				805	if (step <= 0)
				806	{
				807	printf ("Can't handle k=%d\n", k);
				808	abort ();
				809	}
				810
				811	return step;
				812	}
				813
				814	mp_size_t
				815	fft_next_size (mp_size_t pl, int k)
				816	{
				817	mp_size_t m = fft_step_size (k);
				818
				819	/* printf ("[k=%d %ld] %ld ->", k, m, pl); */
				820
				821	if (pl == 0 \|\| (pl & (m-1)) != 0)
				822	pl = (pl \| (m-1)) + 1;
				823
				824	/* printf (" %ld\n", pl); */
				825	return pl;
				826	}
				827
				828	#define NMAX_DEFAULT 1000000
				829	#define MAX_REPS 25
				830	#define MIN_REPS 5
				831
				832	static inline size_t
				833	mpn_mul_fft_lcm (size_t a, unsigned int k)
				834	{
				835	unsigned int l = k;
				836
				837	while (a % 2 == 0 && k > 0)
				838	{
				839	a >>= 1;
				840	k--;
				841	}
				842	return a << l;
				843	}
				844
				845	mp_size_t
				846	fftfill (mp_size_t pl, int k, int sqr)
				847	{
				848	mp_size_t maxLK;
				849	mp_bitcnt_t N, Nprime, nprime, M;
				850
				851	N = pl * GMP_NUMB_BITS;
				852	M = N >> k;
				853
				854	maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
				855
				856	Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
				857	nprime = Nprime / GMP_NUMB_BITS;
				858	if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
				859	{
				860	size_t K2;
				861	for (;;)
				862	{
				863	K2 = 1L << mpn_fft_best_k (nprime, sqr);
				864	if ((nprime & (K2 - 1)) == 0)
				865	break;
				866	nprime = (nprime + K2 - 1) & -K2;
				867	Nprime = nprime * GMP_LIMB_BITS;
				868	}
				869	}
				870	ASSERT_ALWAYS (nprime < pl);
				871
				872	return Nprime;
				873	}
				874
				875	static int
				876	compare_double (const void ap, const void bp)
				877	{
				878	double a = * (const double *) ap;
				879	double b = * (const double *) bp;
				880
				881	if (a < b)
				882	return -1;
				883	else if (a > b)
				884	return 1;
				885	else
				886	return 0;
				887	}
				888
				889	double
				890	median (double *times, int n)
				891	{
				892	qsort (times, n, sizeof (double), compare_double);
				893	return times[n/2];
				894	}
				895
				896	#define FFT_CACHE_SIZE 25
				897	typedef struct fft_cache
				898	{
				899	mp_size_t n;
				900	double time;
				901	} fft_cache_t;
				902
				903	fft_cache_t fft_cache[FFT_CACHE_SIZE];
				904
				905	double
				906	cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
				907	int n_measurements)
				908	{
				909	int i;
				910	double t, ttab[MAX_REPS];
				911
				912	if (fft_cache[k].n == n)
				913	return fft_cache[k].time;
				914
				915	for (i = 0; i < n_measurements; i++)
				916	{
				917	speed_starttime ();
				918	mpn_mul_fft (rp, n, ap, n, bp, n, k);
				919	ttab[i] = speed_endtime ();
				920	}
				921
				922	t = median (ttab, n_measurements);
				923	fft_cache[k].n = n;
				924	fft_cache[k].time = t;
				925	return t;
				926	}
				927
				928	#define INSERT_FFTTAB(idx, nval, kval) \
				929	do { \
				930	fft_tab[idx].n = nval; \
				931	fft_tab[idx].k = kval; \
				932	fft_tab[idx+1].n = (1 << 27) - 1; /* sentinel, 27b wide field */ \
				933	fft_tab[idx+1].k = (1 << 5) - 1; \
				934	} while (0)
				935
				936	int
				937	fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
				938	{
				939	mp_size_t n, n1, prev_n1;
				940	int k, best_k, last_best_k, kmax;
				941	int eff, prev_eff;
				942	double t0, t1;
				943	int n_measurements;
				944	mp_limb_t ap, bp, *rp;
				945	mp_size_t alloc;
				946	struct fft_table_nk *fft_tab;
				947
				948	fft_tab = mpn_fft_table3[p->sqr];
				949
				950	for (k = 0; k < FFT_CACHE_SIZE; k++)
				951	fft_cache[k].n = 0;
				952
				953	if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
				954	{
				955	nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
				956	}
				957
				958	if (print)
				959	printf ("#define %s%*s", p->table_name, 38, "");
				960
				961	if (idx == 0)
				962	{
				963	INSERT_FFTTAB (0, nmin, initial_k);
				964
				965	if (print)
				966	{
				967	printf ("\\\n { ");
				968	printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
				969	}
				970
				971	idx = 1;
				972	}
				973
				974	ap = (mp_ptr) malloc (sizeof (mp_limb_t));
				975	if (p->sqr)
				976	bp = ap;
				977	else
				978	bp = (mp_ptr) malloc (sizeof (mp_limb_t));
				979	rp = (mp_ptr) malloc (sizeof (mp_limb_t));
				980	alloc = 1;
				981
				982	/* Round n to comply to initial k value */
				983	n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
				984
				985	n_measurements = (18 - initial_k) \| 1;
				986	n_measurements = MAX (n_measurements, MIN_REPS);
				987	n_measurements = MIN (n_measurements, MAX_REPS);
				988
				989	last_best_k = initial_k;
				990	best_k = initial_k;
				991
				992	while (n < nmax)
				993	{
				994	int start_k, end_k;
				995
				996	/* Assume the current best k is best until we hit its next FFT step. */
				997	t0 = 99999;
				998
				999	prev_n1 = n + 1;
				1000
				1001	start_k = MAX (4, best_k - 4);
				1002	end_k = MIN (24, best_k + 4);
				1003	for (k = start_k; k <= end_k; k++)
				1004	{
				1005	n1 = mpn_fft_next_size (prev_n1, k);
				1006
				1007	eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
				1008
				1009	if (eff < 70) /* avoid measuring too slow fft:s */
				1010	continue;
				1011
				1012	if (n1 > alloc)
				1013	{
				1014	alloc = n1;
				1015	if (p->sqr)
				1016	{
				1017	ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
				1018	rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
				1019	ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
				1020	mpn_random (ap, alloc);
				1021	rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
				1022	}
				1023	else
				1024	{
				1025	ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
				1026	bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
				1027	rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
				1028	ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
				1029	mpn_random (ap, alloc);
				1030	bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
				1031	mpn_random (bp, alloc);
				1032	rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
				1033	}
				1034	}
				1035
				1036	t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
				1037
				1038	if (t1 * n_measurements > 0.3)
				1039	n_measurements -= 2;
				1040	n_measurements = MAX (n_measurements, MIN_REPS);
				1041
				1042	if (t1 < t0)
				1043	{
				1044	best_k = k;
				1045	t0 = t1;
				1046	}
				1047	}
				1048
				1049	n1 = mpn_fft_next_size (prev_n1, best_k);
				1050
				1051	if (last_best_k != best_k)
				1052	{
				1053	ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
				1054
				1055	if (idx >= FFT_TABLE3_SIZE)
				1056	{
				1057	printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
				1058	abort ();
				1059	}
				1060	INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
				1061
				1062	if (print)
				1063	{
				1064	printf (", ");
				1065	if (idx % 4 == 0)
				1066	printf ("\\\n ");
				1067	printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
				1068	}
				1069
				1070	if (option_trace >= 2)
				1071	{
				1072	printf ("{%lu,%u}\n", prev_n1, best_k);
				1073	fflush (stdout);
				1074	}
				1075
				1076	last_best_k = best_k;
				1077	idx++;
				1078	}
				1079
				1080	for (;;)
				1081	{
				1082	prev_n1 = n1;
				1083	prev_eff = fftfill (prev_n1, best_k, p->sqr);
				1084	n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
				1085	eff = fftfill (n1, best_k, p->sqr);
				1086
				1087	if (eff != prev_eff)
				1088	break;
				1089	}
				1090
				1091	n = prev_n1;
				1092	}
				1093
				1094	kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */
				1095	kmax = MIN (kmax, 25-1);
				1096	for (k = last_best_k + 1; k <= kmax; k++)
				1097	{
				1098	if (idx >= FFT_TABLE3_SIZE)
				1099	{
				1100	printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
				1101	abort ();
				1102	}
				1103	INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
				1104
				1105	if (print)
				1106	{
				1107	printf (", ");
				1108	if (idx % 4 == 0)
				1109	printf ("\\\n ");
				1110	printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
				1111	}
				1112
				1113	idx++;
				1114	}
				1115
				1116	if (print)
				1117	printf (" }\n");
				1118
				1119	free (ap);
				1120	if (! p->sqr)
				1121	free (bp);
				1122	free (rp);
				1123
				1124	return idx;
				1125	}
				1126
				1127	void
				1128	fft (struct fft_param_t *p)
				1129	{
				1130	mp_size_t size;
				1131	int k, idx, initial_k;
				1132
				1133	/* Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD */
				1134
				1135	#if 1
				1136	{
				1137	/* Use plain one() mechanism, for some reasonable initial values of k. The
				1138	advantage is that we don't depend on mpn_fft_table3, which can therefore
				1139	leave it completely uninitialized. */
				1140
				1141	static struct param_t param;
				1142	mp_size_t thres, best_thres;
				1143	int best_k;
				1144	char buf[20];
				1145
				1146	best_thres = MP_SIZE_T_MAX;
				1147	best_k = -1;
				1148
				1149	for (k = 5; k <= 7; k++)
				1150	{
				1151	param.name = p->modf_threshold_name;
				1152	param.min_size = 100;
				1153	param.max_size = 2000;
				1154	param.function = p->mul_function;
				1155	param.step_factor = 0.0;
				1156	param.step = 4;
				1157	param.function2 = p->mul_modf_function;
				1158	param.noprint = 1;
				1159	s.r = k;
				1160	one (&thres, &param);
				1161	if (thres < best_thres)
				1162	{
				1163	best_thres = thres;
				1164	best_k = k;
				1165	}
				1166	}
				1167
				1168	*(p->p_modf_threshold) = best_thres;
				1169	sprintf (buf, "k = %d", best_k);
				1170	print_define_remark (p->modf_threshold_name, best_thres, buf);
				1171	initial_k = best_k;
				1172	}
				1173	#else
				1174	size = p->first_size;
				1175	for (;;)
				1176	{
				1177	double tk, tm;
				1178
				1179	size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
				1180	k = mpn_fft_best_k (size, p->sqr);
				1181
				1182	if (size >= p->max_size)
				1183	break;
				1184
				1185	s.size = size + fft_step_size (k) / 2;
				1186	s.r = k;
				1187	tk = tuneup_measure (p->mul_modf_function, NULL, &s);
				1188	if (tk == -1.0)
				1189	abort ();
				1190
				1191	tm = tuneup_measure (p->mul_function, NULL, &s);
				1192	if (tm == -1.0)
				1193	abort ();
				1194
				1195	if (option_trace >= 2)
				1196	printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n",
				1197	(long) size,
				1198	(long) size + fft_step_size (k) / 2, k, tk,
				1199	(long) s.size, tm);
				1200
				1201	if (tk < tm)
				1202	{
				1203	*p->p_modf_threshold = s.size;
				1204	print_define (p->modf_threshold_name, *p->p_modf_threshold);
				1205	break;
				1206	}
				1207	}
				1208	initial_k = ?;
				1209	#endif
				1210
				1211	/* Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 */
				1212
				1213	idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
				1214	printf ("#define %s_SIZE %d\n", p->table_name, idx);
				1215
				1216	/* Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD */
				1217
				1218	size = 2 * p->p_modf_threshold; / OK? */
				1219	for (;;)
				1220	{
				1221	double tk, tm;
				1222	mp_size_t mulmod_size, mul_size;;
				1223
				1224	if (size >= p->max_size)
				1225	break;
				1226
				1227	mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
				1228	mul_size = (size + mulmod_size) / 2; /* middle of step */
				1229
				1230	s.size = mulmod_size;
				1231	tk = tuneup_measure (p->function, NULL, &s);
				1232	if (tk == -1.0)
				1233	abort ();
				1234
				1235	s.size = mul_size;
				1236	tm = tuneup_measure (p->mul_function, NULL, &s);
				1237	if (tm == -1.0)
				1238	abort ();
				1239
				1240	if (option_trace >= 2)
				1241	printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n",
				1242	(long) size,
				1243	(long) mulmod_size, tk,
				1244	(long) mul_size, tm);
				1245
				1246	size = mulmod_size;
				1247
				1248	if (tk < tm)
				1249	{
				1250	*p->p_threshold = s.size;
				1251	print_define (p->threshold_name, *p->p_threshold);
				1252	break;
				1253	}
				1254	}
				1255	}
				1256
				1257	/* Compare mpn_mul_1 to whatever fast exact single-limb division we have. This
				1258	is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
				1259	This is used in get_str and set_str. */
				1260	void
				1261	relspeed_div_1_vs_mul_1 (void)
				1262	{
				1263	const size_t max_opsize = 100;
				1264	mp_size_t n;
				1265	long j;
				1266	mp_limb_t rp[max_opsize];
				1267	mp_limb_t ap[max_opsize];
				1268	double multime, divtime;
				1269
				1270	mpn_random (ap, max_opsize);
				1271
				1272	multime = 0;
				1273	for (n = max_opsize; n > 1; n--)
				1274	{
				1275	mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
				1276	speed_starttime ();
				1277	for (j = speed_precision; j != 0 ; j--)
				1278	mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
				1279	multime += speed_endtime () / n;
				1280	}
				1281
				1282	divtime = 0;
				1283	for (n = max_opsize; n > 1; n--)
				1284	{
				1285	/* Make input divisible for good measure. */
				1286	ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
				1287
				1288	#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 \|\| ! HAVE_NATIVE_mpn_divexact_1
				1289	mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
				1290	MP_BASES_BIG_BASE_BINVERTED_10,
				1291	MP_BASES_BIG_BASE_CTZ_10);
				1292	#else
				1293	mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
				1294	#endif
				1295	speed_starttime ();
				1296	for (j = speed_precision; j != 0 ; j--)
				1297	{
				1298	#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 \|\| ! HAVE_NATIVE_mpn_divexact_1
				1299	mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
				1300	MP_BASES_BIG_BASE_BINVERTED_10,
				1301	MP_BASES_BIG_BASE_CTZ_10);
				1302	#else
				1303	mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
				1304	#endif
				1305	}
				1306	divtime += speed_endtime () / n;
				1307	}
				1308
				1309	print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
				1310	}
				1311
				1312
				1313	/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
				1314	giving wrong results. */
				1315	void
				1316	tune_mul_n (void)
				1317	{
				1318	static struct param_t param;
				1319	mp_size_t next_toom_start;
				1320	int something_changed;
				1321
				1322	param.function = speed_mpn_mul_n;
				1323
				1324	param.name = "MUL_TOOM22_THRESHOLD";
				1325	param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
				1326	param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
				1327	one (&mul_toom22_threshold, &param);
				1328
				1329	param.noprint = 1;
				1330
				1331	/* Threshold sequence loop. Disable functions that would be used in a very
				1332	narrow range, re-measuring things when that happens. */
				1333	something_changed = 1;
				1334	while (something_changed)
				1335	{
				1336	something_changed = 0;
				1337
				1338	next_toom_start = mul_toom22_threshold;
				1339
				1340	if (mul_toom33_threshold != 0)
				1341	{
				1342	param.name = "MUL_TOOM33_THRESHOLD";
				1343	param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
				1344	param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
				1345	one (&mul_toom33_threshold, &param);
				1346
				1347	if (next_toom_start * 1.05 >= mul_toom33_threshold)
				1348	{
				1349	mul_toom33_threshold = 0;
				1350	something_changed = 1;
				1351	}
				1352	}
				1353
				1354	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
				1355
				1356	if (mul_toom44_threshold != 0)
				1357	{
				1358	param.name = "MUL_TOOM44_THRESHOLD";
				1359	param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
				1360	param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
				1361	one (&mul_toom44_threshold, &param);
				1362
				1363	if (next_toom_start * 1.05 >= mul_toom44_threshold)
				1364	{
				1365	mul_toom44_threshold = 0;
				1366	something_changed = 1;
				1367	}
				1368	}
				1369
				1370	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
				1371
				1372	if (mul_toom6h_threshold != 0)
				1373	{
				1374	param.name = "MUL_TOOM6H_THRESHOLD";
				1375	param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
				1376	param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
				1377	one (&mul_toom6h_threshold, &param);
				1378
				1379	if (next_toom_start * 1.05 >= mul_toom6h_threshold)
				1380	{
				1381	mul_toom6h_threshold = 0;
				1382	something_changed = 1;
				1383	}
				1384	}
				1385
				1386	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
				1387
				1388	if (mul_toom8h_threshold != 0)
				1389	{
				1390	param.name = "MUL_TOOM8H_THRESHOLD";
				1391	param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
				1392	param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
				1393	one (&mul_toom8h_threshold, &param);
				1394
				1395	if (next_toom_start * 1.05 >= mul_toom8h_threshold)
				1396	{
				1397	mul_toom8h_threshold = 0;
				1398	something_changed = 1;
				1399	}
				1400	}
				1401	}
				1402
				1403	print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
				1404	print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
				1405	print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
				1406	print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
				1407
				1408	/* disabled until tuned */
				1409	MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
				1410	}
				1411
				1412	void
				1413	tune_mul (void)
				1414	{
				1415	static struct param_t param;
				1416	mp_size_t thres;
				1417
				1418	param.noprint = 1;
				1419
				1420	param.function = speed_mpn_toom32_for_toom43_mul;
				1421	param.function2 = speed_mpn_toom43_for_toom32_mul;
				1422	param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
				1423	param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
				1424	one (&thres, &param);
				1425	mul_toom32_to_toom43_threshold = thres * 17 / 24;
				1426	print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
				1427
				1428	param.function = speed_mpn_toom32_for_toom53_mul;
				1429	param.function2 = speed_mpn_toom53_for_toom32_mul;
				1430	param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
				1431	param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
				1432	one (&thres, &param);
				1433	mul_toom32_to_toom53_threshold = thres * 19 / 30;
				1434	print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
				1435
				1436	param.function = speed_mpn_toom42_for_toom53_mul;
				1437	param.function2 = speed_mpn_toom53_for_toom42_mul;
				1438	param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
				1439	param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
				1440	one (&thres, &param);
				1441	mul_toom42_to_toom53_threshold = thres * 11 / 20;
				1442	print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
				1443
				1444	param.function = speed_mpn_toom42_mul;
				1445	param.function2 = speed_mpn_toom63_mul;
				1446	param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
				1447	param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
				1448	one (&thres, &param);
				1449	mul_toom42_to_toom63_threshold = thres / 2;
				1450	print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
				1451
				1452	/* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
				1453	param.function = speed_mpn_toom43_for_toom54_mul;
				1454	param.function2 = speed_mpn_toom54_for_toom43_mul;
				1455	param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
				1456	param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
				1457	one (&thres, &param);
				1458	mul_toom43_to_toom54_threshold = thres * 5 / 6;
				1459	print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
				1460	}
				1461
				1462
				1463	void
				1464	tune_mullo (void)
				1465	{
				1466	static struct param_t param;
				1467
				1468	param.function = speed_mpn_mullo_n;
				1469
				1470	param.name = "MULLO_BASECASE_THRESHOLD";
				1471	param.min_size = 2;
				1472	param.min_is_always = 1;
				1473	param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
				1474	param.stop_factor = 1.5;
				1475	param.noprint = 1;
				1476	one (&mullo_basecase_threshold, &param);
				1477
				1478	param.name = "MULLO_DC_THRESHOLD";
				1479	param.min_size = 8;
				1480	param.min_is_always = 0;
				1481	param.max_size = 1000;
				1482	one (&mullo_dc_threshold, &param);
				1483
				1484	if (mullo_basecase_threshold >= mullo_dc_threshold)
				1485	{
				1486	print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
				1487	print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
				1488	}
				1489	else
				1490	{
				1491	print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
				1492	print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
				1493	}
				1494
				1495	if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
				1496	{
				1497	param.name = "MULLO_MUL_N_THRESHOLD";
				1498	param.min_size = mullo_dc_threshold;
				1499	param.max_size = 2 * mul_fft_threshold;
				1500	param.noprint = 0;
				1501	param.step_factor = 0.03;
				1502	one (&mullo_mul_n_threshold, &param);
				1503	}
				1504	else
				1505	print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
				1506	"without FFT use mullo forever");
				1507	}
				1508
				1509	void
				1510	tune_sqrlo (void)
				1511	{
				1512	static struct param_t param;
				1513
				1514	param.function = speed_mpn_sqrlo;
				1515
				1516	param.name = "SQRLO_BASECASE_THRESHOLD";
				1517	param.min_size = 2;
				1518	param.min_is_always = 1;
				1519	param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
				1520	param.stop_factor = 1.5;
				1521	param.noprint = 1;
				1522	one (&sqrlo_basecase_threshold, &param);
				1523
				1524	param.name = "SQRLO_DC_THRESHOLD";
				1525	param.min_size = 8;
				1526	param.min_is_always = 0;
				1527	param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
				1528	one (&sqrlo_dc_threshold, &param);
				1529
				1530	if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
				1531	{
				1532	print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
				1533	print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
				1534	}
				1535	else
				1536	{
				1537	print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
				1538	print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
				1539	}
				1540
				1541	if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
				1542	{
				1543	param.name = "SQRLO_SQR_THRESHOLD";
				1544	param.min_size = sqrlo_dc_threshold;
				1545	param.max_size = 2 * sqr_fft_threshold;
				1546	param.noprint = 0;
				1547	param.step_factor = 0.03;
				1548	one (&sqrlo_sqr_threshold, &param);
				1549	}
				1550	else
				1551	print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
				1552	"without FFT use sqrlo forever");
				1553	}
				1554
				1555	void
				1556	tune_mulmid (void)
				1557	{
				1558	static struct param_t param;
				1559
				1560	param.name = "MULMID_TOOM42_THRESHOLD";
				1561	param.function = speed_mpn_mulmid_n;
				1562	param.min_size = 4;
				1563	param.max_size = 100;
				1564	one (&mulmid_toom42_threshold, &param);
				1565	}
				1566
				1567	void
				1568	tune_mulmod_bnm1 (void)
				1569	{
				1570	static struct param_t param;
				1571
				1572	param.name = "MULMOD_BNM1_THRESHOLD";
				1573	param.function = speed_mpn_mulmod_bnm1;
				1574	param.min_size = 4;
				1575	param.max_size = 100;
				1576	one (&mulmod_bnm1_threshold, &param);
				1577	}
				1578
				1579	void
				1580	tune_sqrmod_bnm1 (void)
				1581	{
				1582	static struct param_t param;
				1583
				1584	param.name = "SQRMOD_BNM1_THRESHOLD";
				1585	param.function = speed_mpn_sqrmod_bnm1;
				1586	param.min_size = 4;
				1587	param.max_size = 100;
				1588	one (&sqrmod_bnm1_threshold, &param);
				1589	}
				1590
				1591
				1592	/* Start the basecase from 3, since 1 is a special case, and if mul_basecase
				1593	is faster only at size==2 then we don't want to bother with extra code
				1594	just for that. Start karatsuba from 4 same as MUL above. */
				1595
				1596	void
				1597	tune_sqr (void)
				1598	{
				1599	/* disabled until tuned */
				1600	SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
				1601
				1602	if (HAVE_NATIVE_mpn_sqr_basecase)
				1603	{
				1604	print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
				1605	sqr_basecase_threshold = 0;
				1606	}
				1607	else
				1608	{
				1609	static struct param_t param;
				1610	param.name = "SQR_BASECASE_THRESHOLD";
				1611	param.function = speed_mpn_sqr;
				1612	param.min_size = 3;
				1613	param.min_is_always = 1;
				1614	param.max_size = TUNE_SQR_TOOM2_MAX;
				1615	param.noprint = 1;
				1616	one (&sqr_basecase_threshold, &param);
				1617	}
				1618
				1619	{
				1620	static struct param_t param;
				1621	param.name = "SQR_TOOM2_THRESHOLD";
				1622	param.function = speed_mpn_sqr;
				1623	param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
				1624	param.max_size = TUNE_SQR_TOOM2_MAX;
				1625	param.noprint = 1;
				1626	one (&sqr_toom2_threshold, &param);
				1627
				1628	if (! HAVE_NATIVE_mpn_sqr_basecase
				1629	&& sqr_toom2_threshold < sqr_basecase_threshold)
				1630	{
				1631	/* Karatsuba becomes faster than mul_basecase before
				1632	sqr_basecase does. Arrange for the expression
				1633	"BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
				1634	selects mpn_sqr_basecase in mpn_sqr to be false, by setting
				1635	SQR_TOOM2_THRESHOLD to zero, making
				1636	SQR_BASECASE_THRESHOLD the toom2 threshold. */
				1637
				1638	sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
				1639	SQR_TOOM2_THRESHOLD = 0;
				1640
				1641	print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
				1642	"toom2");
				1643	print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
				1644	"never sqr_basecase");
				1645	}
				1646	else
				1647	{
				1648	if (! HAVE_NATIVE_mpn_sqr_basecase)
				1649	print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
				1650	print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
				1651	}
				1652	}
				1653
				1654	{
				1655	static struct param_t param;
				1656	mp_size_t next_toom_start;
				1657	int something_changed;
				1658
				1659	param.function = speed_mpn_sqr;
				1660	param.noprint = 1;
				1661
				1662	/* Threshold sequence loop. Disable functions that would be used in a very
				1663	narrow range, re-measuring things when that happens. */
				1664	something_changed = 1;
				1665	while (something_changed)
				1666	{
				1667	something_changed = 0;
				1668
				1669	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
				1670
				1671	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
				1672	param.name = "SQR_TOOM3_THRESHOLD";
				1673	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
				1674	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
				1675	one (&sqr_toom3_threshold, &param);
				1676
				1677	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
				1678
				1679	if (sqr_toom4_threshold != 0)
				1680	{
				1681	param.name = "SQR_TOOM4_THRESHOLD";
				1682	sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
				1683	param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
				1684	param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
				1685	one (&sqr_toom4_threshold, &param);
				1686
				1687	if (next_toom_start * 1.05 >= sqr_toom4_threshold)
				1688	{
				1689	sqr_toom4_threshold = 0;
				1690	something_changed = 1;
				1691	}
				1692	}
				1693
				1694	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
				1695
				1696	if (sqr_toom6_threshold != 0)
				1697	{
				1698	param.name = "SQR_TOOM6_THRESHOLD";
				1699	sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
				1700	param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
				1701	param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
				1702	one (&sqr_toom6_threshold, &param);
				1703
				1704	if (next_toom_start * 1.05 >= sqr_toom6_threshold)
				1705	{
				1706	sqr_toom6_threshold = 0;
				1707	something_changed = 1;
				1708	}
				1709	}
				1710
				1711	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
				1712
				1713	if (sqr_toom8_threshold != 0)
				1714	{
				1715	param.name = "SQR_TOOM8_THRESHOLD";
				1716	sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
				1717	param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
				1718	param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
				1719	one (&sqr_toom8_threshold, &param);
				1720
				1721	if (next_toom_start * 1.05 >= sqr_toom8_threshold)
				1722	{
				1723	sqr_toom8_threshold = 0;
				1724	something_changed = 1;
				1725	}
				1726	}
				1727	}
				1728
				1729	print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
				1730	print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
				1731	print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
				1732	print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
				1733	}
				1734	}
				1735
				1736
				1737	void
				1738	tune_dc_div (void)
				1739	{
				1740	s.r = 0; /* clear to make speed function do 2n/n */
				1741	{
				1742	static struct param_t param;
				1743	param.name = "DC_DIV_QR_THRESHOLD";
				1744	param.function = speed_mpn_sbpi1_div_qr;
				1745	param.function2 = speed_mpn_dcpi1_div_qr;
				1746	param.min_size = 6;
				1747	one (&dc_div_qr_threshold, &param);
				1748	}
				1749	{
				1750	static struct param_t param;
				1751	param.name = "DC_DIVAPPR_Q_THRESHOLD";
				1752	param.function = speed_mpn_sbpi1_divappr_q;
				1753	param.function2 = speed_mpn_dcpi1_divappr_q;
				1754	param.min_size = 6;
				1755	one (&dc_divappr_q_threshold, &param);
				1756	}
				1757	}
				1758
				1759	static double
				1760	speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
				1761	{
				1762	if (s->size < DC_DIV_QR_THRESHOLD)
				1763	return speed_mpn_sbpi1_div_qr (s);
				1764	else
				1765	return speed_mpn_dcpi1_div_qr (s);
				1766	}
				1767
				1768	void
				1769	tune_mu_div (void)
				1770	{
				1771	s.r = 0; /* clear to make speed function do 2n/n */
				1772	{
				1773	static struct param_t param;
				1774	param.name = "MU_DIV_QR_THRESHOLD";
				1775	param.function = speed_mpn_dcpi1_div_qr;
				1776	param.function2 = speed_mpn_mu_div_qr;
				1777	param.min_size = mul_toom22_threshold;
				1778	param.max_size = 5000;
				1779	param.step_factor = 0.02;
				1780	one (&mu_div_qr_threshold, &param);
				1781	}
				1782	{
				1783	static struct param_t param;
				1784	param.name = "MU_DIVAPPR_Q_THRESHOLD";
				1785	param.function = speed_mpn_dcpi1_divappr_q;
				1786	param.function2 = speed_mpn_mu_divappr_q;
				1787	param.min_size = mul_toom22_threshold;
				1788	param.max_size = 5000;
				1789	param.step_factor = 0.02;
				1790	one (&mu_divappr_q_threshold, &param);
				1791	}
				1792	{
				1793	static struct param_t param;
				1794	param.name = "MUPI_DIV_QR_THRESHOLD";
				1795	param.function = speed_mpn_sbordcpi1_div_qr;
				1796	param.function2 = speed_mpn_mupi_div_qr;
				1797	param.min_size = 6;
				1798	param.min_is_always = 1;
				1799	param.max_size = 1000;
				1800	param.step_factor = 0.02;
				1801	one (&mupi_div_qr_threshold, &param);
				1802	}
				1803	}
				1804
				1805	void
				1806	tune_dc_bdiv (void)
				1807	{
				1808	s.r = 0; /* clear to make speed function do 2n/n*/
				1809	{
				1810	static struct param_t param;
				1811	param.name = "DC_BDIV_QR_THRESHOLD";
				1812	param.function = speed_mpn_sbpi1_bdiv_qr;
				1813	param.function2 = speed_mpn_dcpi1_bdiv_qr;
				1814	param.min_size = 4;
				1815	one (&dc_bdiv_qr_threshold, &param);
				1816	}
				1817	{
				1818	static struct param_t param;
				1819	param.name = "DC_BDIV_Q_THRESHOLD";
				1820	param.function = speed_mpn_sbpi1_bdiv_q;
				1821	param.function2 = speed_mpn_dcpi1_bdiv_q;
				1822	param.min_size = 4;
				1823	one (&dc_bdiv_q_threshold, &param);
				1824	}
				1825	}
				1826
				1827	void
				1828	tune_mu_bdiv (void)
				1829	{
				1830	s.r = 0; /* clear to make speed function do 2n/n*/
				1831	{
				1832	static struct param_t param;
				1833	param.name = "MU_BDIV_QR_THRESHOLD";
				1834	param.function = speed_mpn_dcpi1_bdiv_qr;
				1835	param.function2 = speed_mpn_mu_bdiv_qr;
				1836	param.min_size = dc_bdiv_qr_threshold;
				1837	param.max_size = 5000;
				1838	param.step_factor = 0.02;
				1839	one (&mu_bdiv_qr_threshold, &param);
				1840	}
				1841	{
				1842	static struct param_t param;
				1843	param.name = "MU_BDIV_Q_THRESHOLD";
				1844	param.function = speed_mpn_dcpi1_bdiv_q;
				1845	param.function2 = speed_mpn_mu_bdiv_q;
				1846	param.min_size = dc_bdiv_q_threshold;
				1847	param.max_size = 5000;
				1848	param.step_factor = 0.02;
				1849	one (&mu_bdiv_q_threshold, &param);
				1850	}
				1851	}
				1852
				1853	void
				1854	tune_invertappr (void)
				1855	{
				1856	static struct param_t param;
				1857
				1858	param.function = speed_mpn_ni_invertappr;
				1859	param.name = "INV_MULMOD_BNM1_THRESHOLD";
				1860	param.min_size = 5;
				1861	one (&inv_mulmod_bnm1_threshold, &param);
				1862
				1863	param.function = speed_mpn_invertappr;
				1864	param.name = "INV_NEWTON_THRESHOLD";
				1865	param.min_size = 5;
				1866	one (&inv_newton_threshold, &param);
				1867	}
				1868
				1869	void
				1870	tune_invert (void)
				1871	{
				1872	static struct param_t param;
				1873
				1874	param.function = speed_mpn_invert;
				1875	param.name = "INV_APPR_THRESHOLD";
				1876	param.min_size = 5;
				1877	one (&inv_appr_threshold, &param);
				1878	}
				1879
				1880	void
				1881	tune_binvert (void)
				1882	{
				1883	static struct param_t param;
				1884
				1885	param.function = speed_mpn_binvert;
				1886	param.name = "BINV_NEWTON_THRESHOLD";
				1887	param.min_size = 8; /* pointless with smaller operands */
				1888	one (&binv_newton_threshold, &param);
				1889	}
				1890
				1891	void
				1892	tune_redc (void)
				1893	{
				1894	#define TUNE_REDC_2_MAX 100
				1895	#if HAVE_NATIVE_mpn_addmul_2 \|\| HAVE_NATIVE_mpn_redc_2
				1896	#define WANT_REDC_2 1
				1897	#endif
				1898
				1899	#if WANT_REDC_2
				1900	{
				1901	static struct param_t param;
				1902	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
				1903	param.function = speed_mpn_redc_1;
				1904	param.function2 = speed_mpn_redc_2;
				1905	param.min_size = 1;
				1906	param.min_is_always = 1;
				1907	param.max_size = TUNE_REDC_2_MAX;
				1908	param.noprint = 1;
				1909	param.stop_factor = 1.5;
				1910	one (&redc_1_to_redc_2_threshold, &param);
				1911	}
				1912	{
				1913	static struct param_t param;
				1914	param.name = "REDC_2_TO_REDC_N_THRESHOLD";
				1915	param.function = speed_mpn_redc_2;
				1916	param.function2 = speed_mpn_redc_n;
				1917	param.min_size = 16;
				1918	param.noprint = 1;
				1919	one (&redc_2_to_redc_n_threshold, &param);
				1920	}
				1921	if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
				1922	{
				1923	redc_2_to_redc_n_threshold = 0; /* disable redc_2 */
				1924
				1925	/* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
				1926	REDC_1_TO_REDC_2_THRESHOLD. */
				1927	{
				1928	static struct param_t param;
				1929	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
				1930	param.function = speed_mpn_redc_1;
				1931	param.function2 = speed_mpn_redc_n;
				1932	param.min_size = 16;
				1933	param.noprint = 1;
				1934	one (&redc_1_to_redc_2_threshold, &param);
				1935	}
				1936	}
				1937	print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
				1938	print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
				1939	#else
				1940	{
				1941	static struct param_t param;
				1942	param.name = "REDC_1_TO_REDC_N_THRESHOLD";
				1943	param.function = speed_mpn_redc_1;
				1944	param.function2 = speed_mpn_redc_n;
				1945	param.min_size = 16;
				1946	one (&redc_1_to_redc_n_threshold, &param);
				1947	}
				1948	#endif
				1949	}
				1950
				1951	void
				1952	tune_matrix22_mul (void)
				1953	{
				1954	static struct param_t param;
				1955	param.name = "MATRIX22_STRASSEN_THRESHOLD";
				1956	param.function = speed_mpn_matrix22_mul;
				1957	param.min_size = 2;
				1958	one (&matrix22_strassen_threshold, &param);
				1959	}
				1960
				1961	void
				1962	tune_hgcd2 (void)
				1963	{
				1964	static struct param_t param;
				1965	hgcd2_func_t *f[5] =
				1966	{ mpn_hgcd2_1,
				1967	mpn_hgcd2_2,
				1968	mpn_hgcd2_3,
				1969	mpn_hgcd2_4,
				1970	mpn_hgcd2_5 };
				1971	speed_function_t speed_f[5] =
				1972	{ speed_mpn_hgcd2_1,
				1973	speed_mpn_hgcd2_2,
				1974	speed_mpn_hgcd2_3,
				1975	speed_mpn_hgcd2_4,
				1976	speed_mpn_hgcd2_5 };
				1977	int best;
				1978
				1979	s.size = 1;
				1980	best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", &param);
				1981
				1982	/* Use selected function when tuning hgcd and gcd */
				1983	hgcd2_func = f[best];
				1984	}
				1985
				1986	void
				1987	tune_hgcd (void)
				1988	{
				1989	static struct param_t param;
				1990	param.name = "HGCD_THRESHOLD";
				1991	param.function = speed_mpn_hgcd;
				1992	/* We seem to get strange results for small sizes */
				1993	param.min_size = 30;
				1994	one (&hgcd_threshold, &param);
				1995	}
				1996
				1997	void
				1998	tune_hgcd_appr (void)
				1999	{
				2000	static struct param_t param;
				2001	param.name = "HGCD_APPR_THRESHOLD";
				2002	param.function = speed_mpn_hgcd_appr;
				2003	/* We seem to get strange results for small sizes */
				2004	param.min_size = 50;
				2005	param.stop_since_change = 150;
				2006	one (&hgcd_appr_threshold, &param);
				2007	}
				2008
				2009	void
				2010	tune_hgcd_reduce (void)
				2011	{
				2012	static struct param_t param;
				2013	param.name = "HGCD_REDUCE_THRESHOLD";
				2014	param.function = speed_mpn_hgcd_reduce;
				2015	param.min_size = 30;
				2016	param.max_size = 7000;
				2017	param.step_factor = 0.04;
				2018	one (&hgcd_reduce_threshold, &param);
				2019	}
				2020
				2021	void
				2022	tune_gcd_dc (void)
				2023	{
				2024	static struct param_t param;
				2025	param.name = "GCD_DC_THRESHOLD";
				2026	param.function = speed_mpn_gcd;
				2027	param.min_size = hgcd_threshold;
				2028	param.max_size = 3000;
				2029	param.step_factor = 0.02;
				2030	one (&gcd_dc_threshold, &param);
				2031	}
				2032
				2033	void
				2034	tune_gcdext_dc (void)
				2035	{
				2036	static struct param_t param;
				2037	param.name = "GCDEXT_DC_THRESHOLD";
				2038	param.function = speed_mpn_gcdext;
				2039	param.min_size = hgcd_threshold;
				2040	param.max_size = 3000;
				2041	param.step_factor = 0.02;
				2042	one (&gcdext_dc_threshold, &param);
				2043	}
				2044
				2045	/* In tune_powm_sec we compute the table used by the win_size function. The
				2046	cutoff points are in exponent bits, disregarding other operand sizes. It is
				2047	not possible to use the one framework since it currently uses a granularity
				2048	of full limbs.
				2049	*/
				2050
				2051	/* This win_size replaces the variant in the powm code, allowing us to
				2052	control k in the k-ary algorithms. */
				2053	int winsize;
				2054	int
				2055	win_size (mp_bitcnt_t eb)
				2056	{
				2057	return winsize;
				2058	}
				2059
				2060	void
				2061	tune_powm_sec (void)
				2062	{
				2063	mp_size_t n;
				2064	int k, i;
				2065	mp_size_t itch;
				2066	mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
				2067	const int n_max = 3000 / GMP_NUMB_BITS;
				2068	const int n_measurements = 5;
				2069	mp_ptr rp, bp, ep, mp, tp;
				2070	double ttab[n_measurements], tk, tkp1;
				2071	TMP_DECL;
				2072	TMP_MARK;
				2073
				2074	possible_nbits_cutoff = 0;
				2075
				2076	k = 1;
				2077
				2078	winsize = 10; /* the itch function needs this */
				2079	itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
				2080
				2081	rp = TMP_ALLOC_LIMBS (n_max);
				2082	bp = TMP_ALLOC_LIMBS (n_max);
				2083	ep = TMP_ALLOC_LIMBS (n_max);
				2084	mp = TMP_ALLOC_LIMBS (n_max);
				2085	tp = TMP_ALLOC_LIMBS (itch);
				2086
				2087	mpn_random (bp, n_max);
				2088	mpn_random (mp, n_max);
				2089	mp[0] \|= 1;
				2090
				2091	/* How about taking the M operand size into account?
				2092
				2093	An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
				2094	B = O(M)).
				2095
				2096	Using k-ary and no sliding window, the precomputation will need time
				2097	O(2^(k-1)M(log(N))) and the main computation will need O(log(E)S(N)) +
				2098	O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
				2099
				2100	An operation R=powm_sec(B,E,N) will take time like powm.
				2101
				2102	Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
				2103	main computation will need O(log(E)S(N)) + O(log(E)/kM(N)) +
				2104	O(log(E)/k2^klog(N)), for the squarings, multiplications, and full
				2105	table reads, respectively. */
				2106
				2107	printf ("#define POWM_SEC_TABLE ");
				2108
				2109	/* For nbits == 1, we should always use k == 1, so no need to tune
				2110	that. Starting with nbits == 2 also ensure that nbits always is
				2111	larger than the windowsize k+1. */
				2112	for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
				2113	{
				2114	n = (nbits - 1) / GMP_NUMB_BITS + 1;
				2115
				2116	/* Generate E such that sliding-window for k and k+1 works equally
				2117	well/poorly (but sliding is not used in powm_sec, of course). */
				2118	for (i = 0; i < n; i++)
				2119	ep[i] = ~CNST_LIMB(0);
				2120
				2121	winsize = k;
				2122	for (i = 0; i < n_measurements; i++)
				2123	{
				2124	speed_starttime ();
				2125	mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
				2126	ttab[i] = speed_endtime ();
				2127	}
				2128	tk = median (ttab, n_measurements);
				2129
				2130	winsize = k + 1;
				2131	speed_starttime ();
				2132	for (i = 0; i < n_measurements; i++)
				2133	{
				2134	speed_starttime ();
				2135	mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
				2136	ttab[i] = speed_endtime ();
				2137	}
				2138	tkp1 = median (ttab, n_measurements);
				2139	/*
				2140	printf ("testing: %ld, %d", nbits, k, ep[n-1]);
				2141	printf (" %10.5f %10.5f\n", tk, tkp1);
				2142	*/
				2143	if (tkp1 < tk)
				2144	{
				2145	if (possible_nbits_cutoff)
				2146	{
				2147	/* Two consecutive sizes indicate k increase, obey. */
				2148
				2149	/* Must always have x[k] >= k */
				2150	ASSERT_ALWAYS (possible_nbits_cutoff >= k);
				2151
				2152	if (k > 1)
				2153	printf (",");
				2154	printf ("%ld", (long) possible_nbits_cutoff);
				2155	k++;
				2156	possible_nbits_cutoff = 0;
				2157	}
				2158	else
				2159	{
				2160	/* One measurement indicate k increase, save nbits for further
				2161	consideration. */
				2162	/* The new larger k gets used for sizes > the cutoff
				2163	value, hence the cutoff should be one less than the
				2164	smallest size where it gives a speedup. */
				2165	possible_nbits_cutoff = nbits - 1;
				2166	}
				2167	}
				2168	else
				2169	possible_nbits_cutoff = 0;
				2170
				2171	nbits_next = nbits * 65 / 64;
				2172	nbits = nbits_next + (nbits_next == nbits);
				2173	}
				2174	printf ("\n");
				2175	TMP_FREE;
				2176	}
				2177
				2178
				2179	/* size_extra==1 reflects the fact that with high<divisor one division is
				2180	always skipped. Forcing high<divisor while testing ensures consistency
				2181	while stepping through sizes, ie. that size-1 divides will be done each
				2182	time.
				2183
				2184	min_size==2 and min_is_always are used so that if plain division is only
				2185	better at size==1 then don't bother including that code just for that
				2186	case, instead go with preinv always and get a size saving. */
				2187
				2188	#define DIV_1_PARAMS \
				2189	param.check_size = 256; \
				2190	param.min_size = 2; \
				2191	param.min_is_always = 1; \
				2192	param.data_high = DATA_HIGH_LT_R; \
				2193	param.size_extra = 1; \
				2194	param.stop_factor = 2.0;
				2195
				2196
				2197	double (tuned_speed_mpn_divrem_1) (struct speed_params );
				2198
				2199	void
				2200	tune_divrem_1 (void)
				2201	{
				2202	/* plain version by default */
				2203	tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
				2204
				2205	/* No support for tuning native assembler code, do that by hand and put
				2206	the results in the .asm file, there's no need for such thresholds to
				2207	appear in gmp-mparam.h. */
				2208	if (HAVE_NATIVE_mpn_divrem_1)
				2209	return;
				2210
				2211	if (GMP_NAIL_BITS != 0)
				2212	{
				2213	print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
				2214	"no preinv with nails");
				2215	print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
				2216	"no preinv with nails");
				2217	return;
				2218	}
				2219
				2220	if (UDIV_PREINV_ALWAYS)
				2221	{
				2222	print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
				2223	print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
				2224	return;
				2225	}
				2226
				2227	tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
				2228
				2229	/* Tune for the integer part of mpn_divrem_1. This will very possibly be
				2230	a bit out for the fractional part, but that's too bad, the integer part
				2231	is more important. */
				2232	{
				2233	static struct param_t param;
				2234	param.name = "DIVREM_1_NORM_THRESHOLD";
				2235	DIV_1_PARAMS;
				2236	s.r = randlimb_norm ();
				2237	param.function = speed_mpn_divrem_1_tune;
				2238	one (&divrem_1_norm_threshold, &param);
				2239	}
				2240	{
				2241	static struct param_t param;
				2242	param.name = "DIVREM_1_UNNORM_THRESHOLD";
				2243	DIV_1_PARAMS;
				2244	s.r = randlimb_half ();
				2245	param.function = speed_mpn_divrem_1_tune;
				2246	one (&divrem_1_unnorm_threshold, &param);
				2247	}
				2248	}
				2249
				2250	void
				2251	tune_div_qr_1 (void)
				2252	{
				2253	if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
				2254	{
				2255	static struct param_t param;
				2256	speed_function_t f[2] =
				2257	{
				2258	speed_mpn_div_qr_1n_pi1_1,
				2259	speed_mpn_div_qr_1n_pi1_2,
				2260	};
				2261
				2262	s.size = 10;
				2263	s.r = randlimb_norm ();
				2264
				2265	one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
				2266	}
				2267
				2268	{
				2269	static struct param_t param;
				2270	param.name = "DIV_QR_1_NORM_THRESHOLD";
				2271	DIV_1_PARAMS;
				2272	param.min_size = 1;
				2273	param.min_is_always = 0;
				2274	s.r = randlimb_norm ();
				2275	param.function = speed_mpn_div_qr_1_tune;
				2276	one (&div_qr_1_norm_threshold, &param);
				2277	}
				2278	{
				2279	static struct param_t param;
				2280	param.name = "DIV_QR_1_UNNORM_THRESHOLD";
				2281	DIV_1_PARAMS;
				2282	param.min_size = 1;
				2283	param.min_is_always = 0;
				2284	s.r = randlimb_half();
				2285	param.function = speed_mpn_div_qr_1_tune;
				2286	one (&div_qr_1_unnorm_threshold, &param);
				2287	}
				2288	}
				2289
				2290
				2291	void
				2292	tune_mod_1 (void)
				2293	{
				2294	/* No support for tuning native assembler code, do that by hand and put
				2295	the results in the .asm file, there's no need for such thresholds to
				2296	appear in gmp-mparam.h. */
				2297	if (HAVE_NATIVE_mpn_mod_1)
				2298	return;
				2299
				2300	if (GMP_NAIL_BITS != 0)
				2301	{
				2302	print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
				2303	"no preinv with nails");
				2304	print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
				2305	"no preinv with nails");
				2306	return;
				2307	}
				2308
				2309	if (!HAVE_NATIVE_mpn_mod_1_1p)
				2310	{
				2311	static struct param_t param;
				2312	speed_function_t f[2] =
				2313	{
				2314	speed_mpn_mod_1_1_1,
				2315	speed_mpn_mod_1_1_2,
				2316	};
				2317
				2318	s.size = 10;
				2319	s.r = randlimb_half ();
				2320	one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", &param);
				2321	}
				2322
				2323	if (UDIV_PREINV_ALWAYS)
				2324	{
				2325	print_define ("MOD_1_NORM_THRESHOLD", 0L);
				2326	print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
				2327	}
				2328	else
				2329	{
				2330	{
				2331	static struct param_t param;
				2332	param.name = "MOD_1_NORM_THRESHOLD";
				2333	DIV_1_PARAMS;
				2334	s.r = randlimb_norm ();
				2335	param.function = speed_mpn_mod_1_tune;
				2336	one (&mod_1_norm_threshold, &param);
				2337	}
				2338	{
				2339	static struct param_t param;
				2340	param.name = "MOD_1_UNNORM_THRESHOLD";
				2341	DIV_1_PARAMS;
				2342	s.r = randlimb_half ();
				2343	param.function = speed_mpn_mod_1_tune;
				2344	one (&mod_1_unnorm_threshold, &param);
				2345	}
				2346	}
				2347	{
				2348	static struct param_t param;
				2349
				2350	param.check_size = 256;
				2351
				2352	s.r = randlimb_norm ();
				2353	param.function = speed_mpn_mod_1_tune;
				2354
				2355	param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
				2356	param.min_size = 2;
				2357	one (&mod_1n_to_mod_1_1_threshold, &param);
				2358	}
				2359
				2360	{
				2361	static struct param_t param;
				2362
				2363	param.check_size = 256;
				2364	s.r = randlimb_half ();
				2365	param.noprint = 1;
				2366
				2367	param.function = speed_mpn_mod_1_1;
				2368	param.function2 = speed_mpn_mod_1_2;
				2369	param.min_is_always = 1;
				2370	param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
				2371	param.min_size = 2;
				2372	one (&mod_1_1_to_mod_1_2_threshold, &param);
				2373
				2374	param.function = speed_mpn_mod_1_2;
				2375	param.function2 = speed_mpn_mod_1_4;
				2376	param.min_is_always = 1;
				2377	param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
				2378	param.min_size = 1;
				2379	one (&mod_1_2_to_mod_1_4_threshold, &param);
				2380
				2381	if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
				2382	{
				2383	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
				2384	mod_1_2_to_mod_1_4_threshold = 0;
				2385
				2386	param.function = speed_mpn_mod_1_1;
				2387	param.function2 = speed_mpn_mod_1_4;
				2388	param.min_is_always = 1;
				2389	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
				2390	param.min_size = 2;
				2391	one (&mod_1_1_to_mod_1_2_threshold, &param);
				2392	}
				2393
				2394	param.function = speed_mpn_mod_1_tune;
				2395	param.function2 = NULL;
				2396	param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
				2397	param.min_size = 2;
				2398	param.min_is_always = 0;
				2399	one (&mod_1u_to_mod_1_1_threshold, &param);
				2400
				2401	if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
				2402	mod_1_1_to_mod_1_2_threshold = 0;
				2403	if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
				2404	mod_1_2_to_mod_1_4_threshold = 0;
				2405
				2406	print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
				2407	print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
				2408	mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
				2409	print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
				2410	mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
				2411	}
				2412
				2413	{
				2414	static struct param_t param;
				2415
				2416	param.check_size = 256;
				2417
				2418	param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
				2419	s.r = randlimb_norm ();
				2420	param.function = speed_mpn_preinv_mod_1;
				2421	param.function2 = speed_mpn_mod_1_tune;
				2422	param.min_size = 1;
				2423	one (&preinv_mod_1_to_mod_1_threshold, &param);
				2424	}
				2425	}
				2426
				2427
				2428	/* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
				2429	imply that udiv_qrnnd_preinv is worth using, but it seems most
				2430	straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
				2431	directly. */
				2432
				2433	void
				2434	tune_preinv_divrem_1 (void)
				2435	{
				2436	static struct param_t param;
				2437	speed_function_t divrem_1;
				2438	const char *divrem_1_name;
				2439	double t1, t2;
				2440
				2441	if (GMP_NAIL_BITS != 0)
				2442	{
				2443	print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
				2444	return;
				2445	}
				2446
				2447	/* Any native version of mpn_preinv_divrem_1 is assumed to exist because
				2448	it's faster than mpn_divrem_1. */
				2449	if (HAVE_NATIVE_mpn_preinv_divrem_1)
				2450	{
				2451	print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
				2452	return;
				2453	}
				2454
				2455	/* If udiv_qrnnd_preinv is the only division method then of course
				2456	mpn_preinv_divrem_1 should be used. */
				2457	if (UDIV_PREINV_ALWAYS)
				2458	{
				2459	print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
				2460	return;
				2461	}
				2462
				2463	/* If we've got an assembler version of mpn_divrem_1, then compare against
				2464	that, not the mpn_divrem_1_div generic C. */
				2465	if (HAVE_NATIVE_mpn_divrem_1)
				2466	{
				2467	divrem_1 = speed_mpn_divrem_1;
				2468	divrem_1_name = "mpn_divrem_1";
				2469	}
				2470	else
				2471	{
				2472	divrem_1 = speed_mpn_divrem_1_div;
				2473	divrem_1_name = "mpn_divrem_1_div";
				2474	}
				2475
				2476	param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
				2477	s.size = 200; /* generous but not too big */
				2478	/* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
				2479	since in general that's probably most common, though in fact for a
				2480	64-bit limb mp_bases[10].big_base is normalized. */
				2481	s.r = urandom() & (GMP_NUMB_MASK >> 4);
				2482	if (s.r == 0) s.r = 123;
				2483
				2484	t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
				2485	t2 = tuneup_measure (divrem_1, &param, &s);
				2486	if (t1 == -1.0 \|\| t2 == -1.0)
				2487	{
				2488	printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
				2489	divrem_1_name, (long) s.size);
				2490	abort ();
				2491	}
				2492	if (option_trace >= 1)
				2493	printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
				2494	(long) s.size, t1, divrem_1_name, t2);
				2495
				2496	print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
				2497	}
				2498
				2499
				2500
				2501	void
				2502	tune_divrem_2 (void)
				2503	{
				2504	static struct param_t param;
				2505
				2506	/* No support for tuning native assembler code, do that by hand and put
				2507	the results in the .asm file, and there's no need for such thresholds
				2508	to appear in gmp-mparam.h. */
				2509	if (HAVE_NATIVE_mpn_divrem_2)
				2510	return;
				2511
				2512	if (GMP_NAIL_BITS != 0)
				2513	{
				2514	print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
				2515	"no preinv with nails");
				2516	return;
				2517	}
				2518
				2519	if (UDIV_PREINV_ALWAYS)
				2520	{
				2521	print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
				2522	return;
				2523	}
				2524
				2525	/* Tune for the integer part of mpn_divrem_2. This will very possibly be
				2526	a bit out for the fractional part, but that's too bad, the integer part
				2527	is more important.
				2528
				2529	min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
				2530	code space if plain division is better only at size==2 or size==3. */
				2531	param.name = "DIVREM_2_THRESHOLD";
				2532	param.check_size = 256;
				2533	param.min_size = 4;
				2534	param.min_is_always = 1;
				2535	param.size_extra = 2; /* does qsize==nsize-2 divisions */
				2536	param.stop_factor = 2.0;
				2537
				2538	s.r = randlimb_norm ();
				2539	param.function = speed_mpn_divrem_2;
				2540	one (&divrem_2_threshold, &param);
				2541	}
				2542
				2543	void
				2544	tune_div_qr_2 (void)
				2545	{
				2546	static struct param_t param;
				2547	param.name = "DIV_QR_2_PI2_THRESHOLD";
				2548	param.function = speed_mpn_div_qr_2n;
				2549	param.check_size = 500;
				2550	param.min_size = 4;
				2551	one (&div_qr_2_pi2_threshold, &param);
				2552	}
				2553
				2554	/* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
				2555	tune for that. Its speed can differ on odd or even divisor, so take an
				2556	average threshold for the two.
				2557
				2558	mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
				2559	might not vary that way, but don't test this since high<divisor isn't
				2560	expected to occur often with small divisors. */
				2561
				2562	void
				2563	tune_divexact_1 (void)
				2564	{
				2565	static struct param_t param;
				2566	mp_size_t thresh[2], average;
				2567	int low, i;
				2568
				2569	/* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
				2570	full mpn_divrem_1. */
				2571	if (HAVE_NATIVE_mpn_divexact_1)
				2572	{
				2573	print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
				2574	return;
				2575	}
				2576
				2577	ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
				2578
				2579	param.name = "DIVEXACT_1_THRESHOLD";
				2580	param.data_high = DATA_HIGH_GE_R;
				2581	param.check_size = 256;
				2582	param.min_size = 2;
				2583	param.stop_factor = 1.5;
				2584	param.function = tuned_speed_mpn_divrem_1;
				2585	param.function2 = speed_mpn_divexact_1;
				2586	param.noprint = 1;
				2587
				2588	print_define_start (param.name);
				2589
				2590	for (low = 0; low <= 1; low++)
				2591	{
				2592	s.r = randlimb_half();
				2593	if (low == 0)
				2594	s.r \|= 1;
				2595	else
				2596	s.r &= ~CNST_LIMB(7);
				2597
				2598	one (&thresh[low], &param);
				2599	if (option_trace)
				2600	printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
				2601
				2602	if (thresh[low] == MP_SIZE_T_MAX)
				2603	{
				2604	average = MP_SIZE_T_MAX;
				2605	goto divexact_1_done;
				2606	}
				2607	}
				2608
				2609	if (option_trace)
				2610	{
				2611	printf ("average of:");
				2612	for (i = 0; i < numberof(thresh); i++)
				2613	printf (" %ld", (long) thresh[i]);
				2614	printf ("\n");
				2615	}
				2616
				2617	average = 0;
				2618	for (i = 0; i < numberof(thresh); i++)
				2619	average += thresh[i];
				2620	average /= numberof(thresh);
				2621
				2622	/* If divexact turns out to be better as early as 3 limbs, then use it
				2623	always, so as to reduce code size and conditional jumps. */
				2624	if (average <= 3)
				2625	average = 0;
				2626
				2627	divexact_1_done:
				2628	print_define_end (param.name, average);
				2629	}
				2630
				2631
				2632	/* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
				2633	same as mpn_mod_1, but this might not be true of an assembler
				2634	implementation. The threshold used is an average based on data where a
				2635	divide can be skipped and where it can't.
				2636
				2637	If modexact turns out to be better as early as 3 limbs, then use it
				2638	always, so as to reduce code size and conditional jumps. */
				2639
				2640	void
				2641	tune_modexact_1_odd (void)
				2642	{
				2643	static struct param_t param;
				2644	mp_size_t thresh_lt, thresh_ge, average;
				2645
				2646	#if 0
				2647	/* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
				2648	of a full mpn_mod_1. */
				2649	if (HAVE_NATIVE_mpn_modexact_1_odd)
				2650	{
				2651	print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
				2652	return;
				2653	}
				2654	#endif
				2655
				2656	param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
				2657	param.check_size = 256;
				2658	param.min_size = 2;
				2659	param.stop_factor = 1.5;
				2660	param.function = speed_mpn_modexact_1c_odd;
				2661	param.function2 = speed_mpn_mod_1_tune;
				2662	param.noprint = 1;
				2663	s.r = randlimb_half () \| 1;
				2664
				2665	print_define_start (param.name);
				2666
				2667	param.data_high = DATA_HIGH_LT_R;
				2668	one (&thresh_lt, &param);
				2669	if (option_trace)
				2670	printf ("lt thresh %ld\n", (long) thresh_lt);
				2671
				2672	average = thresh_lt;
				2673	if (thresh_lt != MP_SIZE_T_MAX)
				2674	{
				2675	param.data_high = DATA_HIGH_GE_R;
				2676	one (&thresh_ge, &param);
				2677	if (option_trace)
				2678	printf ("ge thresh %ld\n", (long) thresh_ge);
				2679
				2680	if (thresh_ge != MP_SIZE_T_MAX)
				2681	{
				2682	average = (thresh_ge + thresh_lt) / 2;
				2683	if (thresh_ge <= 3)
				2684	average = 0;
				2685	}
				2686	}
				2687
				2688	print_define_end (param.name, average);
				2689	}
				2690
				2691
				2692	void
				2693	tune_jacobi_base (void)
				2694	{
				2695	static struct param_t param;
				2696	speed_function_t f[4] =
				2697	{
				2698	speed_mpn_jacobi_base_1,
				2699	speed_mpn_jacobi_base_2,
				2700	speed_mpn_jacobi_base_3,
				2701	speed_mpn_jacobi_base_4,
				2702	};
				2703
				2704	s.size = GMP_LIMB_BITS * 3 / 4;
				2705
				2706	one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", &param);
				2707	}
				2708
				2709
				2710	void
				2711	tune_get_str (void)
				2712	{
				2713	/* Tune for decimal, it being most common. Some rough testing suggests
				2714	other bases are different, but not by very much. */
				2715	s.r = 10;
				2716	{
				2717	static struct param_t param;
				2718	GET_STR_PRECOMPUTE_THRESHOLD = 0;
				2719	param.name = "GET_STR_DC_THRESHOLD";
				2720	param.function = speed_mpn_get_str;
				2721	param.min_size = 4;
				2722	param.max_size = GET_STR_THRESHOLD_LIMIT;
				2723	one (&get_str_dc_threshold, &param);
				2724	}
				2725	{
				2726	static struct param_t param;
				2727	param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
				2728	param.function = speed_mpn_get_str;
				2729	param.min_size = GET_STR_DC_THRESHOLD;
				2730	param.max_size = GET_STR_THRESHOLD_LIMIT;
				2731	one (&get_str_precompute_threshold, &param);
				2732	}
				2733	}
				2734
				2735
				2736	double
				2737	speed_mpn_pre_set_str (struct speed_params *s)
				2738	{
				2739	unsigned char *str;
				2740	mp_ptr wp;
				2741	mp_size_t wn;
				2742	unsigned i;
				2743	int base;
				2744	double t;
				2745	mp_ptr powtab_mem, tp;
				2746	powers_t powtab[GMP_LIMB_BITS];
				2747	mp_size_t un;
				2748	int chars_per_limb;
				2749	TMP_DECL;
				2750
				2751	SPEED_RESTRICT_COND (s->size >= 1);
				2752
				2753	base = s->r == 0 ? 10 : s->r;
				2754	SPEED_RESTRICT_COND (base >= 2 && base <= 256);
				2755
				2756	TMP_MARK;
				2757
				2758	str = (unsigned char *) TMP_ALLOC (s->size);
				2759	for (i = 0; i < s->size; i++)
				2760	str[i] = s->xp[i] % base;
				2761
				2762	LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
				2763	SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
				2764
				2765	/* use this during development to check wn is big enough */
				2766	/*
				2767	ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
				2768	*/
				2769
				2770	speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
				2771	speed_operand_dst (s, wp, wn);
				2772	speed_cache_fill (s);
				2773
				2774	chars_per_limb = mp_bases[base].chars_per_limb;
				2775	un = s->size / chars_per_limb + 1;
				2776	powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
				2777	size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
				2778	powers_t *pt = powtab + n_pows;
				2779	tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
				2780
				2781	speed_starttime ();
				2782	i = s->reps;
				2783	do
				2784	{
				2785	mpn_pre_set_str (wp, str, s->size, pt, tp);
				2786	}
				2787	while (--i != 0);
				2788	t = speed_endtime ();
				2789
				2790	TMP_FREE;
				2791	return t;
				2792	}
				2793
				2794	void
				2795	tune_set_str (void)
				2796	{
				2797	s.r = 10; /* decimal */
				2798	{
				2799	static struct param_t param;
				2800	SET_STR_PRECOMPUTE_THRESHOLD = 0;
				2801	param.step_factor = 0.01;
				2802	param.name = "SET_STR_DC_THRESHOLD";
				2803	param.function = speed_mpn_pre_set_str;
				2804	param.min_size = 100;
				2805	param.max_size = 50000;
				2806	one (&set_str_dc_threshold, &param);
				2807	}
				2808	{
				2809	static struct param_t param;
				2810	param.step_factor = 0.02;
				2811	param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
				2812	param.function = speed_mpn_set_str;
				2813	param.min_size = SET_STR_DC_THRESHOLD;
				2814	param.max_size = 100000;
				2815	one (&set_str_precompute_threshold, &param);
				2816	}
				2817	}
				2818
				2819
				2820	void
				2821	tune_fft_mul (void)
				2822	{
				2823	static struct fft_param_t param;
				2824
				2825	if (option_fft_max_size == 0)
				2826	return;
				2827
				2828	param.table_name = "MUL_FFT_TABLE3";
				2829	param.threshold_name = "MUL_FFT_THRESHOLD";
				2830	param.p_threshold = &mul_fft_threshold;
				2831	param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
				2832	param.p_modf_threshold = &mul_fft_modf_threshold;
				2833	param.first_size = MUL_TOOM33_THRESHOLD / 2;
				2834	param.max_size = option_fft_max_size;
				2835	param.function = speed_mpn_fft_mul;
				2836	param.mul_modf_function = speed_mpn_mul_fft;
				2837	param.mul_function = speed_mpn_mul_n;
				2838	param.sqr = 0;
				2839	fft (&param);
				2840	}
				2841
				2842
				2843	void
				2844	tune_fft_sqr (void)
				2845	{
				2846	static struct fft_param_t param;
				2847
				2848	if (option_fft_max_size == 0)
				2849	return;
				2850
				2851	param.table_name = "SQR_FFT_TABLE3";
				2852	param.threshold_name = "SQR_FFT_THRESHOLD";
				2853	param.p_threshold = &sqr_fft_threshold;
				2854	param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
				2855	param.p_modf_threshold = &sqr_fft_modf_threshold;
				2856	param.first_size = SQR_TOOM3_THRESHOLD / 2;
				2857	param.max_size = option_fft_max_size;
				2858	param.function = speed_mpn_fft_sqr;
				2859	param.mul_modf_function = speed_mpn_mul_fft_sqr;
				2860	param.mul_function = speed_mpn_sqr;
				2861	param.sqr = 1;
				2862	fft (&param);
				2863	}
				2864
				2865	void
				2866	tune_fac_ui (void)
				2867	{
				2868	static struct param_t param;
				2869
				2870	param.function = speed_mpz_fac_ui_tune;
				2871
				2872	param.name = "FAC_DSC_THRESHOLD";
				2873	param.min_size = 70;
				2874	param.max_size = FAC_DSC_THRESHOLD_LIMIT;
				2875	one (&fac_dsc_threshold, &param);
				2876
				2877	param.name = "FAC_ODD_THRESHOLD";
				2878	param.min_size = 22;
				2879	param.stop_factor = 1.7;
				2880	param.min_is_always = 1;
				2881	one (&fac_odd_threshold, &param);
				2882	}
				2883
				2884	void
				2885	all (void)
				2886	{
				2887	time_t start_time, end_time;
				2888	TMP_DECL;
				2889
				2890	TMP_MARK;
				2891	SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
				2892	SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
				2893
				2894	mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
				2895	mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
				2896
				2897	fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
				2898
				2899	speed_time_init ();
				2900	fprintf (stderr, "Using: %s\n", speed_time_string);
				2901
				2902	fprintf (stderr, "speed_precision %d", speed_precision);
				2903	if (speed_unittime == 1.0)
				2904	fprintf (stderr, ", speed_unittime 1 cycle");
				2905	else
				2906	fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
				2907	if (speed_cycletime == 1.0 \|\| speed_cycletime == 0.0)
				2908	fprintf (stderr, ", CPU freq unknown\n");
				2909	else
				2910	fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
				2911
				2912	fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
				2913	DEFAULT_MAX_SIZE, (long) option_fft_max_size);
				2914	fprintf (stderr, "\n");
				2915
				2916	time (&start_time);
				2917	{
				2918	struct tm *tp;
				2919	tp = localtime (&start_time);
				2920	printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
				2921	tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
				2922
				2923	#ifdef __GNUC__
				2924	/* gcc sub-minor version doesn't seem to come through as a define */
				2925	printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
				2926	#define PRINTED_COMPILER
				2927	#endif
				2928	#if defined (__SUNPRO_C)
				2929	printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
				2930	#define PRINTED_COMPILER
				2931	#endif
				2932	#if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
				2933	/* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
				2934	printf ("MIPSpro C %d.%d.%d */\n",
				2935	_COMPILER_VERSION / 100,
				2936	_COMPILER_VERSION / 10 % 10,
				2937	_COMPILER_VERSION % 10);
				2938	#define PRINTED_COMPILER
				2939	#endif
				2940	#if defined (__DECC) && defined (__DECC_VER)
				2941	printf ("DEC C %d */\n", __DECC_VER);
				2942	#define PRINTED_COMPILER
				2943	#endif
				2944	#if ! defined (PRINTED_COMPILER)
				2945	printf ("system compiler */\n");
				2946	#endif
				2947	}
				2948	printf ("\n");
				2949
				2950	tune_divrem_1 ();
				2951	tune_mod_1 ();
				2952	tune_preinv_divrem_1 ();
				2953	tune_div_qr_1 ();
				2954	#if 0
				2955	tune_divrem_2 ();
				2956	#endif
				2957	tune_div_qr_2 ();
				2958	tune_divexact_1 ();
				2959	tune_modexact_1_odd ();
				2960	printf("\n");
				2961
				2962	relspeed_div_1_vs_mul_1 ();
				2963	printf("\n");
				2964
				2965	tune_mul_n ();
				2966	printf("\n");
				2967
				2968	tune_mul ();
				2969	printf("\n");
				2970
				2971	tune_sqr ();
				2972	printf("\n");
				2973
				2974	tune_mulmid ();
				2975	printf("\n");
				2976
				2977	tune_mulmod_bnm1 ();
				2978	tune_sqrmod_bnm1 ();
				2979	printf("\n");
				2980
				2981	tune_fft_mul ();
				2982	printf("\n");
				2983
				2984	tune_fft_sqr ();
				2985	printf ("\n");
				2986
				2987	tune_mullo ();
				2988	tune_sqrlo ();
				2989	printf("\n");
				2990
				2991	tune_dc_div ();
				2992	tune_dc_bdiv ();
				2993
				2994	printf("\n");
				2995	tune_invertappr ();
				2996	tune_invert ();
				2997	printf("\n");
				2998
				2999	tune_binvert ();
				3000	tune_redc ();
				3001	printf("\n");
				3002
				3003	tune_mu_div ();
				3004	tune_mu_bdiv ();
				3005	printf("\n");
				3006
				3007	tune_powm_sec ();
				3008	printf("\n");
				3009
				3010	tune_get_str ();
				3011	tune_set_str ();
				3012	printf("\n");
				3013
				3014	tune_fac_ui ();
				3015	printf("\n");
				3016
				3017	tune_matrix22_mul ();
				3018	tune_hgcd2 ();
				3019	tune_hgcd ();
				3020	tune_hgcd_appr ();
				3021	tune_hgcd_reduce();
				3022	tune_gcd_dc ();
				3023	tune_gcdext_dc ();
				3024	tune_jacobi_base ();
				3025	printf("\n");
				3026
				3027	time (&end_time);
				3028	printf ("/* Tuneup completed successfully, took %ld seconds */\n",
				3029	(long) (end_time - start_time));
				3030
				3031	TMP_FREE;
				3032	}
				3033
				3034
				3035	int
				3036	main (int argc, char *argv[])
				3037	{
				3038	int opt;
				3039
				3040	/* Unbuffered so if output is redirected to a file it isn't lost if the
				3041	program is killed part way through. */
				3042	setbuf (stdout, NULL);
				3043	setbuf (stderr, NULL);
				3044
				3045	while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
				3046	{
				3047	switch (opt) {
				3048	case 'f':
				3049	if (optarg[0] == 't')
				3050	option_fft_trace = 2;
				3051	else
				3052	option_fft_max_size = atol (optarg);
				3053	break;
				3054	case 'o':
				3055	speed_option_set (optarg);
				3056	break;
				3057	case 'p':
				3058	speed_precision = atoi (optarg);
				3059	break;
				3060	case 't':
				3061	option_trace++;
				3062	break;
				3063	case '?':
				3064	exit(1);
				3065	}
				3066	}
				3067
				3068	all ();
				3069	exit (0);
				3070	}