Austin Schuh | dace2a6 | 2020-08-18 10:56:48 -0700 | [diff] [blame] | 1 | /* Time routines for speed measurements. |
| 2 | |
| 3 | Copyright 1999-2004, 2010-2012 Free Software Foundation, Inc. |
| 4 | |
| 5 | This file is part of the GNU MP Library. |
| 6 | |
| 7 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 8 | it under the terms of either: |
| 9 | |
| 10 | * the GNU Lesser General Public License as published by the Free |
| 11 | Software Foundation; either version 3 of the License, or (at your |
| 12 | option) any later version. |
| 13 | |
| 14 | or |
| 15 | |
| 16 | * the GNU General Public License as published by the Free Software |
| 17 | Foundation; either version 2 of the License, or (at your option) any |
| 18 | later version. |
| 19 | |
| 20 | or both in parallel, as here. |
| 21 | |
| 22 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 23 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 24 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 25 | for more details. |
| 26 | |
| 27 | You should have received copies of the GNU General Public License and the |
| 28 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 29 | see https://www.gnu.org/licenses/. */ |
| 30 | |
| 31 | |
| 32 | /* Usage: |
| 33 | |
| 34 | The code in this file implements the lowest level of time measuring, |
| 35 | simple one-time measuring of time between two points. |
| 36 | |
| 37 | void speed_starttime (void) |
| 38 | double speed_endtime (void) |
| 39 | Call speed_starttime to start measuring, and then call speed_endtime |
| 40 | when done. |
| 41 | |
| 42 | speed_endtime returns the time taken, in seconds. Or if the timebase |
| 43 | is in CPU cycles and the CPU frequency is unknown then speed_endtime |
| 44 | returns cycles. Applications can identify the cycles return by |
| 45 | checking for speed_cycletime (described below) equal to 1.0. |
| 46 | |
| 47 | If some sort of temporary glitch occurs then speed_endtime returns |
| 48 | 0.0. Currently this is for various cases where a negative time has |
| 49 | occurred. This unfortunately occurs with getrusage on some systems, |
| 50 | and with the hppa cycle counter on hpux. |
| 51 | |
| 52 | double speed_cycletime |
| 53 | The time in seconds for each CPU cycle. For example on a 100 MHz CPU |
| 54 | this would be 1.0e-8. |
| 55 | |
| 56 | If the CPU frequency is unknown, then speed_cycletime is either 0.0 |
| 57 | or 1.0. It's 0.0 when speed_endtime is returning seconds, or it's |
| 58 | 1.0 when speed_endtime is returning cycles. |
| 59 | |
| 60 | It may be noted that "speed_endtime() / speed_cycletime" gives a |
| 61 | measured time in cycles, irrespective of whether speed_endtime is |
| 62 | returning cycles or seconds. (Assuming cycles can be had, ie. it's |
| 63 | either cycles already or the cpu frequency is known. See also |
| 64 | speed_cycletime_need_cycles below.) |
| 65 | |
| 66 | double speed_unittime |
| 67 | The unit of time measurement accuracy for the timing method in use. |
| 68 | This is in seconds or cycles, as per speed_endtime. |
| 69 | |
| 70 | char speed_time_string[] |
| 71 | A null-terminated string describing the time method in use. |
| 72 | |
| 73 | void speed_time_init (void) |
| 74 | Initialize time measuring. speed_starttime() does this |
| 75 | automatically, so it's only needed if an application wants to inspect |
| 76 | the above global variables before making a measurement. |
| 77 | |
| 78 | int speed_precision |
| 79 | The intended accuracy of time measurements. speed_measure() in |
| 80 | common.c for instance runs target routines with enough repetitions so |
| 81 | it takes at least "speed_unittime * speed_precision" (this expression |
| 82 | works for both cycles or seconds from speed_endtime). |
| 83 | |
| 84 | A program can provide an option so the user to set speed_precision. |
| 85 | If speed_precision is zero when speed_time_init or speed_starttime |
| 86 | first run then it gets a default based on the measuring method |
| 87 | chosen. (More precision for higher accuracy methods.) |
| 88 | |
| 89 | void speed_cycletime_need_seconds (void) |
| 90 | Call this to demand that speed_endtime will return seconds, and not |
| 91 | cycles. If only cycles are available then an error is printed and |
| 92 | the program exits. |
| 93 | |
| 94 | void speed_cycletime_need_cycles (void) |
| 95 | Call this to demand that speed_cycletime is non-zero, so that |
| 96 | "speed_endtime() / speed_cycletime" will give times in cycles. |
| 97 | |
| 98 | |
| 99 | |
| 100 | Notes: |
| 101 | |
| 102 | Various combinations of cycle counter, read_real_time(), getrusage(), |
| 103 | gettimeofday() and times() can arise, according to which are available |
| 104 | and their precision. |
| 105 | |
| 106 | |
| 107 | Allowing speed_endtime() to return either seconds or cycles is only a |
| 108 | slight complication and makes it possible for the speed program to do |
| 109 | some sensible things without demanding the CPU frequency. If seconds are |
| 110 | being measured then it can always print seconds, and if cycles are being |
| 111 | measured then it can always print them without needing to know how long |
| 112 | they are. Also the tune program doesn't care at all what the units are. |
| 113 | |
| 114 | GMP_CPU_FREQUENCY can always be set when the automated methods in freq.c |
| 115 | fail. This will be needed if times in seconds are wanted but a cycle |
| 116 | counter is being used, or if times in cycles are wanted but getrusage or |
| 117 | another seconds based timer is in use. |
| 118 | |
| 119 | If the measuring method uses a cycle counter but supplements it with |
| 120 | getrusage or the like, then knowing the CPU frequency is mandatory since |
| 121 | the code compares values from the two. |
| 122 | |
| 123 | |
| 124 | Not done: |
| 125 | |
| 126 | Solaris gethrtime() seems no more than a slow way to access the Sparc V9 |
| 127 | cycle counter. gethrvtime() seems to be relevant only to light weight |
| 128 | processes, it doesn't for instance give nanosecond virtual time. So |
| 129 | neither of these are used. |
| 130 | |
| 131 | |
| 132 | Bugs: |
| 133 | |
| 134 | getrusage_microseconds_p is fundamentally flawed, getrusage and |
| 135 | gettimeofday can have resolutions other than clock ticks or microseconds, |
| 136 | for instance IRIX 5 has a tick of 10 ms but a getrusage of 1 ms. |
| 137 | |
| 138 | |
| 139 | Enhancements: |
| 140 | |
| 141 | The SGI hardware counter has 64 bits on some machines, which could be |
| 142 | used when available. But perhaps 32 bits is enough range, and then rely |
| 143 | on the getrusage supplement. |
| 144 | |
| 145 | Maybe getrusage (or times) should be used as a supplement for any |
| 146 | wall-clock measuring method. Currently a wall clock with a good range |
| 147 | (eg. a 64-bit cycle counter) is used without a supplement. |
| 148 | |
| 149 | On PowerPC the timebase registers could be used, but would have to do |
| 150 | something to find out the speed. On 6xx chips it's normally 1/4 bus |
| 151 | speed, on 4xx chips it's either that or an external clock. Measuring |
| 152 | against gettimeofday might be ok. */ |
| 153 | |
| 154 | |
| 155 | #include "config.h" |
| 156 | |
| 157 | #include <errno.h> |
| 158 | #include <setjmp.h> |
| 159 | #include <signal.h> |
| 160 | #include <stddef.h> |
| 161 | #include <stdio.h> |
| 162 | #include <string.h> |
| 163 | #include <stdlib.h> /* for getenv() */ |
| 164 | |
| 165 | #if HAVE_FCNTL_H |
| 166 | #include <fcntl.h> /* for open() */ |
| 167 | #endif |
| 168 | |
| 169 | #if HAVE_STDINT_H |
| 170 | #include <stdint.h> /* for uint64_t */ |
| 171 | #endif |
| 172 | |
| 173 | #if HAVE_UNISTD_H |
| 174 | #include <unistd.h> /* for sysconf() */ |
| 175 | #endif |
| 176 | |
| 177 | #include <sys/types.h> |
| 178 | |
| 179 | #if TIME_WITH_SYS_TIME |
| 180 | # include <sys/time.h> /* for struct timeval */ |
| 181 | # include <time.h> |
| 182 | #else |
| 183 | # if HAVE_SYS_TIME_H |
| 184 | # include <sys/time.h> |
| 185 | # else |
| 186 | # include <time.h> |
| 187 | # endif |
| 188 | #endif |
| 189 | |
| 190 | #if HAVE_SYS_MMAN_H |
| 191 | #include <sys/mman.h> /* for mmap() */ |
| 192 | #endif |
| 193 | |
| 194 | #if HAVE_SYS_RESOURCE_H |
| 195 | #include <sys/resource.h> /* for struct rusage */ |
| 196 | #endif |
| 197 | |
| 198 | #if HAVE_SYS_SYSSGI_H |
| 199 | #include <sys/syssgi.h> /* for syssgi() */ |
| 200 | #endif |
| 201 | |
| 202 | #if HAVE_SYS_SYSTEMCFG_H |
| 203 | #include <sys/systemcfg.h> /* for RTC_POWER on AIX */ |
| 204 | #endif |
| 205 | |
| 206 | #if HAVE_SYS_TIMES_H |
| 207 | #include <sys/times.h> /* for times() and struct tms */ |
| 208 | #endif |
| 209 | |
| 210 | #include "gmp-impl.h" |
| 211 | |
| 212 | #include "speed.h" |
| 213 | |
| 214 | |
| 215 | /* strerror is only used for some stuff on newish systems, no need to have a |
| 216 | proper replacement */ |
| 217 | #if ! HAVE_STRERROR |
| 218 | #define strerror(n) "<strerror not available>" |
| 219 | #endif |
| 220 | |
| 221 | |
| 222 | char speed_time_string[256]; |
| 223 | int speed_precision = 0; |
| 224 | double speed_unittime; |
| 225 | double speed_cycletime = 0.0; |
| 226 | |
| 227 | |
| 228 | /* don't rely on "unsigned" to "double" conversion, it's broken in SunOS 4 |
| 229 | native cc */ |
| 230 | #define M_2POWU (((double) INT_MAX + 1.0) * 2.0) |
| 231 | |
| 232 | #define M_2POW32 4294967296.0 |
| 233 | #define M_2POW64 (M_2POW32 * M_2POW32) |
| 234 | |
| 235 | |
| 236 | /* Conditionals for the time functions available are done with normal C |
| 237 | code, which is a lot easier than wildly nested preprocessor directives. |
| 238 | |
| 239 | The choice of what to use is partly made at run-time, according to |
| 240 | whether the cycle counter works and the measured accuracy of getrusage |
| 241 | and gettimeofday. |
| 242 | |
| 243 | A routine that's not available won't be getting called, but is an abort() |
| 244 | to be sure it isn't called mistakenly. |
| 245 | |
| 246 | It can be assumed that if a function exists then its data type will, but |
| 247 | if the function doesn't then the data type might or might not exist, so |
| 248 | the type can't be used unconditionally. The "struct_rusage" etc macros |
| 249 | provide dummies when the respective function doesn't exist. */ |
| 250 | |
| 251 | |
| 252 | #if HAVE_SPEED_CYCLECOUNTER |
| 253 | static const int have_cycles = HAVE_SPEED_CYCLECOUNTER; |
| 254 | #else |
| 255 | static const int have_cycles = 0; |
| 256 | #define speed_cyclecounter(p) ASSERT_FAIL (speed_cyclecounter not available) |
| 257 | #endif |
| 258 | |
| 259 | /* "stck" returns ticks since 1 Jan 1900 00:00 GMT, where each tick is 2^-12 |
| 260 | microseconds. Same #ifdefs here as in longlong.h. */ |
| 261 | #if defined (__GNUC__) && ! defined (NO_ASM) \ |
| 262 | && (defined (__i370__) || defined (__s390__) || defined (__mvs__)) |
| 263 | static const int have_stck = 1; |
| 264 | static const int use_stck = 1; /* always use when available */ |
| 265 | typedef uint64_t stck_t; /* gcc for s390 is quite new, always has uint64_t */ |
| 266 | #define STCK(timestamp) \ |
| 267 | do { \ |
| 268 | asm ("stck %0" : "=Q" (timestamp)); \ |
| 269 | } while (0) |
| 270 | #else |
| 271 | static const int have_stck = 0; |
| 272 | static const int use_stck = 0; |
| 273 | typedef unsigned long stck_t; /* dummy */ |
| 274 | #define STCK(timestamp) ASSERT_FAIL (stck instruction not available) |
| 275 | #endif |
| 276 | #define STCK_PERIOD (1.0 / 4096e6) /* 2^-12 microseconds */ |
| 277 | |
| 278 | /* mftb |
| 279 | Enhancement: On 64-bit chips mftb gives a 64-bit value, no need for mftbu |
| 280 | and a loop (see powerpc64.asm). */ |
| 281 | #if HAVE_HOST_CPU_FAMILY_powerpc |
| 282 | static const int have_mftb = 1; |
| 283 | #if defined (__GNUC__) && ! defined (NO_ASM) |
| 284 | #define MFTB(a) \ |
| 285 | do { \ |
| 286 | unsigned __h1, __l, __h2; \ |
| 287 | do { \ |
| 288 | asm volatile ("mftbu %0\n" \ |
| 289 | "mftb %1\n" \ |
| 290 | "mftbu %2" \ |
| 291 | : "=r" (__h1), \ |
| 292 | "=r" (__l), \ |
| 293 | "=r" (__h2)); \ |
| 294 | } while (__h1 != __h2); \ |
| 295 | a[0] = __l; \ |
| 296 | a[1] = __h1; \ |
| 297 | } while (0) |
| 298 | #else |
| 299 | #define MFTB(a) mftb_function (a) |
| 300 | #endif |
| 301 | #else /* ! powerpc */ |
| 302 | static const int have_mftb = 0; |
| 303 | #define MFTB(a) \ |
| 304 | do { \ |
| 305 | a[0] = 0; \ |
| 306 | a[1] = 0; \ |
| 307 | ASSERT_FAIL (mftb not available); \ |
| 308 | } while (0) |
| 309 | #endif |
| 310 | |
| 311 | /* Unicos 10.X has syssgi(), but not mmap(). */ |
| 312 | #if HAVE_SYSSGI && HAVE_MMAP |
| 313 | static const int have_sgi = 1; |
| 314 | #else |
| 315 | static const int have_sgi = 0; |
| 316 | #endif |
| 317 | |
| 318 | #if HAVE_READ_REAL_TIME |
| 319 | static const int have_rrt = 1; |
| 320 | #else |
| 321 | static const int have_rrt = 0; |
| 322 | #define read_real_time(t,s) ASSERT_FAIL (read_real_time not available) |
| 323 | #define time_base_to_time(t,s) ASSERT_FAIL (time_base_to_time not available) |
| 324 | #define RTC_POWER 1 |
| 325 | #define RTC_POWER_PC 2 |
| 326 | #define timebasestruct_t struct timebasestruct_dummy |
| 327 | struct timebasestruct_dummy { |
| 328 | int flag; |
| 329 | unsigned int tb_high; |
| 330 | unsigned int tb_low; |
| 331 | }; |
| 332 | #endif |
| 333 | |
| 334 | #if HAVE_CLOCK_GETTIME |
| 335 | static const int have_cgt = 1; |
| 336 | #define struct_timespec struct timespec |
| 337 | #else |
| 338 | static const int have_cgt = 0; |
| 339 | #define struct_timespec struct timespec_dummy |
| 340 | #define clock_gettime(id,ts) (ASSERT_FAIL (clock_gettime not available), -1) |
| 341 | #define clock_getres(id,ts) (ASSERT_FAIL (clock_getres not available), -1) |
| 342 | #endif |
| 343 | |
| 344 | #if HAVE_GETRUSAGE |
| 345 | static const int have_grus = 1; |
| 346 | #define struct_rusage struct rusage |
| 347 | #else |
| 348 | static const int have_grus = 0; |
| 349 | #define getrusage(n,ru) ASSERT_FAIL (getrusage not available) |
| 350 | #define struct_rusage struct rusage_dummy |
| 351 | #endif |
| 352 | |
| 353 | #if HAVE_GETTIMEOFDAY |
| 354 | static const int have_gtod = 1; |
| 355 | #define struct_timeval struct timeval |
| 356 | #else |
| 357 | static const int have_gtod = 0; |
| 358 | #define gettimeofday(tv,tz) ASSERT_FAIL (gettimeofday not available) |
| 359 | #define struct_timeval struct timeval_dummy |
| 360 | #endif |
| 361 | |
| 362 | #if HAVE_TIMES |
| 363 | static const int have_times = 1; |
| 364 | #define struct_tms struct tms |
| 365 | #else |
| 366 | static const int have_times = 0; |
| 367 | #define times(tms) ASSERT_FAIL (times not available) |
| 368 | #define struct_tms struct tms_dummy |
| 369 | #endif |
| 370 | |
| 371 | struct tms_dummy { |
| 372 | long tms_utime; |
| 373 | }; |
| 374 | struct timeval_dummy { |
| 375 | long tv_sec; |
| 376 | long tv_usec; |
| 377 | }; |
| 378 | struct rusage_dummy { |
| 379 | struct_timeval ru_utime; |
| 380 | }; |
| 381 | struct timespec_dummy { |
| 382 | long tv_sec; |
| 383 | long tv_nsec; |
| 384 | }; |
| 385 | |
| 386 | static int use_cycles; |
| 387 | static int use_mftb; |
| 388 | static int use_sgi; |
| 389 | static int use_rrt; |
| 390 | static int use_cgt; |
| 391 | static int use_gtod; |
| 392 | static int use_grus; |
| 393 | static int use_times; |
| 394 | static int use_tick_boundary; |
| 395 | |
| 396 | static unsigned start_cycles[2]; |
| 397 | static stck_t start_stck; |
| 398 | static unsigned start_mftb[2]; |
| 399 | static unsigned start_sgi; |
| 400 | static timebasestruct_t start_rrt; |
| 401 | static struct_timespec start_cgt; |
| 402 | static struct_rusage start_grus; |
| 403 | static struct_timeval start_gtod; |
| 404 | static struct_tms start_times; |
| 405 | |
| 406 | static double cycles_limit = 1e100; |
| 407 | static double mftb_unittime; |
| 408 | static double sgi_unittime; |
| 409 | static double cgt_unittime; |
| 410 | static double grus_unittime; |
| 411 | static double gtod_unittime; |
| 412 | static double times_unittime; |
| 413 | |
| 414 | /* for RTC_POWER format, ie. seconds and nanoseconds */ |
| 415 | #define TIMEBASESTRUCT_SECS(t) ((t)->tb_high + (t)->tb_low * 1e-9) |
| 416 | |
| 417 | |
| 418 | /* Return a string representing a time in seconds, nicely formatted. |
| 419 | Eg. "10.25ms". */ |
| 420 | char * |
| 421 | unittime_string (double t) |
| 422 | { |
| 423 | static char buf[128]; |
| 424 | |
| 425 | const char *unit; |
| 426 | int prec; |
| 427 | |
| 428 | /* choose units and scale */ |
| 429 | if (t < 1e-6) |
| 430 | t *= 1e9, unit = "ns"; |
| 431 | else if (t < 1e-3) |
| 432 | t *= 1e6, unit = "us"; |
| 433 | else if (t < 1.0) |
| 434 | t *= 1e3, unit = "ms"; |
| 435 | else |
| 436 | unit = "s"; |
| 437 | |
| 438 | /* want 4 significant figures */ |
| 439 | if (t < 1.0) |
| 440 | prec = 4; |
| 441 | else if (t < 10.0) |
| 442 | prec = 3; |
| 443 | else if (t < 100.0) |
| 444 | prec = 2; |
| 445 | else |
| 446 | prec = 1; |
| 447 | |
| 448 | sprintf (buf, "%.*f%s", prec, t, unit); |
| 449 | return buf; |
| 450 | } |
| 451 | |
| 452 | |
| 453 | static jmp_buf cycles_works_buf; |
| 454 | |
| 455 | static RETSIGTYPE |
| 456 | cycles_works_handler (int sig) |
| 457 | { |
| 458 | longjmp (cycles_works_buf, 1); |
| 459 | } |
| 460 | |
| 461 | int |
| 462 | cycles_works_p (void) |
| 463 | { |
| 464 | static int result = -1; |
| 465 | |
| 466 | if (result != -1) |
| 467 | goto done; |
| 468 | |
| 469 | /* FIXME: On linux, the cycle counter is not saved and restored over |
| 470 | * context switches, making it almost useless for precise cputime |
| 471 | * measurements. When available, it's better to use clock_gettime, |
| 472 | * which seems to have reasonable accuracy (tested on x86_32, |
| 473 | * linux-2.6.26, glibc-2.7). However, there are also some linux |
| 474 | * systems where clock_gettime is broken in one way or the other, |
| 475 | * like CLOCK_PROCESS_CPUTIME_ID not implemented (easy case) or |
| 476 | * kind-of implemented but broken (needs code to detect that), and |
| 477 | * on those systems a wall-clock cycle counter is the least bad |
| 478 | * fallback. |
| 479 | * |
| 480 | * So we need some code to disable the cycle counter on some but not |
| 481 | * all linux systems. */ |
| 482 | #ifdef SIGILL |
| 483 | { |
| 484 | RETSIGTYPE (*old_handler) (int); |
| 485 | unsigned cycles[2]; |
| 486 | |
| 487 | old_handler = signal (SIGILL, cycles_works_handler); |
| 488 | if (old_handler == SIG_ERR) |
| 489 | { |
| 490 | if (speed_option_verbose) |
| 491 | printf ("cycles_works_p(): SIGILL not supported, assuming speed_cyclecounter() works\n"); |
| 492 | goto yes; |
| 493 | } |
| 494 | if (setjmp (cycles_works_buf)) |
| 495 | { |
| 496 | if (speed_option_verbose) |
| 497 | printf ("cycles_works_p(): SIGILL during speed_cyclecounter(), so doesn't work\n"); |
| 498 | result = 0; |
| 499 | goto done; |
| 500 | } |
| 501 | speed_cyclecounter (cycles); |
| 502 | signal (SIGILL, old_handler); |
| 503 | if (speed_option_verbose) |
| 504 | printf ("cycles_works_p(): speed_cyclecounter() works\n"); |
| 505 | } |
| 506 | #else |
| 507 | |
| 508 | if (speed_option_verbose) |
| 509 | printf ("cycles_works_p(): SIGILL not defined, assuming speed_cyclecounter() works\n"); |
| 510 | goto yes; |
| 511 | #endif |
| 512 | |
| 513 | yes: |
| 514 | result = 1; |
| 515 | |
| 516 | done: |
| 517 | return result; |
| 518 | } |
| 519 | |
| 520 | |
| 521 | /* The number of clock ticks per second, but looking at sysconf rather than |
| 522 | just CLK_TCK, where possible. */ |
| 523 | long |
| 524 | clk_tck (void) |
| 525 | { |
| 526 | static long result = -1L; |
| 527 | if (result != -1L) |
| 528 | return result; |
| 529 | |
| 530 | #if HAVE_SYSCONF |
| 531 | result = sysconf (_SC_CLK_TCK); |
| 532 | if (result != -1L) |
| 533 | { |
| 534 | if (speed_option_verbose) |
| 535 | printf ("sysconf(_SC_CLK_TCK) is %ld per second\n", result); |
| 536 | return result; |
| 537 | } |
| 538 | |
| 539 | fprintf (stderr, |
| 540 | "sysconf(_SC_CLK_TCK) not working, using CLK_TCK instead\n"); |
| 541 | #endif |
| 542 | |
| 543 | #ifdef CLK_TCK |
| 544 | result = CLK_TCK; |
| 545 | if (speed_option_verbose) |
| 546 | printf ("CLK_TCK is %ld per second\n", result); |
| 547 | return result; |
| 548 | #else |
| 549 | fprintf (stderr, "CLK_TCK not defined, cannot continue\n"); |
| 550 | abort (); |
| 551 | #endif |
| 552 | } |
| 553 | |
| 554 | |
| 555 | /* If two times can be observed less than half a clock tick apart, then |
| 556 | assume "get" is microsecond accurate. |
| 557 | |
| 558 | Two times only 1 microsecond apart are not believed, since some kernels |
| 559 | take it upon themselves to ensure gettimeofday doesn't return the same |
| 560 | value twice, for the benefit of applications using it for a timestamp. |
| 561 | This is obviously very stupid given the speed of CPUs these days. |
| 562 | |
| 563 | Making "reps" many calls to noop_1() is designed to waste some CPU, with |
| 564 | a view to getting measurements 2 microseconds (or more) apart. "reps" is |
| 565 | increased progressively until such a period is seen. |
| 566 | |
| 567 | The outer loop "attempts" are just to allow for any random nonsense or |
| 568 | system load upsetting the measurements (ie. making two successive calls |
| 569 | to "get" come out as a longer interval than normal). |
| 570 | |
| 571 | Bugs: |
| 572 | |
| 573 | The assumption that any interval less than a half tick implies |
| 574 | microsecond resolution is obviously fairly rash, the true resolution |
| 575 | could be anything between a microsecond and that half tick. Perhaps |
| 576 | something special would have to be done on a system where this is the |
| 577 | case, since there's no obvious reliable way to detect it |
| 578 | automatically. */ |
| 579 | |
| 580 | #define MICROSECONDS_P(name, type, get, sec, usec) \ |
| 581 | { \ |
| 582 | static int result = -1; \ |
| 583 | type st, et; \ |
| 584 | long dt, half_tick; \ |
| 585 | unsigned attempt, reps, i, j; \ |
| 586 | \ |
| 587 | if (result != -1) \ |
| 588 | return result; \ |
| 589 | \ |
| 590 | result = 0; \ |
| 591 | half_tick = (1000000L / clk_tck ()) / 2; \ |
| 592 | \ |
| 593 | for (attempt = 0; attempt < 5; attempt++) \ |
| 594 | { \ |
| 595 | reps = 0; \ |
| 596 | for (;;) \ |
| 597 | { \ |
| 598 | get (st); \ |
| 599 | for (i = 0; i < reps; i++) \ |
| 600 | for (j = 0; j < 100; j++) \ |
| 601 | noop_1 (CNST_LIMB(0)); \ |
| 602 | get (et); \ |
| 603 | \ |
| 604 | dt = (sec(et)-sec(st))*1000000L + usec(et)-usec(st); \ |
| 605 | \ |
| 606 | if (speed_option_verbose >= 2) \ |
| 607 | printf ("%s attempt=%u, reps=%u, dt=%ld\n", \ |
| 608 | name, attempt, reps, dt); \ |
| 609 | \ |
| 610 | if (dt >= 2) \ |
| 611 | break; \ |
| 612 | \ |
| 613 | reps = (reps == 0 ? 1 : 2*reps); \ |
| 614 | if (reps == 0) \ |
| 615 | break; /* uint overflow, not normal */ \ |
| 616 | } \ |
| 617 | \ |
| 618 | if (dt < half_tick) \ |
| 619 | { \ |
| 620 | result = 1; \ |
| 621 | break; \ |
| 622 | } \ |
| 623 | } \ |
| 624 | \ |
| 625 | if (speed_option_verbose) \ |
| 626 | { \ |
| 627 | if (result) \ |
| 628 | printf ("%s is microsecond accurate\n", name); \ |
| 629 | else \ |
| 630 | printf ("%s is only %s clock tick accurate\n", \ |
| 631 | name, unittime_string (1.0/clk_tck())); \ |
| 632 | } \ |
| 633 | return result; \ |
| 634 | } |
| 635 | |
| 636 | |
| 637 | int |
| 638 | gettimeofday_microseconds_p (void) |
| 639 | { |
| 640 | #define call_gettimeofday(t) gettimeofday (&(t), NULL) |
| 641 | #define timeval_tv_sec(t) ((t).tv_sec) |
| 642 | #define timeval_tv_usec(t) ((t).tv_usec) |
| 643 | MICROSECONDS_P ("gettimeofday", struct_timeval, |
| 644 | call_gettimeofday, timeval_tv_sec, timeval_tv_usec); |
| 645 | } |
| 646 | |
| 647 | int |
| 648 | getrusage_microseconds_p (void) |
| 649 | { |
| 650 | #define call_getrusage(t) getrusage (0, &(t)) |
| 651 | #define rusage_tv_sec(t) ((t).ru_utime.tv_sec) |
| 652 | #define rusage_tv_usec(t) ((t).ru_utime.tv_usec) |
| 653 | MICROSECONDS_P ("getrusage", struct_rusage, |
| 654 | call_getrusage, rusage_tv_sec, rusage_tv_usec); |
| 655 | } |
| 656 | |
| 657 | /* Test whether getrusage goes backwards, return non-zero if it does |
| 658 | (suggesting it's flawed). |
| 659 | |
| 660 | On a macintosh m68040-unknown-netbsd1.4.1 getrusage looks like it's |
| 661 | microsecond accurate, but has been seen remaining unchanged after many |
| 662 | microseconds have elapsed. It also regularly goes backwards by 1000 to |
| 663 | 5000 usecs, this has been seen after between 500 and 4000 attempts taking |
| 664 | perhaps 0.03 seconds. We consider this too broken for good measuring. |
| 665 | We used to have configure pretend getrusage didn't exist on this system, |
| 666 | but a runtime test should be more reliable, since we imagine the problem |
| 667 | is not confined to just this exact system tuple. */ |
| 668 | |
| 669 | int |
| 670 | getrusage_backwards_p (void) |
| 671 | { |
| 672 | static int result = -1; |
| 673 | struct rusage start, prev, next; |
| 674 | long d; |
| 675 | int i; |
| 676 | |
| 677 | if (result != -1) |
| 678 | return result; |
| 679 | |
| 680 | getrusage (0, &start); |
| 681 | memcpy (&next, &start, sizeof (next)); |
| 682 | |
| 683 | result = 0; |
| 684 | i = 0; |
| 685 | for (;;) |
| 686 | { |
| 687 | memcpy (&prev, &next, sizeof (prev)); |
| 688 | getrusage (0, &next); |
| 689 | |
| 690 | if (next.ru_utime.tv_sec < prev.ru_utime.tv_sec |
| 691 | || (next.ru_utime.tv_sec == prev.ru_utime.tv_sec |
| 692 | && next.ru_utime.tv_usec < prev.ru_utime.tv_usec)) |
| 693 | { |
| 694 | if (speed_option_verbose) |
| 695 | printf ("getrusage went backwards (attempt %d: %ld.%06ld -> %ld.%06ld)\n", |
| 696 | i, |
| 697 | (long) prev.ru_utime.tv_sec, (long) prev.ru_utime.tv_usec, |
| 698 | (long) next.ru_utime.tv_sec, (long) next.ru_utime.tv_usec); |
| 699 | result = 1; |
| 700 | break; |
| 701 | } |
| 702 | |
| 703 | /* minimum 1000 attempts, then stop after either 0.1 seconds or 50000 |
| 704 | attempts, whichever comes first */ |
| 705 | d = 1000000 * (next.ru_utime.tv_sec - start.ru_utime.tv_sec) |
| 706 | + (next.ru_utime.tv_usec - start.ru_utime.tv_usec); |
| 707 | i++; |
| 708 | if (i > 50000 || (i > 1000 && d > 100000)) |
| 709 | break; |
| 710 | } |
| 711 | |
| 712 | return result; |
| 713 | } |
| 714 | |
| 715 | /* CLOCK_PROCESS_CPUTIME_ID looks like it's going to be in a future version |
| 716 | of glibc (some time post 2.2). |
| 717 | |
| 718 | CLOCK_VIRTUAL is process time, available in BSD systems (though sometimes |
| 719 | defined, but returning -1 for an error). */ |
| 720 | |
| 721 | #ifdef CLOCK_PROCESS_CPUTIME_ID |
| 722 | # define CGT_ID CLOCK_PROCESS_CPUTIME_ID |
| 723 | #else |
| 724 | # ifdef CLOCK_VIRTUAL |
| 725 | # define CGT_ID CLOCK_VIRTUAL |
| 726 | # endif |
| 727 | #endif |
| 728 | #ifdef CGT_ID |
| 729 | const int have_cgt_id = 1; |
| 730 | #else |
| 731 | const int have_cgt_id = 0; |
| 732 | # define CGT_ID (ASSERT_FAIL (CGT_ID not determined), -1) |
| 733 | #endif |
| 734 | |
| 735 | #define CGT_DELAY_COUNT 1000 |
| 736 | |
| 737 | int |
| 738 | cgt_works_p (void) |
| 739 | { |
| 740 | static int result = -1; |
| 741 | struct_timespec unit; |
| 742 | |
| 743 | if (! have_cgt) |
| 744 | return 0; |
| 745 | |
| 746 | if (! have_cgt_id) |
| 747 | { |
| 748 | if (speed_option_verbose) |
| 749 | printf ("clock_gettime don't know what ID to use\n"); |
| 750 | result = 0; |
| 751 | return result; |
| 752 | } |
| 753 | |
| 754 | if (result != -1) |
| 755 | return result; |
| 756 | |
| 757 | /* trial run to see if it works */ |
| 758 | if (clock_gettime (CGT_ID, &unit) != 0) |
| 759 | { |
| 760 | if (speed_option_verbose) |
| 761 | printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno)); |
| 762 | result = 0; |
| 763 | return result; |
| 764 | } |
| 765 | |
| 766 | /* get the resolution */ |
| 767 | if (clock_getres (CGT_ID, &unit) != 0) |
| 768 | { |
| 769 | if (speed_option_verbose) |
| 770 | printf ("clock_getres id=%d error: %s\n", CGT_ID, strerror (errno)); |
| 771 | result = 0; |
| 772 | return result; |
| 773 | } |
| 774 | |
| 775 | cgt_unittime = unit.tv_sec + unit.tv_nsec * 1e-9; |
| 776 | if (speed_option_verbose) |
| 777 | printf ("clock_gettime is %s accurate\n", unittime_string (cgt_unittime)); |
| 778 | |
| 779 | if (cgt_unittime < 10e-9) |
| 780 | { |
| 781 | /* Do we believe this? */ |
| 782 | struct timespec start, end; |
| 783 | static volatile int counter; |
| 784 | double duration; |
| 785 | if (clock_gettime (CGT_ID, &start)) |
| 786 | { |
| 787 | if (speed_option_verbose) |
| 788 | printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno)); |
| 789 | result = 0; |
| 790 | return result; |
| 791 | } |
| 792 | /* Loop of at least 1000 memory accesses, ought to take at |
| 793 | least 100 ns*/ |
| 794 | for (counter = 0; counter < CGT_DELAY_COUNT; counter++) |
| 795 | ; |
| 796 | if (clock_gettime (CGT_ID, &end)) |
| 797 | { |
| 798 | if (speed_option_verbose) |
| 799 | printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno)); |
| 800 | result = 0; |
| 801 | return result; |
| 802 | } |
| 803 | duration = (end.tv_sec + end.tv_nsec * 1e-9 |
| 804 | - start.tv_sec - start.tv_nsec * 1e-9); |
| 805 | if (speed_option_verbose) |
| 806 | printf ("delay loop of %d rounds took %s (according to clock_gettime)\n", |
| 807 | CGT_DELAY_COUNT, unittime_string (duration)); |
| 808 | if (duration < 100e-9) |
| 809 | { |
| 810 | if (speed_option_verbose) |
| 811 | printf ("clock_gettime id=%d not believable\n", CGT_ID); |
| 812 | result = 0; |
| 813 | return result; |
| 814 | } |
| 815 | } |
| 816 | result = 1; |
| 817 | return result; |
| 818 | } |
| 819 | |
| 820 | |
| 821 | static double |
| 822 | freq_measure_mftb_one (void) |
| 823 | { |
| 824 | #define call_gettimeofday(t) gettimeofday (&(t), NULL) |
| 825 | #define timeval_tv_sec(t) ((t).tv_sec) |
| 826 | #define timeval_tv_usec(t) ((t).tv_usec) |
| 827 | FREQ_MEASURE_ONE ("mftb", struct_timeval, |
| 828 | call_gettimeofday, MFTB, |
| 829 | timeval_tv_sec, timeval_tv_usec); |
| 830 | } |
| 831 | |
| 832 | |
| 833 | static jmp_buf mftb_works_buf; |
| 834 | |
| 835 | static RETSIGTYPE |
| 836 | mftb_works_handler (int sig) |
| 837 | { |
| 838 | longjmp (mftb_works_buf, 1); |
| 839 | } |
| 840 | |
| 841 | int |
| 842 | mftb_works_p (void) |
| 843 | { |
| 844 | unsigned a[2]; |
| 845 | RETSIGTYPE (*old_handler) (int); |
| 846 | double cycletime; |
| 847 | |
| 848 | /* suppress a warning about a[] unused */ |
| 849 | a[0] = 0; |
| 850 | |
| 851 | if (! have_mftb) |
| 852 | return 0; |
| 853 | |
| 854 | #ifdef SIGILL |
| 855 | old_handler = signal (SIGILL, mftb_works_handler); |
| 856 | if (old_handler == SIG_ERR) |
| 857 | { |
| 858 | if (speed_option_verbose) |
| 859 | printf ("mftb_works_p(): SIGILL not supported, assuming mftb works\n"); |
| 860 | return 1; |
| 861 | } |
| 862 | if (setjmp (mftb_works_buf)) |
| 863 | { |
| 864 | if (speed_option_verbose) |
| 865 | printf ("mftb_works_p(): SIGILL during mftb, so doesn't work\n"); |
| 866 | return 0; |
| 867 | } |
| 868 | MFTB (a); |
| 869 | signal (SIGILL, old_handler); |
| 870 | if (speed_option_verbose) |
| 871 | printf ("mftb_works_p(): mftb works\n"); |
| 872 | #else |
| 873 | |
| 874 | if (speed_option_verbose) |
| 875 | printf ("mftb_works_p(): SIGILL not defined, assuming mftb works\n"); |
| 876 | #endif |
| 877 | |
| 878 | #if ! HAVE_GETTIMEOFDAY |
| 879 | if (speed_option_verbose) |
| 880 | printf ("mftb_works_p(): no gettimeofday available to measure mftb\n"); |
| 881 | return 0; |
| 882 | #endif |
| 883 | |
| 884 | /* The time base is normally 1/4 of the bus speed on 6xx and 7xx chips, on |
| 885 | other chips it can be driven from an external clock. */ |
| 886 | cycletime = freq_measure ("mftb", freq_measure_mftb_one); |
| 887 | if (cycletime == -1.0) |
| 888 | { |
| 889 | if (speed_option_verbose) |
| 890 | printf ("mftb_works_p(): cannot measure mftb period\n"); |
| 891 | return 0; |
| 892 | } |
| 893 | |
| 894 | mftb_unittime = cycletime; |
| 895 | return 1; |
| 896 | } |
| 897 | |
| 898 | |
| 899 | volatile unsigned *sgi_addr; |
| 900 | |
| 901 | int |
| 902 | sgi_works_p (void) |
| 903 | { |
| 904 | #if HAVE_SYSSGI && HAVE_MMAP |
| 905 | static int result = -1; |
| 906 | |
| 907 | size_t pagesize, offset; |
| 908 | __psunsigned_t phys, physpage; |
| 909 | void *virtpage; |
| 910 | unsigned period_picoseconds; |
| 911 | int size, fd; |
| 912 | |
| 913 | if (result != -1) |
| 914 | return result; |
| 915 | |
| 916 | phys = syssgi (SGI_QUERY_CYCLECNTR, &period_picoseconds); |
| 917 | if (phys == (__psunsigned_t) -1) |
| 918 | { |
| 919 | /* ENODEV is the error when a counter is not available */ |
| 920 | if (speed_option_verbose) |
| 921 | printf ("syssgi SGI_QUERY_CYCLECNTR error: %s\n", strerror (errno)); |
| 922 | result = 0; |
| 923 | return result; |
| 924 | } |
| 925 | sgi_unittime = period_picoseconds * 1e-12; |
| 926 | |
| 927 | /* IRIX 5 doesn't have SGI_CYCLECNTR_SIZE, assume 32 bits in that case. |
| 928 | Challenge/ONYX hardware has a 64 bit byte counter, but there seems no |
| 929 | obvious way to identify that without SGI_CYCLECNTR_SIZE. */ |
| 930 | #ifdef SGI_CYCLECNTR_SIZE |
| 931 | size = syssgi (SGI_CYCLECNTR_SIZE); |
| 932 | if (size == -1) |
| 933 | { |
| 934 | if (speed_option_verbose) |
| 935 | { |
| 936 | printf ("syssgi SGI_CYCLECNTR_SIZE error: %s\n", strerror (errno)); |
| 937 | printf (" will assume size==4\n"); |
| 938 | } |
| 939 | size = 32; |
| 940 | } |
| 941 | #else |
| 942 | size = 32; |
| 943 | #endif |
| 944 | |
| 945 | if (size < 32) |
| 946 | { |
| 947 | printf ("syssgi SGI_CYCLECNTR_SIZE gives %d, expected 32 or 64\n", size); |
| 948 | result = 0; |
| 949 | return result; |
| 950 | } |
| 951 | |
| 952 | pagesize = getpagesize(); |
| 953 | offset = (size_t) phys & (pagesize-1); |
| 954 | physpage = phys - offset; |
| 955 | |
| 956 | /* shouldn't cross over a page boundary */ |
| 957 | ASSERT_ALWAYS (offset + size/8 <= pagesize); |
| 958 | |
| 959 | fd = open("/dev/mmem", O_RDONLY); |
| 960 | if (fd == -1) |
| 961 | { |
| 962 | if (speed_option_verbose) |
| 963 | printf ("open /dev/mmem: %s\n", strerror (errno)); |
| 964 | result = 0; |
| 965 | return result; |
| 966 | } |
| 967 | |
| 968 | virtpage = mmap (0, pagesize, PROT_READ, MAP_PRIVATE, fd, (off_t) physpage); |
| 969 | if (virtpage == (void *) -1) |
| 970 | { |
| 971 | if (speed_option_verbose) |
| 972 | printf ("mmap /dev/mmem: %s\n", strerror (errno)); |
| 973 | result = 0; |
| 974 | return result; |
| 975 | } |
| 976 | |
| 977 | /* address of least significant 4 bytes, knowing mips is big endian */ |
| 978 | sgi_addr = (unsigned *) ((char *) virtpage + offset |
| 979 | + size/8 - sizeof(unsigned)); |
| 980 | result = 1; |
| 981 | return result; |
| 982 | |
| 983 | #else /* ! (HAVE_SYSSGI && HAVE_MMAP) */ |
| 984 | return 0; |
| 985 | #endif |
| 986 | } |
| 987 | |
| 988 | |
| 989 | #define DEFAULT(var,n) \ |
| 990 | do { \ |
| 991 | if (! (var)) \ |
| 992 | (var) = (n); \ |
| 993 | } while (0) |
| 994 | |
| 995 | void |
| 996 | speed_time_init (void) |
| 997 | { |
| 998 | double supplement_unittime = 0.0; |
| 999 | |
| 1000 | static int speed_time_initialized = 0; |
| 1001 | if (speed_time_initialized) |
| 1002 | return; |
| 1003 | speed_time_initialized = 1; |
| 1004 | |
| 1005 | speed_cycletime_init (); |
| 1006 | |
| 1007 | if (!speed_option_cycles_broken && have_cycles && cycles_works_p ()) |
| 1008 | { |
| 1009 | use_cycles = 1; |
| 1010 | DEFAULT (speed_cycletime, 1.0); |
| 1011 | speed_unittime = speed_cycletime; |
| 1012 | DEFAULT (speed_precision, 10000); |
| 1013 | strcpy (speed_time_string, "CPU cycle counter"); |
| 1014 | |
| 1015 | /* only used if a supplementary method is chosen below */ |
| 1016 | cycles_limit = (have_cycles == 1 ? M_2POW32 : M_2POW64) / 2.0 |
| 1017 | * speed_cycletime; |
| 1018 | |
| 1019 | if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p()) |
| 1020 | { |
| 1021 | /* this is a good combination */ |
| 1022 | use_grus = 1; |
| 1023 | supplement_unittime = grus_unittime = 1.0e-6; |
| 1024 | strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond getrusage()"); |
| 1025 | } |
| 1026 | else if (have_cycles == 1) |
| 1027 | { |
| 1028 | /* When speed_cyclecounter has a limited range, look for something |
| 1029 | to supplement it. */ |
| 1030 | if (have_gtod && gettimeofday_microseconds_p()) |
| 1031 | { |
| 1032 | use_gtod = 1; |
| 1033 | supplement_unittime = gtod_unittime = 1.0e-6; |
| 1034 | strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond gettimeofday()"); |
| 1035 | } |
| 1036 | else if (have_grus) |
| 1037 | { |
| 1038 | use_grus = 1; |
| 1039 | supplement_unittime = grus_unittime = 1.0 / (double) clk_tck (); |
| 1040 | sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick getrusage()", unittime_string (supplement_unittime)); |
| 1041 | } |
| 1042 | else if (have_times) |
| 1043 | { |
| 1044 | use_times = 1; |
| 1045 | supplement_unittime = times_unittime = 1.0 / (double) clk_tck (); |
| 1046 | sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick times()", unittime_string (supplement_unittime)); |
| 1047 | } |
| 1048 | else if (have_gtod) |
| 1049 | { |
| 1050 | use_gtod = 1; |
| 1051 | supplement_unittime = gtod_unittime = 1.0 / (double) clk_tck (); |
| 1052 | sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick gettimeofday()", unittime_string (supplement_unittime)); |
| 1053 | } |
| 1054 | else |
| 1055 | { |
| 1056 | fprintf (stderr, "WARNING: cycle counter is 32 bits and there's no other functions.\n"); |
| 1057 | fprintf (stderr, " Wraparounds may produce bad results on long measurements.\n"); |
| 1058 | } |
| 1059 | } |
| 1060 | |
| 1061 | if (use_grus || use_times || use_gtod) |
| 1062 | { |
| 1063 | /* must know cycle period to compare cycles to other measuring |
| 1064 | (via cycles_limit) */ |
| 1065 | speed_cycletime_need_seconds (); |
| 1066 | |
| 1067 | if (speed_precision * supplement_unittime > cycles_limit) |
| 1068 | { |
| 1069 | fprintf (stderr, "WARNING: requested precision can't always be achieved due to limited range\n"); |
| 1070 | fprintf (stderr, " cycle counter and limited precision supplemental method\n"); |
| 1071 | fprintf (stderr, " (%s)\n", speed_time_string); |
| 1072 | } |
| 1073 | } |
| 1074 | } |
| 1075 | else if (have_stck) |
| 1076 | { |
| 1077 | strcpy (speed_time_string, "STCK timestamp"); |
| 1078 | /* stck is in units of 2^-12 microseconds, which is very likely higher |
| 1079 | resolution than a cpu cycle */ |
| 1080 | if (speed_cycletime == 0.0) |
| 1081 | speed_cycletime_fail |
| 1082 | ("Need to know CPU frequency for effective stck unit"); |
| 1083 | speed_unittime = MAX (speed_cycletime, STCK_PERIOD); |
| 1084 | DEFAULT (speed_precision, 10000); |
| 1085 | } |
| 1086 | else if (have_mftb && mftb_works_p ()) |
| 1087 | { |
| 1088 | use_mftb = 1; |
| 1089 | DEFAULT (speed_precision, 10000); |
| 1090 | speed_unittime = mftb_unittime; |
| 1091 | sprintf (speed_time_string, "mftb counter (%s)", |
| 1092 | unittime_string (speed_unittime)); |
| 1093 | } |
| 1094 | else if (have_sgi && sgi_works_p ()) |
| 1095 | { |
| 1096 | use_sgi = 1; |
| 1097 | DEFAULT (speed_precision, 10000); |
| 1098 | speed_unittime = sgi_unittime; |
| 1099 | sprintf (speed_time_string, "syssgi() mmap counter (%s), supplemented by millisecond getrusage()", |
| 1100 | unittime_string (speed_unittime)); |
| 1101 | /* supplemented with getrusage, which we assume to have 1ms resolution */ |
| 1102 | use_grus = 1; |
| 1103 | supplement_unittime = 1e-3; |
| 1104 | } |
| 1105 | else if (have_rrt) |
| 1106 | { |
| 1107 | timebasestruct_t t; |
| 1108 | use_rrt = 1; |
| 1109 | DEFAULT (speed_precision, 10000); |
| 1110 | read_real_time (&t, sizeof(t)); |
| 1111 | switch (t.flag) { |
| 1112 | case RTC_POWER: |
| 1113 | /* FIXME: What's the actual RTC resolution? */ |
| 1114 | speed_unittime = 1e-7; |
| 1115 | strcpy (speed_time_string, "read_real_time() power nanoseconds"); |
| 1116 | break; |
| 1117 | case RTC_POWER_PC: |
| 1118 | t.tb_high = 1; |
| 1119 | t.tb_low = 0; |
| 1120 | time_base_to_time (&t, sizeof(t)); |
| 1121 | speed_unittime = TIMEBASESTRUCT_SECS(&t) / M_2POW32; |
| 1122 | sprintf (speed_time_string, "%s read_real_time() powerpc ticks", |
| 1123 | unittime_string (speed_unittime)); |
| 1124 | break; |
| 1125 | default: |
| 1126 | fprintf (stderr, "ERROR: Unrecognised timebasestruct_t flag=%d\n", |
| 1127 | t.flag); |
| 1128 | abort (); |
| 1129 | } |
| 1130 | } |
| 1131 | else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5e-6) |
| 1132 | { |
| 1133 | /* use clock_gettime if microsecond or better resolution */ |
| 1134 | choose_cgt: |
| 1135 | use_cgt = 1; |
| 1136 | speed_unittime = cgt_unittime; |
| 1137 | DEFAULT (speed_precision, (cgt_unittime <= 0.1e-6 ? 10000 : 1000)); |
| 1138 | strcpy (speed_time_string, "microsecond accurate clock_gettime()"); |
| 1139 | } |
| 1140 | else if (have_times && clk_tck() > 1000000) |
| 1141 | { |
| 1142 | /* Cray vector systems have times() which is clock cycle resolution |
| 1143 | (eg. 450 MHz). */ |
| 1144 | DEFAULT (speed_precision, 10000); |
| 1145 | goto choose_times; |
| 1146 | } |
| 1147 | else if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p()) |
| 1148 | { |
| 1149 | use_grus = 1; |
| 1150 | speed_unittime = grus_unittime = 1.0e-6; |
| 1151 | DEFAULT (speed_precision, 1000); |
| 1152 | strcpy (speed_time_string, "microsecond accurate getrusage()"); |
| 1153 | } |
| 1154 | else if (have_gtod && gettimeofday_microseconds_p()) |
| 1155 | { |
| 1156 | use_gtod = 1; |
| 1157 | speed_unittime = gtod_unittime = 1.0e-6; |
| 1158 | DEFAULT (speed_precision, 1000); |
| 1159 | strcpy (speed_time_string, "microsecond accurate gettimeofday()"); |
| 1160 | } |
| 1161 | else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5/clk_tck()) |
| 1162 | { |
| 1163 | /* use clock_gettime if 1 tick or better resolution */ |
| 1164 | goto choose_cgt; |
| 1165 | } |
| 1166 | else if (have_times) |
| 1167 | { |
| 1168 | use_tick_boundary = 1; |
| 1169 | DEFAULT (speed_precision, 200); |
| 1170 | choose_times: |
| 1171 | use_times = 1; |
| 1172 | speed_unittime = times_unittime = 1.0 / (double) clk_tck (); |
| 1173 | sprintf (speed_time_string, "%s clock tick times()", |
| 1174 | unittime_string (speed_unittime)); |
| 1175 | } |
| 1176 | else if (have_grus) |
| 1177 | { |
| 1178 | use_grus = 1; |
| 1179 | use_tick_boundary = 1; |
| 1180 | speed_unittime = grus_unittime = 1.0 / (double) clk_tck (); |
| 1181 | DEFAULT (speed_precision, 200); |
| 1182 | sprintf (speed_time_string, "%s clock tick getrusage()\n", |
| 1183 | unittime_string (speed_unittime)); |
| 1184 | } |
| 1185 | else if (have_gtod) |
| 1186 | { |
| 1187 | use_gtod = 1; |
| 1188 | use_tick_boundary = 1; |
| 1189 | speed_unittime = gtod_unittime = 1.0 / (double) clk_tck (); |
| 1190 | DEFAULT (speed_precision, 200); |
| 1191 | sprintf (speed_time_string, "%s clock tick gettimeofday()", |
| 1192 | unittime_string (speed_unittime)); |
| 1193 | } |
| 1194 | else |
| 1195 | { |
| 1196 | fprintf (stderr, "No time measuring method available\n"); |
| 1197 | fprintf (stderr, "None of: speed_cyclecounter(), STCK(), getrusage(), gettimeofday(), times()\n"); |
| 1198 | abort (); |
| 1199 | } |
| 1200 | |
| 1201 | if (speed_option_verbose) |
| 1202 | { |
| 1203 | printf ("speed_time_init: %s\n", speed_time_string); |
| 1204 | printf (" speed_precision %d\n", speed_precision); |
| 1205 | printf (" speed_unittime %.2g\n", speed_unittime); |
| 1206 | if (supplement_unittime) |
| 1207 | printf (" supplement_unittime %.2g\n", supplement_unittime); |
| 1208 | printf (" use_tick_boundary %d\n", use_tick_boundary); |
| 1209 | if (have_cycles) |
| 1210 | printf (" cycles_limit %.2g seconds\n", cycles_limit); |
| 1211 | } |
| 1212 | } |
| 1213 | |
| 1214 | |
| 1215 | |
| 1216 | /* Burn up CPU until a clock tick boundary, for greater accuracy. Set the |
| 1217 | corresponding "start_foo" appropriately too. */ |
| 1218 | |
| 1219 | void |
| 1220 | grus_tick_boundary (void) |
| 1221 | { |
| 1222 | struct_rusage prev; |
| 1223 | getrusage (0, &prev); |
| 1224 | do { |
| 1225 | getrusage (0, &start_grus); |
| 1226 | } while (start_grus.ru_utime.tv_usec == prev.ru_utime.tv_usec); |
| 1227 | } |
| 1228 | |
| 1229 | void |
| 1230 | gtod_tick_boundary (void) |
| 1231 | { |
| 1232 | struct_timeval prev; |
| 1233 | gettimeofday (&prev, NULL); |
| 1234 | do { |
| 1235 | gettimeofday (&start_gtod, NULL); |
| 1236 | } while (start_gtod.tv_usec == prev.tv_usec); |
| 1237 | } |
| 1238 | |
| 1239 | void |
| 1240 | times_tick_boundary (void) |
| 1241 | { |
| 1242 | struct_tms prev; |
| 1243 | times (&prev); |
| 1244 | do |
| 1245 | times (&start_times); |
| 1246 | while (start_times.tms_utime == prev.tms_utime); |
| 1247 | } |
| 1248 | |
| 1249 | |
| 1250 | /* "have_" values are tested to let unused code go dead. */ |
| 1251 | |
| 1252 | void |
| 1253 | speed_starttime (void) |
| 1254 | { |
| 1255 | speed_time_init (); |
| 1256 | |
| 1257 | if (have_grus && use_grus) |
| 1258 | { |
| 1259 | if (use_tick_boundary) |
| 1260 | grus_tick_boundary (); |
| 1261 | else |
| 1262 | getrusage (0, &start_grus); |
| 1263 | } |
| 1264 | |
| 1265 | if (have_gtod && use_gtod) |
| 1266 | { |
| 1267 | if (use_tick_boundary) |
| 1268 | gtod_tick_boundary (); |
| 1269 | else |
| 1270 | gettimeofday (&start_gtod, NULL); |
| 1271 | } |
| 1272 | |
| 1273 | if (have_times && use_times) |
| 1274 | { |
| 1275 | if (use_tick_boundary) |
| 1276 | times_tick_boundary (); |
| 1277 | else |
| 1278 | times (&start_times); |
| 1279 | } |
| 1280 | |
| 1281 | if (have_cgt && use_cgt) |
| 1282 | clock_gettime (CGT_ID, &start_cgt); |
| 1283 | |
| 1284 | if (have_rrt && use_rrt) |
| 1285 | read_real_time (&start_rrt, sizeof(start_rrt)); |
| 1286 | |
| 1287 | if (have_sgi && use_sgi) |
| 1288 | start_sgi = *sgi_addr; |
| 1289 | |
| 1290 | if (have_mftb && use_mftb) |
| 1291 | MFTB (start_mftb); |
| 1292 | |
| 1293 | if (have_stck && use_stck) |
| 1294 | STCK (start_stck); |
| 1295 | |
| 1296 | /* Cycles sampled last for maximum accuracy. */ |
| 1297 | if (have_cycles && use_cycles) |
| 1298 | speed_cyclecounter (start_cycles); |
| 1299 | } |
| 1300 | |
| 1301 | |
| 1302 | /* Calculate the difference between two cycle counter samples, as a "double" |
| 1303 | counter of cycles. |
| 1304 | |
| 1305 | The start and end values are allowed to cancel in integers in case the |
| 1306 | counter values are bigger than the 53 bits that normally fit in a double. |
| 1307 | |
| 1308 | This works even if speed_cyclecounter() puts a value bigger than 32-bits |
| 1309 | in the low word (the high word always gets a 2**32 multiplier though). */ |
| 1310 | |
| 1311 | double |
| 1312 | speed_cyclecounter_diff (const unsigned end[2], const unsigned start[2]) |
| 1313 | { |
| 1314 | unsigned d; |
| 1315 | double t; |
| 1316 | |
| 1317 | if (have_cycles == 1) |
| 1318 | { |
| 1319 | t = (end[0] - start[0]); |
| 1320 | } |
| 1321 | else |
| 1322 | { |
| 1323 | d = end[0] - start[0]; |
| 1324 | t = d - (d > end[0] ? M_2POWU : 0.0); |
| 1325 | t += (end[1] - start[1]) * M_2POW32; |
| 1326 | } |
| 1327 | return t; |
| 1328 | } |
| 1329 | |
| 1330 | |
| 1331 | double |
| 1332 | speed_mftb_diff (const unsigned end[2], const unsigned start[2]) |
| 1333 | { |
| 1334 | unsigned d; |
| 1335 | double t; |
| 1336 | |
| 1337 | d = end[0] - start[0]; |
| 1338 | t = (double) d - (d > end[0] ? M_2POW32 : 0.0); |
| 1339 | t += (end[1] - start[1]) * M_2POW32; |
| 1340 | return t; |
| 1341 | } |
| 1342 | |
| 1343 | |
| 1344 | /* Calculate the difference between "start" and "end" using fields "sec" and |
| 1345 | "psec", where each "psec" is a "punit" of a second. |
| 1346 | |
| 1347 | The seconds parts are allowed to cancel before being combined with the |
| 1348 | psec parts, in case a simple "sec+psec*punit" exceeds the precision of a |
| 1349 | double. |
| 1350 | |
| 1351 | Total time is only calculated in a "double" since an integer count of |
| 1352 | psecs might overflow. 2^32 microseconds is only a bit over an hour, or |
| 1353 | 2^32 nanoseconds only about 4 seconds. |
| 1354 | |
| 1355 | The casts to "long" are for the benefit of timebasestruct_t, where the |
| 1356 | fields are only "unsigned int", but we want a signed difference. */ |
| 1357 | |
| 1358 | #define DIFF_SECS_ROUTINE(sec, psec, punit) \ |
| 1359 | { \ |
| 1360 | long sec_diff, psec_diff; \ |
| 1361 | sec_diff = (long) end->sec - (long) start->sec; \ |
| 1362 | psec_diff = (long) end->psec - (long) start->psec; \ |
| 1363 | return (double) sec_diff + punit * (double) psec_diff; \ |
| 1364 | } |
| 1365 | |
| 1366 | double |
| 1367 | timeval_diff_secs (const struct_timeval *end, const struct_timeval *start) |
| 1368 | { |
| 1369 | DIFF_SECS_ROUTINE (tv_sec, tv_usec, 1e-6); |
| 1370 | } |
| 1371 | |
| 1372 | double |
| 1373 | rusage_diff_secs (const struct_rusage *end, const struct_rusage *start) |
| 1374 | { |
| 1375 | DIFF_SECS_ROUTINE (ru_utime.tv_sec, ru_utime.tv_usec, 1e-6); |
| 1376 | } |
| 1377 | |
| 1378 | double |
| 1379 | timespec_diff_secs (const struct_timespec *end, const struct_timespec *start) |
| 1380 | { |
| 1381 | DIFF_SECS_ROUTINE (tv_sec, tv_nsec, 1e-9); |
| 1382 | } |
| 1383 | |
| 1384 | /* This is for use after time_base_to_time, ie. for seconds and nanoseconds. */ |
| 1385 | double |
| 1386 | timebasestruct_diff_secs (const timebasestruct_t *end, |
| 1387 | const timebasestruct_t *start) |
| 1388 | { |
| 1389 | DIFF_SECS_ROUTINE (tb_high, tb_low, 1e-9); |
| 1390 | } |
| 1391 | |
| 1392 | |
| 1393 | double |
| 1394 | speed_endtime (void) |
| 1395 | { |
| 1396 | #define END_USE(name,value) \ |
| 1397 | do { \ |
| 1398 | if (speed_option_verbose >= 3) \ |
| 1399 | printf ("speed_endtime(): used %s\n", name); \ |
| 1400 | result = value; \ |
| 1401 | goto done; \ |
| 1402 | } while (0) |
| 1403 | |
| 1404 | #define END_ENOUGH(name,value) \ |
| 1405 | do { \ |
| 1406 | if (speed_option_verbose >= 3) \ |
| 1407 | printf ("speed_endtime(): %s gives enough precision\n", name); \ |
| 1408 | result = value; \ |
| 1409 | goto done; \ |
| 1410 | } while (0) |
| 1411 | |
| 1412 | #define END_EXCEED(name,value) \ |
| 1413 | do { \ |
| 1414 | if (speed_option_verbose >= 3) \ |
| 1415 | printf ("speed_endtime(): cycle counter limit exceeded, used %s\n", \ |
| 1416 | name); \ |
| 1417 | result = value; \ |
| 1418 | goto done; \ |
| 1419 | } while (0) |
| 1420 | |
| 1421 | unsigned end_cycles[2]; |
| 1422 | stck_t end_stck; |
| 1423 | unsigned end_mftb[2]; |
| 1424 | unsigned end_sgi; |
| 1425 | timebasestruct_t end_rrt; |
| 1426 | struct_timespec end_cgt; |
| 1427 | struct_timeval end_gtod; |
| 1428 | struct_rusage end_grus; |
| 1429 | struct_tms end_times; |
| 1430 | double t_gtod, t_grus, t_times, t_cgt; |
| 1431 | double t_rrt, t_sgi, t_mftb, t_stck, t_cycles; |
| 1432 | double result; |
| 1433 | |
| 1434 | /* Cycles sampled first for maximum accuracy. |
| 1435 | "have_" values tested to let unused code go dead. */ |
| 1436 | |
| 1437 | if (have_cycles && use_cycles) speed_cyclecounter (end_cycles); |
| 1438 | if (have_stck && use_stck) STCK (end_stck); |
| 1439 | if (have_mftb && use_mftb) MFTB (end_mftb); |
| 1440 | if (have_sgi && use_sgi) end_sgi = *sgi_addr; |
| 1441 | if (have_rrt && use_rrt) read_real_time (&end_rrt, sizeof(end_rrt)); |
| 1442 | if (have_cgt && use_cgt) clock_gettime (CGT_ID, &end_cgt); |
| 1443 | if (have_gtod && use_gtod) gettimeofday (&end_gtod, NULL); |
| 1444 | if (have_grus && use_grus) getrusage (0, &end_grus); |
| 1445 | if (have_times && use_times) times (&end_times); |
| 1446 | |
| 1447 | result = -1.0; |
| 1448 | |
| 1449 | if (speed_option_verbose >= 4) |
| 1450 | { |
| 1451 | printf ("speed_endtime():\n"); |
| 1452 | if (use_cycles) |
| 1453 | printf (" cycles 0x%X,0x%X -> 0x%X,0x%X\n", |
| 1454 | start_cycles[1], start_cycles[0], |
| 1455 | end_cycles[1], end_cycles[0]); |
| 1456 | |
| 1457 | if (use_stck) |
| 1458 | printf (" stck 0x%lX -> 0x%lX\n", start_stck, end_stck); |
| 1459 | |
| 1460 | if (use_mftb) |
| 1461 | printf (" mftb 0x%X,%08X -> 0x%X,%08X\n", |
| 1462 | start_mftb[1], start_mftb[0], |
| 1463 | end_mftb[1], end_mftb[0]); |
| 1464 | |
| 1465 | if (use_sgi) |
| 1466 | printf (" sgi 0x%X -> 0x%X\n", start_sgi, end_sgi); |
| 1467 | |
| 1468 | if (use_rrt) |
| 1469 | printf (" read_real_time (%d)%u,%u -> (%d)%u,%u\n", |
| 1470 | start_rrt.flag, start_rrt.tb_high, start_rrt.tb_low, |
| 1471 | end_rrt.flag, end_rrt.tb_high, end_rrt.tb_low); |
| 1472 | |
| 1473 | if (use_cgt) |
| 1474 | printf (" clock_gettime %ld.%09ld -> %ld.%09ld\n", |
| 1475 | start_cgt.tv_sec, start_cgt.tv_nsec, |
| 1476 | end_cgt.tv_sec, end_cgt.tv_nsec); |
| 1477 | |
| 1478 | if (use_gtod) |
| 1479 | printf (" gettimeofday %ld.%06ld -> %ld.%06ld\n", |
| 1480 | start_gtod.tv_sec, start_gtod.tv_usec, |
| 1481 | end_gtod.tv_sec, end_gtod.tv_usec); |
| 1482 | |
| 1483 | if (use_grus) |
| 1484 | printf (" getrusage %ld.%06ld -> %ld.%06ld\n", |
| 1485 | start_grus.ru_utime.tv_sec, start_grus.ru_utime.tv_usec, |
| 1486 | end_grus.ru_utime.tv_sec, end_grus.ru_utime.tv_usec); |
| 1487 | |
| 1488 | if (use_times) |
| 1489 | printf (" times %ld -> %ld\n", |
| 1490 | start_times.tms_utime, end_times.tms_utime); |
| 1491 | } |
| 1492 | |
| 1493 | if (use_rrt) |
| 1494 | { |
| 1495 | time_base_to_time (&start_rrt, sizeof(start_rrt)); |
| 1496 | time_base_to_time (&end_rrt, sizeof(end_rrt)); |
| 1497 | t_rrt = timebasestruct_diff_secs (&end_rrt, &start_rrt); |
| 1498 | END_USE ("read_real_time()", t_rrt); |
| 1499 | } |
| 1500 | |
| 1501 | if (use_cgt) |
| 1502 | { |
| 1503 | t_cgt = timespec_diff_secs (&end_cgt, &start_cgt); |
| 1504 | END_USE ("clock_gettime()", t_cgt); |
| 1505 | } |
| 1506 | |
| 1507 | if (use_grus) |
| 1508 | { |
| 1509 | t_grus = rusage_diff_secs (&end_grus, &start_grus); |
| 1510 | |
| 1511 | /* Use getrusage() if the cycle counter limit would be exceeded, or if |
| 1512 | it provides enough accuracy already. */ |
| 1513 | if (use_cycles) |
| 1514 | { |
| 1515 | if (t_grus >= speed_precision*grus_unittime) |
| 1516 | END_ENOUGH ("getrusage()", t_grus); |
| 1517 | if (t_grus >= cycles_limit) |
| 1518 | END_EXCEED ("getrusage()", t_grus); |
| 1519 | } |
| 1520 | } |
| 1521 | |
| 1522 | if (use_times) |
| 1523 | { |
| 1524 | t_times = (end_times.tms_utime - start_times.tms_utime) * times_unittime; |
| 1525 | |
| 1526 | /* Use times() if the cycle counter limit would be exceeded, or if |
| 1527 | it provides enough accuracy already. */ |
| 1528 | if (use_cycles) |
| 1529 | { |
| 1530 | if (t_times >= speed_precision*times_unittime) |
| 1531 | END_ENOUGH ("times()", t_times); |
| 1532 | if (t_times >= cycles_limit) |
| 1533 | END_EXCEED ("times()", t_times); |
| 1534 | } |
| 1535 | } |
| 1536 | |
| 1537 | if (use_gtod) |
| 1538 | { |
| 1539 | t_gtod = timeval_diff_secs (&end_gtod, &start_gtod); |
| 1540 | |
| 1541 | /* Use gettimeofday() if it measured a value bigger than the cycle |
| 1542 | counter can handle. */ |
| 1543 | if (use_cycles) |
| 1544 | { |
| 1545 | if (t_gtod >= cycles_limit) |
| 1546 | END_EXCEED ("gettimeofday()", t_gtod); |
| 1547 | } |
| 1548 | } |
| 1549 | |
| 1550 | if (use_mftb) |
| 1551 | { |
| 1552 | t_mftb = speed_mftb_diff (end_mftb, start_mftb) * mftb_unittime; |
| 1553 | END_USE ("mftb", t_mftb); |
| 1554 | } |
| 1555 | |
| 1556 | if (use_stck) |
| 1557 | { |
| 1558 | t_stck = (end_stck - start_stck) * STCK_PERIOD; |
| 1559 | END_USE ("stck", t_stck); |
| 1560 | } |
| 1561 | |
| 1562 | if (use_sgi) |
| 1563 | { |
| 1564 | t_sgi = (end_sgi - start_sgi) * sgi_unittime; |
| 1565 | END_USE ("SGI hardware counter", t_sgi); |
| 1566 | } |
| 1567 | |
| 1568 | if (use_cycles) |
| 1569 | { |
| 1570 | t_cycles = speed_cyclecounter_diff (end_cycles, start_cycles) |
| 1571 | * speed_cycletime; |
| 1572 | END_USE ("cycle counter", t_cycles); |
| 1573 | } |
| 1574 | |
| 1575 | if (use_grus && getrusage_microseconds_p()) |
| 1576 | END_USE ("getrusage()", t_grus); |
| 1577 | |
| 1578 | if (use_gtod && gettimeofday_microseconds_p()) |
| 1579 | END_USE ("gettimeofday()", t_gtod); |
| 1580 | |
| 1581 | if (use_times) END_USE ("times()", t_times); |
| 1582 | if (use_grus) END_USE ("getrusage()", t_grus); |
| 1583 | if (use_gtod) END_USE ("gettimeofday()", t_gtod); |
| 1584 | |
| 1585 | fprintf (stderr, "speed_endtime(): oops, no time method available\n"); |
| 1586 | abort (); |
| 1587 | |
| 1588 | done: |
| 1589 | if (result < 0.0) |
| 1590 | { |
| 1591 | if (speed_option_verbose >= 2) |
| 1592 | fprintf (stderr, "speed_endtime(): warning, treating negative time as zero: %.9f\n", result); |
| 1593 | result = 0.0; |
| 1594 | } |
| 1595 | return result; |
| 1596 | } |