blob: 8d9c1bbf2fe0d4dedcd9d152202a27afcf469b5c [file] [log] [blame]
Austin Schuhbb1338c2024-06-15 19:31:16 -07001/* __gmp_doscan -- formatted input internals.
2
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
6
7Copyright 2001-2003 Free Software Foundation, Inc.
8
9This file is part of the GNU MP Library.
10
11The GNU MP Library is free software; you can redistribute it and/or modify
12it under the terms of either:
13
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17
18or
19
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
22 later version.
23
24or both in parallel, as here.
25
26The GNU MP Library is distributed in the hope that it will be useful, but
27WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29for more details.
30
31You should have received copies of the GNU General Public License and the
32GNU Lesser General Public License along with the GNU MP Library. If not,
33see https://www.gnu.org/licenses/. */
34
35#define _GNU_SOURCE /* for DECIMAL_POINT in langinfo.h */
36
37#include "config.h" /* needed for the HAVE_, could also move gmp incls */
38
39#include <stdarg.h>
40#include <ctype.h>
41#include <stddef.h> /* for ptrdiff_t */
42#include <stdio.h>
43#include <stdlib.h> /* for strtol */
44#include <string.h>
45
46#if HAVE_LANGINFO_H
47#include <langinfo.h> /* for nl_langinfo */
48#endif
49
50#if HAVE_LOCALE_H
51#include <locale.h> /* for localeconv */
52#endif
53
54#if HAVE_INTTYPES_H
55# include <inttypes.h> /* for intmax_t */
56#else
57# if HAVE_STDINT_H
58# include <stdint.h>
59# endif
60#endif
61
62#if HAVE_SYS_TYPES_H
63#include <sys/types.h> /* for quad_t */
64#endif
65
66#include "gmp-impl.h"
67
68
69/* Change this to "#define TRACE(x) x" for some traces. */
70#define TRACE(x)
71
72
73/* General:
74
75 It's necessary to parse up the format string to recognise the GMP
76 extra types F, Q and Z. Other types and conversions are passed
77 across to the standard sscanf or fscanf via funs->scan, for ease of
78 implementation. This is essential in the case of something like glibc
79 %p where the pointer format isn't actually documented.
80
81 Because funs->scan doesn't get the whole input it can't put the right
82 values in for %n, so that's handled in __gmp_doscan. Neither sscanf
83 nor fscanf directly indicate how many characters were read, so an
84 extra %n is appended to each run for that. For fscanf this merely
85 supports our %n output, but for sscanf it lets funs->step move us
86 along the input string.
87
88 Whitespace and literal matches in the format string, including %%,
89 are handled directly within __gmp_doscan. This is reasonably
90 efficient, and avoids some suspicious behaviour observed in various
91 system libc's. GLIBC 2.2.4 for instance returns 0 on
92
93 sscanf(" ", " x")
94 or
95 sscanf(" ", " x%d",&n)
96
97 whereas we think they should return EOF, since end-of-string is
98 reached when a match of "x" is required.
99
100 For standard % conversions, funs->scan is called once for each
101 conversion. If we had vfscanf and vsscanf and could rely on their
102 fixed text matching behaviour then we could call them with multiple
103 consecutive standard conversions. But plain fscanf and sscanf work
104 fine, and parsing one field at a time shouldn't be too much of a
105 slowdown.
106
107 gmpscan:
108
109 gmpscan reads a gmp type. It's only used from one place, but is a
110 separate subroutine to avoid a big chunk of complicated code in the
111 middle of __gmp_doscan. Within gmpscan a couple of loopbacks make it
112 possible to share code for parsing integers, rationals and floats.
113
114 In gmpscan normally one char of lookahead is maintained, but when width
115 is reached that stops, on the principle that an fgetc/ungetc of a char
116 past where we're told to stop would be undesirable. "chars" is how many
117 characters have been read so far, including the current c. When
118 chars==width and another character is desired then a jump is done to the
119 "convert" stage. c is invalid and mustn't be unget'ed in this case;
120 chars is set to width+1 to indicate that.
121
122 gmpscan normally returns the number of characters read. -1 means an
123 invalid field, -2 means EOF reached before any matching characters
124 were read.
125
126 For hex floats, the mantissa part is passed to mpf_set_str, then the
127 exponent is applied with mpf_mul_exp or mpf_div_2exp. This is easier
128 than teaching mpf_set_str about an exponent factor (ie. 2) differing
129 from the mantissa radix point factor (ie. 16). mpf_mul_exp and
130 mpf_div_2exp will preserve the application requested precision, so
131 nothing in that respect is lost by making this a two-step process.
132
133 Matching and errors:
134
135 C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest
136 string which is a match for the appropriate type, or a prefix of a
137 match. With that done, if it's only a prefix then the result is a
138 matching failure, ie. invalid input.
139
140 This rule seems fairly clear, but doesn't seem to be universally
141 applied in system C libraries. Even GLIBC doesn't seem to get it
142 right, insofar as it seems to accept some apparently invalid forms.
143 Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the
144 standard would suggest a non-empty sequence of digits should be
145 required after an "0x".
146
147 A footnote to 7.19.6.2 para 17 notes how this input item reading can
148 mean inputs acceptable to strtol are not acceptable to fscanf. We
149 think this confirms our reading of "0x" as invalid.
150
151 Clearly gmp_sscanf could backtrack to a longest input which was a
152 valid match for a given item, but this is not done, since C99 says
153 sscanf is identical to fscanf, so we make gmp_sscanf identical to
154 gmp_fscanf.
155
156 Types:
157
158 C99 says "ll" is for long long, and "L" is for long double floats.
159 Unfortunately in GMP 4.1.1 we documented the two as equivalent. This
160 doesn't affect us directly, since both are passed through to plain
161 scanf. It seems wisest not to try to enforce the C99 rule. This is
162 consistent with what we said before, though whether it actually
163 worked was always up to the C library.
164
165 Alternatives:
166
167 Consideration was given to using separate code for gmp_fscanf and
168 gmp_sscanf. The sscanf case could zip across a string doing literal
169 matches or recognising digits in gmpscan, rather than making a
170 function call fun->get per character. The fscanf could use getc
171 rather than fgetc too, which might help those systems where getc is a
172 macro or otherwise inlined. But none of this scanning and converting
173 will be particularly fast, so the two are done together to keep it a
174 little simpler for now.
175
176 Various multibyte string issues are not addressed, for a start C99
177 scanf says the format string is multibyte. Since we pass %c, %s and
178 %[ to the system scanf, they might do multibyte reads already, but
179 it's another matter whether or not that can be used, since our digit
180 and whitespace parsing is only unibyte. The plan is to quietly
181 ignore multibyte locales for now. This is not as bad as it sounds,
182 since GMP is presumably used mostly on numbers, which can be
183 perfectly adequately treated in plain ASCII.
184
185*/
186
187
188struct gmp_doscan_params_t {
189 int base;
190 int ignore;
191 char type;
192 int width;
193};
194
195
196#define GET(c) \
197 do { \
198 ASSERT (chars <= width); \
199 chars++; \
200 if (chars > width) \
201 goto convert; \
202 (c) = (*funs->get) (data); \
203 } while (0)
204
205/* store into "s", extending if necessary */
206#define STORE(c) \
207 do { \
208 ASSERT (s_upto <= s_alloc); \
209 if (s_upto >= s_alloc) \
210 { \
211 size_t s_alloc_new = s_alloc + S_ALLOC_STEP; \
212 s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \
213 s_alloc = s_alloc_new; \
214 } \
215 s[s_upto++] = c; \
216 } while (0)
217
218#define S_ALLOC_STEP 512
219
220static int
221gmpscan (const struct gmp_doscan_funs_t *funs, void *data,
222 const struct gmp_doscan_params_t *p, void *dst)
223{
224 int chars, c, base, first, width, seen_point, seen_digit, hexfloat;
225 size_t s_upto, s_alloc, hexexp;
226 char *s;
227 int invalid = 0;
228
229 TRACE (printf ("gmpscan\n"));
230
231 ASSERT (p->type == 'F' || p->type == 'Q' || p->type == 'Z');
232
233 c = (*funs->get) (data);
234 if (c == EOF)
235 return -2;
236
237 chars = 1;
238 first = 1;
239 seen_point = 0;
240 width = (p->width == 0 ? INT_MAX-1 : p->width);
241 base = p->base;
242 s_alloc = S_ALLOC_STEP;
243 s = __GMP_ALLOCATE_FUNC_TYPE (s_alloc, char);
244 s_upto = 0;
245 hexfloat = 0;
246 hexexp = 0;
247
248 another:
249 seen_digit = 0;
250 if (c == '-')
251 {
252 STORE (c);
253 goto get_for_sign;
254 }
255 else if (c == '+')
256 {
257 /* don't store '+', it's not accepted by mpz_set_str etc */
258 get_for_sign:
259 GET (c);
260 }
261
262 if (base == 0)
263 {
264 base = 10; /* decimal if no base indicator */
265 if (c == '0')
266 {
267 seen_digit = 1; /* 0 alone is a valid number */
268 if (p->type != 'F')
269 base = 8; /* leading 0 is octal, for non-floats */
270 STORE (c);
271 GET (c);
272 if (c == 'x' || c == 'X')
273 {
274 base = 16;
275 seen_digit = 0; /* must have digits after an 0x */
276 if (p->type == 'F') /* don't pass 'x' to mpf_set_str_point */
277 hexfloat = 1;
278 else
279 STORE (c);
280 GET (c);
281 }
282 }
283 }
284
285 digits:
286 for (;;)
287 {
288 if (base == 16)
289 {
290 if (! isxdigit (c))
291 break;
292 }
293 else
294 {
295 if (! isdigit (c))
296 break;
297 if (base == 8 && (c == '8' || c == '9'))
298 break;
299 }
300
301 seen_digit = 1;
302 STORE (c);
303 GET (c);
304 }
305
306 if (first)
307 {
308 /* decimal point */
309 if (p->type == 'F' && ! seen_point)
310 {
311 /* For a multi-character decimal point, if the first character is
312 present then all of it must be, otherwise the input is
313 considered invalid. */
314 const char *point = GMP_DECIMAL_POINT;
315 int pc = (unsigned char) *point++;
316 if (c == pc)
317 {
318 for (;;)
319 {
320 STORE (c);
321 GET (c);
322 pc = (unsigned char) *point++;
323 if (pc == '\0')
324 break;
325 if (c != pc)
326 goto set_invalid;
327 }
328 seen_point = 1;
329 goto digits;
330 }
331 }
332
333 /* exponent */
334 if (p->type == 'F')
335 {
336 if (hexfloat && (c == 'p' || c == 'P'))
337 {
338 hexexp = s_upto; /* exponent location */
339 base = 10; /* exponent in decimal */
340 goto exponent;
341 }
342 else if (! hexfloat && (c == 'e' || c == 'E'))
343 {
344 exponent:
345 /* must have at least one digit in the mantissa, just an exponent
346 is not good enough */
347 if (! seen_digit)
348 goto set_invalid;
349
350 do_second:
351 first = 0;
352 STORE (c);
353 GET (c);
354 goto another;
355 }
356 }
357
358 /* denominator */
359 if (p->type == 'Q' && c == '/')
360 {
361 /* must have at least one digit in the numerator */
362 if (! seen_digit)
363 goto set_invalid;
364
365 /* now look for at least one digit in the denominator */
366 seen_digit = 0;
367
368 /* allow the base to be redetermined for "%i" */
369 base = p->base;
370 goto do_second;
371 }
372 }
373
374 convert:
375 if (! seen_digit)
376 {
377 set_invalid:
378 invalid = 1;
379 goto done;
380 }
381
382 if (! p->ignore)
383 {
384 STORE ('\0');
385 TRACE (printf (" convert \"%s\"\n", s));
386
387 /* We ought to have parsed out a valid string above, so just test
388 mpz_set_str etc with an ASSERT. */
389 switch (p->type) {
390 case 'F':
391 {
392 mpf_ptr f = (mpf_ptr) dst;
393 if (hexexp != 0)
394 s[hexexp] = '\0';
395 ASSERT_NOCARRY (mpf_set_str (f, s, hexfloat ? 16 : 10));
396 if (hexexp != 0)
397 {
398 char *dummy;
399 long exp;
400 exp = strtol (s + hexexp + 1, &dummy, 10);
401 if (exp >= 0)
402 mpf_mul_2exp (f, f, (unsigned long) exp);
403 else
404 mpf_div_2exp (f, f, NEG_CAST (unsigned long, exp));
405 }
406 }
407 break;
408 case 'Q':
409 ASSERT_NOCARRY (mpq_set_str ((mpq_ptr) dst, s, p->base));
410 break;
411 case 'Z':
412 ASSERT_NOCARRY (mpz_set_str ((mpz_ptr) dst, s, p->base));
413 break;
414 default:
415 ASSERT (0);
416 /*FALLTHRU*/
417 break;
418 }
419 }
420
421 done:
422 ASSERT (chars <= width+1);
423 if (chars != width+1)
424 {
425 (*funs->unget) (c, data);
426 TRACE (printf (" ungetc %d, to give %d chars\n", c, chars-1));
427 }
428 chars--;
429
430 (*__gmp_free_func) (s, s_alloc);
431
432 if (invalid)
433 {
434 TRACE (printf (" invalid\n"));
435 return -1;
436 }
437
438 TRACE (printf (" return %d chars (cf width %d)\n", chars, width));
439 return chars;
440}
441
442
443/* Read and discard whitespace, if any. Return number of chars skipped.
444 Whitespace skipping never provokes the EOF return from __gmp_doscan, so
445 it's not necessary to watch for EOF from funs->get, */
446static int
447skip_white (const struct gmp_doscan_funs_t *funs, void *data)
448{
449 int c;
450 int ret = 0;
451
452 do
453 {
454 c = (funs->get) (data);
455 ret++;
456 }
457 while (isspace (c));
458
459 (funs->unget) (c, data);
460 ret--;
461
462 TRACE (printf (" skip white %d\n", ret));
463 return ret;
464}
465
466
467int
468__gmp_doscan (const struct gmp_doscan_funs_t *funs, void *data,
469 const char *orig_fmt, va_list orig_ap)
470{
471 struct gmp_doscan_params_t param;
472 va_list ap;
473 char *alloc_fmt;
474 const char *fmt, *this_fmt, *end_fmt;
475 size_t orig_fmt_len, alloc_fmt_size, len;
476 int new_fields, new_chars;
477 char fchar;
478 int fields = 0;
479 int chars = 0;
480
481 TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt);
482 if (funs->scan == (gmp_doscan_scan_t) sscanf)
483 printf (" s=\"%s\"\n", * (const char **) data));
484
485 /* Don't modify orig_ap, if va_list is actually an array and hence call by
486 reference. It could be argued that it'd be more efficient to leave
487 callers to make a copy if they care, but doing so here is going to be a
488 very small part of the total work, and we may as well keep applications
489 out of trouble. */
490 va_copy (ap, orig_ap);
491
492 /* Parts of the format string are going to be copied so that a " %n" can
493 be appended. alloc_fmt is some space for that. orig_fmt_len+4 will be
494 needed if fmt consists of a single "%" specifier, but otherwise is an
495 overestimate. We're not going to be very fast here, so use
496 __gmp_allocate_func rather than TMP_ALLOC. */
497 orig_fmt_len = strlen (orig_fmt);
498 alloc_fmt_size = orig_fmt_len + 4;
499 alloc_fmt = __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size, char);
500
501 fmt = orig_fmt;
502 end_fmt = orig_fmt + orig_fmt_len;
503
504 for (;;)
505 {
506 next:
507 fchar = *fmt++;
508
509 if (fchar == '\0')
510 break;
511
512 if (isspace (fchar))
513 {
514 chars += skip_white (funs, data);
515 continue;
516 }
517
518 if (fchar != '%')
519 {
520 int c;
521 literal:
522 c = (funs->get) (data);
523 if (c != fchar)
524 {
525 (funs->unget) (c, data);
526 if (c == EOF)
527 {
528 eof_no_match:
529 if (fields == 0)
530 fields = EOF;
531 }
532 goto done;
533 }
534 chars++;
535 continue;
536 }
537
538 param.type = '\0';
539 param.base = 0; /* for e,f,g,i */
540 param.ignore = 0;
541 param.width = 0;
542
543 this_fmt = fmt-1;
544 TRACE (printf (" this_fmt \"%s\"\n", this_fmt));
545
546 for (;;)
547 {
548 ASSERT (fmt <= end_fmt);
549
550 fchar = *fmt++;
551 switch (fchar) {
552
553 case '\0': /* unterminated % sequence */
554 ASSERT (0);
555 goto done;
556
557 case '%': /* literal % */
558 goto literal;
559
560 case '[': /* character range */
561 fchar = *fmt++;
562 if (fchar == '^')
563 fchar = *fmt++;
564 /* ']' allowed as the first char (possibly after '^') */
565 if (fchar == ']')
566 fchar = *fmt++;
567 for (;;)
568 {
569 ASSERT (fmt <= end_fmt);
570 if (fchar == '\0')
571 {
572 /* unterminated % sequence */
573 ASSERT (0);
574 goto done;
575 }
576 if (fchar == ']')
577 break;
578 fchar = *fmt++;
579 }
580 /*FALLTHRU*/
581 case 'c': /* characters */
582 case 's': /* string of non-whitespace */
583 case 'p': /* pointer */
584 libc_type:
585 len = fmt - this_fmt;
586 memcpy (alloc_fmt, this_fmt, len);
587 alloc_fmt[len++] = '%';
588 alloc_fmt[len++] = 'n';
589 alloc_fmt[len] = '\0';
590
591 TRACE (printf (" scan \"%s\"\n", alloc_fmt);
592 if (funs->scan == (gmp_doscan_scan_t) sscanf)
593 printf (" s=\"%s\"\n", * (const char **) data));
594
595 new_chars = -1;
596 if (param.ignore)
597 {
598 new_fields = (*funs->scan) (data, alloc_fmt, &new_chars, NULL);
599 ASSERT (new_fields == 0 || new_fields == EOF);
600 }
601 else
602 {
603 void *arg = va_arg (ap, void *);
604 new_fields = (*funs->scan) (data, alloc_fmt, arg, &new_chars);
605 ASSERT (new_fields==0 || new_fields==1 || new_fields==EOF);
606
607 if (new_fields == 0)
608 goto done; /* invalid input */
609
610 if (new_fields == 1)
611 ASSERT (new_chars != -1);
612 }
613 TRACE (printf (" new_fields %d new_chars %d\n",
614 new_fields, new_chars));
615
616 if (new_fields == -1)
617 goto eof_no_match; /* EOF before anything matched */
618
619 /* Under param.ignore, when new_fields==0 we don't know if
620 it's a successful match or an invalid field. new_chars
621 won't have been assigned if it was an invalid field. */
622 if (new_chars == -1)
623 goto done; /* invalid input */
624
625 chars += new_chars;
626 (*funs->step) (data, new_chars);
627
628 increment_fields:
629 if (! param.ignore)
630 fields++;
631 goto next;
632
633 case 'd': /* decimal */
634 case 'u': /* decimal */
635 param.base = 10;
636 goto numeric;
637
638 case 'e': /* float */
639 case 'E': /* float */
640 case 'f': /* float */
641 case 'g': /* float */
642 case 'G': /* float */
643 case 'i': /* integer with base marker */
644 numeric:
645 if (param.type != 'F' && param.type != 'Q' && param.type != 'Z')
646 goto libc_type;
647
648 chars += skip_white (funs, data);
649
650 new_chars = gmpscan (funs, data, &param,
651 param.ignore ? NULL : va_arg (ap, void*));
652 if (new_chars == -2)
653 goto eof_no_match;
654 if (new_chars == -1)
655 goto done;
656
657 ASSERT (new_chars >= 0);
658 chars += new_chars;
659 goto increment_fields;
660
661 case 'a': /* glibc allocate string */
662 case '\'': /* glibc digit groupings */
663 break;
664
665 case 'F': /* mpf_t */
666 case 'j': /* intmax_t */
667 case 'L': /* long long */
668 case 'q': /* quad_t */
669 case 'Q': /* mpq_t */
670 case 't': /* ptrdiff_t */
671 case 'z': /* size_t */
672 case 'Z': /* mpz_t */
673 set_type:
674 param.type = fchar;
675 break;
676
677 case 'h': /* short or char */
678 if (param.type != 'h')
679 goto set_type;
680 param.type = 'H'; /* internal code for "hh" */
681 break;
682
683 goto numeric;
684
685 case 'l': /* long, long long, double or long double */
686 if (param.type != 'l')
687 goto set_type;
688 param.type = 'L'; /* "ll" means "L" */
689 break;
690
691 case 'n':
692 if (! param.ignore)
693 {
694 void *p;
695 p = va_arg (ap, void *);
696 TRACE (printf (" store %%n to %p\n", p));
697 switch (param.type) {
698 case '\0': * (int *) p = chars; break;
699 case 'F': mpf_set_si ((mpf_ptr) p, (long) chars); break;
700 case 'H': * (char *) p = chars; break;
701 case 'h': * (short *) p = chars; break;
702#if HAVE_INTMAX_T
703 case 'j': * (intmax_t *) p = chars; break;
704#else
705 case 'j': ASSERT_FAIL (intmax_t not available); break;
706#endif
707 case 'l': * (long *) p = chars; break;
708#if HAVE_QUAD_T && HAVE_LONG_LONG
709 case 'q':
710 ASSERT_ALWAYS (sizeof (quad_t) == sizeof (long long));
711 /*FALLTHRU*/
712#else
713 case 'q': ASSERT_FAIL (quad_t not available); break;
714#endif
715#if HAVE_LONG_LONG
716 case 'L': * (long long *) p = chars; break;
717#else
718 case 'L': ASSERT_FAIL (long long not available); break;
719#endif
720 case 'Q': mpq_set_si ((mpq_ptr) p, (long) chars, 1L); break;
721#if HAVE_PTRDIFF_T
722 case 't': * (ptrdiff_t *) p = chars; break;
723#else
724 case 't': ASSERT_FAIL (ptrdiff_t not available); break;
725#endif
726 case 'z': * (size_t *) p = chars; break;
727 case 'Z': mpz_set_si ((mpz_ptr) p, (long) chars); break;
728 default: ASSERT (0); break;
729 }
730 }
731 goto next;
732
733 case 'o':
734 param.base = 8;
735 goto numeric;
736
737 case 'x':
738 case 'X':
739 param.base = 16;
740 goto numeric;
741
742 case '0': case '1': case '2': case '3': case '4':
743 case '5': case '6': case '7': case '8': case '9':
744 param.width = 0;
745 do {
746 param.width = param.width * 10 + (fchar-'0');
747 fchar = *fmt++;
748 } while (isdigit (fchar));
749 fmt--; /* unget the non-digit */
750 break;
751
752 case '*':
753 param.ignore = 1;
754 break;
755
756 default:
757 /* something invalid in a % sequence */
758 ASSERT (0);
759 goto next;
760 }
761 }
762 }
763
764 done:
765 (*__gmp_free_func) (alloc_fmt, alloc_fmt_size);
766 return fields;
767}