Daily bump.
[gcc.git] / libcpp / lex.c
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2021 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26
27 enum spell_type
28 {
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
33 };
34
35 struct token_spelling
36 {
37 enum spell_type category;
38 const unsigned char *name;
39 };
40
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
64
65 static _cpp_buff *new_buff (size_t);
66
67
68 /* Utility routine:
69
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
74 {
75 if (token->type != CPP_NAME)
76 return 0;
77
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 }
80
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 {
86 if (buffer->notes_used == buffer->notes_cap)
87 {
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
91 }
92
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
96 }
97
98 \f
99 /* Fast path to find line special characters using optimized character
100 scanning algorithms. Anything complicated falls back to the slow
101 path below. Since this loop is very hot it's worth doing these kinds
102 of optimizations.
103
104 One of the paths through the ifdefs should provide
105
106 const uchar *search_line_fast (const uchar *s, const uchar *end);
107
108 Between S and END, search for \n, \r, \\, ?. Return a pointer to
109 the found character.
110
111 Note that the last character of the buffer is *always* a newline,
112 as forced by _cpp_convert_input. This fact can be used to avoid
113 explicitly looking for the end of the buffer. */
114
115 /* Configure gives us an ifdef test. */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
119
120 /* We'd like the largest integer that fits into a register. There's nothing
121 in <stdint.h> that gives us that. For most hosts this is unsigned long,
122 but MS decided on an LLP64 model. Thankfully when building with GCC we
123 can get the "real" word size. */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
129
130 /* The code below is only expecting sizes 4 or 8.
131 Die at compile-time if this expectation is violated. */
132 typedef char check_word_type_size
133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
134
135 /* Return X with the first N bytes forced to values that won't match one
136 of the interesting characters. Note that NUL is not interesting. */
137
138 static inline word_type
139 acc_char_mask_misalign (word_type val, unsigned int n)
140 {
141 word_type mask = -1;
142 if (WORDS_BIGENDIAN)
143 mask >>= n * 8;
144 else
145 mask <<= n * 8;
146 return val & mask;
147 }
148
149 /* Return X replicated to all byte positions within WORD_TYPE. */
150
151 static inline word_type
152 acc_char_replicate (uchar x)
153 {
154 word_type ret;
155
156 ret = (x << 24) | (x << 16) | (x << 8) | x;
157 if (sizeof(word_type) == 8)
158 ret = (ret << 16 << 16) | ret;
159 return ret;
160 }
161
162 /* Return non-zero if some byte of VAL is (probably) C. */
163
164 static inline word_type
165 acc_char_cmp (word_type val, word_type c)
166 {
167 #if defined(__GNUC__) && defined(__alpha__)
168 /* We can get exact results using a compare-bytes instruction.
169 Get (val == c) via (0 >= (val ^ c)). */
170 return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172 word_type magic = 0x7efefefeU;
173 if (sizeof(word_type) == 8)
174 magic = (magic << 16 << 16) | 0xfefefefeU;
175 magic |= 1;
176
177 val ^= c;
178 return ((val + magic) ^ ~val) & ~magic;
179 #endif
180 }
181
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183 the found character. If this was a false positive, return -1. */
184
185 static inline int
186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 word_type val ATTRIBUTE_UNUSED)
188 {
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190 /* The cmpbge instruction sets *bits* of the result corresponding to
191 matches in the bytes with no false positives. */
192 return __builtin_ctzl (cmp);
193 #else
194 unsigned int i;
195
196 /* ??? It would be nice to force unrolling here,
197 and have all of these constants folded. */
198 for (i = 0; i < sizeof(word_type); ++i)
199 {
200 uchar c;
201 if (WORDS_BIGENDIAN)
202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203 else
204 c = (val >> i * 8) & 0xff;
205
206 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 return i;
208 }
209
210 return -1;
211 #endif
212 }
213
214 /* A version of the fast scanner using bit fiddling techniques.
215
216 For 32-bit words, one would normally perform 16 comparisons and
217 16 branches. With this algorithm one performs 24 arithmetic
218 operations and one branch. Whether this is faster with a 32-bit
219 word size is going to be somewhat system dependent.
220
221 For 64-bit words, we eliminate twice the number of comparisons
222 and branches without increasing the number of arithmetic operations.
223 It's almost certainly going to be a win with 64-bit word size. */
224
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226 ATTRIBUTE_UNUSED;
227
228 static const uchar *
229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230 {
231 const word_type repl_nl = acc_char_replicate ('\n');
232 const word_type repl_cr = acc_char_replicate ('\r');
233 const word_type repl_bs = acc_char_replicate ('\\');
234 const word_type repl_qm = acc_char_replicate ('?');
235
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
239
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, misalign);
246
247 /* Main loop. */
248 while (1)
249 {
250 t = acc_char_cmp (val, repl_nl);
251 t |= acc_char_cmp (val, repl_cr);
252 t |= acc_char_cmp (val, repl_bs);
253 t |= acc_char_cmp (val, repl_qm);
254
255 if (__builtin_expect (t != 0, 0))
256 {
257 int i = acc_char_index (t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
260 }
261
262 val = *++p;
263 }
264 }
265
266 /* Disable on Solaris 2/x86 until the following problem can be properly
267 autoconfed:
268
269 The Solaris 10+ assembler tags objects with the instruction set
270 extensions used, so SSE4.2 executables cannot run on machines that
271 don't support that extension. */
272
273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
274
275 /* Replicated character data to be shared between implementations.
276 Recall that outside of a context with vector support we can't
277 define compatible vector types, therefore these are all defined
278 in terms of raw characters. */
279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286 { '?', '?', '?', '?', '?', '?', '?', '?',
287 '?', '?', '?', '?', '?', '?', '?', '?' },
288 };
289
290 /* A version of the fast scanner using MMX vectorized byte compare insns.
291
292 This uses the PMOVMSKB instruction which was introduced with "MMX2",
293 which was packaged into SSE1; it is also present in the AMD MMX
294 extension. Mark the function as using "sse" so that we emit a real
295 "emms" instruction, rather than the 3dNOW "femms" instruction. */
296
297 static const uchar *
298 #ifndef __SSE__
299 __attribute__((__target__("sse")))
300 #endif
301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
302 {
303 typedef char v8qi __attribute__ ((__vector_size__ (8)));
304 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
305
306 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
310
311 unsigned int misalign, found, mask;
312 const v8qi *p;
313 v8qi data, t, c;
314
315 /* Align the source pointer. While MMX doesn't generate unaligned data
316 faults, this allows us to safely scan to the end of the buffer without
317 reading beyond the end of the last page. */
318 misalign = (uintptr_t)s & 7;
319 p = (const v8qi *)((uintptr_t)s & -8);
320 data = *p;
321
322 /* Create a mask for the bytes that are valid within the first
323 16-byte block. The Idea here is that the AND with the mask
324 within the loop is "free", since we need some AND or TEST
325 insn in order to set the flags for the branch anyway. */
326 mask = -1u << misalign;
327
328 /* Main loop processing 8 bytes at a time. */
329 goto start;
330 do
331 {
332 data = *++p;
333 mask = -1;
334
335 start:
336 t = __builtin_ia32_pcmpeqb(data, repl_nl);
337 c = __builtin_ia32_pcmpeqb(data, repl_cr);
338 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339 c = __builtin_ia32_pcmpeqb(data, repl_bs);
340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341 c = __builtin_ia32_pcmpeqb(data, repl_qm);
342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 found = __builtin_ia32_pmovmskb (t);
344 found &= mask;
345 }
346 while (!found);
347
348 __builtin_ia32_emms ();
349
350 /* FOUND contains 1 in bits for which we matched a relevant
351 character. Conversion to the byte index is trivial. */
352 found = __builtin_ctz(found);
353 return (const uchar *)p + found;
354 }
355
356 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
357
358 static const uchar *
359 #ifndef __SSE2__
360 __attribute__((__target__("sse2")))
361 #endif
362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
363 {
364 typedef char v16qi __attribute__ ((__vector_size__ (16)));
365
366 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
370
371 unsigned int misalign, found, mask;
372 const v16qi *p;
373 v16qi data, t;
374
375 /* Align the source pointer. */
376 misalign = (uintptr_t)s & 15;
377 p = (const v16qi *)((uintptr_t)s & -16);
378 data = *p;
379
380 /* Create a mask for the bytes that are valid within the first
381 16-byte block. The Idea here is that the AND with the mask
382 within the loop is "free", since we need some AND or TEST
383 insn in order to set the flags for the branch anyway. */
384 mask = -1u << misalign;
385
386 /* Main loop processing 16 bytes at a time. */
387 goto start;
388 do
389 {
390 data = *++p;
391 mask = -1;
392
393 start:
394 t = __builtin_ia32_pcmpeqb128(data, repl_nl);
395 t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
396 t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
397 t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
398 found = __builtin_ia32_pmovmskb128 (t);
399 found &= mask;
400 }
401 while (!found);
402
403 /* FOUND contains 1 in bits for which we matched a relevant
404 character. Conversion to the byte index is trivial. */
405 found = __builtin_ctz(found);
406 return (const uchar *)p + found;
407 }
408
409 #ifdef HAVE_SSE4
410 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
411
412 static const uchar *
413 #ifndef __SSE4_2__
414 __attribute__((__target__("sse4.2")))
415 #endif
416 search_line_sse42 (const uchar *s, const uchar *end)
417 {
418 typedef char v16qi __attribute__ ((__vector_size__ (16)));
419 static const v16qi search = { '\n', '\r', '?', '\\' };
420
421 uintptr_t si = (uintptr_t)s;
422 uintptr_t index;
423
424 /* Check for unaligned input. */
425 if (si & 15)
426 {
427 v16qi sv;
428
429 if (__builtin_expect (end - s < 16, 0)
430 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
431 {
432 /* There are less than 16 bytes left in the buffer, and less
433 than 16 bytes left on the page. Reading 16 bytes at this
434 point might generate a spurious page fault. Defer to the
435 SSE2 implementation, which already handles alignment. */
436 return search_line_sse2 (s, end);
437 }
438
439 /* ??? The builtin doesn't understand that the PCMPESTRI read from
440 memory need not be aligned. */
441 sv = __builtin_ia32_loaddqu ((const char *) s);
442 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
443
444 if (__builtin_expect (index < 16, 0))
445 goto found;
446
447 /* Advance the pointer to an aligned address. We will re-scan a
448 few bytes, but we no longer need care for reading past the
449 end of a page, since we're guaranteed a match. */
450 s = (const uchar *)((si + 15) & -16);
451 }
452
453 /* Main loop, processing 16 bytes at a time. */
454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
455 while (1)
456 {
457 char f;
458
459 /* By using inline assembly instead of the builtin,
460 we can use the result, as well as the flags set. */
461 __asm ("%vpcmpestri\t$0, %2, %3"
462 : "=c"(index), "=@ccc"(f)
463 : "m"(*s), "x"(search), "a"(4), "d"(16));
464 if (f)
465 break;
466
467 s += 16;
468 }
469 #else
470 s -= 16;
471 /* By doing the whole loop in inline assembly,
472 we can make proper use of the flags set. */
473 __asm ( ".balign 16\n"
474 "0: add $16, %1\n"
475 " %vpcmpestri\t$0, (%1), %2\n"
476 " jnc 0b"
477 : "=&c"(index), "+r"(s)
478 : "x"(search), "a"(4), "d"(16));
479 #endif
480
481 found:
482 return s + index;
483 }
484
485 #else
486 /* Work around out-dated assemblers without sse4 support. */
487 #define search_line_sse42 search_line_sse2
488 #endif
489
490 /* Check the CPU capabilities. */
491
492 #include "../gcc/config/i386/cpuid.h"
493
494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495 static search_line_fast_type search_line_fast;
496
497 #define HAVE_init_vectorized_lexer 1
498 static inline void
499 init_vectorized_lexer (void)
500 {
501 unsigned dummy, ecx = 0, edx = 0;
502 search_line_fast_type impl = search_line_acc_char;
503 int minimum = 0;
504
505 #if defined(__SSE4_2__)
506 minimum = 3;
507 #elif defined(__SSE2__)
508 minimum = 2;
509 #elif defined(__SSE__)
510 minimum = 1;
511 #endif
512
513 if (minimum == 3)
514 impl = search_line_sse42;
515 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
516 {
517 if (minimum == 3 || (ecx & bit_SSE4_2))
518 impl = search_line_sse42;
519 else if (minimum == 2 || (edx & bit_SSE2))
520 impl = search_line_sse2;
521 else if (minimum == 1 || (edx & bit_SSE))
522 impl = search_line_mmx;
523 }
524 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
525 {
526 if (minimum == 1
527 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
528 impl = search_line_mmx;
529 }
530
531 search_line_fast = impl;
532 }
533
534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
535
536 /* A vection of the fast scanner using AltiVec vectorized byte compares
537 and VSX unaligned loads (when VSX is available). This is otherwise
538 the same as the AltiVec version. */
539
540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
541 static const uchar *
542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
543 {
544 typedef __attribute__((altivec(vector))) unsigned char vc;
545
546 const vc repl_nl = {
547 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
549 };
550 const vc repl_cr = {
551 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
553 };
554 const vc repl_bs = {
555 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
557 };
558 const vc repl_qm = {
559 '?', '?', '?', '?', '?', '?', '?', '?',
560 '?', '?', '?', '?', '?', '?', '?', '?',
561 };
562 const vc zero = { 0 };
563
564 vc data, t;
565
566 /* Main loop processing 16 bytes at a time. */
567 do
568 {
569 vc m_nl, m_cr, m_bs, m_qm;
570
571 data = __builtin_vec_vsx_ld (0, s);
572 s += 16;
573
574 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578 t = (m_nl | m_cr) | (m_bs | m_qm);
579
580 /* T now contains 0xff in bytes for which we matched one of the relevant
581 characters. We want to exit the loop if any byte in T is non-zero.
582 Below is the expansion of vec_any_ne(t, zero). */
583 }
584 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
585
586 /* Restore s to to point to the 16 bytes we just processed. */
587 s -= 16;
588
589 {
590 #define N (sizeof(vc) / sizeof(long))
591
592 union {
593 vc v;
594 /* Statically assert that N is 2 or 4. */
595 unsigned long l[(N == 2 || N == 4) ? N : -1];
596 } u;
597 unsigned long l, i = 0;
598
599 u.v = t;
600
601 /* Find the first word of T that is non-zero. */
602 switch (N)
603 {
604 case 4:
605 l = u.l[i++];
606 if (l != 0)
607 break;
608 s += sizeof(unsigned long);
609 l = u.l[i++];
610 if (l != 0)
611 break;
612 s += sizeof(unsigned long);
613 /* FALLTHRU */
614 case 2:
615 l = u.l[i++];
616 if (l != 0)
617 break;
618 s += sizeof(unsigned long);
619 l = u.l[i];
620 }
621
622 /* L now contains 0xff in bytes for which we matched one of the
623 relevant characters. We can find the byte index by finding
624 its bit index and dividing by 8. */
625 #ifdef __BIG_ENDIAN__
626 l = __builtin_clzl(l) >> 3;
627 #else
628 l = __builtin_ctzl(l) >> 3;
629 #endif
630 return s + l;
631
632 #undef N
633 }
634 }
635
636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
637
638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
639 This cannot be used for little endian because vec_lvsl/lvsr are
640 deprecated for little endian and the code won't work properly. */
641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
642 so we can't compile this function without -maltivec on the command line
643 (or implied by some other switch). */
644
645 static const uchar *
646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
647 {
648 typedef __attribute__((altivec(vector))) unsigned char vc;
649
650 const vc repl_nl = {
651 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
653 };
654 const vc repl_cr = {
655 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
657 };
658 const vc repl_bs = {
659 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
661 };
662 const vc repl_qm = {
663 '?', '?', '?', '?', '?', '?', '?', '?',
664 '?', '?', '?', '?', '?', '?', '?', '?',
665 };
666 const vc ones = {
667 -1, -1, -1, -1, -1, -1, -1, -1,
668 -1, -1, -1, -1, -1, -1, -1, -1,
669 };
670 const vc zero = { 0 };
671
672 vc data, mask, t;
673
674 /* Altivec loads automatically mask addresses with -16. This lets us
675 issue the first load as early as possible. */
676 data = __builtin_vec_ld(0, (const vc *)s);
677
678 /* Discard bytes before the beginning of the buffer. Do this by
679 beginning with all ones and shifting in zeros according to the
680 mis-alignment. The LVSR instruction pulls the exact shift we
681 want from the address. */
682 mask = __builtin_vec_lvsr(0, s);
683 mask = __builtin_vec_perm(zero, ones, mask);
684 data &= mask;
685
686 /* While altivec loads mask addresses, we still need to align S so
687 that the offset we compute at the end is correct. */
688 s = (const uchar *)((uintptr_t)s & -16);
689
690 /* Main loop processing 16 bytes at a time. */
691 goto start;
692 do
693 {
694 vc m_nl, m_cr, m_bs, m_qm;
695
696 s += 16;
697 data = __builtin_vec_ld(0, (const vc *)s);
698
699 start:
700 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
701 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
702 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
703 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
704 t = (m_nl | m_cr) | (m_bs | m_qm);
705
706 /* T now contains 0xff in bytes for which we matched one of the relevant
707 characters. We want to exit the loop if any byte in T is non-zero.
708 Below is the expansion of vec_any_ne(t, zero). */
709 }
710 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
711
712 {
713 #define N (sizeof(vc) / sizeof(long))
714
715 union {
716 vc v;
717 /* Statically assert that N is 2 or 4. */
718 unsigned long l[(N == 2 || N == 4) ? N : -1];
719 } u;
720 unsigned long l, i = 0;
721
722 u.v = t;
723
724 /* Find the first word of T that is non-zero. */
725 switch (N)
726 {
727 case 4:
728 l = u.l[i++];
729 if (l != 0)
730 break;
731 s += sizeof(unsigned long);
732 l = u.l[i++];
733 if (l != 0)
734 break;
735 s += sizeof(unsigned long);
736 /* FALLTHROUGH */
737 case 2:
738 l = u.l[i++];
739 if (l != 0)
740 break;
741 s += sizeof(unsigned long);
742 l = u.l[i];
743 }
744
745 /* L now contains 0xff in bytes for which we matched one of the
746 relevant characters. We can find the byte index by finding
747 its bit index and dividing by 8. */
748 l = __builtin_clzl(l) >> 3;
749 return s + l;
750
751 #undef N
752 }
753 }
754
755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
756 #include "arm_neon.h"
757
758 /* This doesn't have to be the exact page size, but no system may use
759 a size smaller than this. ARMv8 requires a minimum page size of
760 4k. The impact of being conservative here is a small number of
761 cases will take the slightly slower entry path into the main
762 loop. */
763
764 #define AARCH64_MIN_PAGE_SIZE 4096
765
766 static const uchar *
767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
768 {
769 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
770 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
771 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
772 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
773 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
774
775 #ifdef __ARM_BIG_ENDIAN
776 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
777 #else
778 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
779 #endif
780
781 unsigned int found;
782 const uint8_t *p;
783 uint8x16_t data;
784 uint8x16_t t;
785 uint16x8_t m;
786 uint8x16_t u, v, w;
787
788 /* Align the source pointer. */
789 p = (const uint8_t *)((uintptr_t)s & -16);
790
791 /* Assuming random string start positions, with a 4k page size we'll take
792 the slow path about 0.37% of the time. */
793 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
794 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
795 < 16, 0))
796 {
797 /* Slow path: the string starts near a possible page boundary. */
798 uint32_t misalign, mask;
799
800 misalign = (uintptr_t)s & 15;
801 mask = (-1u << misalign) & 0xffff;
802 data = vld1q_u8 (p);
803 t = vceqq_u8 (data, repl_nl);
804 u = vceqq_u8 (data, repl_cr);
805 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
806 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
807 t = vorrq_u8 (v, w);
808 t = vandq_u8 (t, xmask);
809 m = vpaddlq_u8 (t);
810 m = vshlq_u16 (m, shift);
811 found = vaddvq_u16 (m);
812 found &= mask;
813 if (found)
814 return (const uchar*)p + __builtin_ctz (found);
815 }
816 else
817 {
818 data = vld1q_u8 ((const uint8_t *) s);
819 t = vceqq_u8 (data, repl_nl);
820 u = vceqq_u8 (data, repl_cr);
821 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
822 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
823 t = vorrq_u8 (v, w);
824 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
825 goto done;
826 }
827
828 do
829 {
830 p += 16;
831 data = vld1q_u8 (p);
832 t = vceqq_u8 (data, repl_nl);
833 u = vceqq_u8 (data, repl_cr);
834 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
835 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
836 t = vorrq_u8 (v, w);
837 } while (!vpaddd_u64 ((uint64x2_t)t));
838
839 done:
840 /* Now that we've found the terminating substring, work out precisely where
841 we need to stop. */
842 t = vandq_u8 (t, xmask);
843 m = vpaddlq_u8 (t);
844 m = vshlq_u16 (m, shift);
845 found = vaddvq_u16 (m);
846 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
847 + __builtin_ctz (found));
848 }
849
850 #elif defined (__ARM_NEON)
851 #include "arm_neon.h"
852
853 static const uchar *
854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
855 {
856 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
857 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
858 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
859 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
860 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
861
862 unsigned int misalign, found, mask;
863 const uint8_t *p;
864 uint8x16_t data;
865
866 /* Align the source pointer. */
867 misalign = (uintptr_t)s & 15;
868 p = (const uint8_t *)((uintptr_t)s & -16);
869 data = vld1q_u8 (p);
870
871 /* Create a mask for the bytes that are valid within the first
872 16-byte block. The Idea here is that the AND with the mask
873 within the loop is "free", since we need some AND or TEST
874 insn in order to set the flags for the branch anyway. */
875 mask = (-1u << misalign) & 0xffff;
876
877 /* Main loop, processing 16 bytes at a time. */
878 goto start;
879
880 do
881 {
882 uint8x8_t l;
883 uint16x4_t m;
884 uint32x2_t n;
885 uint8x16_t t, u, v, w;
886
887 p += 16;
888 data = vld1q_u8 (p);
889 mask = 0xffff;
890
891 start:
892 t = vceqq_u8 (data, repl_nl);
893 u = vceqq_u8 (data, repl_cr);
894 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
895 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
896 t = vandq_u8 (vorrq_u8 (v, w), xmask);
897 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
898 m = vpaddl_u8 (l);
899 n = vpaddl_u16 (m);
900
901 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
902 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
903 found &= mask;
904 }
905 while (!found);
906
907 /* FOUND contains 1 in bits for which we matched a relevant
908 character. Conversion to the byte index is trivial. */
909 found = __builtin_ctz (found);
910 return (const uchar *)p + found;
911 }
912
913 #else
914
915 /* We only have one accelerated alternative. Use a direct call so that
916 we encourage inlining. */
917
918 #define search_line_fast search_line_acc_char
919
920 #endif
921
922 /* Initialize the lexer if needed. */
923
924 void
925 _cpp_init_lexer (void)
926 {
927 #ifdef HAVE_init_vectorized_lexer
928 init_vectorized_lexer ();
929 #endif
930 }
931
932 /* Returns with a logical line that contains no escaped newlines or
933 trigraphs. This is a time-critical inner loop. */
934 void
935 _cpp_clean_line (cpp_reader *pfile)
936 {
937 cpp_buffer *buffer;
938 const uchar *s;
939 uchar c, *d, *p;
940
941 buffer = pfile->buffer;
942 buffer->cur_note = buffer->notes_used = 0;
943 buffer->cur = buffer->line_base = buffer->next_line;
944 buffer->need_line = false;
945 s = buffer->next_line;
946
947 if (!buffer->from_stage3)
948 {
949 const uchar *pbackslash = NULL;
950
951 /* Fast path. This is the common case of an un-escaped line with
952 no trigraphs. The primary win here is by not writing any
953 data back to memory until we have to. */
954 while (1)
955 {
956 /* Perform an optimized search for \n, \r, \\, ?. */
957 s = search_line_fast (s, buffer->rlimit);
958
959 c = *s;
960 if (c == '\\')
961 {
962 /* Record the location of the backslash and continue. */
963 pbackslash = s++;
964 }
965 else if (__builtin_expect (c == '?', 0))
966 {
967 if (__builtin_expect (s[1] == '?', false)
968 && _cpp_trigraph_map[s[2]])
969 {
970 /* Have a trigraph. We may or may not have to convert
971 it. Add a line note regardless, for -Wtrigraphs. */
972 add_line_note (buffer, s, s[2]);
973 if (CPP_OPTION (pfile, trigraphs))
974 {
975 /* We do, and that means we have to switch to the
976 slow path. */
977 d = (uchar *) s;
978 *d = _cpp_trigraph_map[s[2]];
979 s += 2;
980 goto slow_path;
981 }
982 }
983 /* Not a trigraph. Continue on fast-path. */
984 s++;
985 }
986 else
987 break;
988 }
989
990 /* This must be \r or \n. We're either done, or we'll be forced
991 to write back to the buffer and continue on the slow path. */
992 d = (uchar *) s;
993
994 if (__builtin_expect (s == buffer->rlimit, false))
995 goto done;
996
997 /* DOS line ending? */
998 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
999 {
1000 s++;
1001 if (s == buffer->rlimit)
1002 goto done;
1003 }
1004
1005 if (__builtin_expect (pbackslash == NULL, true))
1006 goto done;
1007
1008 /* Check for escaped newline. */
1009 p = d;
1010 while (is_nvspace (p[-1]))
1011 p--;
1012 if (p - 1 != pbackslash)
1013 goto done;
1014
1015 /* Have an escaped newline; process it and proceed to
1016 the slow path. */
1017 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018 d = p - 2;
1019 buffer->next_line = p - 1;
1020
1021 slow_path:
1022 while (1)
1023 {
1024 c = *++s;
1025 *++d = c;
1026
1027 if (c == '\n' || c == '\r')
1028 {
1029 /* Handle DOS line endings. */
1030 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031 s++;
1032 if (s == buffer->rlimit)
1033 break;
1034
1035 /* Escaped? */
1036 p = d;
1037 while (p != buffer->next_line && is_nvspace (p[-1]))
1038 p--;
1039 if (p == buffer->next_line || p[-1] != '\\')
1040 break;
1041
1042 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043 d = p - 2;
1044 buffer->next_line = p - 1;
1045 }
1046 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047 {
1048 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1049 add_line_note (buffer, d, s[2]);
1050 if (CPP_OPTION (pfile, trigraphs))
1051 {
1052 *d = _cpp_trigraph_map[s[2]];
1053 s += 2;
1054 }
1055 }
1056 }
1057 }
1058 else
1059 {
1060 while (*s != '\n' && *s != '\r')
1061 s++;
1062 d = (uchar *) s;
1063
1064 /* Handle DOS line endings. */
1065 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066 s++;
1067 }
1068
1069 done:
1070 *d = '\n';
1071 /* A sentinel note that should never be processed. */
1072 add_line_note (buffer, d + 1, '\n');
1073 buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077 about in a comment. */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081 const uchar *p;
1082
1083 /* Within comments we don't warn about trigraphs, unless the
1084 trigraph forms an escaped newline, as that may change
1085 behavior. */
1086 if (note->type != '/')
1087 return false;
1088
1089 /* If -trigraphs, then this was an escaped newline iff the next note
1090 is coincident. */
1091 if (CPP_OPTION (pfile, trigraphs))
1092 return note[1].pos == note->pos;
1093
1094 /* Otherwise, see if this forms an escaped newline. */
1095 p = note->pos + 3;
1096 while (is_nvspace (*p))
1097 p++;
1098
1099 /* There might have been escaped newlines between the trigraph and the
1100 newline we found. Hence the position test. */
1101 return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105 location. */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109 cpp_buffer *buffer = pfile->buffer;
1110
1111 for (;;)
1112 {
1113 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114 unsigned int col;
1115
1116 if (note->pos > buffer->cur)
1117 break;
1118
1119 buffer->cur_note++;
1120 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122 if (note->type == '\\' || note->type == ' ')
1123 {
1124 if (note->type == ' ' && !in_comment)
1125 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126 "backslash and newline separated by space");
1127
1128 if (buffer->next_line > buffer->rlimit)
1129 {
1130 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131 "backslash-newline at end of file");
1132 /* Prevent "no newline at end of file" warning. */
1133 buffer->next_line = buffer->rlimit;
1134 }
1135
1136 buffer->line_base = note->pos;
1137 CPP_INCREMENT_LINE (pfile, 0);
1138 }
1139 else if (_cpp_trigraph_map[note->type])
1140 {
1141 if (CPP_OPTION (pfile, warn_trigraphs)
1142 && (!in_comment || warn_in_comment (pfile, note)))
1143 {
1144 if (CPP_OPTION (pfile, trigraphs))
1145 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146 pfile->line_table->highest_line, col,
1147 "trigraph ??%c converted to %c",
1148 note->type,
1149 (int) _cpp_trigraph_map[note->type]);
1150 else
1151 {
1152 cpp_warning_with_line
1153 (pfile, CPP_W_TRIGRAPHS,
1154 pfile->line_table->highest_line, col,
1155 "trigraph ??%c ignored, use -trigraphs to enable",
1156 note->type);
1157 }
1158 }
1159 }
1160 else if (note->type == 0)
1161 /* Already processed in lex_raw_string. */;
1162 else
1163 abort ();
1164 }
1165 }
1166
1167 /* Skip a C-style block comment. We find the end of the comment by
1168 seeing if an asterisk is before every '/' we encounter. Returns
1169 nonzero if comment terminated by EOF, zero otherwise.
1170
1171 Buffer->cur points to the initial asterisk of the comment. */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175 cpp_buffer *buffer = pfile->buffer;
1176 const uchar *cur = buffer->cur;
1177 uchar c;
1178
1179 cur++;
1180 if (*cur == '/')
1181 cur++;
1182
1183 for (;;)
1184 {
1185 /* People like decorating comments with '*', so check for '/'
1186 instead for efficiency. */
1187 c = *cur++;
1188
1189 if (c == '/')
1190 {
1191 if (cur[-2] == '*')
1192 break;
1193
1194 /* Warn about potential nested comments, but not if the '/'
1195 comes immediately before the true comment delimiter.
1196 Don't bother to get it right across escaped newlines. */
1197 if (CPP_OPTION (pfile, warn_comments)
1198 && cur[0] == '*' && cur[1] != '/')
1199 {
1200 buffer->cur = cur;
1201 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202 pfile->line_table->highest_line,
1203 CPP_BUF_COL (buffer),
1204 "\"/*\" within comment");
1205 }
1206 }
1207 else if (c == '\n')
1208 {
1209 unsigned int cols;
1210 buffer->cur = cur - 1;
1211 _cpp_process_line_notes (pfile, true);
1212 if (buffer->next_line >= buffer->rlimit)
1213 return true;
1214 _cpp_clean_line (pfile);
1215
1216 cols = buffer->next_line - buffer->line_base;
1217 CPP_INCREMENT_LINE (pfile, cols);
1218
1219 cur = buffer->cur;
1220 }
1221 }
1222
1223 buffer->cur = cur;
1224 _cpp_process_line_notes (pfile, true);
1225 return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229 terminating newline. Handles escaped newlines. Returns nonzero
1230 if a multiline comment. */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234 cpp_buffer *buffer = pfile->buffer;
1235 location_t orig_line = pfile->line_table->highest_line;
1236
1237 while (*buffer->cur != '\n')
1238 buffer->cur++;
1239
1240 _cpp_process_line_notes (pfile, true);
1241 return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character. */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248 cpp_buffer *buffer = pfile->buffer;
1249 bool saw_NUL = false;
1250
1251 do
1252 {
1253 /* Horizontal space always OK. */
1254 if (c == ' ' || c == '\t')
1255 ;
1256 /* Just \f \v or \0 left. */
1257 else if (c == '\0')
1258 saw_NUL = true;
1259 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261 CPP_BUF_COL (buffer),
1262 "%s in preprocessing directive",
1263 c == '\f' ? "form feed" : "vertical tab");
1264
1265 c = *buffer->cur++;
1266 }
1267 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1268 while (is_nvspace (c));
1269
1270 if (saw_NUL)
1271 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273 buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277 '.', '+' or '-'). */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281 unsigned int i;
1282
1283 for (i = 0; i < string->len; i++)
1284 if (!is_idchar (string->text[i]))
1285 return 0;
1286
1287 return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291 sequences not in NFC/NFKC. */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294 const cpp_token *token,
1295 const struct normalize_state *s)
1296 {
1297 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298 && !pfile->state.skipping)
1299 {
1300 /* Make sure that the token is printed using UCNs, even
1301 if we'd otherwise happily print UTF-8. */
1302 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303 size_t sz;
1304
1305 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308 "`%.*s' is not in NFKC", (int) sz, buf);
1309 else
1310 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311 "`%.*s' is not in NFC", (int) sz, buf);
1312 free (buf);
1313 }
1314 }
1315
1316 static const cppchar_t utf8_signifier = 0xC0;
1317
1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1319 an identifier. FIRST is TRUE if this starts an identifier. */
1320 static bool
1321 forms_identifier_p (cpp_reader *pfile, int first,
1322 struct normalize_state *state)
1323 {
1324 cpp_buffer *buffer = pfile->buffer;
1325
1326 if (*buffer->cur == '$')
1327 {
1328 if (!CPP_OPTION (pfile, dollars_in_ident))
1329 return false;
1330
1331 buffer->cur++;
1332 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1333 {
1334 CPP_OPTION (pfile, warn_dollars) = 0;
1335 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1336 }
1337
1338 return true;
1339 }
1340
1341 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1342 if (CPP_OPTION (pfile, extended_identifiers))
1343 {
1344 cppchar_t s;
1345 if (*buffer->cur >= utf8_signifier)
1346 {
1347 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1348 state, &s))
1349 return true;
1350 }
1351 else if (*buffer->cur == '\\'
1352 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1353 {
1354 buffer->cur += 2;
1355 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1356 state, &s, NULL, NULL))
1357 return true;
1358 buffer->cur -= 2;
1359 }
1360 }
1361
1362 return false;
1363 }
1364
1365 /* Helper function to issue error about improper __VA_OPT__ use. */
1366 static void
1367 maybe_va_opt_error (cpp_reader *pfile)
1368 {
1369 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1370 {
1371 /* __VA_OPT__ should not be accepted at all, but allow it in
1372 system headers. */
1373 if (!_cpp_in_system_header (pfile))
1374 cpp_error (pfile, CPP_DL_PEDWARN,
1375 "__VA_OPT__ is not available until C++20");
1376 }
1377 else if (!pfile->state.va_args_ok)
1378 {
1379 /* __VA_OPT__ should only appear in the replacement list of a
1380 variadic macro. */
1381 cpp_error (pfile, CPP_DL_PEDWARN,
1382 "__VA_OPT__ can only appear in the expansion"
1383 " of a C++20 variadic macro");
1384 }
1385 }
1386
1387 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1388 static cpp_hashnode *
1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1390 {
1391 cpp_hashnode *result;
1392 const uchar *cur;
1393 unsigned int len;
1394 unsigned int hash = HT_HASHSTEP (0, *base);
1395
1396 cur = base + 1;
1397 while (ISIDNUM (*cur))
1398 {
1399 hash = HT_HASHSTEP (hash, *cur);
1400 cur++;
1401 }
1402 len = cur - base;
1403 hash = HT_HASHFINISH (hash, len);
1404 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1405 base, len, hash, HT_ALLOC));
1406
1407 /* Rarely, identifiers require diagnostics when lexed. */
1408 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1409 && !pfile->state.skipping, 0))
1410 {
1411 /* It is allowed to poison the same identifier twice. */
1412 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1413 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1414 NODE_NAME (result));
1415
1416 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1417 replacement list of a variadic macro. */
1418 if (result == pfile->spec_nodes.n__VA_ARGS__
1419 && !pfile->state.va_args_ok)
1420 {
1421 if (CPP_OPTION (pfile, cplusplus))
1422 cpp_error (pfile, CPP_DL_PEDWARN,
1423 "__VA_ARGS__ can only appear in the expansion"
1424 " of a C++11 variadic macro");
1425 else
1426 cpp_error (pfile, CPP_DL_PEDWARN,
1427 "__VA_ARGS__ can only appear in the expansion"
1428 " of a C99 variadic macro");
1429 }
1430
1431 if (result == pfile->spec_nodes.n__VA_OPT__)
1432 maybe_va_opt_error (pfile);
1433
1434 /* For -Wc++-compat, warn about use of C++ named operators. */
1435 if (result->flags & NODE_WARN_OPERATOR)
1436 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1437 "identifier \"%s\" is a special operator name in C++",
1438 NODE_NAME (result));
1439 }
1440
1441 return result;
1442 }
1443
1444 /* Get the cpp_hashnode of an identifier specified by NAME in
1445 the current cpp_reader object. If none is found, NULL is returned. */
1446 cpp_hashnode *
1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1448 {
1449 cpp_hashnode *result;
1450 result = lex_identifier_intern (pfile, (uchar *) name);
1451 return result;
1452 }
1453
1454 /* Lex an identifier starting at BUFFER->CUR - 1. */
1455 static cpp_hashnode *
1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1457 struct normalize_state *nst, cpp_hashnode **spelling)
1458 {
1459 cpp_hashnode *result;
1460 const uchar *cur;
1461 unsigned int len;
1462 unsigned int hash = HT_HASHSTEP (0, *base);
1463
1464 cur = pfile->buffer->cur;
1465 if (! starts_ucn)
1466 {
1467 while (ISIDNUM (*cur))
1468 {
1469 hash = HT_HASHSTEP (hash, *cur);
1470 cur++;
1471 }
1472 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1473 }
1474 pfile->buffer->cur = cur;
1475 if (starts_ucn || forms_identifier_p (pfile, false, nst))
1476 {
1477 /* Slower version for identifiers containing UCNs
1478 or extended chars (including $). */
1479 do {
1480 while (ISIDNUM (*pfile->buffer->cur))
1481 {
1482 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1483 pfile->buffer->cur++;
1484 }
1485 } while (forms_identifier_p (pfile, false, nst));
1486 result = _cpp_interpret_identifier (pfile, base,
1487 pfile->buffer->cur - base);
1488 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1489 }
1490 else
1491 {
1492 len = cur - base;
1493 hash = HT_HASHFINISH (hash, len);
1494
1495 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1496 base, len, hash, HT_ALLOC));
1497 *spelling = result;
1498 }
1499
1500 /* Rarely, identifiers require diagnostics when lexed. */
1501 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1502 && !pfile->state.skipping, 0))
1503 {
1504 /* It is allowed to poison the same identifier twice. */
1505 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1506 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1507 NODE_NAME (result));
1508
1509 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1510 replacement list of a variadic macro. */
1511 if (result == pfile->spec_nodes.n__VA_ARGS__
1512 && !pfile->state.va_args_ok)
1513 {
1514 if (CPP_OPTION (pfile, cplusplus))
1515 cpp_error (pfile, CPP_DL_PEDWARN,
1516 "__VA_ARGS__ can only appear in the expansion"
1517 " of a C++11 variadic macro");
1518 else
1519 cpp_error (pfile, CPP_DL_PEDWARN,
1520 "__VA_ARGS__ can only appear in the expansion"
1521 " of a C99 variadic macro");
1522 }
1523
1524 /* __VA_OPT__ should only appear in the replacement list of a
1525 variadic macro. */
1526 if (result == pfile->spec_nodes.n__VA_OPT__)
1527 maybe_va_opt_error (pfile);
1528
1529 /* For -Wc++-compat, warn about use of C++ named operators. */
1530 if (result->flags & NODE_WARN_OPERATOR)
1531 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1532 "identifier \"%s\" is a special operator name in C++",
1533 NODE_NAME (result));
1534 }
1535
1536 return result;
1537 }
1538
1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
1540 static void
1541 lex_number (cpp_reader *pfile, cpp_string *number,
1542 struct normalize_state *nst)
1543 {
1544 const uchar *cur;
1545 const uchar *base;
1546 uchar *dest;
1547
1548 base = pfile->buffer->cur - 1;
1549 do
1550 {
1551 cur = pfile->buffer->cur;
1552
1553 /* N.B. ISIDNUM does not include $. */
1554 while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1555 || VALID_SIGN (*cur, cur[-1]))
1556 {
1557 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1558 cur++;
1559 }
1560 /* A number can't end with a digit separator. */
1561 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1562 --cur;
1563
1564 pfile->buffer->cur = cur;
1565 }
1566 while (forms_identifier_p (pfile, false, nst));
1567
1568 number->len = cur - base;
1569 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1570 memcpy (dest, base, number->len);
1571 dest[number->len] = '\0';
1572 number->text = dest;
1573 }
1574
1575 /* Create a token of type TYPE with a literal spelling. */
1576 static void
1577 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1578 unsigned int len, enum cpp_ttype type)
1579 {
1580 token->type = type;
1581 token->val.str.len = len;
1582 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
1583 }
1584
1585 const uchar *
1586 cpp_alloc_token_string (cpp_reader *pfile,
1587 const unsigned char *ptr, unsigned len)
1588 {
1589 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1590
1591 dest[len] = 0;
1592 memcpy (dest, ptr, len);
1593 return dest;
1594 }
1595
1596 /* A pair of raw buffer pointers. The currently open one is [1], the
1597 first one is [0]. Used for string literal lexing. */
1598 struct lit_accum {
1599 _cpp_buff *first;
1600 _cpp_buff *last;
1601 const uchar *rpos;
1602 size_t accum;
1603
1604 lit_accum ()
1605 : first (NULL), last (NULL), rpos (0), accum (0)
1606 {
1607 }
1608
1609 void append (cpp_reader *, const uchar *, size_t);
1610
1611 void read_begin (cpp_reader *);
1612 bool reading_p () const
1613 {
1614 return rpos != NULL;
1615 }
1616 char read_char ()
1617 {
1618 char c = *rpos++;
1619 if (rpos == BUFF_FRONT (last))
1620 rpos = NULL;
1621 return c;
1622 }
1623 };
1624
1625 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1626 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
1627
1628 void
1629 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1630 {
1631 if (!last)
1632 /* Starting. */
1633 first = last = _cpp_get_buff (pfile, len);
1634 else if (len > BUFF_ROOM (last))
1635 {
1636 /* There is insufficient room in the buffer. Copy what we can,
1637 and then either extend or create a new one. */
1638 size_t room = BUFF_ROOM (last);
1639 memcpy (BUFF_FRONT (last), base, room);
1640 BUFF_FRONT (last) += room;
1641 base += room;
1642 len -= room;
1643 accum += room;
1644
1645 gcc_checking_assert (!rpos);
1646
1647 last = _cpp_append_extend_buff (pfile, last, len);
1648 }
1649
1650 memcpy (BUFF_FRONT (last), base, len);
1651 BUFF_FRONT (last) += len;
1652 accum += len;
1653 }
1654
1655 void
1656 lit_accum::read_begin (cpp_reader *pfile)
1657 {
1658 /* We never accumulate more than 4 chars to read. */
1659 if (BUFF_ROOM (last) < 4)
1660
1661 last = _cpp_append_extend_buff (pfile, last, 4);
1662 rpos = BUFF_FRONT (last);
1663 }
1664
1665 /* Returns true if a macro has been defined.
1666 This might not work if compile with -save-temps,
1667 or preprocess separately from compilation. */
1668
1669 static bool
1670 is_macro(cpp_reader *pfile, const uchar *base)
1671 {
1672 const uchar *cur = base;
1673 if (! ISIDST (*cur))
1674 return false;
1675 unsigned int hash = HT_HASHSTEP (0, *cur);
1676 ++cur;
1677 while (ISIDNUM (*cur))
1678 {
1679 hash = HT_HASHSTEP (hash, *cur);
1680 ++cur;
1681 }
1682 hash = HT_HASHFINISH (hash, cur - base);
1683
1684 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1685 base, cur - base, hash, HT_NO_INSERT));
1686
1687 return result && cpp_macro_p (result);
1688 }
1689
1690 /* Returns true if a literal suffix does not have the expected form
1691 and is defined as a macro. */
1692
1693 static bool
1694 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1695 {
1696 /* User-defined literals outside of namespace std must start with a single
1697 underscore, so assume anything of that form really is a UDL suffix.
1698 We don't need to worry about UDLs defined inside namespace std because
1699 their names are reserved, so cannot be used as macro names in valid
1700 programs. */
1701 if (base[0] == '_' && base[1] != '_')
1702 return false;
1703 return is_macro (pfile, base);
1704 }
1705
1706 /* Lexes a raw string. The stored string contains the spelling,
1707 including double quotes, delimiter string, '(' and ')', any leading
1708 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
1709 the type of the literal, or CPP_OTHER if it was not properly
1710 terminated.
1711
1712 BASE is the start of the token. Updates pfile->buffer->cur to just
1713 after the lexed string.
1714
1715 The spelling is NUL-terminated, but it is not guaranteed that this
1716 is the first NUL since embedded NULs are preserved. */
1717
1718 static void
1719 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1720 {
1721 const uchar *pos = base;
1722
1723 /* 'tis a pity this information isn't passed down from the lexer's
1724 initial categorization of the token. */
1725 enum cpp_ttype type = CPP_STRING;
1726
1727 if (*pos == 'L')
1728 {
1729 type = CPP_WSTRING;
1730 pos++;
1731 }
1732 else if (*pos == 'U')
1733 {
1734 type = CPP_STRING32;
1735 pos++;
1736 }
1737 else if (*pos == 'u')
1738 {
1739 if (pos[1] == '8')
1740 {
1741 type = CPP_UTF8STRING;
1742 pos++;
1743 }
1744 else
1745 type = CPP_STRING16;
1746 pos++;
1747 }
1748
1749 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1750 pos += 2;
1751
1752 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1753
1754 /* Skip notes before the ". */
1755 while (note->pos < pos)
1756 ++note;
1757
1758 lit_accum accum;
1759
1760 uchar prefix[17];
1761 unsigned prefix_len = 0;
1762 enum Phase
1763 {
1764 PHASE_PREFIX = -2,
1765 PHASE_NONE = -1,
1766 PHASE_SUFFIX = 0
1767 } phase = PHASE_PREFIX;
1768
1769 for (;;)
1770 {
1771 gcc_checking_assert (note->pos >= pos);
1772
1773 /* Undo any escaped newlines and trigraphs. */
1774 if (!accum.reading_p () && note->pos == pos)
1775 switch (note->type)
1776 {
1777 case '\\':
1778 case ' ':
1779 /* Restore backslash followed by newline. */
1780 accum.append (pfile, base, pos - base);
1781 base = pos;
1782 accum.read_begin (pfile);
1783 accum.append (pfile, UC"\\", 1);
1784
1785 after_backslash:
1786 if (note->type == ' ')
1787 /* GNU backslash whitespace newline extension. FIXME
1788 could be any sequence of non-vertical space. When we
1789 can properly restore any such sequence, we should
1790 mark this note as handled so _cpp_process_line_notes
1791 doesn't warn. */
1792 accum.append (pfile, UC" ", 1);
1793
1794 accum.append (pfile, UC"\n", 1);
1795 note++;
1796 break;
1797
1798 case '\n':
1799 /* This can happen for ??/<NEWLINE> when trigraphs are not
1800 being interpretted. */
1801 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1802 note->type = 0;
1803 note++;
1804 break;
1805
1806 default:
1807 gcc_checking_assert (_cpp_trigraph_map[note->type]);
1808
1809 /* Don't warn about this trigraph in
1810 _cpp_process_line_notes, since trigraphs show up as
1811 trigraphs in raw strings. */
1812 uchar type = note->type;
1813 note->type = 0;
1814
1815 if (CPP_OPTION (pfile, trigraphs))
1816 {
1817 accum.append (pfile, base, pos - base);
1818 base = pos;
1819 accum.read_begin (pfile);
1820 accum.append (pfile, UC"??", 2);
1821 accum.append (pfile, &type, 1);
1822
1823 /* ??/ followed by newline gets two line notes, one for
1824 the trigraph and one for the backslash/newline. */
1825 if (type == '/' && note[1].pos == pos)
1826 {
1827 note++;
1828 gcc_assert (note->type == '\\' || note->type == ' ');
1829 goto after_backslash;
1830 }
1831 /* Skip the replacement character. */
1832 base = ++pos;
1833 }
1834
1835 note++;
1836 break;
1837 }
1838
1839 /* Now get a char to process. Either from an expanded note, or
1840 from the line buffer. */
1841 bool read_note = accum.reading_p ();
1842 char c = read_note ? accum.read_char () : *pos++;
1843
1844 if (phase == PHASE_PREFIX)
1845 {
1846 if (c == '(')
1847 {
1848 /* Done. */
1849 phase = PHASE_NONE;
1850 prefix[prefix_len++] = '"';
1851 }
1852 else if (prefix_len < 16
1853 /* Prefix chars are any of the basic character set,
1854 [lex.charset] except for '
1855 ()\\\t\v\f\n'. Optimized for a contiguous
1856 alphabet. */
1857 /* Unlike a switch, this collapses down to one or
1858 two shift and bitmask operations on an ASCII
1859 system, with an outlier or two. */
1860 && (('Z' - 'A' == 25
1861 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1862 : ISIDST (c))
1863 || (c >= '0' && c <= '9')
1864 || c == '_' || c == '{' || c == '}'
1865 || c == '[' || c == ']' || c == '#'
1866 || c == '<' || c == '>' || c == '%'
1867 || c == ':' || c == ';' || c == '.' || c == '?'
1868 || c == '*' || c == '+' || c == '-' || c == '/'
1869 || c == '^' || c == '&' || c == '|' || c == '~'
1870 || c == '!' || c == '=' || c == ','
1871 || c == '"' || c == '\''))
1872 prefix[prefix_len++] = c;
1873 else
1874 {
1875 /* Something is wrong. */
1876 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1877 if (prefix_len == 16)
1878 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1879 col, "raw string delimiter longer "
1880 "than 16 characters");
1881 else if (c == '\n')
1882 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1883 col, "invalid new-line in raw "
1884 "string delimiter");
1885 else
1886 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1887 col, "invalid character '%c' in "
1888 "raw string delimiter", c);
1889 type = CPP_OTHER;
1890 phase = PHASE_NONE;
1891 /* Continue until we get a close quote, that's probably
1892 the best failure mode. */
1893 prefix_len = 0;
1894 }
1895 if (c != '\n')
1896 continue;
1897 }
1898
1899 if (phase != PHASE_NONE)
1900 {
1901 if (prefix[phase] != c)
1902 phase = PHASE_NONE;
1903 else if (unsigned (phase + 1) == prefix_len)
1904 break;
1905 else
1906 {
1907 phase = Phase (phase + 1);
1908 continue;
1909 }
1910 }
1911
1912 if (!prefix_len && c == '"')
1913 /* Failure mode lexing. */
1914 goto out;
1915 else if (prefix_len && c == ')')
1916 phase = PHASE_SUFFIX;
1917 else if (!read_note && c == '\n')
1918 {
1919 pos--;
1920 pfile->buffer->cur = pos;
1921 if (pfile->state.in_directive
1922 || (pfile->state.parsing_args
1923 && pfile->buffer->next_line >= pfile->buffer->rlimit))
1924 {
1925 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1926 "unterminated raw string");
1927 type = CPP_OTHER;
1928 goto out;
1929 }
1930
1931 accum.append (pfile, base, pos - base + 1);
1932 _cpp_process_line_notes (pfile, false);
1933
1934 if (pfile->buffer->next_line < pfile->buffer->rlimit)
1935 CPP_INCREMENT_LINE (pfile, 0);
1936 pfile->buffer->need_line = true;
1937
1938 if (!_cpp_get_fresh_line (pfile))
1939 {
1940 /* We ran out of file and failed to get a line. */
1941 location_t src_loc = token->src_loc;
1942 token->type = CPP_EOF;
1943 /* Tell the compiler the line number of the EOF token. */
1944 token->src_loc = pfile->line_table->highest_line;
1945 token->flags = BOL;
1946 if (accum.first)
1947 _cpp_release_buff (pfile, accum.first);
1948 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1949 "unterminated raw string");
1950 /* Now pop the buffer that _cpp_get_fresh_line did not. */
1951 _cpp_pop_buffer (pfile);
1952 return;
1953 }
1954
1955 pos = base = pfile->buffer->cur;
1956 note = &pfile->buffer->notes[pfile->buffer->cur_note];
1957 }
1958 }
1959
1960 if (CPP_OPTION (pfile, user_literals))
1961 {
1962 /* If a string format macro, say from inttypes.h, is placed touching
1963 a string literal it could be parsed as a C++11 user-defined string
1964 literal thus breaking the program. */
1965 if (is_macro_not_literal_suffix (pfile, pos))
1966 {
1967 /* Raise a warning, but do not consume subsequent tokens. */
1968 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1969 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1970 token->src_loc, 0,
1971 "invalid suffix on literal; C++11 requires "
1972 "a space between literal and string macro");
1973 }
1974 /* Grab user defined literal suffix. */
1975 else if (ISIDST (*pos))
1976 {
1977 type = cpp_userdef_string_add_type (type);
1978 ++pos;
1979
1980 while (ISIDNUM (*pos))
1981 ++pos;
1982 }
1983 }
1984
1985 out:
1986 pfile->buffer->cur = pos;
1987 if (!accum.accum)
1988 create_literal (pfile, token, base, pos - base, type);
1989 else
1990 {
1991 size_t extra_len = pos - base;
1992 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
1993
1994 token->type = type;
1995 token->val.str.len = accum.accum + extra_len;
1996 token->val.str.text = dest;
1997 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
1998 {
1999 size_t len = BUFF_FRONT (buf) - buf->base;
2000 memcpy (dest, buf->base, len);
2001 dest += len;
2002 }
2003 _cpp_release_buff (pfile, accum.first);
2004 memcpy (dest, base, extra_len);
2005 dest[extra_len] = '\0';
2006 }
2007 }
2008
2009 /* Lexes a string, character constant, or angle-bracketed header file
2010 name. The stored string contains the spelling, including opening
2011 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2012 'R' modifier. It returns the type of the literal, or CPP_OTHER
2013 if it was not properly terminated, or CPP_LESS for an unterminated
2014 header name which must be relexed as normal tokens.
2015
2016 The spelling is NUL-terminated, but it is not guaranteed that this
2017 is the first NUL since embedded NULs are preserved. */
2018 static void
2019 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2020 {
2021 bool saw_NUL = false;
2022 const uchar *cur;
2023 cppchar_t terminator;
2024 enum cpp_ttype type;
2025
2026 cur = base;
2027 terminator = *cur++;
2028 if (terminator == 'L' || terminator == 'U')
2029 terminator = *cur++;
2030 else if (terminator == 'u')
2031 {
2032 terminator = *cur++;
2033 if (terminator == '8')
2034 terminator = *cur++;
2035 }
2036 if (terminator == 'R')
2037 {
2038 lex_raw_string (pfile, token, base);
2039 return;
2040 }
2041 if (terminator == '"')
2042 type = (*base == 'L' ? CPP_WSTRING :
2043 *base == 'U' ? CPP_STRING32 :
2044 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2045 : CPP_STRING);
2046 else if (terminator == '\'')
2047 type = (*base == 'L' ? CPP_WCHAR :
2048 *base == 'U' ? CPP_CHAR32 :
2049 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2050 : CPP_CHAR);
2051 else
2052 terminator = '>', type = CPP_HEADER_NAME;
2053
2054 for (;;)
2055 {
2056 cppchar_t c = *cur++;
2057
2058 /* In #include-style directives, terminators are not escapable. */
2059 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2060 cur++;
2061 else if (c == terminator)
2062 break;
2063 else if (c == '\n')
2064 {
2065 cur--;
2066 /* Unmatched quotes always yield undefined behavior, but
2067 greedy lexing means that what appears to be an unterminated
2068 header name may actually be a legitimate sequence of tokens. */
2069 if (terminator == '>')
2070 {
2071 token->type = CPP_LESS;
2072 return;
2073 }
2074 type = CPP_OTHER;
2075 break;
2076 }
2077 else if (c == '\0')
2078 saw_NUL = true;
2079 }
2080
2081 if (saw_NUL && !pfile->state.skipping)
2082 cpp_error (pfile, CPP_DL_WARNING,
2083 "null character(s) preserved in literal");
2084
2085 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2086 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2087 (int) terminator);
2088
2089 if (CPP_OPTION (pfile, user_literals))
2090 {
2091 /* If a string format macro, say from inttypes.h, is placed touching
2092 a string literal it could be parsed as a C++11 user-defined string
2093 literal thus breaking the program. */
2094 if (is_macro_not_literal_suffix (pfile, cur))
2095 {
2096 /* Raise a warning, but do not consume subsequent tokens. */
2097 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2098 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2099 token->src_loc, 0,
2100 "invalid suffix on literal; C++11 requires "
2101 "a space between literal and string macro");
2102 }
2103 /* Grab user defined literal suffix. */
2104 else if (ISIDST (*cur))
2105 {
2106 type = cpp_userdef_char_add_type (type);
2107 type = cpp_userdef_string_add_type (type);
2108 ++cur;
2109
2110 while (ISIDNUM (*cur))
2111 ++cur;
2112 }
2113 }
2114 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2115 && is_macro (pfile, cur)
2116 && !pfile->state.skipping)
2117 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2118 token->src_loc, 0, "C++11 requires a space "
2119 "between string literal and macro");
2120
2121 pfile->buffer->cur = cur;
2122 create_literal (pfile, token, base, cur - base, type);
2123 }
2124
2125 /* Return the comment table. The client may not make any assumption
2126 about the ordering of the table. */
2127 cpp_comment_table *
2128 cpp_get_comments (cpp_reader *pfile)
2129 {
2130 return &pfile->comments;
2131 }
2132
2133 /* Append a comment to the end of the comment table. */
2134 static void
2135 store_comment (cpp_reader *pfile, cpp_token *token)
2136 {
2137 int len;
2138
2139 if (pfile->comments.allocated == 0)
2140 {
2141 pfile->comments.allocated = 256;
2142 pfile->comments.entries = (cpp_comment *) xmalloc
2143 (pfile->comments.allocated * sizeof (cpp_comment));
2144 }
2145
2146 if (pfile->comments.count == pfile->comments.allocated)
2147 {
2148 pfile->comments.allocated *= 2;
2149 pfile->comments.entries = (cpp_comment *) xrealloc
2150 (pfile->comments.entries,
2151 pfile->comments.allocated * sizeof (cpp_comment));
2152 }
2153
2154 len = token->val.str.len;
2155
2156 /* Copy comment. Note, token may not be NULL terminated. */
2157 pfile->comments.entries[pfile->comments.count].comment =
2158 (char *) xmalloc (sizeof (char) * (len + 1));
2159 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2160 token->val.str.text, len);
2161 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2162
2163 /* Set source location. */
2164 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2165
2166 /* Increment the count of entries in the comment table. */
2167 pfile->comments.count++;
2168 }
2169
2170 /* The stored comment includes the comment start and any terminator. */
2171 static void
2172 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2173 cppchar_t type)
2174 {
2175 unsigned char *buffer;
2176 unsigned int len, clen, i;
2177
2178 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2179
2180 /* C++ comments probably (not definitely) have moved past a new
2181 line, which we don't want to save in the comment. */
2182 if (is_vspace (pfile->buffer->cur[-1]))
2183 len--;
2184
2185 /* If we are currently in a directive or in argument parsing, then
2186 we need to store all C++ comments as C comments internally, and
2187 so we need to allocate a little extra space in that case.
2188
2189 Note that the only time we encounter a directive here is
2190 when we are saving comments in a "#define". */
2191 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2192 && type == '/') ? len + 2 : len;
2193
2194 buffer = _cpp_unaligned_alloc (pfile, clen);
2195
2196 token->type = CPP_COMMENT;
2197 token->val.str.len = clen;
2198 token->val.str.text = buffer;
2199
2200 buffer[0] = '/';
2201 memcpy (buffer + 1, from, len - 1);
2202
2203 /* Finish conversion to a C comment, if necessary. */
2204 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2205 {
2206 buffer[1] = '*';
2207 buffer[clen - 2] = '*';
2208 buffer[clen - 1] = '/';
2209 /* As there can be in a C++ comments illegal sequences for C comments
2210 we need to filter them out. */
2211 for (i = 2; i < (clen - 2); i++)
2212 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2213 buffer[i] = '|';
2214 }
2215
2216 /* Finally store this comment for use by clients of libcpp. */
2217 store_comment (pfile, token);
2218 }
2219
2220 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2221 comment. */
2222
2223 static bool
2224 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2225 {
2226 const unsigned char *from = comment_start + 1;
2227
2228 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2229 {
2230 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2231 don't recognize any comments. The latter only checks attributes,
2232 the former doesn't warn. */
2233 case 0:
2234 default:
2235 return false;
2236 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2237 content it has. */
2238 case 1:
2239 return true;
2240 case 2:
2241 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2242 .*falls?[ \t-]*thr(u|ough).* regex. */
2243 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2244 from++)
2245 {
2246 /* Is there anything like strpbrk with upper boundary, or
2247 memchr looking for 2 characters rather than just one? */
2248 if (from[0] != 'f' && from[0] != 'F')
2249 continue;
2250 if (from[1] != 'a' && from[1] != 'A')
2251 continue;
2252 if (from[2] != 'l' && from[2] != 'L')
2253 continue;
2254 if (from[3] != 'l' && from[3] != 'L')
2255 continue;
2256 from += sizeof "fall" - 1;
2257 if (from[0] == 's' || from[0] == 'S')
2258 from++;
2259 while (*from == ' ' || *from == '\t' || *from == '-')
2260 from++;
2261 if (from[0] != 't' && from[0] != 'T')
2262 continue;
2263 if (from[1] != 'h' && from[1] != 'H')
2264 continue;
2265 if (from[2] != 'r' && from[2] != 'R')
2266 continue;
2267 if (from[3] == 'u' || from[3] == 'U')
2268 return true;
2269 if (from[3] != 'o' && from[3] != 'O')
2270 continue;
2271 if (from[4] != 'u' && from[4] != 'U')
2272 continue;
2273 if (from[5] != 'g' && from[5] != 'G')
2274 continue;
2275 if (from[6] != 'h' && from[6] != 'H')
2276 continue;
2277 return true;
2278 }
2279 return false;
2280 case 3:
2281 case 4:
2282 break;
2283 }
2284
2285 /* Whole comment contents:
2286 -fallthrough
2287 @fallthrough@
2288 */
2289 if (*from == '-' || *from == '@')
2290 {
2291 size_t len = sizeof "fallthrough" - 1;
2292 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2293 return false;
2294 if (memcmp (from + 1, "fallthrough", len))
2295 return false;
2296 if (*from == '@')
2297 {
2298 if (from[len + 1] != '@')
2299 return false;
2300 len++;
2301 }
2302 from += 1 + len;
2303 }
2304 /* Whole comment contents (regex):
2305 lint -fallthrough[ \t]*
2306 */
2307 else if (*from == 'l')
2308 {
2309 size_t len = sizeof "int -fallthrough" - 1;
2310 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2311 return false;
2312 if (memcmp (from + 1, "int -fallthrough", len))
2313 return false;
2314 from += 1 + len;
2315 while (*from == ' ' || *from == '\t')
2316 from++;
2317 }
2318 /* Whole comment contents (regex):
2319 [ \t]*FALLTHR(U|OUGH)[ \t]*
2320 */
2321 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2322 {
2323 while (*from == ' ' || *from == '\t')
2324 from++;
2325 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2326 return false;
2327 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2328 return false;
2329 from += sizeof "FALLTHR" - 1;
2330 if (*from == 'U')
2331 from++;
2332 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2333 return false;
2334 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2335 return false;
2336 else
2337 from += sizeof "OUGH" - 1;
2338 while (*from == ' ' || *from == '\t')
2339 from++;
2340 }
2341 /* Whole comment contents (regex):
2342 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2343 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2344 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2345 */
2346 else
2347 {
2348 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2349 from++;
2350 unsigned char f = *from;
2351 bool all_upper = false;
2352 if (f == 'E' || f == 'e')
2353 {
2354 if ((size_t) (pfile->buffer->cur - from)
2355 < sizeof "else fallthru" - 1)
2356 return false;
2357 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2358 all_upper = true;
2359 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2360 return false;
2361 from += sizeof "else" - 1;
2362 if (*from == ',')
2363 from++;
2364 if (*from != ' ')
2365 return false;
2366 from++;
2367 if (all_upper && *from == 'f')
2368 return false;
2369 if (f == 'e' && *from == 'F')
2370 return false;
2371 f = *from;
2372 }
2373 else if (f == 'I' || f == 'i')
2374 {
2375 if ((size_t) (pfile->buffer->cur - from)
2376 < sizeof "intentional fallthru" - 1)
2377 return false;
2378 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2379 sizeof "NTENTIONAL" - 1) == 0)
2380 all_upper = true;
2381 else if (memcmp (from + 1, "ntentional",
2382 sizeof "ntentional" - 1))
2383 return false;
2384 from += sizeof "intentional" - 1;
2385 if (*from == ' ')
2386 {
2387 from++;
2388 if (all_upper && *from == 'f')
2389 return false;
2390 }
2391 else if (all_upper)
2392 {
2393 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2394 return false;
2395 from += sizeof "LY " - 1;
2396 }
2397 else
2398 {
2399 if (memcmp (from, "ly ", sizeof "ly " - 1))
2400 return false;
2401 from += sizeof "ly " - 1;
2402 }
2403 if (f == 'i' && *from == 'F')
2404 return false;
2405 f = *from;
2406 }
2407 if (f != 'F' && f != 'f')
2408 return false;
2409 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2410 return false;
2411 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2412 all_upper = true;
2413 else if (all_upper)
2414 return false;
2415 else if (memcmp (from + 1, "all", sizeof "all" - 1))
2416 return false;
2417 from += sizeof "fall" - 1;
2418 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2419 from += 2;
2420 else if (*from == ' ' || *from == '-')
2421 from++;
2422 else if (*from != (all_upper ? 'T' : 't'))
2423 return false;
2424 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2425 return false;
2426 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2427 return false;
2428 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2429 {
2430 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2431 return false;
2432 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2433 sizeof "hrough" - 1))
2434 return false;
2435 from += sizeof "through" - 1;
2436 }
2437 else
2438 from += sizeof "thru" - 1;
2439 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2440 from++;
2441 if (*from == '-')
2442 {
2443 from++;
2444 if (*comment_start == '*')
2445 {
2446 do
2447 {
2448 while (*from && *from != '*'
2449 && *from != '\n' && *from != '\r')
2450 from++;
2451 if (*from != '*' || from[1] == '/')
2452 break;
2453 from++;
2454 }
2455 while (1);
2456 }
2457 else
2458 while (*from && *from != '\n' && *from != '\r')
2459 from++;
2460 }
2461 }
2462 /* C block comment. */
2463 if (*comment_start == '*')
2464 {
2465 if (*from != '*' || from[1] != '/')
2466 return false;
2467 }
2468 /* C++ line comment. */
2469 else if (*from != '\n')
2470 return false;
2471
2472 return true;
2473 }
2474
2475 /* Allocate COUNT tokens for RUN. */
2476 void
2477 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2478 {
2479 run->base = XNEWVEC (cpp_token, count);
2480 run->limit = run->base + count;
2481 run->next = NULL;
2482 }
2483
2484 /* Returns the next tokenrun, or creates one if there is none. */
2485 static tokenrun *
2486 next_tokenrun (tokenrun *run)
2487 {
2488 if (run->next == NULL)
2489 {
2490 run->next = XNEW (tokenrun);
2491 run->next->prev = run;
2492 _cpp_init_tokenrun (run->next, 250);
2493 }
2494
2495 return run->next;
2496 }
2497
2498 /* Return the number of not yet processed token in a given
2499 context. */
2500 int
2501 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2502 {
2503 if (context->tokens_kind == TOKENS_KIND_DIRECT)
2504 return (LAST (context).token - FIRST (context).token);
2505 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2506 || context->tokens_kind == TOKENS_KIND_EXTENDED)
2507 return (LAST (context).ptoken - FIRST (context).ptoken);
2508 else
2509 abort ();
2510 }
2511
2512 /* Returns the token present at index INDEX in a given context. If
2513 INDEX is zero, the next token to be processed is returned. */
2514 static const cpp_token*
2515 _cpp_token_from_context_at (cpp_context *context, int index)
2516 {
2517 if (context->tokens_kind == TOKENS_KIND_DIRECT)
2518 return &(FIRST (context).token[index]);
2519 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2520 || context->tokens_kind == TOKENS_KIND_EXTENDED)
2521 return FIRST (context).ptoken[index];
2522 else
2523 abort ();
2524 }
2525
2526 /* Look ahead in the input stream. */
2527 const cpp_token *
2528 cpp_peek_token (cpp_reader *pfile, int index)
2529 {
2530 cpp_context *context = pfile->context;
2531 const cpp_token *peektok;
2532 int count;
2533
2534 /* First, scan through any pending cpp_context objects. */
2535 while (context->prev)
2536 {
2537 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2538
2539 if (index < (int) sz)
2540 return _cpp_token_from_context_at (context, index);
2541 index -= (int) sz;
2542 context = context->prev;
2543 }
2544
2545 /* We will have to read some new tokens after all (and do so
2546 without invalidating preceding tokens). */
2547 count = index;
2548 pfile->keep_tokens++;
2549
2550 /* For peeked tokens temporarily disable line_change reporting,
2551 until the tokens are parsed for real. */
2552 void (*line_change) (cpp_reader *, const cpp_token *, int)
2553 = pfile->cb.line_change;
2554 pfile->cb.line_change = NULL;
2555
2556 do
2557 {
2558 peektok = _cpp_lex_token (pfile);
2559 if (peektok->type == CPP_EOF)
2560 {
2561 index--;
2562 break;
2563 }
2564 else if (peektok->type == CPP_PRAGMA)
2565 {
2566 /* Don't peek past a pragma. */
2567 if (peektok == &pfile->directive_result)
2568 /* Save the pragma in the buffer. */
2569 *pfile->cur_token++ = *peektok;
2570 index--;
2571 break;
2572 }
2573 }
2574 while (index--);
2575
2576 _cpp_backup_tokens_direct (pfile, count - index);
2577 pfile->keep_tokens--;
2578 pfile->cb.line_change = line_change;
2579
2580 return peektok;
2581 }
2582
2583 /* Allocate a single token that is invalidated at the same time as the
2584 rest of the tokens on the line. Has its line and col set to the
2585 same as the last lexed token, so that diagnostics appear in the
2586 right place. */
2587 cpp_token *
2588 _cpp_temp_token (cpp_reader *pfile)
2589 {
2590 cpp_token *old, *result;
2591 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2592 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2593
2594 old = pfile->cur_token - 1;
2595 /* Any pre-existing lookaheads must not be clobbered. */
2596 if (la)
2597 {
2598 if (sz <= la)
2599 {
2600 tokenrun *next = next_tokenrun (pfile->cur_run);
2601
2602 if (sz < la)
2603 memmove (next->base + 1, next->base,
2604 (la - sz) * sizeof (cpp_token));
2605
2606 next->base[0] = pfile->cur_run->limit[-1];
2607 }
2608
2609 if (sz > 1)
2610 memmove (pfile->cur_token + 1, pfile->cur_token,
2611 MIN (la, sz - 1) * sizeof (cpp_token));
2612 }
2613
2614 if (!sz && pfile->cur_token == pfile->cur_run->limit)
2615 {
2616 pfile->cur_run = next_tokenrun (pfile->cur_run);
2617 pfile->cur_token = pfile->cur_run->base;
2618 }
2619
2620 result = pfile->cur_token++;
2621 result->src_loc = old->src_loc;
2622 return result;
2623 }
2624
2625 /* We're at the beginning of a logical line (so not in
2626 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
2627 if we should enter deferred_pragma mode to tokenize the rest of the
2628 line as a module control-line. */
2629
2630 static void
2631 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
2632 {
2633 unsigned backup = 0; /* Tokens we peeked. */
2634 cpp_hashnode *node = result->val.node.node;
2635 cpp_token *peek = result;
2636 cpp_token *keyword = peek;
2637 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
2638 int header_count = 0;
2639
2640 /* Make sure the incoming state is as we expect it. This way we
2641 can restore it using constants. */
2642 gcc_checking_assert (!pfile->state.in_deferred_pragma
2643 && !pfile->state.skipping
2644 && !pfile->state.parsing_args
2645 && !pfile->state.angled_headers
2646 && (pfile->state.save_comments
2647 == !CPP_OPTION (pfile, discard_comments)));
2648
2649 /* Enter directives mode sufficiently for peeking. We don't have
2650 to actually set in_directive. */
2651 pfile->state.in_deferred_pragma = true;
2652
2653 /* These two fields are needed to process tokenization in deferred
2654 pragma mode. They are not used outside deferred pragma mode or
2655 directives mode. */
2656 pfile->state.pragma_allow_expansion = true;
2657 pfile->directive_line = result->src_loc;
2658
2659 /* Saving comments is incompatible with directives mode. */
2660 pfile->state.save_comments = 0;
2661
2662 if (node == n_modules[spec_nodes::M_EXPORT][0])
2663 {
2664 peek = _cpp_lex_direct (pfile);
2665 keyword = peek;
2666 backup++;
2667 if (keyword->type != CPP_NAME)
2668 goto not_module;
2669 node = keyword->val.node.node;
2670 if (!(node->flags & NODE_MODULE))
2671 goto not_module;
2672 }
2673
2674 if (node == n_modules[spec_nodes::M__IMPORT][0])
2675 /* __import */
2676 header_count = backup + 2 + 16;
2677 else if (node == n_modules[spec_nodes::M_IMPORT][0])
2678 /* import */
2679 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
2680 else if (node == n_modules[spec_nodes::M_MODULE][0])
2681 ; /* module */
2682 else
2683 goto not_module;
2684
2685 /* We've seen [export] {module|import|__import}. Check the next token. */
2686 if (header_count)
2687 /* After '{,__}import' a header name may appear. */
2688 pfile->state.angled_headers = true;
2689 peek = _cpp_lex_direct (pfile);
2690 backup++;
2691
2692 /* ... import followed by identifier, ':', '<' or
2693 header-name preprocessing tokens, or module
2694 followed by cpp-identifier, ':' or ';' preprocessing
2695 tokens. C++ keywords are not yet relevant. */
2696 if (peek->type == CPP_NAME
2697 || peek->type == CPP_COLON
2698 || (header_count
2699 ? (peek->type == CPP_LESS
2700 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
2701 || peek->type == CPP_HEADER_NAME)
2702 : peek->type == CPP_SEMICOLON))
2703 {
2704 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
2705 if (!pfile->state.pragma_allow_expansion)
2706 pfile->state.prevent_expansion++;
2707
2708 if (!header_count && linemap_included_from
2709 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
2710 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
2711 "module control-line cannot be in included file");
2712
2713 /* The first one or two tokens cannot be macro names. */
2714 for (int ix = backup; ix--;)
2715 {
2716 cpp_token *tok = ix ? keyword : result;
2717 cpp_hashnode *node = tok->val.node.node;
2718
2719 /* Don't attempt to expand the token. */
2720 tok->flags |= NO_EXPAND;
2721 if (_cpp_defined_macro_p (node)
2722 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
2723 && !cpp_fun_like_macro_p (node))
2724 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
2725 "module control-line \"%s\" cannot be"
2726 " an object-like macro",
2727 NODE_NAME (node));
2728 }
2729
2730 /* Map to underbar variants. */
2731 keyword->val.node.node = n_modules[header_count
2732 ? spec_nodes::M_IMPORT
2733 : spec_nodes::M_MODULE][1];
2734 if (backup != 1)
2735 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
2736
2737 /* Maybe tell the tokenizer we expect a header-name down the
2738 road. */
2739 pfile->state.directive_file_token = header_count;
2740 }
2741 else
2742 {
2743 not_module:
2744 /* Drop out of directive mode. */
2745 /* We aaserted save_comments had this value upon entry. */
2746 pfile->state.save_comments
2747 = !CPP_OPTION (pfile, discard_comments);
2748 pfile->state.in_deferred_pragma = false;
2749 /* Do not let this remain on. */
2750 pfile->state.angled_headers = false;
2751 }
2752
2753 /* In either case we want to backup the peeked tokens. */
2754 if (backup)
2755 {
2756 /* If we saw EOL, we should drop it, because this isn't a module
2757 control-line after all. */
2758 bool eol = peek->type == CPP_PRAGMA_EOL;
2759 if (!eol || backup > 1)
2760 {
2761 /* Put put the peeked tokens back */
2762 _cpp_backup_tokens_direct (pfile, backup);
2763 /* But if the last one was an EOL, forget it. */
2764 if (eol)
2765 pfile->lookaheads--;
2766 }
2767 }
2768 }
2769
2770 /* Lex a token into RESULT (external interface). Takes care of issues
2771 like directive handling, token lookahead, multiple include
2772 optimization and skipping. */
2773 const cpp_token *
2774 _cpp_lex_token (cpp_reader *pfile)
2775 {
2776 cpp_token *result;
2777
2778 for (;;)
2779 {
2780 if (pfile->cur_token == pfile->cur_run->limit)
2781 {
2782 pfile->cur_run = next_tokenrun (pfile->cur_run);
2783 pfile->cur_token = pfile->cur_run->base;
2784 }
2785 /* We assume that the current token is somewhere in the current
2786 run. */
2787 if (pfile->cur_token < pfile->cur_run->base
2788 || pfile->cur_token >= pfile->cur_run->limit)
2789 abort ();
2790
2791 if (pfile->lookaheads)
2792 {
2793 pfile->lookaheads--;
2794 result = pfile->cur_token++;
2795 }
2796 else
2797 result = _cpp_lex_direct (pfile);
2798
2799 if (result->flags & BOL)
2800 {
2801 /* Is this a directive. If _cpp_handle_directive returns
2802 false, it is an assembler #. */
2803 if (result->type == CPP_HASH
2804 /* 6.10.3 p 11: Directives in a list of macro arguments
2805 gives undefined behavior. This implementation
2806 handles the directive as normal. */
2807 && pfile->state.parsing_args != 1)
2808 {
2809 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2810 {
2811 if (pfile->directive_result.type == CPP_PADDING)
2812 continue;
2813 result = &pfile->directive_result;
2814 }
2815 }
2816 else if (pfile->state.in_deferred_pragma)
2817 result = &pfile->directive_result;
2818 else if (result->type == CPP_NAME
2819 && (result->val.node.node->flags & NODE_MODULE)
2820 && !pfile->state.skipping
2821 /* Unlike regular directives, we do not deal with
2822 tokenizing module directives as macro arguments.
2823 That's not permitted. */
2824 && !pfile->state.parsing_args)
2825 {
2826 /* P1857. Before macro expansion, At start of logical
2827 line ... */
2828 /* We don't have to consider lookaheads at this point. */
2829 gcc_checking_assert (!pfile->lookaheads);
2830
2831 cpp_maybe_module_directive (pfile, result);
2832 }
2833
2834 if (pfile->cb.line_change && !pfile->state.skipping)
2835 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2836 }
2837
2838 /* We don't skip tokens in directives. */
2839 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2840 break;
2841
2842 /* Outside a directive, invalidate controlling macros. At file
2843 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2844 get here and MI optimization works. */
2845 pfile->mi_valid = false;
2846
2847 if (!pfile->state.skipping || result->type == CPP_EOF)
2848 break;
2849 }
2850
2851 return result;
2852 }
2853
2854 /* Returns true if a fresh line has been loaded. */
2855 bool
2856 _cpp_get_fresh_line (cpp_reader *pfile)
2857 {
2858 /* We can't get a new line until we leave the current directive. */
2859 if (pfile->state.in_directive)
2860 return false;
2861
2862 for (;;)
2863 {
2864 cpp_buffer *buffer = pfile->buffer;
2865
2866 if (!buffer->need_line)
2867 return true;
2868
2869 if (buffer->next_line < buffer->rlimit)
2870 {
2871 _cpp_clean_line (pfile);
2872 return true;
2873 }
2874
2875 /* First, get out of parsing arguments state. */
2876 if (pfile->state.parsing_args)
2877 return false;
2878
2879 /* End of buffer. Non-empty files should end in a newline. */
2880 if (buffer->buf != buffer->rlimit
2881 && buffer->next_line > buffer->rlimit
2882 && !buffer->from_stage3)
2883 {
2884 /* Clip to buffer size. */
2885 buffer->next_line = buffer->rlimit;
2886 }
2887
2888 if (buffer->prev && !buffer->return_at_eof)
2889 _cpp_pop_buffer (pfile);
2890 else
2891 {
2892 /* End of translation. Do not pop the buffer yet. Increment
2893 line number so that the EOF token is on a line of its own
2894 (_cpp_lex_direct doesn't increment in that case, because
2895 it's hard for it to distinguish this special case). */
2896 CPP_INCREMENT_LINE (pfile, 0);
2897 return false;
2898 }
2899 }
2900 }
2901
2902 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
2903 do \
2904 { \
2905 result->type = ELSE_TYPE; \
2906 if (*buffer->cur == CHAR) \
2907 buffer->cur++, result->type = THEN_TYPE; \
2908 } \
2909 while (0)
2910
2911 /* Lex a token into pfile->cur_token, which is also incremented, to
2912 get diagnostics pointing to the correct location.
2913
2914 Does not handle issues such as token lookahead, multiple-include
2915 optimization, directives, skipping etc. This function is only
2916 suitable for use by _cpp_lex_token, and in special cases like
2917 lex_expansion_token which doesn't care for any of these issues.
2918
2919 When meeting a newline, returns CPP_EOF if parsing a directive,
2920 otherwise returns to the start of the token buffer if permissible.
2921 Returns the location of the lexed token. */
2922 cpp_token *
2923 _cpp_lex_direct (cpp_reader *pfile)
2924 {
2925 cppchar_t c;
2926 cpp_buffer *buffer;
2927 const unsigned char *comment_start;
2928 bool fallthrough_comment = false;
2929 cpp_token *result = pfile->cur_token++;
2930
2931 fresh_line:
2932 result->flags = 0;
2933 buffer = pfile->buffer;
2934 if (buffer->need_line)
2935 {
2936 gcc_assert (!pfile->state.in_deferred_pragma);
2937 if (!_cpp_get_fresh_line (pfile))
2938 {
2939 result->type = CPP_EOF;
2940 /* Not a real EOF in a directive or arg parsing -- we refuse
2941 to advance to the next file now, and will once we're out
2942 of those modes. */
2943 if (!pfile->state.in_directive && !pfile->state.parsing_args)
2944 {
2945 /* Tell the compiler the line number of the EOF token. */
2946 result->src_loc = pfile->line_table->highest_line;
2947 result->flags = BOL;
2948 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2949 _cpp_pop_buffer (pfile);
2950 }
2951 return result;
2952 }
2953 if (buffer != pfile->buffer)
2954 fallthrough_comment = false;
2955 if (!pfile->keep_tokens)
2956 {
2957 pfile->cur_run = &pfile->base_run;
2958 result = pfile->base_run.base;
2959 pfile->cur_token = result + 1;
2960 }
2961 result->flags = BOL;
2962 if (pfile->state.parsing_args == 2)
2963 result->flags |= PREV_WHITE;
2964 }
2965 buffer = pfile->buffer;
2966 update_tokens_line:
2967 result->src_loc = pfile->line_table->highest_line;
2968
2969 skipped_white:
2970 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2971 && !pfile->overlaid_buffer)
2972 {
2973 _cpp_process_line_notes (pfile, false);
2974 result->src_loc = pfile->line_table->highest_line;
2975 }
2976 c = *buffer->cur++;
2977
2978 if (pfile->forced_token_location)
2979 result->src_loc = pfile->forced_token_location;
2980 else
2981 result->src_loc = linemap_position_for_column (pfile->line_table,
2982 CPP_BUF_COLUMN (buffer, buffer->cur));
2983
2984 switch (c)
2985 {
2986 case ' ': case '\t': case '\f': case '\v': case '\0':
2987 result->flags |= PREV_WHITE;
2988 skip_whitespace (pfile, c);
2989 goto skipped_white;
2990
2991 case '\n':
2992 /* Increment the line, unless this is the last line ... */
2993 if (buffer->cur < buffer->rlimit
2994 /* ... or this is a #include, (where _cpp_stack_file needs to
2995 unwind by one line) ... */
2996 || (pfile->state.in_directive > 1
2997 /* ... except traditional-cpp increments this elsewhere. */
2998 && !CPP_OPTION (pfile, traditional)))
2999 CPP_INCREMENT_LINE (pfile, 0);
3000 buffer->need_line = true;
3001 if (pfile->state.in_deferred_pragma)
3002 {
3003 /* Produce the PRAGMA_EOL on this line. File reading
3004 ensures there is always a \n at end of the buffer, thus
3005 in a deferred pragma we always see CPP_PRAGMA_EOL before
3006 any CPP_EOF. */
3007 result->type = CPP_PRAGMA_EOL;
3008 result->flags &= ~PREV_WHITE;
3009 pfile->state.in_deferred_pragma = false;
3010 if (!pfile->state.pragma_allow_expansion)
3011 pfile->state.prevent_expansion--;
3012 return result;
3013 }
3014 goto fresh_line;
3015
3016 case '0': case '1': case '2': case '3': case '4':
3017 case '5': case '6': case '7': case '8': case '9':
3018 {
3019 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3020 result->type = CPP_NUMBER;
3021 lex_number (pfile, &result->val.str, &nst);
3022 warn_about_normalization (pfile, result, &nst);
3023 break;
3024 }
3025
3026 case 'L':
3027 case 'u':
3028 case 'U':
3029 case 'R':
3030 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3031 wide strings or raw strings. */
3032 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3033 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3034 {
3035 if ((*buffer->cur == '\'' && c != 'R')
3036 || *buffer->cur == '"'
3037 || (*buffer->cur == 'R'
3038 && c != 'R'
3039 && buffer->cur[1] == '"'
3040 && CPP_OPTION (pfile, rliterals))
3041 || (*buffer->cur == '8'
3042 && c == 'u'
3043 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3044 && CPP_OPTION (pfile, utf8_char_literals)))
3045 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3046 && CPP_OPTION (pfile, rliterals)))))
3047 {
3048 lex_string (pfile, result, buffer->cur - 1);
3049 break;
3050 }
3051 }
3052 /* Fall through. */
3053
3054 case '_':
3055 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3056 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3057 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3058 case 's': case 't': case 'v': case 'w': case 'x':
3059 case 'y': case 'z':
3060 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3061 case 'G': case 'H': case 'I': case 'J': case 'K':
3062 case 'M': case 'N': case 'O': case 'P': case 'Q':
3063 case 'S': case 'T': case 'V': case 'W': case 'X':
3064 case 'Y': case 'Z':
3065 result->type = CPP_NAME;
3066 {
3067 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3068 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3069 &nst,
3070 &result->val.node.spelling);
3071 warn_about_normalization (pfile, result, &nst);
3072 }
3073
3074 /* Convert named operators to their proper types. */
3075 if (result->val.node.node->flags & NODE_OPERATOR)
3076 {
3077 result->flags |= NAMED_OP;
3078 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3079 }
3080
3081 /* Signal FALLTHROUGH comment followed by another token. */
3082 if (fallthrough_comment)
3083 result->flags |= PREV_FALLTHROUGH;
3084 break;
3085
3086 case '\'':
3087 case '"':
3088 lex_string (pfile, result, buffer->cur - 1);
3089 break;
3090
3091 case '/':
3092 /* A potential block or line comment. */
3093 comment_start = buffer->cur;
3094 c = *buffer->cur;
3095
3096 if (c == '*')
3097 {
3098 if (_cpp_skip_block_comment (pfile))
3099 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3100 }
3101 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3102 {
3103 /* Don't warn for system headers. */
3104 if (_cpp_in_system_header (pfile))
3105 ;
3106 /* Warn about comments if pedantically GNUC89, and not
3107 in system headers. */
3108 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3109 && CPP_PEDANTIC (pfile)
3110 && ! buffer->warned_cplusplus_comments)
3111 {
3112 if (cpp_error (pfile, CPP_DL_PEDWARN,
3113 "C++ style comments are not allowed in ISO C90"))
3114 cpp_error (pfile, CPP_DL_NOTE,
3115 "(this will be reported only once per input file)");
3116 buffer->warned_cplusplus_comments = 1;
3117 }
3118 /* Or if specifically desired via -Wc90-c99-compat. */
3119 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3120 && ! CPP_OPTION (pfile, cplusplus)
3121 && ! buffer->warned_cplusplus_comments)
3122 {
3123 if (cpp_error (pfile, CPP_DL_WARNING,
3124 "C++ style comments are incompatible with C90"))
3125 cpp_error (pfile, CPP_DL_NOTE,
3126 "(this will be reported only once per input file)");
3127 buffer->warned_cplusplus_comments = 1;
3128 }
3129 /* In C89/C94, C++ style comments are forbidden. */
3130 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3131 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3132 {
3133 /* But don't be confused about valid code such as
3134 - // immediately followed by *,
3135 - // in a preprocessing directive,
3136 - // in an #if 0 block. */
3137 if (buffer->cur[1] == '*'
3138 || pfile->state.in_directive
3139 || pfile->state.skipping)
3140 {
3141 result->type = CPP_DIV;
3142 break;
3143 }
3144 else if (! buffer->warned_cplusplus_comments)
3145 {
3146 if (cpp_error (pfile, CPP_DL_ERROR,
3147 "C++ style comments are not allowed in "
3148 "ISO C90"))
3149 cpp_error (pfile, CPP_DL_NOTE,
3150 "(this will be reported only once per input "
3151 "file)");
3152 buffer->warned_cplusplus_comments = 1;
3153 }
3154 }
3155 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3156 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3157 }
3158 else if (c == '=')
3159 {
3160 buffer->cur++;
3161 result->type = CPP_DIV_EQ;
3162 break;
3163 }
3164 else
3165 {
3166 result->type = CPP_DIV;
3167 break;
3168 }
3169
3170 if (fallthrough_comment_p (pfile, comment_start))
3171 fallthrough_comment = true;
3172
3173 if (pfile->cb.comment)
3174 {
3175 size_t len = pfile->buffer->cur - comment_start;
3176 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3177 len + 1);
3178 }
3179
3180 if (!pfile->state.save_comments)
3181 {
3182 result->flags |= PREV_WHITE;
3183 goto update_tokens_line;
3184 }
3185
3186 if (fallthrough_comment)
3187 result->flags |= PREV_FALLTHROUGH;
3188
3189 /* Save the comment as a token in its own right. */
3190 save_comment (pfile, result, comment_start, c);
3191 break;
3192
3193 case '<':
3194 if (pfile->state.angled_headers)
3195 {
3196 lex_string (pfile, result, buffer->cur - 1);
3197 if (result->type != CPP_LESS)
3198 break;
3199 }
3200
3201 result->type = CPP_LESS;
3202 if (*buffer->cur == '=')
3203 {
3204 buffer->cur++, result->type = CPP_LESS_EQ;
3205 if (*buffer->cur == '>'
3206 && CPP_OPTION (pfile, cplusplus)
3207 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3208 buffer->cur++, result->type = CPP_SPACESHIP;
3209 }
3210 else if (*buffer->cur == '<')
3211 {
3212 buffer->cur++;
3213 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3214 }
3215 else if (CPP_OPTION (pfile, digraphs))
3216 {
3217 if (*buffer->cur == ':')
3218 {
3219 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3220 three characters are <:: and the subsequent character
3221 is neither : nor >, the < is treated as a preprocessor
3222 token by itself". */
3223 if (CPP_OPTION (pfile, cplusplus)
3224 && CPP_OPTION (pfile, lang) != CLK_CXX98
3225 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3226 && buffer->cur[1] == ':'
3227 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3228 break;
3229
3230 buffer->cur++;
3231 result->flags |= DIGRAPH;
3232 result->type = CPP_OPEN_SQUARE;
3233 }
3234 else if (*buffer->cur == '%')
3235 {
3236 buffer->cur++;
3237 result->flags |= DIGRAPH;
3238 result->type = CPP_OPEN_BRACE;
3239 }
3240 }
3241 break;
3242
3243 case '>':
3244 result->type = CPP_GREATER;
3245 if (*buffer->cur == '=')
3246 buffer->cur++, result->type = CPP_GREATER_EQ;
3247 else if (*buffer->cur == '>')
3248 {
3249 buffer->cur++;
3250 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3251 }
3252 break;
3253
3254 case '%':
3255 result->type = CPP_MOD;
3256 if (*buffer->cur == '=')
3257 buffer->cur++, result->type = CPP_MOD_EQ;
3258 else if (CPP_OPTION (pfile, digraphs))
3259 {
3260 if (*buffer->cur == ':')
3261 {
3262 buffer->cur++;
3263 result->flags |= DIGRAPH;
3264 result->type = CPP_HASH;
3265 if (*buffer->cur == '%' && buffer->cur[1] == ':')
3266 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3267 }
3268 else if (*buffer->cur == '>')
3269 {
3270 buffer->cur++;
3271 result->flags |= DIGRAPH;
3272 result->type = CPP_CLOSE_BRACE;
3273 }
3274 }
3275 break;
3276
3277 case '.':
3278 result->type = CPP_DOT;
3279 if (ISDIGIT (*buffer->cur))
3280 {
3281 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3282 result->type = CPP_NUMBER;
3283 lex_number (pfile, &result->val.str, &nst);
3284 warn_about_normalization (pfile, result, &nst);
3285 }
3286 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3287 buffer->cur += 2, result->type = CPP_ELLIPSIS;
3288 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3289 buffer->cur++, result->type = CPP_DOT_STAR;
3290 break;
3291
3292 case '+':
3293 result->type = CPP_PLUS;
3294 if (*buffer->cur == '+')
3295 buffer->cur++, result->type = CPP_PLUS_PLUS;
3296 else if (*buffer->cur == '=')
3297 buffer->cur++, result->type = CPP_PLUS_EQ;
3298 break;
3299
3300 case '-':
3301 result->type = CPP_MINUS;
3302 if (*buffer->cur == '>')
3303 {
3304 buffer->cur++;
3305 result->type = CPP_DEREF;
3306 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3307 buffer->cur++, result->type = CPP_DEREF_STAR;
3308 }
3309 else if (*buffer->cur == '-')
3310 buffer->cur++, result->type = CPP_MINUS_MINUS;
3311 else if (*buffer->cur == '=')
3312 buffer->cur++, result->type = CPP_MINUS_EQ;
3313 break;
3314
3315 case '&':
3316 result->type = CPP_AND;
3317 if (*buffer->cur == '&')
3318 buffer->cur++, result->type = CPP_AND_AND;
3319 else if (*buffer->cur == '=')
3320 buffer->cur++, result->type = CPP_AND_EQ;
3321 break;
3322
3323 case '|':
3324 result->type = CPP_OR;
3325 if (*buffer->cur == '|')
3326 buffer->cur++, result->type = CPP_OR_OR;
3327 else if (*buffer->cur == '=')
3328 buffer->cur++, result->type = CPP_OR_EQ;
3329 break;
3330
3331 case ':':
3332 result->type = CPP_COLON;
3333 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3334 buffer->cur++, result->type = CPP_SCOPE;
3335 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3336 {
3337 buffer->cur++;
3338 result->flags |= DIGRAPH;
3339 result->type = CPP_CLOSE_SQUARE;
3340 }
3341 break;
3342
3343 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3344 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3345 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3346 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3347 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3348
3349 case '?': result->type = CPP_QUERY; break;
3350 case '~': result->type = CPP_COMPL; break;
3351 case ',': result->type = CPP_COMMA; break;
3352 case '(': result->type = CPP_OPEN_PAREN; break;
3353 case ')': result->type = CPP_CLOSE_PAREN; break;
3354 case '[': result->type = CPP_OPEN_SQUARE; break;
3355 case ']': result->type = CPP_CLOSE_SQUARE; break;
3356 case '{': result->type = CPP_OPEN_BRACE; break;
3357 case '}': result->type = CPP_CLOSE_BRACE; break;
3358 case ';': result->type = CPP_SEMICOLON; break;
3359
3360 /* @ is a punctuator in Objective-C. */
3361 case '@': result->type = CPP_ATSIGN; break;
3362
3363 default:
3364 {
3365 const uchar *base = --buffer->cur;
3366
3367 /* Check for an extended identifier ($ or UCN or UTF-8). */
3368 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3369 if (forms_identifier_p (pfile, true, &nst))
3370 {
3371 result->type = CPP_NAME;
3372 result->val.node.node = lex_identifier (pfile, base, true, &nst,
3373 &result->val.node.spelling);
3374 warn_about_normalization (pfile, result, &nst);
3375 break;
3376 }
3377
3378 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
3379 single token. */
3380 buffer->cur++;
3381 if (c >= utf8_signifier)
3382 {
3383 const uchar *pstr = base;
3384 cppchar_t s;
3385 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3386 buffer->cur = pstr;
3387 }
3388 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3389 break;
3390 }
3391
3392 }
3393
3394 /* Potentially convert the location of the token to a range. */
3395 if (result->src_loc >= RESERVED_LOCATION_COUNT
3396 && result->type != CPP_EOF)
3397 {
3398 /* Ensure that any line notes are processed, so that we have the
3399 correct physical line/column for the end-point of the token even
3400 when a logical line is split via one or more backslashes. */
3401 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3402 && !pfile->overlaid_buffer)
3403 _cpp_process_line_notes (pfile, false);
3404
3405 source_range tok_range;
3406 tok_range.m_start = result->src_loc;
3407 tok_range.m_finish
3408 = linemap_position_for_column (pfile->line_table,
3409 CPP_BUF_COLUMN (buffer, buffer->cur));
3410
3411 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3412 result->src_loc,
3413 tok_range, NULL);
3414 }
3415
3416 return result;
3417 }
3418
3419 /* An upper bound on the number of bytes needed to spell TOKEN.
3420 Does not include preceding whitespace. */
3421 unsigned int
3422 cpp_token_len (const cpp_token *token)
3423 {
3424 unsigned int len;
3425
3426 switch (TOKEN_SPELL (token))
3427 {
3428 default: len = 6; break;
3429 case SPELL_LITERAL: len = token->val.str.len; break;
3430 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
3431 }
3432
3433 return len;
3434 }
3435
3436 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3437 Return the number of bytes read out of NAME. (There are always
3438 10 bytes written to BUFFER.) */
3439
3440 static size_t
3441 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3442 {
3443 int j;
3444 int ucn_len = 0;
3445 int ucn_len_c;
3446 unsigned t;
3447 unsigned long utf32;
3448
3449 /* Compute the length of the UTF-8 sequence. */
3450 for (t = *name; t & 0x80; t <<= 1)
3451 ucn_len++;
3452
3453 utf32 = *name & (0x7F >> ucn_len);
3454 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3455 {
3456 utf32 = (utf32 << 6) | (*++name & 0x3F);
3457
3458 /* Ill-formed UTF-8. */
3459 if ((*name & ~0x3F) != 0x80)
3460 abort ();
3461 }
3462
3463 *buffer++ = '\\';
3464 *buffer++ = 'U';
3465 for (j = 7; j >= 0; j--)
3466 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3467 return ucn_len;
3468 }
3469
3470 /* Given a token TYPE corresponding to a digraph, return a pointer to
3471 the spelling of the digraph. */
3472 static const unsigned char *
3473 cpp_digraph2name (enum cpp_ttype type)
3474 {
3475 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3476 }
3477
3478 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3479 The buffer must already contain the enough space to hold the
3480 token's spelling. Returns a pointer to the character after the
3481 last character written. */
3482 unsigned char *
3483 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3484 {
3485 size_t i;
3486 const unsigned char *name = NODE_NAME (ident);
3487
3488 for (i = 0; i < NODE_LEN (ident); i++)
3489 if (name[i] & ~0x7F)
3490 {
3491 i += utf8_to_ucn (buffer, name + i) - 1;
3492 buffer += 10;
3493 }
3494 else
3495 *buffer++ = name[i];
3496
3497 return buffer;
3498 }
3499
3500 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
3501 already contain the enough space to hold the token's spelling.
3502 Returns a pointer to the character after the last character written.
3503 FORSTRING is true if this is to be the spelling after translation
3504 phase 1 (with the original spelling of extended identifiers), false
3505 if extended identifiers should always be written using UCNs (there is
3506 no option for always writing them in the internal UTF-8 form).
3507 FIXME: Would be nice if we didn't need the PFILE argument. */
3508 unsigned char *
3509 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3510 unsigned char *buffer, bool forstring)
3511 {
3512 switch (TOKEN_SPELL (token))
3513 {
3514 case SPELL_OPERATOR:
3515 {
3516 const unsigned char *spelling;
3517 unsigned char c;
3518
3519 if (token->flags & DIGRAPH)
3520 spelling = cpp_digraph2name (token->type);
3521 else if (token->flags & NAMED_OP)
3522 goto spell_ident;
3523 else
3524 spelling = TOKEN_NAME (token);
3525
3526 while ((c = *spelling++) != '\0')
3527 *buffer++ = c;
3528 }
3529 break;
3530
3531 spell_ident:
3532 case SPELL_IDENT:
3533 if (forstring)
3534 {
3535 memcpy (buffer, NODE_NAME (token->val.node.spelling),
3536 NODE_LEN (token->val.node.spelling));
3537 buffer += NODE_LEN (token->val.node.spelling);
3538 }
3539 else
3540 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3541 break;
3542
3543 case SPELL_LITERAL:
3544 memcpy (buffer, token->val.str.text, token->val.str.len);
3545 buffer += token->val.str.len;
3546 break;
3547
3548 case SPELL_NONE:
3549 cpp_error (pfile, CPP_DL_ICE,
3550 "unspellable token %s", TOKEN_NAME (token));
3551 break;
3552 }
3553
3554 return buffer;
3555 }
3556
3557 /* Returns TOKEN spelt as a null-terminated string. The string is
3558 freed when the reader is destroyed. Useful for diagnostics. */
3559 unsigned char *
3560 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3561 {
3562 unsigned int len = cpp_token_len (token) + 1;
3563 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3564
3565 end = cpp_spell_token (pfile, token, start, false);
3566 end[0] = '\0';
3567
3568 return start;
3569 }
3570
3571 /* Returns a pointer to a string which spells the token defined by
3572 TYPE and FLAGS. Used by C front ends, which really should move to
3573 using cpp_token_as_text. */
3574 const char *
3575 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3576 {
3577 if (flags & DIGRAPH)
3578 return (const char *) cpp_digraph2name (type);
3579 else if (flags & NAMED_OP)
3580 return cpp_named_operator2name (type);
3581
3582 return (const char *) token_spellings[type].name;
3583 }
3584
3585 /* Writes the spelling of token to FP, without any preceding space.
3586 Separated from cpp_spell_token for efficiency - to avoid stdio
3587 double-buffering. */
3588 void
3589 cpp_output_token (const cpp_token *token, FILE *fp)
3590 {
3591 switch (TOKEN_SPELL (token))
3592 {
3593 case SPELL_OPERATOR:
3594 {
3595 const unsigned char *spelling;
3596 int c;
3597
3598 if (token->flags & DIGRAPH)
3599 spelling = cpp_digraph2name (token->type);
3600 else if (token->flags & NAMED_OP)
3601 goto spell_ident;
3602 else
3603 spelling = TOKEN_NAME (token);
3604
3605 c = *spelling;
3606 do
3607 putc (c, fp);
3608 while ((c = *++spelling) != '\0');
3609 }
3610 break;
3611
3612 spell_ident:
3613 case SPELL_IDENT:
3614 {
3615 size_t i;
3616 const unsigned char * name = NODE_NAME (token->val.node.node);
3617
3618 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3619 if (name[i] & ~0x7F)
3620 {
3621 unsigned char buffer[10];
3622 i += utf8_to_ucn (buffer, name + i) - 1;
3623 fwrite (buffer, 1, 10, fp);
3624 }
3625 else
3626 fputc (NODE_NAME (token->val.node.node)[i], fp);
3627 }
3628 break;
3629
3630 case SPELL_LITERAL:
3631 if (token->type == CPP_HEADER_NAME)
3632 fputc ('"', fp);
3633 fwrite (token->val.str.text, 1, token->val.str.len, fp);
3634 if (token->type == CPP_HEADER_NAME)
3635 fputc ('"', fp);
3636 break;
3637
3638 case SPELL_NONE:
3639 /* An error, most probably. */
3640 break;
3641 }
3642 }
3643
3644 /* Compare two tokens. */
3645 int
3646 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3647 {
3648 if (a->type == b->type && a->flags == b->flags)
3649 switch (TOKEN_SPELL (a))
3650 {
3651 default: /* Keep compiler happy. */
3652 case SPELL_OPERATOR:
3653 /* token_no is used to track where multiple consecutive ##
3654 tokens were originally located. */
3655 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3656 case SPELL_NONE:
3657 return (a->type != CPP_MACRO_ARG
3658 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3659 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3660 case SPELL_IDENT:
3661 return (a->val.node.node == b->val.node.node
3662 && a->val.node.spelling == b->val.node.spelling);
3663 case SPELL_LITERAL:
3664 return (a->val.str.len == b->val.str.len
3665 && !memcmp (a->val.str.text, b->val.str.text,
3666 a->val.str.len));
3667 }
3668
3669 return 0;
3670 }
3671
3672 /* Returns nonzero if a space should be inserted to avoid an
3673 accidental token paste for output. For simplicity, it is
3674 conservative, and occasionally advises a space where one is not
3675 needed, e.g. "." and ".2". */
3676 int
3677 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3678 const cpp_token *token2)
3679 {
3680 enum cpp_ttype a = token1->type, b = token2->type;
3681 cppchar_t c;
3682
3683 if (token1->flags & NAMED_OP)
3684 a = CPP_NAME;
3685 if (token2->flags & NAMED_OP)
3686 b = CPP_NAME;
3687
3688 c = EOF;
3689 if (token2->flags & DIGRAPH)
3690 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3691 else if (token_spellings[b].category == SPELL_OPERATOR)
3692 c = token_spellings[b].name[0];
3693
3694 /* Quickly get everything that can paste with an '='. */
3695 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3696 return 1;
3697
3698 switch (a)
3699 {
3700 case CPP_GREATER: return c == '>';
3701 case CPP_LESS: return c == '<' || c == '%' || c == ':';
3702 case CPP_PLUS: return c == '+';
3703 case CPP_MINUS: return c == '-' || c == '>';
3704 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
3705 case CPP_MOD: return c == ':' || c == '>';
3706 case CPP_AND: return c == '&';
3707 case CPP_OR: return c == '|';
3708 case CPP_COLON: return c == ':' || c == '>';
3709 case CPP_DEREF: return c == '*';
3710 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
3711 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
3712 case CPP_NAME: return ((b == CPP_NUMBER
3713 && name_p (pfile, &token2->val.str))
3714 || b == CPP_NAME
3715 || b == CPP_CHAR || b == CPP_STRING); /* L */
3716 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
3717 || c == '.' || c == '+' || c == '-');
3718 /* UCNs */
3719 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
3720 && b == CPP_NAME)
3721 || (CPP_OPTION (pfile, objc)
3722 && token1->val.str.text[0] == '@'
3723 && (b == CPP_NAME || b == CPP_STRING)));
3724 case CPP_LESS_EQ: return c == '>';
3725 case CPP_STRING:
3726 case CPP_WSTRING:
3727 case CPP_UTF8STRING:
3728 case CPP_STRING16:
3729 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
3730 && (b == CPP_NAME
3731 || (TOKEN_SPELL (token2) == SPELL_LITERAL
3732 && ISIDST (token2->val.str.text[0]))));
3733
3734 default: break;
3735 }
3736
3737 return 0;
3738 }
3739
3740 /* Output all the remaining tokens on the current line, and a newline
3741 character, to FP. Leading whitespace is removed. If there are
3742 macros, special token padding is not performed. */
3743 void
3744 cpp_output_line (cpp_reader *pfile, FILE *fp)
3745 {
3746 const cpp_token *token;
3747
3748 token = cpp_get_token (pfile);
3749 while (token->type != CPP_EOF)
3750 {
3751 cpp_output_token (token, fp);
3752 token = cpp_get_token (pfile);
3753 if (token->flags & PREV_WHITE)
3754 putc (' ', fp);
3755 }
3756
3757 putc ('\n', fp);
3758 }
3759
3760 /* Return a string representation of all the remaining tokens on the
3761 current line. The result is allocated using xmalloc and must be
3762 freed by the caller. */
3763 unsigned char *
3764 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3765 {
3766 const cpp_token *token;
3767 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3768 unsigned int alloced = 120 + out;
3769 unsigned char *result = (unsigned char *) xmalloc (alloced);
3770
3771 /* If DIR_NAME is empty, there are no initial contents. */
3772 if (dir_name)
3773 {
3774 sprintf ((char *) result, "#%s ", dir_name);
3775 out += 2;
3776 }
3777
3778 token = cpp_get_token (pfile);
3779 while (token->type != CPP_EOF)
3780 {
3781 unsigned char *last;
3782 /* Include room for a possible space and the terminating nul. */
3783 unsigned int len = cpp_token_len (token) + 2;
3784
3785 if (out + len > alloced)
3786 {
3787 alloced *= 2;
3788 if (out + len > alloced)
3789 alloced = out + len;
3790 result = (unsigned char *) xrealloc (result, alloced);
3791 }
3792
3793 last = cpp_spell_token (pfile, token, &result[out], 0);
3794 out = last - result;
3795
3796 token = cpp_get_token (pfile);
3797 if (token->flags & PREV_WHITE)
3798 result[out++] = ' ';
3799 }
3800
3801 result[out] = '\0';
3802 return result;
3803 }
3804
3805 /* Memory buffers. Changing these three constants can have a dramatic
3806 effect on performance. The values here are reasonable defaults,
3807 but might be tuned. If you adjust them, be sure to test across a
3808 range of uses of cpplib, including heavy nested function-like macro
3809 expansion. Also check the change in peak memory usage (NJAMD is a
3810 good tool for this). */
3811 #define MIN_BUFF_SIZE 8000
3812 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3813 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3814 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3815
3816 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3817 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3818 #endif
3819
3820 /* Create a new allocation buffer. Place the control block at the end
3821 of the buffer, so that buffer overflows will cause immediate chaos. */
3822 static _cpp_buff *
3823 new_buff (size_t len)
3824 {
3825 _cpp_buff *result;
3826 unsigned char *base;
3827
3828 if (len < MIN_BUFF_SIZE)
3829 len = MIN_BUFF_SIZE;
3830 len = CPP_ALIGN (len);
3831
3832 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3833 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3834 struct first. */
3835 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3836 base = XNEWVEC (unsigned char, len + slen);
3837 result = (_cpp_buff *) base;
3838 base += slen;
3839 #else
3840 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3841 result = (_cpp_buff *) (base + len);
3842 #endif
3843 result->base = base;
3844 result->cur = base;
3845 result->limit = base + len;
3846 result->next = NULL;
3847 return result;
3848 }
3849
3850 /* Place a chain of unwanted allocation buffers on the free list. */
3851 void
3852 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3853 {
3854 _cpp_buff *end = buff;
3855
3856 while (end->next)
3857 end = end->next;
3858 end->next = pfile->free_buffs;
3859 pfile->free_buffs = buff;
3860 }
3861
3862 /* Return a free buffer of size at least MIN_SIZE. */
3863 _cpp_buff *
3864 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3865 {
3866 _cpp_buff *result, **p;
3867
3868 for (p = &pfile->free_buffs;; p = &(*p)->next)
3869 {
3870 size_t size;
3871
3872 if (*p == NULL)
3873 return new_buff (min_size);
3874 result = *p;
3875 size = result->limit - result->base;
3876 /* Return a buffer that's big enough, but don't waste one that's
3877 way too big. */
3878 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3879 break;
3880 }
3881
3882 *p = result->next;
3883 result->next = NULL;
3884 result->cur = result->base;
3885 return result;
3886 }
3887
3888 /* Creates a new buffer with enough space to hold the uncommitted
3889 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
3890 the excess bytes to the new buffer. Chains the new buffer after
3891 BUFF, and returns the new buffer. */
3892 _cpp_buff *
3893 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3894 {
3895 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3896 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3897
3898 buff->next = new_buff;
3899 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3900 return new_buff;
3901 }
3902
3903 /* Creates a new buffer with enough space to hold the uncommitted
3904 remaining bytes of the buffer pointed to by BUFF, and at least
3905 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
3906 Chains the new buffer before the buffer pointed to by BUFF, and
3907 updates the pointer to point to the new buffer. */
3908 void
3909 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3910 {
3911 _cpp_buff *new_buff, *old_buff = *pbuff;
3912 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3913
3914 new_buff = _cpp_get_buff (pfile, size);
3915 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3916 new_buff->next = old_buff;
3917 *pbuff = new_buff;
3918 }
3919
3920 /* Free a chain of buffers starting at BUFF. */
3921 void
3922 _cpp_free_buff (_cpp_buff *buff)
3923 {
3924 _cpp_buff *next;
3925
3926 for (; buff; buff = next)
3927 {
3928 next = buff->next;
3929 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3930 free (buff);
3931 #else
3932 free (buff->base);
3933 #endif
3934 }
3935 }
3936
3937 /* Allocate permanent, unaligned storage of length LEN. */
3938 unsigned char *
3939 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3940 {
3941 _cpp_buff *buff = pfile->u_buff;
3942 unsigned char *result = buff->cur;
3943
3944 if (len > (size_t) (buff->limit - result))
3945 {
3946 buff = _cpp_get_buff (pfile, len);
3947 buff->next = pfile->u_buff;
3948 pfile->u_buff = buff;
3949 result = buff->cur;
3950 }
3951
3952 buff->cur = result + len;
3953 return result;
3954 }
3955
3956 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3957 That buffer is used for growing allocations when saving macro
3958 replacement lists in a #define, and when parsing an answer to an
3959 assertion in #assert, #unassert or #if (and therefore possibly
3960 whilst expanding macros). It therefore must not be used by any
3961 code that they might call: specifically the lexer and the guts of
3962 the macro expander.
3963
3964 All existing other uses clearly fit this restriction: storing
3965 registered pragmas during initialization. */
3966 unsigned char *
3967 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3968 {
3969 _cpp_buff *buff = pfile->a_buff;
3970 unsigned char *result = buff->cur;
3971
3972 if (len > (size_t) (buff->limit - result))
3973 {
3974 buff = _cpp_get_buff (pfile, len);
3975 buff->next = pfile->a_buff;
3976 pfile->a_buff = buff;
3977 result = buff->cur;
3978 }
3979
3980 buff->cur = result + len;
3981 return result;
3982 }
3983
3984 /* Commit or allocate storage from a buffer. */
3985
3986 void *
3987 _cpp_commit_buff (cpp_reader *pfile, size_t size)
3988 {
3989 void *ptr = BUFF_FRONT (pfile->a_buff);
3990
3991 if (pfile->hash_table->alloc_subobject)
3992 {
3993 void *copy = pfile->hash_table->alloc_subobject (size);
3994 memcpy (copy, ptr, size);
3995 ptr = copy;
3996 }
3997 else
3998 BUFF_FRONT (pfile->a_buff) += size;
3999
4000 return ptr;
4001 }
4002
4003 /* Say which field of TOK is in use. */
4004
4005 enum cpp_token_fld_kind
4006 cpp_token_val_index (const cpp_token *tok)
4007 {
4008 switch (TOKEN_SPELL (tok))
4009 {
4010 case SPELL_IDENT:
4011 return CPP_TOKEN_FLD_NODE;
4012 case SPELL_LITERAL:
4013 return CPP_TOKEN_FLD_STR;
4014 case SPELL_OPERATOR:
4015 /* Operands which were originally spelled as ident keep around
4016 the node for the exact spelling. */
4017 if (tok->flags & NAMED_OP)
4018 return CPP_TOKEN_FLD_NODE;
4019 else if (tok->type == CPP_PASTE)
4020 return CPP_TOKEN_FLD_TOKEN_NO;
4021 else
4022 return CPP_TOKEN_FLD_NONE;
4023 case SPELL_NONE:
4024 if (tok->type == CPP_MACRO_ARG)
4025 return CPP_TOKEN_FLD_ARG_NO;
4026 else if (tok->type == CPP_PADDING)
4027 return CPP_TOKEN_FLD_SOURCE;
4028 else if (tok->type == CPP_PRAGMA)
4029 return CPP_TOKEN_FLD_PRAGMA;
4030 /* fall through */
4031 default:
4032 return CPP_TOKEN_FLD_NONE;
4033 }
4034 }
4035
4036 /* All tokens lexed in R after calling this function will be forced to
4037 have their location_t to be P, until
4038 cpp_stop_forcing_token_locations is called for R. */
4039
4040 void
4041 cpp_force_token_locations (cpp_reader *r, location_t loc)
4042 {
4043 r->forced_token_location = loc;
4044 }
4045
4046 /* Go back to assigning locations naturally for lexed tokens. */
4047
4048 void
4049 cpp_stop_forcing_token_locations (cpp_reader *r)
4050 {
4051 r->forced_token_location = 0;
4052 }
4053
4054 /* We're looking at \, if it's escaping EOL, look past it. If at
4055 LIMIT, don't advance. */
4056
4057 static const unsigned char *
4058 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4059 {
4060 const unsigned char *probe = peek;
4061
4062 if (__builtin_expect (peek[1] == '\n', true))
4063 {
4064 eol:
4065 probe += 2;
4066 if (__builtin_expect (probe < limit, true))
4067 {
4068 peek = probe;
4069 if (*peek == '\\')
4070 /* The user might be perverse. */
4071 return do_peek_backslash (peek, limit);
4072 }
4073 }
4074 else if (__builtin_expect (peek[1] == '\r', false))
4075 {
4076 if (probe[2] == '\n')
4077 probe++;
4078 goto eol;
4079 }
4080
4081 return peek;
4082 }
4083
4084 static const unsigned char *
4085 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4086 {
4087 if (__builtin_expect (*peek == '\\', false))
4088 peek = do_peek_backslash (peek, limit);
4089 return peek;
4090 }
4091
4092 static const unsigned char *
4093 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4094 {
4095 if (peek == bound)
4096 return NULL;
4097
4098 unsigned char c = *--peek;
4099 if (__builtin_expect (c == '\n', false)
4100 || __builtin_expect (c == 'r', false))
4101 {
4102 if (peek == bound)
4103 return peek;
4104 int ix = -1;
4105 if (c == '\n' && peek[ix] == '\r')
4106 {
4107 if (peek + ix == bound)
4108 return peek;
4109 ix--;
4110 }
4111
4112 if (peek[ix] == '\\')
4113 return do_peek_prev (peek + ix, bound);
4114
4115 return peek;
4116 }
4117 else
4118 return peek;
4119 }
4120
4121 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4122 space. Otherwise return NULL. */
4123
4124 static const unsigned char *
4125 do_peek_ident (const char *match, const unsigned char *peek,
4126 const unsigned char *limit)
4127 {
4128 for (; *++match; peek++)
4129 if (*peek != *match)
4130 {
4131 peek = do_peek_next (peek, limit);
4132 if (*peek != *match)
4133 return NULL;
4134 }
4135
4136 /* Must now not be looking at an identifier char. */
4137 peek = do_peek_next (peek, limit);
4138 if (ISIDNUM (*peek))
4139 return NULL;
4140
4141 /* Skip control-line whitespace. */
4142 ws:
4143 while (*peek == ' ' || *peek == '\t')
4144 peek++;
4145 if (__builtin_expect (*peek == '\\', false))
4146 {
4147 peek = do_peek_backslash (peek, limit);
4148 if (*peek != '\\')
4149 goto ws;
4150 }
4151
4152 return peek;
4153 }
4154
4155 /* Are we looking at a module control line starting as PEEK - 1? */
4156
4157 static bool
4158 do_peek_module (cpp_reader *pfile, unsigned char c,
4159 const unsigned char *peek, const unsigned char *limit)
4160 {
4161 bool import = false;
4162
4163 if (__builtin_expect (c == 'e', false))
4164 {
4165 if (!((peek[0] == 'x' || peek[0] == '\\')
4166 && (peek = do_peek_ident ("export", peek, limit))))
4167 return false;
4168
4169 /* export, peek for import or module. No need to peek __import
4170 here. */
4171 if (peek[0] == 'i')
4172 {
4173 if (!((peek[1] == 'm' || peek[1] == '\\')
4174 && (peek = do_peek_ident ("import", peek + 1, limit))))
4175 return false;
4176 import = true;
4177 }
4178 else if (peek[0] == 'm')
4179 {
4180 if (!((peek[1] == 'o' || peek[1] == '\\')
4181 && (peek = do_peek_ident ("module", peek + 1, limit))))
4182 return false;
4183 }
4184 else
4185 return false;
4186 }
4187 else if (__builtin_expect (c == 'i', false))
4188 {
4189 if (!((peek[0] == 'm' || peek[0] == '\\')
4190 && (peek = do_peek_ident ("import", peek, limit))))
4191 return false;
4192 import = true;
4193 }
4194 else if (__builtin_expect (c == '_', false))
4195 {
4196 /* Needed for translated includes. */
4197 if (!((peek[0] == '_' || peek[0] == '\\')
4198 && (peek = do_peek_ident ("__import", peek, limit))))
4199 return false;
4200 import = true;
4201 }
4202 else if (__builtin_expect (c == 'm', false))
4203 {
4204 if (!((peek[0] == 'o' || peek[0] == '\\')
4205 && (peek = do_peek_ident ("module", peek, limit))))
4206 return false;
4207 }
4208 else
4209 return false;
4210
4211 /* Peek the next character to see if it's good enough. We'll be at
4212 the first non-whitespace char, including skipping an escaped
4213 newline. */
4214 /* ... import followed by identifier, ':', '<' or header-name
4215 preprocessing tokens, or module followed by identifier, ':' or
4216 ';' preprocessing tokens. */
4217 unsigned char p = *peek++;
4218
4219 /* A character literal is ... single quotes, ... optionally preceded
4220 by u8, u, U, or L */
4221 /* A string-literal is a ... double quotes, optionally prefixed by
4222 R, u8, u8R, u, uR, U, UR, L, or LR */
4223 if (p == 'u')
4224 {
4225 peek = do_peek_next (peek, limit);
4226 if (*peek == '8')
4227 {
4228 peek++;
4229 goto peek_u8;
4230 }
4231 goto peek_u;
4232 }
4233 else if (p == 'U' || p == 'L')
4234 {
4235 peek_u8:
4236 peek = do_peek_next (peek, limit);
4237 peek_u:
4238 if (*peek == '\"' || *peek == '\'')
4239 return false;
4240
4241 if (*peek == 'R')
4242 goto peek_R;
4243 /* Identifier. Ok. */
4244 }
4245 else if (p == 'R')
4246 {
4247 peek_R:
4248 if (CPP_OPTION (pfile, rliterals))
4249 {
4250 peek = do_peek_next (peek, limit);
4251 if (*peek == '\"')
4252 return false;
4253 }
4254 /* Identifier. Ok. */
4255 }
4256 else if ('Z' - 'A' == 25
4257 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4258 : ISIDST (p))
4259 {
4260 /* Identifier. Ok. */
4261 }
4262 else if (p == '<')
4263 {
4264 /* Maybe angle header, ok for import. Reject
4265 '<=', '<<' digraph:'<:'. */
4266 if (!import)
4267 return false;
4268 peek = do_peek_next (peek, limit);
4269 if (*peek == '=' || *peek == '<'
4270 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4271 return false;
4272 }
4273 else if (p == ';')
4274 {
4275 /* SEMICOLON, ok for module. */
4276 if (import)
4277 return false;
4278 }
4279 else if (p == '"')
4280 {
4281 /* STRING, ok for import. */
4282 if (!import)
4283 return false;
4284 }
4285 else if (p == ':')
4286 {
4287 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4288 peek = do_peek_next (peek, limit);
4289 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4290 return false;
4291 }
4292 else
4293 /* FIXME: Detect a unicode character, excluding those not
4294 permitted as the initial character. [lex.name]/1. I presume
4295 we need to check the \[uU] spellings, and directly using
4296 Unicode in say UTF8 form? Or perhaps we do the phase-1
4297 conversion of UTF8 to universal-character-names? */
4298 return false;
4299
4300 return true;
4301 }
4302
4303 /* Directives-only scanning. Somewhat more relaxed than correct
4304 parsing -- some ill-formed programs will not be rejected. */
4305
4306 void
4307 cpp_directive_only_process (cpp_reader *pfile,
4308 void *data,
4309 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4310 {
4311 bool module_p = CPP_OPTION (pfile, module_directives);
4312
4313 do
4314 {
4315 restart:
4316 /* Buffer initialization, but no line cleaning. */
4317 cpp_buffer *buffer = pfile->buffer;
4318 buffer->cur_note = buffer->notes_used = 0;
4319 buffer->cur = buffer->line_base = buffer->next_line;
4320 buffer->need_line = false;
4321 /* Files always end in a newline. We rely on this for
4322 character peeking safety. */
4323 gcc_assert (buffer->rlimit[-1] == '\n');
4324
4325 const unsigned char *base = buffer->cur;
4326 unsigned line_count = 0;
4327 const unsigned char *line_start = base;
4328
4329 bool bol = true;
4330 bool raw = false;
4331
4332 const unsigned char *lwm = base;
4333 for (const unsigned char *pos = base, *limit = buffer->rlimit;
4334 pos < limit;)
4335 {
4336 unsigned char c = *pos++;
4337 /* This matches the switch in _cpp_lex_direct. */
4338 switch (c)
4339 {
4340 case ' ': case '\t': case '\f': case '\v':
4341 /* Whitespace, do nothing. */
4342 break;
4343
4344 case '\r': /* MAC line ending, or Windows \r\n */
4345 if (*pos == '\n')
4346 pos++;
4347 /* FALLTHROUGH */
4348
4349 case '\n':
4350 bol = true;
4351
4352 next_line:
4353 CPP_INCREMENT_LINE (pfile, 0);
4354 line_count++;
4355 line_start = pos;
4356 break;
4357
4358 case '\\':
4359 /* <backslash><newline> is removed, and doesn't undo any
4360 preceeding escape or whatnot. */
4361 if (*pos == '\n')
4362 {
4363 pos++;
4364 goto next_line;
4365 }
4366 else if (*pos == '\r')
4367 {
4368 if (pos[1] == '\n')
4369 pos++;
4370 pos++;
4371 goto next_line;
4372 }
4373 goto dflt;
4374
4375 case '#':
4376 if (bol)
4377 {
4378 /* Line directive. */
4379 if (pos - 1 > base && !pfile->state.skipping)
4380 cb (pfile, CPP_DO_print, data,
4381 line_count, base, pos - 1 - base);
4382
4383 /* Prep things for directive handling. */
4384 buffer->next_line = pos;
4385 buffer->need_line = true;
4386 bool ok = _cpp_get_fresh_line (pfile);
4387 gcc_checking_assert (ok);
4388
4389 /* Ensure proper column numbering for generated
4390 error messages. */
4391 buffer->line_base -= pos - line_start;
4392
4393 _cpp_handle_directive (pfile, line_start + 1 != pos);
4394
4395 /* Sanitize the line settings. Duplicate #include's can
4396 mess things up. */
4397 // FIXME: Necessary?
4398 pfile->line_table->highest_location
4399 = pfile->line_table->highest_line;
4400
4401 if (!pfile->state.skipping
4402 && pfile->buffer->next_line < pfile->buffer->rlimit)
4403 cb (pfile, CPP_DO_location, data,
4404 pfile->line_table->highest_line);
4405
4406 goto restart;
4407 }
4408 goto dflt;
4409
4410 case '/':
4411 {
4412 const unsigned char *peek = do_peek_next (pos, limit);
4413 if (!(*peek == '/' || *peek == '*'))
4414 goto dflt;
4415
4416 /* Line or block comment */
4417 bool is_block = *peek == '*';
4418 bool star = false;
4419 bool esc = false;
4420 location_t sloc
4421 = linemap_position_for_column (pfile->line_table,
4422 pos - line_start);
4423
4424 while (pos < limit)
4425 {
4426 char c = *pos++;
4427 switch (c)
4428 {
4429 case '\\':
4430 esc = true;
4431 break;
4432
4433 case '\r':
4434 if (*pos == '\n')
4435 pos++;
4436 /* FALLTHROUGH */
4437
4438 case '\n':
4439 {
4440 CPP_INCREMENT_LINE (pfile, 0);
4441 line_count++;
4442 line_start = pos;
4443 if (!esc && !is_block)
4444 {
4445 bol = true;
4446 goto done_comment;
4447 }
4448 }
4449 if (!esc)
4450 star = false;
4451 esc = false;
4452 break;
4453
4454 case '*':
4455 if (pos > peek && !esc)
4456 star = is_block;
4457 esc = false;
4458 break;
4459
4460 case '/':
4461 if (star)
4462 goto done_comment;
4463 /* FALLTHROUGH */
4464
4465 default:
4466 star = false;
4467 esc = false;
4468 break;
4469 }
4470 }
4471 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4472 "unterminated comment");
4473 done_comment:
4474 lwm = pos;
4475 break;
4476 }
4477
4478 case '\'':
4479 if (!CPP_OPTION (pfile, digit_separators))
4480 goto delimited_string;
4481
4482 /* Possibly a number punctuator. */
4483 if (!ISIDNUM (*do_peek_next (pos, limit)))
4484 goto delimited_string;
4485
4486 goto quote_peek;
4487
4488 case '\"':
4489 if (!CPP_OPTION (pfile, rliterals))
4490 goto delimited_string;
4491
4492 quote_peek:
4493 {
4494 /* For ' see if it's a number punctuator
4495 \.?<digit>(<digit>|<identifier-nondigit>
4496 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4497 /* For " see if it's a raw string
4498 {U,L,u,u8}R. This includes CPP_NUMBER detection,
4499 because that could be 0e+R. */
4500 const unsigned char *peek = pos - 1;
4501 bool quote_first = c == '"';
4502 bool quote_eight = false;
4503 bool maybe_number_start = false;
4504 bool want_number = false;
4505
4506 while ((peek = do_peek_prev (peek, lwm)))
4507 {
4508 unsigned char p = *peek;
4509 if (quote_first)
4510 {
4511 if (!raw)
4512 {
4513 if (p != 'R')
4514 break;
4515 raw = true;
4516 continue;
4517 }
4518
4519 quote_first = false;
4520 if (p == 'L' || p == 'U' || p == 'u')
4521 ;
4522 else if (p == '8')
4523 quote_eight = true;
4524 else
4525 goto second_raw;
4526 }
4527 else if (quote_eight)
4528 {
4529 if (p != 'u')
4530 {
4531 raw = false;
4532 break;
4533 }
4534 quote_eight = false;
4535 }
4536 else if (c == '"')
4537 {
4538 second_raw:;
4539 if (!want_number && ISIDNUM (p))
4540 {
4541 raw = false;
4542 break;
4543 }
4544 }
4545
4546 if (ISDIGIT (p))
4547 maybe_number_start = true;
4548 else if (p == '.')
4549 want_number = true;
4550 else if (ISIDNUM (p))
4551 maybe_number_start = false;
4552 else if (p == '+' || p == '-')
4553 {
4554 if (const unsigned char *peek_prev
4555 = do_peek_prev (peek, lwm))
4556 {
4557 p = *peek_prev;
4558 if (p == 'e' || p == 'E'
4559 || p == 'p' || p == 'P')
4560 {
4561 want_number = true;
4562 maybe_number_start = false;
4563 }
4564 else
4565 break;
4566 }
4567 else
4568 break;
4569 }
4570 else if (p == '\'' || p == '\"')
4571 {
4572 /* If this is lwm, this must be the end of a
4573 previous string. So this is a trailing
4574 literal type, (a) if those are allowed,
4575 and (b) maybe_start is false. Otherwise
4576 this must be a CPP_NUMBER because we've
4577 met another ', and we'd have checked that
4578 in its own right. */
4579 if (peek == lwm && CPP_OPTION (pfile, uliterals))
4580 {
4581 if (!maybe_number_start && !want_number)
4582 /* Must be a literal type. */
4583 raw = false;
4584 }
4585 else if (p == '\''
4586 && CPP_OPTION (pfile, digit_separators))
4587 maybe_number_start = true;
4588 break;
4589 }
4590 else if (c == '\'')
4591 break;
4592 else if (!quote_first && !quote_eight)
4593 break;
4594 }
4595
4596 if (maybe_number_start)
4597 {
4598 if (c == '\'')
4599 /* A CPP NUMBER. */
4600 goto dflt;
4601 raw = false;
4602 }
4603
4604 goto delimited_string;
4605 }
4606
4607 delimited_string:
4608 {
4609 /* (Possibly raw) string or char literal. */
4610 unsigned char end = c;
4611 int delim_len = -1;
4612 const unsigned char *delim = NULL;
4613 location_t sloc = linemap_position_for_column (pfile->line_table,
4614 pos - line_start);
4615 int esc = 0;
4616
4617 if (raw)
4618 {
4619 /* There can be no line breaks in the delimiter. */
4620 delim = pos;
4621 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4622 {
4623 if (delim_len == 16)
4624 {
4625 cpp_error_with_line (pfile, CPP_DL_ERROR,
4626 sloc, 0,
4627 "raw string delimiter"
4628 " longer than %d"
4629 " characters",
4630 delim_len);
4631 raw = false;
4632 pos = delim;
4633 break;
4634 }
4635 if (strchr (") \\\t\v\f\n", c))
4636 {
4637 cpp_error_with_line (pfile, CPP_DL_ERROR,
4638 sloc, 0,
4639 "invalid character '%c'"
4640 " in raw string"
4641 " delimiter", c);
4642 raw = false;
4643 pos = delim;
4644 break;
4645 }
4646 if (pos >= limit)
4647 goto bad_string;
4648 }
4649 }
4650
4651 while (pos < limit)
4652 {
4653 char c = *pos++;
4654 switch (c)
4655 {
4656 case '\\':
4657 if (!raw)
4658 esc++;
4659 break;
4660
4661 case '\r':
4662 if (*pos == '\n')
4663 pos++;
4664 /* FALLTHROUGH */
4665
4666 case '\n':
4667 {
4668 CPP_INCREMENT_LINE (pfile, 0);
4669 line_count++;
4670 line_start = pos;
4671 }
4672 if (esc)
4673 esc--;
4674 break;
4675
4676 case ')':
4677 if (raw
4678 && pos + delim_len + 1 < limit
4679 && pos[delim_len] == end
4680 && !memcmp (delim, pos, delim_len))
4681 {
4682 pos += delim_len + 1;
4683 raw = false;
4684 goto done_string;
4685 }
4686 break;
4687
4688 default:
4689 if (!raw && !(esc & 1) && c == end)
4690 goto done_string;
4691 esc = 0;
4692 break;
4693 }
4694 }
4695 bad_string:
4696 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4697 "unterminated literal");
4698
4699 done_string:
4700 raw = false;
4701 lwm = pos - 1;
4702 }
4703 goto dflt;
4704
4705 case '_':
4706 case 'e':
4707 case 'i':
4708 case 'm':
4709 if (bol && module_p && !pfile->state.skipping
4710 && do_peek_module (pfile, c, pos, limit))
4711 {
4712 /* We've seen the start of a module control line.
4713 Start up the tokenizer. */
4714 pos--; /* Backup over the first character. */
4715
4716 /* Backup over whitespace to start of line. */
4717 while (pos > line_start
4718 && (pos[-1] == ' ' || pos[-1] == '\t'))
4719 pos--;
4720
4721 if (pos > base)
4722 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
4723
4724 /* Prep things for directive handling. */
4725 buffer->next_line = pos;
4726 buffer->need_line = true;
4727
4728 /* Now get tokens until the PRAGMA_EOL. */
4729 do
4730 {
4731 location_t spelling;
4732 const cpp_token *tok
4733 = cpp_get_token_with_location (pfile, &spelling);
4734
4735 gcc_assert (pfile->state.in_deferred_pragma
4736 || tok->type == CPP_PRAGMA_EOL);
4737 cb (pfile, CPP_DO_token, data, tok, spelling);
4738 }
4739 while (pfile->state.in_deferred_pragma);
4740
4741 if (pfile->buffer->next_line < pfile->buffer->rlimit)
4742 cb (pfile, CPP_DO_location, data,
4743 pfile->line_table->highest_line);
4744
4745 pfile->mi_valid = false;
4746 goto restart;
4747 }
4748 goto dflt;
4749
4750 default:
4751 dflt:
4752 bol = false;
4753 pfile->mi_valid = false;
4754 break;
4755 }
4756 }
4757
4758 if (buffer->rlimit > base && !pfile->state.skipping)
4759 cb (pfile, CPP_DO_print, data, line_count, base, buffer->rlimit - base);
4760
4761 _cpp_pop_buffer (pfile);
4762 }
4763 while (pfile->buffer);
4764 }