Remove path name from test case
[binutils-gdb.git] / gas / app.c
1 /* This is the Assembler Pre-Processor
2 Copyright (C) 1987-2023 Free Software Foundation, Inc.
3
4 This file is part of GAS, the GNU Assembler.
5
6 GAS is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GAS is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
14 License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GAS; see the file COPYING. If not, write to the Free
18 Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
19 02110-1301, USA. */
20
21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */
22 /* App, the assembler pre-processor. This pre-processor strips out
23 excess spaces, turns single-quoted characters into a decimal
24 constant, and turns the # in # <number> <filename> <garbage> into a
25 .linefile. This needs better error-handling. */
26
27 #include "as.h"
28
29 #if (__STDC__ != 1)
30 #ifndef const
31 #define const /* empty */
32 #endif
33 #endif
34
35 #ifdef H_TICK_HEX
36 int enable_h_tick_hex = 0;
37 #endif
38
39 #ifdef TC_M68K
40 /* Whether we are scrubbing in m68k MRI mode. This is different from
41 flag_m68k_mri, because the two flags will be affected by the .mri
42 pseudo-op at different times. */
43 static int scrub_m68k_mri;
44
45 /* The pseudo-op which switches in and out of MRI mode. See the
46 comment in do_scrub_chars. */
47 static const char mri_pseudo[] = ".mri 0";
48 #else
49 #define scrub_m68k_mri 0
50 #endif
51
52 #if defined TC_ARM && defined OBJ_ELF
53 /* The pseudo-op for which we need to special-case `@' characters.
54 See the comment in do_scrub_chars. */
55 static const char symver_pseudo[] = ".symver";
56 static const char * symver_state;
57 #endif
58
59 static char last_char;
60
61 static char lex[256];
62 static const char symbol_chars[] =
63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
64
65 #define LEX_IS_SYMBOL_COMPONENT 1
66 #define LEX_IS_WHITESPACE 2
67 #define LEX_IS_LINE_SEPARATOR 3
68 #define LEX_IS_COMMENT_START 4
69 #define LEX_IS_LINE_COMMENT_START 5
70 #define LEX_IS_TWOCHAR_COMMENT_1ST 6
71 #define LEX_IS_STRINGQUOTE 8
72 #define LEX_IS_COLON 9
73 #define LEX_IS_NEWLINE 10
74 #define LEX_IS_ONECHAR_QUOTE 11
75 #ifdef TC_V850
76 #define LEX_IS_DOUBLEDASH_1ST 12
77 #endif
78 #ifdef TC_M32R
79 #define DOUBLEBAR_PARALLEL
80 #endif
81 #ifdef DOUBLEBAR_PARALLEL
82 #define LEX_IS_DOUBLEBAR_1ST 13
83 #endif
84 #define LEX_IS_PARALLEL_SEPARATOR 14
85 #ifdef H_TICK_HEX
86 #define LEX_IS_H 15
87 #endif
88 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT)
89 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE)
90 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR)
91 #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
92 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START)
93 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START)
94 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE)
95
96 static int process_escape (int);
97
98 /* FIXME-soon: The entire lexer/parser thingy should be
99 built statically at compile time rather than dynamically
100 each and every time the assembler is run. xoxorich. */
101
102 void
103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
104 {
105 const char *p;
106 int c;
107
108 lex[' '] = LEX_IS_WHITESPACE;
109 lex['\t'] = LEX_IS_WHITESPACE;
110 lex['\r'] = LEX_IS_WHITESPACE;
111 lex['\n'] = LEX_IS_NEWLINE;
112 lex[':'] = LEX_IS_COLON;
113
114 #ifdef TC_M68K
115 scrub_m68k_mri = m68k_mri;
116
117 if (! m68k_mri)
118 #endif
119 {
120 lex['"'] = LEX_IS_STRINGQUOTE;
121
122 #if ! defined (TC_HPPA)
123 lex['\''] = LEX_IS_ONECHAR_QUOTE;
124 #endif
125
126 #ifdef SINGLE_QUOTE_STRINGS
127 lex['\''] = LEX_IS_STRINGQUOTE;
128 #endif
129 }
130
131 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
132 in state 5 of do_scrub_chars must be changed. */
133
134 /* Note that these override the previous defaults, e.g. if ';' is a
135 comment char, then it isn't a line separator. */
136 for (p = symbol_chars; *p; ++p)
137 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
138
139 for (c = 128; c < 256; ++c)
140 lex[c] = LEX_IS_SYMBOL_COMPONENT;
141
142 #ifdef tc_symbol_chars
143 /* This macro permits the processor to specify all characters which
144 may appears in an operand. This will prevent the scrubber from
145 discarding meaningful whitespace in certain cases. The i386
146 backend uses this to support prefixes, which can confuse the
147 scrubber as to whether it is parsing operands or opcodes. */
148 for (p = tc_symbol_chars; *p; ++p)
149 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
150 #endif
151
152 /* The m68k backend wants to be able to change comment_chars. */
153 #ifndef tc_comment_chars
154 #define tc_comment_chars comment_chars
155 #endif
156 for (p = tc_comment_chars; *p; p++)
157 lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
158
159 for (p = line_comment_chars; *p; p++)
160 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
161
162 #ifndef tc_line_separator_chars
163 #define tc_line_separator_chars line_separator_chars
164 #endif
165 for (p = tc_line_separator_chars; *p; p++)
166 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
167
168 #ifdef tc_parallel_separator_chars
169 /* This macro permits the processor to specify all characters which
170 separate parallel insns on the same line. */
171 for (p = tc_parallel_separator_chars; *p; p++)
172 lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
173 #endif
174
175 /* Only allow slash-star comments if slash is not in use.
176 FIXME: This isn't right. We should always permit them. */
177 if (lex['/'] == 0)
178 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
179
180 #ifdef TC_M68K
181 if (m68k_mri)
182 {
183 lex['\''] = LEX_IS_STRINGQUOTE;
184 lex[';'] = LEX_IS_COMMENT_START;
185 lex['*'] = LEX_IS_LINE_COMMENT_START;
186 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
187 then it can't be used in an expression. */
188 lex['!'] = LEX_IS_LINE_COMMENT_START;
189 }
190 #endif
191
192 #ifdef TC_V850
193 lex['-'] = LEX_IS_DOUBLEDASH_1ST;
194 #endif
195 #ifdef DOUBLEBAR_PARALLEL
196 lex['|'] = LEX_IS_DOUBLEBAR_1ST;
197 #endif
198 #ifdef TC_D30V
199 /* Must do this is we want VLIW instruction with "->" or "<-". */
200 lex['-'] = LEX_IS_SYMBOL_COMPONENT;
201 #endif
202
203 #ifdef H_TICK_HEX
204 if (enable_h_tick_hex)
205 {
206 lex['h'] = LEX_IS_H;
207 lex['H'] = LEX_IS_H;
208 }
209 #endif
210 }
211
212 /* Saved state of the scrubber. */
213 static int state;
214 static int old_state;
215 static const char *out_string;
216 static char out_buf[20];
217 static int add_newlines;
218 static char *saved_input;
219 static size_t saved_input_len;
220 static char input_buffer[32 * 1024];
221 static const char *mri_state;
222 static char mri_last_ch;
223
224 /* Data structure for saving the state of app across #include's. Note that
225 app is called asynchronously to the parsing of the .include's, so our
226 state at the time .include is interpreted is completely unrelated.
227 That's why we have to save it all. */
228
229 struct app_save
230 {
231 int state;
232 int old_state;
233 const char * out_string;
234 char out_buf[sizeof (out_buf)];
235 int add_newlines;
236 char * saved_input;
237 size_t saved_input_len;
238 #ifdef TC_M68K
239 int scrub_m68k_mri;
240 #endif
241 const char * mri_state;
242 char mri_last_ch;
243 #if defined TC_ARM && defined OBJ_ELF
244 const char * symver_state;
245 #endif
246 char last_char;
247 };
248
249 char *
250 app_push (void)
251 {
252 struct app_save *saved;
253
254 saved = XNEW (struct app_save);
255 saved->state = state;
256 saved->old_state = old_state;
257 saved->out_string = out_string;
258 memcpy (saved->out_buf, out_buf, sizeof (out_buf));
259 saved->add_newlines = add_newlines;
260 if (saved_input == NULL)
261 saved->saved_input = NULL;
262 else
263 {
264 saved->saved_input = XNEWVEC (char, saved_input_len);
265 memcpy (saved->saved_input, saved_input, saved_input_len);
266 saved->saved_input_len = saved_input_len;
267 }
268 #ifdef TC_M68K
269 saved->scrub_m68k_mri = scrub_m68k_mri;
270 #endif
271 saved->mri_state = mri_state;
272 saved->mri_last_ch = mri_last_ch;
273 #if defined TC_ARM && defined OBJ_ELF
274 saved->symver_state = symver_state;
275 #endif
276 saved->last_char = last_char;
277
278 /* do_scrub_begin() is not useful, just wastes time. */
279
280 state = 0;
281 saved_input = NULL;
282 add_newlines = 0;
283
284 return (char *) saved;
285 }
286
287 void
288 app_pop (char *arg)
289 {
290 struct app_save *saved = (struct app_save *) arg;
291
292 /* There is no do_scrub_end (). */
293 state = saved->state;
294 old_state = saved->old_state;
295 out_string = saved->out_string;
296 memcpy (out_buf, saved->out_buf, sizeof (out_buf));
297 add_newlines = saved->add_newlines;
298 if (saved->saved_input == NULL)
299 saved_input = NULL;
300 else
301 {
302 gas_assert (saved->saved_input_len <= sizeof (input_buffer));
303 memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
304 saved_input = input_buffer;
305 saved_input_len = saved->saved_input_len;
306 free (saved->saved_input);
307 }
308 #ifdef TC_M68K
309 scrub_m68k_mri = saved->scrub_m68k_mri;
310 #endif
311 mri_state = saved->mri_state;
312 mri_last_ch = saved->mri_last_ch;
313 #if defined TC_ARM && defined OBJ_ELF
314 symver_state = saved->symver_state;
315 #endif
316 last_char = saved->last_char;
317
318 free (arg);
319 }
320
321 /* @@ This assumes that \n &c are the same on host and target. This is not
322 necessarily true. */
323
324 static int
325 process_escape (int ch)
326 {
327 switch (ch)
328 {
329 case 'b':
330 return '\b';
331 case 'f':
332 return '\f';
333 case 'n':
334 return '\n';
335 case 'r':
336 return '\r';
337 case 't':
338 return '\t';
339 case '\'':
340 return '\'';
341 case '"':
342 return '\"';
343 default:
344 return ch;
345 }
346 }
347
348 #define MULTIBYTE_WARN_COUNT_LIMIT 10
349 static unsigned int multibyte_warn_count = 0;
350
351 bool
352 scan_for_multibyte_characters (const unsigned char * start,
353 const unsigned char * end,
354 bool warn)
355 {
356 if (end <= start)
357 return false;
358
359 if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT)
360 return false;
361
362 bool found = false;
363
364 while (start < end)
365 {
366 unsigned char c;
367
368 if ((c = * start++) <= 0x7f)
369 continue;
370
371 if (!warn)
372 return true;
373
374 found = true;
375
376 const char * filename;
377 unsigned int lineno;
378
379 filename = as_where (& lineno);
380 if (filename == NULL)
381 as_warn (_("multibyte character (%#x) encountered in input"), c);
382 else if (lineno == 0)
383 as_warn (_("multibyte character (%#x) encountered in %s"), c, filename);
384 else
385 as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno);
386
387 if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT)
388 {
389 as_warn (_("further multibyte character warnings suppressed"));
390 break;
391 }
392 }
393
394 return found;
395 }
396
397 /* This function is called to process input characters. The GET
398 parameter is used to retrieve more input characters. GET should
399 set its parameter to point to a buffer, and return the length of
400 the buffer; it should return 0 at end of file. The scrubbed output
401 characters are put into the buffer starting at TOSTART; the TOSTART
402 buffer is TOLEN bytes in length. The function returns the number
403 of scrubbed characters put into TOSTART. This will be TOLEN unless
404 end of file was seen. This function is arranged as a state
405 machine, and saves its state so that it may return at any point.
406 This is the way the old code used to work. */
407
408 size_t
409 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
410 {
411 char *to = tostart;
412 char *toend = tostart + tolen;
413 char *from;
414 char *fromend;
415 size_t fromlen;
416 int ch, ch2 = 0;
417 /* Character that started the string we're working on. */
418 static char quotechar;
419
420 /*State 0: beginning of normal line
421 1: After first whitespace on line (flush more white)
422 2: After first non-white (opcode) on line (keep 1white)
423 3: after second white on line (into operands) (flush white)
424 4: after putting out a .linefile, put out digits
425 5: parsing a string, then go to old-state
426 6: putting out \ escape in a "d string.
427 7: no longer used
428 8: no longer used
429 9: After seeing symbol char in state 3 (keep 1white after symchar)
430 10: After seeing whitespace in state 9 (keep white before symchar)
431 11: After seeing a symbol character in state 0 (eg a label definition)
432 -1: output string in out_string and go to the state in old_state
433 -2: flush text until a '*' '/' is seen, then go to state old_state
434 #ifdef TC_V850
435 12: After seeing a dash, looking for a second dash as a start
436 of comment.
437 #endif
438 #ifdef DOUBLEBAR_PARALLEL
439 13: After seeing a vertical bar, looking for a second
440 vertical bar as a parallel expression separator.
441 #endif
442 #ifdef TC_PREDICATE_START_CHAR
443 14: After seeing a predicate start character at state 0, looking
444 for a predicate end character as predicate.
445 15: After seeing a predicate start character at state 1, looking
446 for a predicate end character as predicate.
447 #endif
448 #ifdef TC_Z80
449 16: After seeing an 'a' or an 'A' at the start of a symbol
450 17: After seeing an 'f' or an 'F' in state 16
451 #endif
452 */
453
454 /* I added states 9 and 10 because the MIPS ECOFF assembler uses
455 constructs like ``.loc 1 20''. This was turning into ``.loc
456 120''. States 9 and 10 ensure that a space is never dropped in
457 between characters which could appear in an identifier. Ian
458 Taylor, ian@cygnus.com.
459
460 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
461 correctly on the PA (and any other target where colons are optional).
462 Jeff Law, law@cs.utah.edu.
463
464 I added state 13 so that something like "cmp r1, r2 || trap #1" does not
465 get squashed into "cmp r1,r2||trap#1", with the all important space
466 between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */
467
468 /* This macro gets the next input character. */
469
470 #define GET() \
471 (from < fromend \
472 ? * (unsigned char *) (from++) \
473 : (saved_input = NULL, \
474 fromlen = (*get) (input_buffer, sizeof input_buffer), \
475 from = input_buffer, \
476 fromend = from + fromlen, \
477 (fromlen == 0 \
478 ? EOF \
479 : * (unsigned char *) (from++))))
480
481 /* This macro pushes a character back on the input stream. */
482
483 #define UNGET(uch) (*--from = (uch))
484
485 /* This macro puts a character into the output buffer. If this
486 character fills the output buffer, this macro jumps to the label
487 TOFULL. We use this rather ugly approach because we need to
488 handle two different termination conditions: EOF on the input
489 stream, and a full output buffer. It would be simpler if we
490 always read in the entire input stream before processing it, but
491 I don't want to make such a significant change to the assembler's
492 memory usage. */
493
494 #define PUT(pch) \
495 do \
496 { \
497 *to++ = (pch); \
498 if (to >= toend) \
499 goto tofull; \
500 } \
501 while (0)
502
503 if (saved_input != NULL)
504 {
505 from = saved_input;
506 fromend = from + saved_input_len;
507 }
508 else
509 {
510 fromlen = (*get) (input_buffer, sizeof input_buffer);
511 if (fromlen == 0)
512 return 0;
513 from = input_buffer;
514 fromend = from + fromlen;
515
516 if (multibyte_handling == multibyte_warn)
517 (void) scan_for_multibyte_characters ((const unsigned char *) from,
518 (const unsigned char* ) fromend,
519 true /* Generate warnings. */);
520 }
521
522 while (1)
523 {
524 /* The cases in this switch end with continue, in order to
525 branch back to the top of this while loop and generate the
526 next output character in the appropriate state. */
527 switch (state)
528 {
529 case -1:
530 ch = *out_string++;
531 if (*out_string == '\0')
532 {
533 state = old_state;
534 old_state = 3;
535 }
536 PUT (ch);
537 continue;
538
539 case -2:
540 for (;;)
541 {
542 do
543 {
544 ch = GET ();
545
546 if (ch == EOF)
547 {
548 as_warn (_("end of file in comment"));
549 goto fromeof;
550 }
551
552 if (ch == '\n')
553 PUT ('\n');
554 }
555 while (ch != '*');
556
557 while ((ch = GET ()) == '*')
558 ;
559
560 if (ch == EOF)
561 {
562 as_warn (_("end of file in comment"));
563 goto fromeof;
564 }
565
566 if (ch == '/')
567 break;
568
569 UNGET (ch);
570 }
571
572 state = old_state;
573 UNGET (' ');
574 continue;
575
576 case 4:
577 ch = GET ();
578 if (ch == EOF)
579 goto fromeof;
580 else if (ch >= '0' && ch <= '9')
581 PUT (ch);
582 else
583 {
584 while (ch != EOF && IS_WHITESPACE (ch))
585 ch = GET ();
586 if (ch == '"')
587 {
588 quotechar = ch;
589 state = 5;
590 old_state = 3;
591 PUT (ch);
592 }
593 else
594 {
595 while (ch != EOF && ch != '\n')
596 ch = GET ();
597 state = 0;
598 PUT (ch);
599 }
600 }
601 continue;
602
603 case 5:
604 /* We are going to copy everything up to a quote character,
605 with special handling for a backslash. We try to
606 optimize the copying in the simple case without using the
607 GET and PUT macros. */
608 {
609 char *s;
610 ptrdiff_t len;
611
612 for (s = from; s < fromend; s++)
613 {
614 ch = *s;
615 if (ch == '\\'
616 || ch == quotechar
617 || ch == '\n')
618 break;
619 }
620 len = s - from;
621 if (len > toend - to)
622 len = toend - to;
623 if (len > 0)
624 {
625 memcpy (to, from, len);
626 to += len;
627 from += len;
628 if (to >= toend)
629 goto tofull;
630 }
631 }
632
633 ch = GET ();
634 if (ch == EOF)
635 {
636 /* This buffer is here specifically so
637 that the UNGET below will work. */
638 static char one_char_buf[1];
639
640 as_warn (_("end of file in string; '%c' inserted"), quotechar);
641 state = old_state;
642 from = fromend = one_char_buf + 1;
643 fromlen = 1;
644 UNGET ('\n');
645 PUT (quotechar);
646 }
647 else if (ch == quotechar)
648 {
649 state = old_state;
650 PUT (ch);
651 }
652 else if (TC_STRING_ESCAPES && ch == '\\')
653 {
654 state = 6;
655 PUT (ch);
656 }
657 else if (scrub_m68k_mri && ch == '\n')
658 {
659 /* Just quietly terminate the string. This permits lines like
660 bne label loop if we haven't reach end yet. */
661 state = old_state;
662 UNGET (ch);
663 PUT ('\'');
664 }
665 else
666 {
667 PUT (ch);
668 }
669 continue;
670
671 case 6:
672 state = 5;
673 ch = GET ();
674 switch (ch)
675 {
676 /* Handle strings broken across lines, by turning '\n' into
677 '\\' and 'n'. */
678 case '\n':
679 UNGET ('n');
680 add_newlines++;
681 PUT ('\\');
682 continue;
683
684 case EOF:
685 as_warn (_("end of file in string; '%c' inserted"), quotechar);
686 PUT (quotechar);
687 continue;
688
689 case '"':
690 case '\\':
691 case 'b':
692 case 'f':
693 case 'n':
694 case 'r':
695 case 't':
696 case 'v':
697 case 'x':
698 case 'X':
699 case '0':
700 case '1':
701 case '2':
702 case '3':
703 case '4':
704 case '5':
705 case '6':
706 case '7':
707 break;
708
709 default:
710 #ifdef ONLY_STANDARD_ESCAPES
711 as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
712 #endif
713 break;
714 }
715 PUT (ch);
716 continue;
717
718 #ifdef DOUBLEBAR_PARALLEL
719 case 13:
720 ch = GET ();
721 if (ch != '|')
722 abort ();
723
724 /* Reset back to state 1 and pretend that we are parsing a
725 line from just after the first white space. */
726 state = 1;
727 PUT ('|');
728 #ifdef TC_TIC6X
729 /* "||^" is used for SPMASKed instructions. */
730 ch = GET ();
731 if (ch == EOF)
732 goto fromeof;
733 else if (ch == '^')
734 PUT ('^');
735 else
736 UNGET (ch);
737 #endif
738 continue;
739 #endif
740 #ifdef TC_Z80
741 case 16:
742 /* We have seen an 'a' at the start of a symbol, look for an 'f'. */
743 ch = GET ();
744 if (ch == 'f' || ch == 'F')
745 {
746 state = 17;
747 PUT (ch);
748 }
749 else
750 {
751 if (ch != EOF)
752 UNGET (ch);
753 state = 9;
754 break;
755 }
756 /* Fall through. */
757 case 17:
758 /* We have seen "af" at the start of a symbol,
759 a ' here is a part of that symbol. */
760 ch = GET ();
761 state = 9;
762 if (ch == '\'')
763 /* Change to avoid warning about unclosed string. */
764 PUT ('`');
765 else if (ch != EOF)
766 UNGET (ch);
767 break;
768 #endif
769 }
770
771 /* OK, we are somewhere in states 0 through 4 or 9 through 11. */
772
773 /* flushchar: */
774 ch = GET ();
775
776 #ifdef TC_PREDICATE_START_CHAR
777 if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
778 {
779 state += 14;
780 PUT (ch);
781 continue;
782 }
783 else if (state == 14 || state == 15)
784 {
785 if (ch == TC_PREDICATE_END_CHAR)
786 {
787 state -= 14;
788 PUT (ch);
789 ch = GET ();
790 }
791 else
792 {
793 PUT (ch);
794 continue;
795 }
796 }
797 #endif
798
799 recycle:
800
801 #if defined TC_ARM && defined OBJ_ELF
802 /* We need to watch out for .symver directives. See the comment later
803 in this function. */
804 if (symver_state == NULL)
805 {
806 if ((state == 0 || state == 1) && ch == symver_pseudo[0])
807 symver_state = symver_pseudo + 1;
808 }
809 else
810 {
811 /* We advance to the next state if we find the right
812 character. */
813 if (ch != '\0' && (*symver_state == ch))
814 ++symver_state;
815 else if (*symver_state != '\0')
816 /* We did not get the expected character, or we didn't
817 get a valid terminating character after seeing the
818 entire pseudo-op, so we must go back to the beginning. */
819 symver_state = NULL;
820 else
821 {
822 /* We've read the entire pseudo-op. If this is the end
823 of the line, go back to the beginning. */
824 if (IS_NEWLINE (ch))
825 symver_state = NULL;
826 }
827 }
828 #endif /* TC_ARM && OBJ_ELF */
829
830 #ifdef TC_M68K
831 /* We want to have pseudo-ops which control whether we are in
832 MRI mode or not. Unfortunately, since m68k MRI mode affects
833 the scrubber, that means that we need a special purpose
834 recognizer here. */
835 if (mri_state == NULL)
836 {
837 if ((state == 0 || state == 1)
838 && ch == mri_pseudo[0])
839 mri_state = mri_pseudo + 1;
840 }
841 else
842 {
843 /* We advance to the next state if we find the right
844 character, or if we need a space character and we get any
845 whitespace character, or if we need a '0' and we get a
846 '1' (this is so that we only need one state to handle
847 ``.mri 0'' and ``.mri 1''). */
848 if (ch != '\0'
849 && (*mri_state == ch
850 || (*mri_state == ' '
851 && lex[ch] == LEX_IS_WHITESPACE)
852 || (*mri_state == '0'
853 && ch == '1')))
854 {
855 mri_last_ch = ch;
856 ++mri_state;
857 }
858 else if (*mri_state != '\0'
859 || (lex[ch] != LEX_IS_WHITESPACE
860 && lex[ch] != LEX_IS_NEWLINE))
861 {
862 /* We did not get the expected character, or we didn't
863 get a valid terminating character after seeing the
864 entire pseudo-op, so we must go back to the
865 beginning. */
866 mri_state = NULL;
867 }
868 else
869 {
870 /* We've read the entire pseudo-op. mips_last_ch is
871 either '0' or '1' indicating whether to enter or
872 leave MRI mode. */
873 do_scrub_begin (mri_last_ch == '1');
874 mri_state = NULL;
875
876 /* We continue handling the character as usual. The
877 main gas reader must also handle the .mri pseudo-op
878 to control expression parsing and the like. */
879 }
880 }
881 #endif
882
883 if (ch == EOF)
884 {
885 if (state != 0)
886 {
887 as_warn (_("end of file not at end of a line; newline inserted"));
888 state = 0;
889 PUT ('\n');
890 }
891 goto fromeof;
892 }
893
894 switch (lex[ch])
895 {
896 case LEX_IS_WHITESPACE:
897 do
898 {
899 ch = GET ();
900 }
901 while (ch != EOF && IS_WHITESPACE (ch));
902 if (ch == EOF)
903 goto fromeof;
904
905 if (state == 0)
906 {
907 /* Preserve a single whitespace character at the
908 beginning of a line. */
909 state = 1;
910 UNGET (ch);
911 PUT (' ');
912 break;
913 }
914
915 #ifdef KEEP_WHITE_AROUND_COLON
916 if (lex[ch] == LEX_IS_COLON)
917 {
918 /* Only keep this white if there's no white *after* the
919 colon. */
920 ch2 = GET ();
921 if (ch2 != EOF)
922 UNGET (ch2);
923 if (!IS_WHITESPACE (ch2))
924 {
925 state = 9;
926 UNGET (ch);
927 PUT (' ');
928 break;
929 }
930 }
931 #endif
932 if (IS_COMMENT (ch)
933 || IS_LINE_SEPARATOR (ch)
934 || IS_PARALLEL_SEPARATOR (ch))
935 {
936 if (scrub_m68k_mri)
937 {
938 /* In MRI mode, we keep these spaces. */
939 UNGET (ch);
940 PUT (' ');
941 break;
942 }
943 goto recycle;
944 }
945
946 /* If we're in state 2 or 11, we've seen a non-white
947 character followed by whitespace. If the next character
948 is ':', this is whitespace after a label name which we
949 normally must ignore. In MRI mode, though, spaces are
950 not permitted between the label and the colon. */
951 if ((state == 2 || state == 11)
952 && lex[ch] == LEX_IS_COLON
953 && ! scrub_m68k_mri)
954 {
955 state = 1;
956 PUT (ch);
957 break;
958 }
959
960 switch (state)
961 {
962 case 1:
963 /* We can arrive here if we leave a leading whitespace
964 character at the beginning of a line. */
965 goto recycle;
966 case 2:
967 state = 3;
968 if (to + 1 < toend)
969 {
970 /* Optimize common case by skipping UNGET/GET. */
971 PUT (' '); /* Sp after opco */
972 goto recycle;
973 }
974 UNGET (ch);
975 PUT (' ');
976 break;
977 case 3:
978 #ifndef TC_KEEP_OPERAND_SPACES
979 /* For TI C6X, we keep these spaces as they may separate
980 functional unit specifiers from operands. */
981 if (scrub_m68k_mri)
982 #endif
983 {
984 /* In MRI mode, we keep these spaces. */
985 UNGET (ch);
986 PUT (' ');
987 break;
988 }
989 goto recycle; /* Sp in operands */
990 case 9:
991 case 10:
992 #ifndef TC_KEEP_OPERAND_SPACES
993 if (scrub_m68k_mri)
994 #endif
995 {
996 /* In MRI mode, we keep these spaces. */
997 state = 3;
998 UNGET (ch);
999 PUT (' ');
1000 break;
1001 }
1002 state = 10; /* Sp after symbol char */
1003 goto recycle;
1004 case 11:
1005 if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
1006 state = 1;
1007 else
1008 {
1009 /* We know that ch is not ':', since we tested that
1010 case above. Therefore this is not a label, so it
1011 must be the opcode, and we've just seen the
1012 whitespace after it. */
1013 state = 3;
1014 }
1015 UNGET (ch);
1016 PUT (' '); /* Sp after label definition. */
1017 break;
1018 default:
1019 BAD_CASE (state);
1020 }
1021 break;
1022
1023 case LEX_IS_TWOCHAR_COMMENT_1ST:
1024 ch2 = GET ();
1025 if (ch2 == '*')
1026 {
1027 for (;;)
1028 {
1029 do
1030 {
1031 ch2 = GET ();
1032 if (ch2 != EOF && IS_NEWLINE (ch2))
1033 add_newlines++;
1034 }
1035 while (ch2 != EOF && ch2 != '*');
1036
1037 while (ch2 == '*')
1038 ch2 = GET ();
1039
1040 if (ch2 == EOF || ch2 == '/')
1041 break;
1042
1043 /* This UNGET will ensure that we count newlines
1044 correctly. */
1045 UNGET (ch2);
1046 }
1047
1048 if (ch2 == EOF)
1049 as_warn (_("end of file in multiline comment"));
1050
1051 ch = ' ';
1052 goto recycle;
1053 }
1054 #ifdef DOUBLESLASH_LINE_COMMENTS
1055 else if (ch2 == '/')
1056 {
1057 do
1058 {
1059 ch = GET ();
1060 }
1061 while (ch != EOF && !IS_NEWLINE (ch));
1062 if (ch == EOF)
1063 as_warn ("end of file in comment; newline inserted");
1064 state = 0;
1065 PUT ('\n');
1066 break;
1067 }
1068 #endif
1069 else
1070 {
1071 if (ch2 != EOF)
1072 UNGET (ch2);
1073 if (state == 9 || state == 10)
1074 state = 3;
1075 PUT (ch);
1076 }
1077 break;
1078
1079 case LEX_IS_STRINGQUOTE:
1080 quotechar = ch;
1081 if (state == 10)
1082 {
1083 /* Preserve the whitespace in foo "bar". */
1084 UNGET (ch);
1085 state = 3;
1086 PUT (' ');
1087
1088 /* PUT didn't jump out. We could just break, but we
1089 know what will happen, so optimize a bit. */
1090 ch = GET ();
1091 old_state = 9;
1092 }
1093 else if (state == 3)
1094 old_state = 9;
1095 else
1096 old_state = state;
1097 state = 5;
1098 PUT (ch);
1099 break;
1100
1101 case LEX_IS_ONECHAR_QUOTE:
1102 #ifdef H_TICK_HEX
1103 if (state == 9 && enable_h_tick_hex)
1104 {
1105 char c;
1106
1107 c = GET ();
1108 as_warn ("'%c found after symbol", c);
1109 UNGET (c);
1110 }
1111 #endif
1112 if (state == 10)
1113 {
1114 /* Preserve the whitespace in foo 'b'. */
1115 UNGET (ch);
1116 state = 3;
1117 PUT (' ');
1118 break;
1119 }
1120 ch = GET ();
1121 if (ch == EOF)
1122 {
1123 as_warn (_("end of file after a one-character quote; \\0 inserted"));
1124 ch = 0;
1125 }
1126 if (ch == '\\')
1127 {
1128 ch = GET ();
1129 if (ch == EOF)
1130 {
1131 as_warn (_("end of file in escape character"));
1132 ch = '\\';
1133 }
1134 else
1135 ch = process_escape (ch);
1136 }
1137 sprintf (out_buf, "%d", (int) (unsigned char) ch);
1138
1139 /* None of these 'x constants for us. We want 'x'. */
1140 if ((ch = GET ()) != '\'')
1141 {
1142 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1143 as_warn (_("missing close quote; (assumed)"));
1144 #else
1145 if (ch != EOF)
1146 UNGET (ch);
1147 #endif
1148 }
1149 if (strlen (out_buf) == 1)
1150 {
1151 PUT (out_buf[0]);
1152 break;
1153 }
1154 if (state == 9)
1155 old_state = 3;
1156 else
1157 old_state = state;
1158 state = -1;
1159 out_string = out_buf;
1160 PUT (*out_string++);
1161 break;
1162
1163 case LEX_IS_COLON:
1164 #ifdef KEEP_WHITE_AROUND_COLON
1165 state = 9;
1166 #else
1167 if (state == 9 || state == 10)
1168 state = 3;
1169 else if (state != 3)
1170 state = 1;
1171 #endif
1172 PUT (ch);
1173 break;
1174
1175 case LEX_IS_NEWLINE:
1176 /* Roll out a bunch of newlines from inside comments, etc. */
1177 if (add_newlines)
1178 {
1179 --add_newlines;
1180 UNGET (ch);
1181 }
1182 /* Fall through. */
1183
1184 case LEX_IS_LINE_SEPARATOR:
1185 state = 0;
1186 PUT (ch);
1187 break;
1188
1189 case LEX_IS_PARALLEL_SEPARATOR:
1190 state = 1;
1191 PUT (ch);
1192 break;
1193
1194 #ifdef TC_V850
1195 case LEX_IS_DOUBLEDASH_1ST:
1196 ch2 = GET ();
1197 if (ch2 != '-')
1198 {
1199 if (ch2 != EOF)
1200 UNGET (ch2);
1201 goto de_fault;
1202 }
1203 /* Read and skip to end of line. */
1204 do
1205 {
1206 ch = GET ();
1207 }
1208 while (ch != EOF && ch != '\n');
1209
1210 if (ch == EOF)
1211 as_warn (_("end of file in comment; newline inserted"));
1212
1213 state = 0;
1214 PUT ('\n');
1215 break;
1216 #endif
1217 #ifdef DOUBLEBAR_PARALLEL
1218 case LEX_IS_DOUBLEBAR_1ST:
1219 ch2 = GET ();
1220 if (ch2 != EOF)
1221 UNGET (ch2);
1222 if (ch2 != '|')
1223 goto de_fault;
1224
1225 /* Handle '||' in two states as invoking PUT twice might
1226 result in the first one jumping out of this loop. We'd
1227 then lose track of the state and one '|' char. */
1228 state = 13;
1229 PUT ('|');
1230 break;
1231 #endif
1232 case LEX_IS_LINE_COMMENT_START:
1233 /* FIXME-someday: The two character comment stuff was badly
1234 thought out. On i386, we want '/' as line comment start
1235 AND we want C style comments. hence this hack. The
1236 whole lexical process should be reworked. xoxorich. */
1237 if (ch == '/')
1238 {
1239 ch2 = GET ();
1240 if (ch2 == '*')
1241 {
1242 old_state = 3;
1243 state = -2;
1244 break;
1245 }
1246 else if (ch2 != EOF)
1247 {
1248 UNGET (ch2);
1249 }
1250 }
1251
1252 if (state == 0 || state == 1) /* Only comment at start of line. */
1253 {
1254 int startch;
1255
1256 startch = ch;
1257
1258 do
1259 {
1260 ch = GET ();
1261 }
1262 while (ch != EOF && IS_WHITESPACE (ch));
1263
1264 if (ch == EOF)
1265 {
1266 as_warn (_("end of file in comment; newline inserted"));
1267 PUT ('\n');
1268 break;
1269 }
1270
1271 if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1272 {
1273 /* Not a cpp line. */
1274 while (ch != EOF && !IS_NEWLINE (ch))
1275 ch = GET ();
1276 if (ch == EOF)
1277 {
1278 as_warn (_("end of file in comment; newline inserted"));
1279 PUT ('\n');
1280 }
1281 else /* IS_NEWLINE (ch) */
1282 {
1283 /* To process non-zero add_newlines. */
1284 UNGET (ch);
1285 }
1286 state = 0;
1287 break;
1288 }
1289 /* Looks like `# 123 "filename"' from cpp. */
1290 UNGET (ch);
1291 old_state = 4;
1292 state = -1;
1293 if (scrub_m68k_mri)
1294 out_string = "\tlinefile ";
1295 else
1296 out_string = "\t.linefile ";
1297 PUT (*out_string++);
1298 break;
1299 }
1300
1301 #ifdef TC_D10V
1302 /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1303 Trap is the only short insn that has a first operand that is
1304 neither register nor label.
1305 We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1306 We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1307 already LEX_IS_LINE_COMMENT_START. However, it is the
1308 only character in line_comment_chars for d10v, hence we
1309 can recognize it as such. */
1310 /* An alternative approach would be to reset the state to 1 when
1311 we see '||', '<'- or '->', but that seems to be overkill. */
1312 if (state == 10)
1313 PUT (' ');
1314 #endif
1315 /* We have a line comment character which is not at the
1316 start of a line. If this is also a normal comment
1317 character, fall through. Otherwise treat it as a default
1318 character. */
1319 if (strchr (tc_comment_chars, ch) == NULL
1320 && (! scrub_m68k_mri
1321 || (ch != '!' && ch != '*')))
1322 goto de_fault;
1323 if (scrub_m68k_mri
1324 && (ch == '!' || ch == '*' || ch == '#')
1325 && state != 1
1326 && state != 10)
1327 goto de_fault;
1328 /* Fall through. */
1329 case LEX_IS_COMMENT_START:
1330 #if defined TC_ARM && defined OBJ_ELF
1331 /* On the ARM, `@' is the comment character.
1332 Unfortunately this is also a special character in ELF .symver
1333 directives (and .type, though we deal with those another way).
1334 So we check if this line is such a directive, and treat
1335 the character as default if so. This is a hack. */
1336 if ((symver_state != NULL) && (*symver_state == 0))
1337 goto de_fault;
1338 #endif
1339
1340 /* Care is needed not to damage occurrences of \<comment-char>
1341 by stripping the <comment-char> onwards. Yuck. */
1342 if ((to > tostart ? to[-1] : last_char) == '\\')
1343 /* Do not treat the <comment-char> as a start-of-comment. */
1344 goto de_fault;
1345
1346 #ifdef WARN_COMMENTS
1347 if (!found_comment)
1348 found_comment_file = as_where (&found_comment);
1349 #endif
1350 do
1351 {
1352 ch = GET ();
1353 }
1354 while (ch != EOF && !IS_NEWLINE (ch));
1355 if (ch == EOF)
1356 as_warn (_("end of file in comment; newline inserted"));
1357 state = 0;
1358 PUT ('\n');
1359 break;
1360
1361 #ifdef H_TICK_HEX
1362 case LEX_IS_H:
1363 /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1364 the H' with 0x to make them gas-style hex characters. */
1365 if (enable_h_tick_hex)
1366 {
1367 char quot;
1368
1369 quot = GET ();
1370 if (quot == '\'')
1371 {
1372 UNGET ('x');
1373 ch = '0';
1374 }
1375 else
1376 UNGET (quot);
1377 }
1378 #endif
1379 /* Fall through. */
1380
1381 case LEX_IS_SYMBOL_COMPONENT:
1382 if (state == 10)
1383 {
1384 /* This is a symbol character following another symbol
1385 character, with whitespace in between. We skipped
1386 the whitespace earlier, so output it now. */
1387 UNGET (ch);
1388 state = 3;
1389 PUT (' ');
1390 break;
1391 }
1392
1393 #ifdef TC_Z80
1394 /* "af'" is a symbol containing '\''. */
1395 if (state == 3 && (ch == 'a' || ch == 'A'))
1396 {
1397 state = 16;
1398 PUT (ch);
1399 ch = GET ();
1400 if (ch == 'f' || ch == 'F')
1401 {
1402 state = 17;
1403 PUT (ch);
1404 break;
1405 }
1406 else
1407 {
1408 state = 9;
1409 if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1410 {
1411 if (ch != EOF)
1412 UNGET (ch);
1413 break;
1414 }
1415 }
1416 }
1417 #endif
1418 if (state == 3)
1419 state = 9;
1420
1421 /* This is a common case. Quickly copy CH and all the
1422 following symbol component or normal characters. */
1423 if (to + 1 < toend
1424 && mri_state == NULL
1425 #if defined TC_ARM && defined OBJ_ELF
1426 && symver_state == NULL
1427 #endif
1428 )
1429 {
1430 char *s;
1431 ptrdiff_t len;
1432
1433 for (s = from; s < fromend; s++)
1434 {
1435 int type;
1436
1437 ch2 = *(unsigned char *) s;
1438 type = lex[ch2];
1439 if (type != 0
1440 && type != LEX_IS_SYMBOL_COMPONENT)
1441 break;
1442 }
1443
1444 if (s > from)
1445 /* Handle the last character normally, for
1446 simplicity. */
1447 --s;
1448
1449 len = s - from;
1450
1451 if (len > (toend - to) - 1)
1452 len = (toend - to) - 1;
1453
1454 if (len > 0)
1455 {
1456 PUT (ch);
1457 memcpy (to, from, len);
1458 to += len;
1459 from += len;
1460 if (to >= toend)
1461 goto tofull;
1462 ch = GET ();
1463 }
1464 }
1465
1466 /* Fall through. */
1467 default:
1468 de_fault:
1469 /* Some relatively `normal' character. */
1470 if (state == 0)
1471 {
1472 state = 11; /* Now seeing label definition. */
1473 }
1474 else if (state == 1)
1475 {
1476 state = 2; /* Ditto. */
1477 }
1478 else if (state == 9)
1479 {
1480 if (!IS_SYMBOL_COMPONENT (ch))
1481 state = 3;
1482 }
1483 else if (state == 10)
1484 {
1485 if (ch == '\\')
1486 {
1487 /* Special handling for backslash: a backslash may
1488 be the beginning of a formal parameter (of a
1489 macro) following another symbol character, with
1490 whitespace in between. If that is the case, we
1491 output a space before the parameter. Strictly
1492 speaking, correct handling depends upon what the
1493 macro parameter expands into; if the parameter
1494 expands into something which does not start with
1495 an operand character, then we don't want to keep
1496 the space. We don't have enough information to
1497 make the right choice, so here we are making the
1498 choice which is more likely to be correct. */
1499 if (to + 1 >= toend)
1500 {
1501 /* If we're near the end of the buffer, save the
1502 character for the next time round. Otherwise
1503 we'll lose our state. */
1504 UNGET (ch);
1505 goto tofull;
1506 }
1507 *to++ = ' ';
1508 }
1509
1510 state = 3;
1511 }
1512 PUT (ch);
1513 break;
1514 }
1515 }
1516
1517 /*NOTREACHED*/
1518
1519 fromeof:
1520 /* We have reached the end of the input. */
1521 if (to > tostart)
1522 last_char = to[-1];
1523 return to - tostart;
1524
1525 tofull:
1526 /* The output buffer is full. Save any input we have not yet
1527 processed. */
1528 if (fromend > from)
1529 {
1530 saved_input = from;
1531 saved_input_len = fromend - from;
1532 }
1533 else
1534 saved_input = NULL;
1535
1536 if (to > tostart)
1537 last_char = to[-1];
1538 return to - tostart;
1539 }
1540
1541 /* Return amount of pending input. */
1542
1543 size_t
1544 do_scrub_pending (void)
1545 {
1546 size_t len = 0;
1547 if (saved_input)
1548 len += saved_input_len;
1549 if (state == -1)
1550 len += strlen (out_string);
1551 return len;
1552 }