compiler: read embedcfg files, parse go:embed directives
[gcc.git] / gcc / go / gofrontend / lex.h
1 // lex.h -- Go frontend lexer. -*- C++ -*-
2
3 // Copyright 2009 The Go Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file.
6
7 #ifndef GO_LEX_H
8 #define GO_LEX_H
9
10 #include <mpfr.h>
11
12 #include "operator.h"
13 #include "go-linemap.h"
14
15 struct Unicode_range;
16
17 // The keywords. These must be in sorted order, other than
18 // KEYWORD_INVALID. They must match the Keywords::mapping_ array in
19 // lex.cc.
20
21 enum Keyword
22 {
23 KEYWORD_INVALID, // Not a keyword.
24 KEYWORD_ASM,
25 KEYWORD_BREAK,
26 KEYWORD_CASE,
27 KEYWORD_CHAN,
28 KEYWORD_CONST,
29 KEYWORD_CONTINUE,
30 KEYWORD_DEFAULT,
31 KEYWORD_DEFER,
32 KEYWORD_ELSE,
33 KEYWORD_FALLTHROUGH,
34 KEYWORD_FOR,
35 KEYWORD_FUNC,
36 KEYWORD_GO,
37 KEYWORD_GOTO,
38 KEYWORD_IF,
39 KEYWORD_IMPORT,
40 KEYWORD_INTERFACE,
41 KEYWORD_MAP,
42 KEYWORD_PACKAGE,
43 KEYWORD_RANGE,
44 KEYWORD_RETURN,
45 KEYWORD_SELECT,
46 KEYWORD_STRUCT,
47 KEYWORD_SWITCH,
48 KEYWORD_TYPE,
49 KEYWORD_VAR
50 };
51
52 // Pragmas built from magic comments and recorded for functions.
53 // These are used as bits in a bitmask.
54 // The set of values is intended to be the same as the gc compiler.
55
56 enum GoPragma
57 {
58 GOPRAGMA_NOINTERFACE = 1 << 0, // Method not in type descriptor.
59 GOPRAGMA_NOESCAPE = 1 << 1, // Args do not escape.
60 GOPRAGMA_NORACE = 1 << 2, // No race detector.
61 GOPRAGMA_NOSPLIT = 1 << 3, // Do not split stack.
62 GOPRAGMA_NOINLINE = 1 << 4, // Do not inline.
63 GOPRAGMA_SYSTEMSTACK = 1 << 5, // Must run on system stack.
64 GOPRAGMA_NOWRITEBARRIER = 1 << 6, // No write barriers.
65 GOPRAGMA_NOWRITEBARRIERREC = 1 << 7, // No write barriers here or callees.
66 GOPRAGMA_YESWRITEBARRIERREC = 1 << 8, // Stops nowritebarrierrec.
67 GOPRAGMA_MARK = 1 << 9, // Marker for nowritebarrierrec.
68 GOPRAGMA_CGOUNSAFEARGS = 1 << 10, // Pointer to arg is pointer to all.
69 GOPRAGMA_UINTPTRESCAPES = 1 << 11, // uintptr(p) escapes.
70 GOPRAGMA_NOTINHEAP = 1 << 12 // type is not in heap.
71 };
72
73 // A token returned from the lexer.
74
75 class Token
76 {
77 public:
78 // Token classification.
79 enum Classification
80 {
81 // Token is invalid.
82 TOKEN_INVALID,
83 // Token indicates end of input.
84 TOKEN_EOF,
85 // Token is a keyword.
86 TOKEN_KEYWORD,
87 // Token is an identifier.
88 TOKEN_IDENTIFIER,
89 // Token is a string of characters.
90 TOKEN_STRING,
91 // Token is an operator.
92 TOKEN_OPERATOR,
93 // Token is a character constant.
94 TOKEN_CHARACTER,
95 // Token is an integer.
96 TOKEN_INTEGER,
97 // Token is a floating point number.
98 TOKEN_FLOAT,
99 // Token is an imaginary number.
100 TOKEN_IMAGINARY
101 };
102
103 ~Token();
104 Token(const Token&);
105 Token& operator=(const Token&);
106
107 // Get token classification.
108 Classification
109 classification() const
110 { return this->classification_; }
111
112 // Make a token for an invalid value.
113 static Token
114 make_invalid_token(Location location)
115 { return Token(TOKEN_INVALID, location); }
116
117 // Make a token representing end of file.
118 static Token
119 make_eof_token(Location location)
120 { return Token(TOKEN_EOF, location); }
121
122 // Make a keyword token.
123 static Token
124 make_keyword_token(Keyword keyword, Location location)
125 {
126 Token tok(TOKEN_KEYWORD, location);
127 tok.u_.keyword = keyword;
128 return tok;
129 }
130
131 // Make an identifier token.
132 static Token
133 make_identifier_token(const std::string& value, bool is_exported,
134 Location location)
135 {
136 Token tok(TOKEN_IDENTIFIER, location);
137 tok.u_.identifier_value.name = new std::string(value);
138 tok.u_.identifier_value.is_exported = is_exported;
139 return tok;
140 }
141
142 // Make a quoted string token.
143 static Token
144 make_string_token(const std::string& value, Location location)
145 {
146 Token tok(TOKEN_STRING, location);
147 tok.u_.string_value = new std::string(value);
148 return tok;
149 }
150
151 // Make an operator token.
152 static Token
153 make_operator_token(Operator op, Location location)
154 {
155 Token tok(TOKEN_OPERATOR, location);
156 tok.u_.op = op;
157 return tok;
158 }
159
160 // Make a character constant token.
161 static Token
162 make_character_token(mpz_t val, Location location)
163 {
164 Token tok(TOKEN_CHARACTER, location);
165 mpz_init(tok.u_.integer_value);
166 mpz_swap(tok.u_.integer_value, val);
167 return tok;
168 }
169
170 // Make an integer token.
171 static Token
172 make_integer_token(mpz_t val, Location location)
173 {
174 Token tok(TOKEN_INTEGER, location);
175 mpz_init(tok.u_.integer_value);
176 mpz_swap(tok.u_.integer_value, val);
177 return tok;
178 }
179
180 // Make a float token.
181 static Token
182 make_float_token(mpfr_t val, Location location)
183 {
184 Token tok(TOKEN_FLOAT, location);
185 mpfr_init(tok.u_.float_value);
186 mpfr_swap(tok.u_.float_value, val);
187 return tok;
188 }
189
190 // Make a token for an imaginary number.
191 static Token
192 make_imaginary_token(mpfr_t val, Location location)
193 {
194 Token tok(TOKEN_IMAGINARY, location);
195 mpfr_init(tok.u_.float_value);
196 mpfr_swap(tok.u_.float_value, val);
197 return tok;
198 }
199
200 // Get the location of the token.
201 Location
202 location() const
203 { return this->location_; }
204
205 // Return whether this is an invalid token.
206 bool
207 is_invalid() const
208 { return this->classification_ == TOKEN_INVALID; }
209
210 // Return whether this is the EOF token.
211 bool
212 is_eof() const
213 { return this->classification_ == TOKEN_EOF; }
214
215 // Return the keyword value for a keyword token.
216 Keyword
217 keyword() const
218 {
219 go_assert(this->classification_ == TOKEN_KEYWORD);
220 return this->u_.keyword;
221 }
222
223 // Return whether this is an identifier.
224 bool
225 is_identifier() const
226 { return this->classification_ == TOKEN_IDENTIFIER; }
227
228 // Return the identifier.
229 const std::string&
230 identifier() const
231 {
232 go_assert(this->classification_ == TOKEN_IDENTIFIER);
233 return *this->u_.identifier_value.name;
234 }
235
236 // Return whether the identifier is exported.
237 bool
238 is_identifier_exported() const
239 {
240 go_assert(this->classification_ == TOKEN_IDENTIFIER);
241 return this->u_.identifier_value.is_exported;
242 }
243
244 // Return whether this is a string.
245 bool
246 is_string() const
247 {
248 return this->classification_ == TOKEN_STRING;
249 }
250
251 // Return the value of a string. The returned value is a string of
252 // UTF-8 characters.
253 std::string
254 string_value() const
255 {
256 go_assert(this->classification_ == TOKEN_STRING);
257 return *this->u_.string_value;
258 }
259
260 // Return the value of a character constant.
261 const mpz_t*
262 character_value() const
263 {
264 go_assert(this->classification_ == TOKEN_CHARACTER);
265 return &this->u_.integer_value;
266 }
267
268 // Return the value of an integer.
269 const mpz_t*
270 integer_value() const
271 {
272 go_assert(this->classification_ == TOKEN_INTEGER);
273 return &this->u_.integer_value;
274 }
275
276 // Return the value of a float.
277 const mpfr_t*
278 float_value() const
279 {
280 go_assert(this->classification_ == TOKEN_FLOAT);
281 return &this->u_.float_value;
282 }
283
284 // Return the value of an imaginary number.
285 const mpfr_t*
286 imaginary_value() const
287 {
288 go_assert(this->classification_ == TOKEN_IMAGINARY);
289 return &this->u_.float_value;
290 }
291
292 // Return the operator value for an operator token.
293 Operator
294 op() const
295 {
296 go_assert(this->classification_ == TOKEN_OPERATOR);
297 return this->u_.op;
298 }
299
300 // Return whether this token is KEYWORD.
301 bool
302 is_keyword(Keyword keyword) const
303 {
304 return (this->classification_ == TOKEN_KEYWORD
305 && this->u_.keyword == keyword);
306 }
307
308 // Return whether this token is OP.
309 bool
310 is_op(Operator op) const
311 { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; }
312
313 // Print the token for debugging.
314 void
315 print(FILE*) const;
316
317 private:
318 // Private constructor used by make_..._token functions above.
319 Token(Classification, Location);
320
321 // Clear the token.
322 void
323 clear();
324
325 // The token classification.
326 Classification classification_;
327 union
328 {
329 // The keyword value for TOKEN_KEYWORD.
330 Keyword keyword;
331 // The token value for TOKEN_IDENTIFIER.
332 struct
333 {
334 // The name of the identifier. This has been mangled to only
335 // include ASCII characters.
336 std::string* name;
337 // Whether this name should be exported. This is true if the
338 // first letter in the name is upper case.
339 bool is_exported;
340 } identifier_value;
341 // The string value for TOKEN_STRING.
342 std::string* string_value;
343 // The token value for TOKEN_CHARACTER or TOKEN_INTEGER.
344 mpz_t integer_value;
345 // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY.
346 mpfr_t float_value;
347 // The token value for TOKEN_OPERATOR or the keyword value
348 Operator op;
349 } u_;
350 // The source location.
351 Location location_;
352 };
353
354 // The lexer itself.
355
356 class Lex
357 {
358 public:
359 Lex(const char* input_file_name, FILE* input_file, Linemap *linemap);
360
361 ~Lex();
362
363 // Return the next token.
364 Token
365 next_token();
366
367 // Return the contents of any current //extern comment.
368 const std::string&
369 extern_name() const
370 { return this->extern_; }
371
372 // Return the current set of pragmas, and clear them.
373 unsigned int
374 get_and_clear_pragmas()
375 {
376 unsigned int ret = this->pragmas_;
377 this->pragmas_ = 0;
378 return ret;
379 }
380
381 struct Linkname
382 {
383 std::string ext_name; // External name; empty to just export.
384 bool is_exported; // Whether the internal name is exported.
385 Location loc; // Location of go:linkname directive.
386
387 Linkname()
388 : ext_name(), is_exported(false), loc()
389 { }
390
391 Linkname(const std::string& ext_name_a, bool is_exported_a, Location loc_a)
392 : ext_name(ext_name_a), is_exported(is_exported_a), loc(loc_a)
393 { }
394 };
395
396 typedef std::map<std::string, Linkname> Linknames;
397
398 // Return the linknames seen so far, or NULL if none, and clear the
399 // set. These are from go:linkname compiler directives.
400 Linknames*
401 get_and_clear_linknames()
402 {
403 Linknames* ret = this->linknames_;
404 this->linknames_ = NULL;
405 return ret;
406 }
407
408 // Return whether there are any current go:embed patterns.
409 bool
410 has_embeds() const
411 { return !this->embeds_.empty(); }
412
413 // If there are any go:embed patterns seen so far, store them in
414 // *EMBEDS and clear the saved set. *EMBEDS must be an empty
415 // vector.
416 void
417 get_and_clear_embeds(std::vector<std::string>* embeds)
418 {
419 go_assert(embeds->empty());
420 std::swap(*embeds, this->embeds_);
421 }
422
423 // Return whether the identifier NAME should be exported. NAME is a
424 // mangled name which includes only ASCII characters.
425 static bool
426 is_exported_mangled_name(const std::string& name);
427
428 // Return whether the identifier NAME should be exported. NAME is
429 // an unmangled utf-8 string and may contain non-ASCII characters.
430 static bool
431 is_exported_name(const std::string& name);
432
433 // Return whether the identifier NAME is invalid. When we see an
434 // invalid character we still build an identifier, but we use a
435 // magic string to indicate that the identifier is invalid. We then
436 // use this to avoid knockon errors.
437 static bool
438 is_invalid_identifier(const std::string& name);
439
440 // A helper function. Append V to STR. IS_CHARACTER is true if V
441 // is a Unicode character which should be converted into UTF-8,
442 // false if it is a byte value to be appended directly. The
443 // location is used to warn about an out of range character.
444 static void
445 append_char(unsigned int v, bool is_charater, std::string* str,
446 Location);
447
448 // A helper function. Fetch a UTF-8 character from STR and store it
449 // in *VALUE. Return the number of bytes read from STR. Return 0
450 // if STR does not point to a valid UTF-8 character.
451 static int
452 fetch_char(const char* str, unsigned int *value);
453
454 // Return whether C is a Unicode or "C" locale space character.
455 static bool
456 is_unicode_space(unsigned int c);
457
458 // Convert the specified hex char into an unsigned integer value.
459 static unsigned
460 hex_val(char c);
461
462 private:
463 ssize_t
464 get_line();
465
466 bool
467 require_line();
468
469 // The current location.
470 Location
471 location() const;
472
473 // A position CHARS column positions before the current location.
474 Location
475 earlier_location(int chars) const;
476
477 static bool
478 is_hex_digit(char);
479
480 static bool
481 is_base_digit(int base, char);
482
483 static unsigned char
484 octal_value(char c)
485 { return c - '0'; }
486
487 Token
488 make_invalid_token()
489 { return Token::make_invalid_token(this->location()); }
490
491 Token
492 make_eof_token()
493 { return Token::make_eof_token(this->location()); }
494
495 Token
496 make_operator(Operator op, int chars)
497 { return Token::make_operator_token(op, this->earlier_location(chars)); }
498
499 Token
500 gather_identifier();
501
502 static bool
503 could_be_exponent(int base, const char*, const char*);
504
505 Token
506 gather_number();
507
508 void
509 skip_exponent();
510
511 Token
512 gather_character();
513
514 Token
515 gather_string();
516
517 Token
518 gather_raw_string();
519
520 const char*
521 advance_one_utf8_char(const char*, unsigned int*, bool*);
522
523 const char*
524 advance_one_char(const char*, bool, unsigned int*, bool*);
525
526 static bool
527 is_unicode_digit(unsigned int c);
528
529 static bool
530 is_unicode_letter(unsigned int c);
531
532 static bool
533 is_unicode_uppercase(unsigned int c);
534
535 static bool
536 is_in_unicode_range(unsigned int C, const Unicode_range* ranges,
537 size_t range_size);
538
539 Operator
540 three_character_operator(char, char, char);
541
542 Operator
543 two_character_operator(char, char);
544
545 Operator
546 one_character_operator(char);
547
548 bool
549 skip_c_comment(bool* found_newline);
550
551 void
552 skip_cpp_comment();
553
554 void
555 gather_embed(const char*, const char*);
556
557 // The input file name.
558 const char* input_file_name_;
559 // The input file.
560 FILE* input_file_;
561 // The object used to keep track of file names and line numbers.
562 Linemap* linemap_;
563 // The line buffer. This holds the current line.
564 char* linebuf_;
565 // The size of the line buffer.
566 size_t linebufsize_;
567 // The nmber of characters in the current line.
568 size_t linesize_;
569 // The current offset in linebuf_.
570 size_t lineoff_;
571 // The current line number.
572 size_t lineno_;
573 // Whether to add a semicolon if we see a newline now.
574 bool add_semi_at_eol_;
575 // Pragmas for the next function, from magic comments.
576 unsigned int pragmas_;
577 // The external name to use for a function declaration, from a magic
578 // //extern comment.
579 std::string extern_;
580 // The list of //go:linkname comments, if any.
581 Linknames* linknames_;
582 // The list of //go:embed patterns, if any.
583 std::vector<std::string> embeds_;
584 };
585
586 #endif // !defined(GO_LEX_H)