diff --git a/doc/langref.md b/doc/langref.md index d1b3bba468..6db5a10ef6 100644 --- a/doc/langref.md +++ b/doc/langref.md @@ -272,10 +272,26 @@ Literal Example Characters Escapes Null Term Type Byte 'H' All ASCII Byte No u8 UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8 UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8 -UTF-8 Raw String r"A(hello)A" All Unicode None No [5]u8 -UTF-8 Raw C String rc"A(hello)A" All Unicode None Yes &const u8 +UTF-8 Raw String r"X(hello)X" All Unicode None No [5]u8 +UTF-8 Raw C String rc"X(hello)X" All Unicode None Yes &const u8 ``` +### Escapes + + Escape | Name +----------|------------------------------------------------------------------- + \n | Newline + \r | Carriage Return + \t | Tab + \\ | Backslash + \' | Single Quote + \" | Double Quote + \xNN | hexadecimal 8-bit character code (2 digits) + \uNNNN | hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits) + \UNNNNNN | hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits) + +Note that the maximum valid Unicode point is 0x10ffff. + ##### Raw Strings Raw string literals have no escapes and can span across multiple lines. To @@ -283,25 +299,6 @@ start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('. To end a raw string, use ')' followed by the same unique bytes, followed by '"'. -``` -Escape Name - -\xNN hexadecimal 8-bit character code (exactly 2 digits) -\n Newline -\r Carriage return -\t Tab -\\ Backslash -\0 Null -\' Single quote -\" Double quote -``` - -### Unicode Escapes - - Escape | Name -------------|----------------------------------------------- - \u{NNNNNN} | hexadecimal 24-bit Unicode character code (up to 6 digits) - #### Numeric Literals ``` diff --git a/src/parser.cpp b/src/parser.cpp index 62f83fd080..04434f82e6 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -219,7 +219,7 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) { return return_value; } -static int get_hex_digit(uint8_t c) { +static uint32_t get_hex_digit(uint8_t c) { switch (c) { case '0': return 0; case '1': return 1; @@ -251,7 +251,7 @@ static int get_hex_digit(uint8_t c) { case 'F': return 15; default: - return -1; + return UINT32_MAX; } } @@ -279,13 +279,17 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool StateEscape, StateHex1, StateHex2, + StateUnicode, }; buf_resize(buf, 0); + int unicode_index; + int unicode_end; + State state = StatePre; SrcPos pos = {token->start_line, token->start_column}; - int hex_value = 0; + uint32_t hex_value = 0; for (int i = token->start_pos; i < token->end_pos - 1; i += 1) { uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i); @@ -348,17 +352,34 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool if (offset_map) offset_map->append(pos); state = StateStart; break; + case '\'': + buf_append_char(buf, '\''); + if (offset_map) offset_map->append(pos); + state = StateStart; + break; case 'x': state = StateHex1; break; + case 'u': + state = StateUnicode; + unicode_index = 0; + unicode_end = 4; + hex_value = 0; + break; + case 'U': + state = StateUnicode; + unicode_index = 0; + unicode_end = 6; + hex_value = 0; + break; default: ast_error(pc, token, "invalid escape character"); } break; case StateHex1: { - int hex_digit = get_hex_digit(c); - if (hex_digit == -1) { + uint32_t hex_digit = get_hex_digit(c); + if (hex_digit == UINT32_MAX) { ast_error(pc, token, "invalid hex digit: '%c'", c); } hex_value = hex_digit * 16; @@ -367,8 +388,8 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool } case StateHex2: { - int hex_digit = get_hex_digit(c); - if (hex_digit == -1) { + uint32_t hex_digit = get_hex_digit(c); + if (hex_digit == UINT32_MAX) { ast_error(pc, token, "invalid hex digit: '%c'", c); } hex_value += hex_digit; @@ -377,6 +398,47 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool state = StateStart; break; } + case StateUnicode: + { + uint32_t hex_digit = get_hex_digit(c); + if (hex_digit == UINT32_MAX) { + ast_error(pc, token, "invalid hex digit: '%c'", c); + } + hex_value *= 16; + hex_value += hex_digit; + unicode_index += 1; + if (unicode_index >= unicode_end) { + if (hex_value <= 0x7f) { + // 00000000 00000000 00000000 0xxxxxxx + buf_append_char(buf, hex_value); + } else if (hex_value <= 0x7ff) { + // 00000000 00000000 00000xxx xx000000 + buf_append_char(buf, (unsigned char)(0xc0 | (hex_value >> 6))); + // 00000000 00000000 00000000 00xxxxxx + buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); + } else if (hex_value <= 0xffff) { + // 00000000 00000000 xxxx0000 00000000 + buf_append_char(buf, (unsigned char)(0xe0 | (hex_value >> 12))); + // 00000000 00000000 0000xxxx xx000000 + buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); + } else if (hex_value <= 0x10ffff) { + // 00000000 000xxx00 00000000 00000000 + buf_append_char(buf, (unsigned char)(0xf0 | (hex_value >> 18))); + // 00000000 000000xx xxxx0000 00000000 + buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 12) & 0x3f))); + // 00000000 00000000 0000xxxx xx000000 + buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); + } else { + ast_error(pc, token, "unicode value out of range: %x", hex_value); + } + state = StateStart; + } + break; + } } if (c == '\n') { pos.line += 1; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 373f815857..f89d9236e8 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -103,6 +103,21 @@ ALPHA: \ case '_' +#define HEX_DIGIT \ + 'a': \ + case 'b': \ + case 'c': \ + case 'd': \ + case 'e': \ + case 'f': \ + case 'A': \ + case 'B': \ + case 'C': \ + case 'D': \ + case 'E': \ + case 'F': \ + case DIGIT + const char * zig_keywords[] = { "true", "false", "null", "fn", "return", "var", "const", "extern", "pub", "export", "use", "if", "else", "goto", "asm", @@ -132,11 +147,11 @@ enum TokenizeState { TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p" TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5" TokenizeStateString, + TokenizeStateStringEscape, TokenizeStateRawString, TokenizeStateRawStringContents, TokenizeStateRawStringMaybeEnd, TokenizeStateCharLiteral, - TokenizeStateCharLiteralEscape, TokenizeStateCharLiteralEnd, TokenizeStateSawStar, TokenizeStateSawSlash, @@ -162,6 +177,7 @@ enum TokenizeState { TokenizeStateSawDotDot, TokenizeStateSawQuestionMark, TokenizeStateSawAtSign, + TokenizeStateHex, TokenizeStateError, }; @@ -179,6 +195,7 @@ struct Tokenize { int raw_string_id_start; int raw_string_id_end; int raw_string_id_cmp_pos; + int hex_chars_left; }; __attribute__ ((format (printf, 2, 3))) @@ -921,10 +938,63 @@ void tokenize(Buf *buf, Tokenization *out) { case '\n': tokenize_error(&t, "use raw string for multiline string literal"); break; + case '\\': + t.state = TokenizeStateStringEscape; + break; default: break; } break; + case TokenizeStateStringEscape: + switch (c) { + case 'x': + t.state = TokenizeStateHex; + t.hex_chars_left = 2; + break; + case 'u': + t.state = TokenizeStateHex; + t.hex_chars_left = 4; + break; + case 'U': + t.state = TokenizeStateHex; + t.hex_chars_left = 6; + break; + case 'n': + case 'r': + case '\\': + case 't': + case '\'': + case '"': + if (t.cur_tok->id == TokenIdCharLiteral) { + t.state = TokenizeStateCharLiteralEnd; + } else if (t.cur_tok->id == TokenIdStringLiteral) { + t.state = TokenizeStateString; + } else { + zig_unreachable(); + } + break; + default: + tokenize_error(&t, "invalid character: '%c'", c); + } + break; + case TokenizeStateHex: + switch (c) { + case HEX_DIGIT: + t.hex_chars_left -= 1; + if (t.hex_chars_left == 0) { + if (t.cur_tok->id == TokenIdCharLiteral) { + t.state = TokenizeStateCharLiteralEnd; + } else if (t.cur_tok->id == TokenIdStringLiteral) { + t.state = TokenizeStateString; + } else { + zig_unreachable(); + } + } + break; + default: + tokenize_error(&t, "invalid character: '%c'", c); + } + break; case TokenizeStateRawString: if (c == '(') { t.raw_string_id_end = t.pos; @@ -963,16 +1033,13 @@ void tokenize(Buf *buf, Tokenization *out) { t.state = TokenizeStateStart; break; case '\\': - t.state = TokenizeStateCharLiteralEscape; + t.state = TokenizeStateStringEscape; break; default: t.state = TokenizeStateCharLiteralEnd; break; } break; - case TokenizeStateCharLiteralEscape: - t.state = TokenizeStateCharLiteralEnd; - break; case TokenizeStateCharLiteralEnd: switch (c) { case '\'': @@ -1136,13 +1203,22 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateString: tokenize_error(&t, "unterminated string"); break; + case TokenizeStateStringEscape: + case TokenizeStateHex: + if (t.cur_tok->id == TokenIdStringLiteral) { + tokenize_error(&t, "unterminated string"); + } else if (t.cur_tok->id == TokenIdCharLiteral) { + tokenize_error(&t, "unterminated character literal"); + } else { + zig_unreachable(); + } + break; case TokenizeStateRawString: case TokenizeStateRawStringContents: case TokenizeStateRawStringMaybeEnd: tokenize_error(&t, "unterminated raw string"); break; case TokenizeStateCharLiteral: - case TokenizeStateCharLiteralEscape: case TokenizeStateCharLiteralEnd: tokenize_error(&t, "unterminated character literal"); break; diff --git a/test/self_hosted.zig b/test/self_hosted.zig index 4a787ac665..e33f726590 100644 --- a/test/self_hosted.zig +++ b/test/self_hosted.zig @@ -1398,3 +1398,14 @@ fn test_take_address_of_parameter_noeval(f: f32) { fn array_mult_operator() { assert(str.eql("ab" ** 5, "ababababab")); } + +#attribute("test") +fn string_escapes() { + assert(str.eql("\"", "\x22")); + assert(str.eql("\'", "\x27")); + assert(str.eql("\n", "\x0a")); + assert(str.eql("\r", "\x0d")); + assert(str.eql("\t", "\x09")); + assert(str.eql("\\", "\x5c")); + assert(str.eql("\u1234\u0069", "\xe1\x88\xb4\x69")); +}