allow _ separators in number literals (stage 1)

* Underscores `_` may be placed between two digits in a int/float literal
* Consecutive underscores are not allowed
* Fixed parsing bug in exponents of hexadecimal float literals.
  Exponents should always be base 10, but hex characters would be parsed
  inside the exponent and everything after them would be ignored. eg:
  `0x1.0p1ab1` would be parsed as `0x1.0p1`.
This commit is contained in:
momumi 2020-03-15 11:37:36 +10:00
parent eb4d313dbc
commit 7aac21c6f5
6 changed files with 297 additions and 92 deletions

View File

@ -885,6 +885,12 @@ const hex_int = 0xff;
const another_hex_int = 0xFF;
const octal_int = 0o755;
const binary_int = 0b11110000;
// underscores may be placed between two digits as a visual separator
const one_billion = 1_000_000_000;
const binary_mask = 0b1_1111_1111;
const permissions = 0o7_5_5;
const big_address = 0xFF80_0000_0000_0000;
{#code_end#}
{#header_close#}
{#header_open|Runtime Integer Values#}
@ -947,6 +953,11 @@ const yet_another = 123.0e+77;
const hex_floating_point = 0x103.70p-5;
const another_hex_float = 0x103.70;
const yet_another_hex_float = 0x103.70P-5;
// underscores may be placed between two digits as a visual separator
const lightspeed = 299_792_458.000_000;
const nanosecond = 0.000_000_001;
const more_hex = 0x1234_5678.9ABC_CDEFp-10;
{#code_end#}
<p>
There is no syntax for NaN, infinity, or negative infinity. For these special values,

View File

@ -69,23 +69,23 @@ test "floatundisf" {
test__floatundisf(0, 0.0);
test__floatundisf(1, 1.0);
test__floatundisf(2, 2.0);
test__floatundisf(0x7FFFFF8000000000, 0x1.FFFFFEp+62F);
test__floatundisf(0x7FFFFF0000000000, 0x1.FFFFFCp+62F);
test__floatundisf(0x8000008000000000, 0x1p+63F);
test__floatundisf(0x8000010000000000, 0x1.000002p+63F);
test__floatundisf(0x8000000000000000, 0x1p+63F);
test__floatundisf(0x8000000000000001, 0x1p+63F);
test__floatundisf(0xFFFFFFFFFFFFFFFE, 0x1p+64F);
test__floatundisf(0xFFFFFFFFFFFFFFFF, 0x1p+64F);
test__floatundisf(0x0007FB72E8000000, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72EA000000, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72EB000000, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72EBFFFFFF, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72EC000000, 0x1.FEDCBCp+50F);
test__floatundisf(0x0007FB72E8000001, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72E6000000, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72E7000000, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72E7FFFFFF, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72E4000001, 0x1.FEDCBAp+50F);
test__floatundisf(0x0007FB72E4000000, 0x1.FEDCB8p+50F);
test__floatundisf(0x7FFFFF8000000000, 0x1.FFFFFEp+62);
test__floatundisf(0x7FFFFF0000000000, 0x1.FFFFFCp+62);
test__floatundisf(0x8000008000000000, 0x1p+63);
test__floatundisf(0x8000010000000000, 0x1.000002p+63);
test__floatundisf(0x8000000000000000, 0x1p+63);
test__floatundisf(0x8000000000000001, 0x1p+63);
test__floatundisf(0xFFFFFFFFFFFFFFFE, 0x1p+64);
test__floatundisf(0xFFFFFFFFFFFFFFFF, 0x1p+64);
test__floatundisf(0x0007FB72E8000000, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72EA000000, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72EB000000, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72EBFFFFFF, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72EC000000, 0x1.FEDCBCp+50);
test__floatundisf(0x0007FB72E8000001, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72E6000000, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72E7000000, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72E7FFFFFF, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72E4000001, 0x1.FEDCBAp+50);
test__floatundisf(0x0007FB72E4000000, 0x1.FEDCB8p+50);
}

View File

@ -165,22 +165,36 @@ static long long scanexp(struct MuslFILE *f, int pok)
int x;
long long y;
int neg = 0;
c = shgetc(f);
if (c=='+' || c=='-') {
neg = (c=='-');
c = shgetc(f);
if (c-'0'>=10U && pok) shunget(f);
}
if (c-'0'>=10U) {
if (c-'0'>=10U && c!='_') {
shunget(f);
return LLONG_MIN;
}
for (x=0; c-'0'<10U && x<INT_MAX/10; c = shgetc(f))
x = 10*x + c-'0';
for (y=x; c-'0'<10U && y<LLONG_MAX/100; c = shgetc(f))
y = 10*y + c-'0';
for (; c-'0'<10U; c = shgetc(f));
for (x=0; ; c = shgetc(f)) {
if (c=='_') {
continue;
} else if (c-'0'<10U && x<INT_MAX/10) {
x = 10*x + c-'0';
} else {
break;
}
}
for (y=x; ; c = shgetc(f)) {
if (c=='_') {
continue;
} else if (c-'0'<10U && y<LLONG_MAX/100) {
y = 10*y + c-'0';
} else {
break;
}
}
for (; c-'0'<10U || c=='_'; c = shgetc(f));
shunget(f);
return neg ? -y : y;
}
@ -450,16 +464,36 @@ static float128_t decfloat(struct MuslFILE *f, int c, int bits, int emin, int si
j=0;
k=0;
/* Don't let leading zeros consume buffer space */
for (; c=='0'; c = shgetc(f)) gotdig=1;
/* Don't let leading zeros/underscores consume buffer space */
for (; ; c = shgetc(f)) {
if (c=='_') {
continue;
} else if (c=='0') {
gotdig=1;
} else {
break;
}
}
if (c=='.') {
gotrad = 1;
for (c = shgetc(f); c=='0'; c = shgetc(f)) gotdig=1, lrp--;
for (c = shgetc(f); ; c = shgetc(f)) {
if (c == '_') {
continue;
} else if (c=='0') {
gotdig=1;
lrp--;
} else {
break;
}
}
}
x[0] = 0;
for (; c-'0'<10U || c=='.'; c = shgetc(f)) {
if (c == '.') {
for (; c-'0'<10U || c=='.' || c=='_'; c = shgetc(f)) {
if (c == '_') {
continue;
} else if (c == '.') {
if (gotrad) break;
gotrad = 1;
lrp = dc;
@ -773,18 +807,29 @@ static float128_t hexfloat(struct MuslFILE *f, int bits, int emin, int sign, int
c = shgetc(f);
/* Skip leading zeros */
for (; c=='0'; c = shgetc(f)) gotdig = 1;
/* Skip leading zeros/underscores */
for (; c=='0' || c=='_'; c = shgetc(f)) gotdig = 1;
if (c=='.') {
gotrad = 1;
c = shgetc(f);
/* Count zeros after the radix point before significand */
for (rp=0; c=='0'; c = shgetc(f), rp--) gotdig = 1;
for (rp=0; ; c = shgetc(f)) {
if (c == '_') {
continue;
} else if (c == '0') {
gotdig = 1;
rp--;
} else {
break;
}
}
}
for (; c-'0'<10U || (c|32)-'a'<6U || c=='.'; c = shgetc(f)) {
if (c=='.') {
for (; c-'0'<10U || (c|32)-'a'<6U || c=='.' || c=='_'; c = shgetc(f)) {
if (c=='_') {
continue;
} else if (c=='.') {
if (gotrad) break;
rp = dc;
gotrad = 1;

View File

@ -177,10 +177,13 @@ enum TokenizeState {
TokenizeStateSymbol,
TokenizeStateZero, // "0", which might lead to "0x"
TokenizeStateNumber, // "123", "0x123"
TokenizeStateNumberNoUnderscore, // "12_", "0x12_" next char must be digit
TokenizeStateNumberDot,
TokenizeStateFloatFraction, // "123.456", "0x123.456"
TokenizeStateFloatFractionNoUnderscore, // "123.45_", "0x123.45_"
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateFloatExponentNumber, // "123.456e7", "123.456e+7", "123.456e-7"
TokenizeStateFloatExponentNumberNoUnderscore, // "123.456e7_", "123.456e+7_", "123.456e-7_"
TokenizeStateString,
TokenizeStateStringEscape,
TokenizeStateStringEscapeUnicodeStart,
@ -233,14 +236,10 @@ struct Tokenize {
Token *cur_tok;
Tokenization *out;
uint32_t radix;
int32_t exp_add_amt;
bool is_exp_negative;
bool is_trailing_underscore;
size_t char_code_index;
bool unicode;
uint32_t char_code;
int exponent_in_bin_or_dec;
BigInt specified_exponent;
BigInt significand;
size_t remaining_code_units;
};
@ -426,20 +425,16 @@ void tokenize(Buf *buf, Tokenization *out) {
case '0':
t.state = TokenizeStateZero;
begin_token(&t, TokenIdIntLiteral);
t.is_trailing_underscore = false;
t.radix = 10;
t.exp_add_amt = 1;
t.exponent_in_bin_or_dec = 0;
bigint_init_unsigned(&t.cur_tok->data.int_lit.bigint, 0);
bigint_init_unsigned(&t.specified_exponent, 0);
break;
case DIGIT_NON_ZERO:
t.state = TokenizeStateNumber;
begin_token(&t, TokenIdIntLiteral);
t.is_trailing_underscore = false;
t.radix = 10;
t.exp_add_amt = 1;
t.exponent_in_bin_or_dec = 0;
bigint_init_unsigned(&t.cur_tok->data.int_lit.bigint, get_digit_value(c));
bigint_init_unsigned(&t.specified_exponent, 0);
break;
case '"':
begin_token(&t, TokenIdStringLiteral);
@ -1189,17 +1184,15 @@ void tokenize(Buf *buf, Tokenization *out) {
switch (c) {
case 'b':
t.radix = 2;
t.state = TokenizeStateNumber;
t.state = TokenizeStateNumberNoUnderscore;
break;
case 'o':
t.radix = 8;
t.exp_add_amt = 3;
t.state = TokenizeStateNumber;
t.state = TokenizeStateNumberNoUnderscore;
break;
case 'x':
t.radix = 16;
t.exp_add_amt = 4;
t.state = TokenizeStateNumber;
t.state = TokenizeStateNumberNoUnderscore;
break;
default:
// reinterpret as normal number
@ -1208,9 +1201,27 @@ void tokenize(Buf *buf, Tokenization *out) {
continue;
}
break;
case TokenizeStateNumberNoUnderscore:
if (c == '_') {
invalid_char_error(&t, c);
break;
} else if (get_digit_value(c) < t.radix) {
t.is_trailing_underscore = false;
t.state = TokenizeStateNumber;
}
// fall through
case TokenizeStateNumber:
{
if (c == '_') {
t.is_trailing_underscore = true;
t.state = TokenizeStateNumberNoUnderscore;
break;
}
if (c == '.') {
if (t.is_trailing_underscore) {
invalid_char_error(&t, c);
break;
}
if (t.radix != 16 && t.radix != 10) {
invalid_char_error(&t, c);
}
@ -1222,13 +1233,18 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c);
}
t.state = TokenizeStateFloatExponentUnsigned;
t.radix = 10; // exponent is always base 10
assert(t.cur_tok->id == TokenIdIntLiteral);
bigint_init_bigint(&t.significand, &t.cur_tok->data.int_lit.bigint);
set_token_id(&t, t.cur_tok, TokenIdFloatLiteral);
break;
}
uint32_t digit_value = get_digit_value(c);
if (digit_value >= t.radix) {
if (t.is_trailing_underscore) {
invalid_char_error(&t, c);
break;
}
if (is_symbol_char(c)) {
invalid_char_error(&t, c);
}
@ -1259,20 +1275,37 @@ void tokenize(Buf *buf, Tokenization *out) {
continue;
}
t.pos -= 1;
t.state = TokenizeStateFloatFraction;
t.state = TokenizeStateFloatFractionNoUnderscore;
assert(t.cur_tok->id == TokenIdIntLiteral);
bigint_init_bigint(&t.significand, &t.cur_tok->data.int_lit.bigint);
set_token_id(&t, t.cur_tok, TokenIdFloatLiteral);
continue;
}
case TokenizeStateFloatFractionNoUnderscore:
if (c == '_') {
invalid_char_error(&t, c);
} else if (get_digit_value(c) < t.radix) {
t.is_trailing_underscore = false;
t.state = TokenizeStateFloatFraction;
}
// fall through
case TokenizeStateFloatFraction:
{
if (c == '_') {
t.is_trailing_underscore = true;
t.state = TokenizeStateFloatFractionNoUnderscore;
break;
}
if (is_exponent_signifier(c, t.radix)) {
t.state = TokenizeStateFloatExponentUnsigned;
t.radix = 10; // exponent is always base 10
break;
}
uint32_t digit_value = get_digit_value(c);
if (digit_value >= t.radix) {
if (t.is_trailing_underscore) {
invalid_char_error(&t, c);
break;
}
if (is_symbol_char(c)) {
invalid_char_error(&t, c);
}
@ -1282,46 +1315,47 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateStart;
continue;
}
t.exponent_in_bin_or_dec -= t.exp_add_amt;
if (t.radix == 10) {
// For now we use strtod to parse decimal floats, so we just have to get to the
// end of the token.
break;
}
BigInt digit_value_bi;
bigint_init_unsigned(&digit_value_bi, digit_value);
BigInt radix_bi;
bigint_init_unsigned(&radix_bi, t.radix);
BigInt multiplied;
bigint_mul(&multiplied, &t.significand, &radix_bi);
bigint_add(&t.significand, &multiplied, &digit_value_bi);
break;
// we use parse_f128 to generate the float literal, so just
// need to get to the end of the token
}
break;
case TokenizeStateFloatExponentUnsigned:
switch (c) {
case '+':
t.is_exp_negative = false;
t.state = TokenizeStateFloatExponentNumber;
t.state = TokenizeStateFloatExponentNumberNoUnderscore;
break;
case '-':
t.is_exp_negative = true;
t.state = TokenizeStateFloatExponentNumber;
t.state = TokenizeStateFloatExponentNumberNoUnderscore;
break;
default:
// reinterpret as normal exponent number
t.pos -= 1;
t.is_exp_negative = false;
t.state = TokenizeStateFloatExponentNumber;
t.state = TokenizeStateFloatExponentNumberNoUnderscore;
continue;
}
break;
case TokenizeStateFloatExponentNumberNoUnderscore:
if (c == '_') {
invalid_char_error(&t, c);
} else if (get_digit_value(c) < t.radix) {
t.is_trailing_underscore = false;
t.state = TokenizeStateFloatExponentNumber;
}
// fall through
case TokenizeStateFloatExponentNumber:
{
if (c == '_') {
t.is_trailing_underscore = true;
t.state = TokenizeStateFloatExponentNumberNoUnderscore;
break;
}
uint32_t digit_value = get_digit_value(c);
if (digit_value >= t.radix) {
if (t.is_trailing_underscore) {
invalid_char_error(&t, c);
break;
}
if (is_symbol_char(c)) {
invalid_char_error(&t, c);
}
@ -1331,21 +1365,9 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateStart;
continue;
}
if (t.radix == 10) {
// For now we use strtod to parse decimal floats, so we just have to get to the
// end of the token.
break;
}
BigInt digit_value_bi;
bigint_init_unsigned(&digit_value_bi, digit_value);
BigInt radix_bi;
bigint_init_unsigned(&radix_bi, 10);
BigInt multiplied;
bigint_mul(&multiplied, &t.specified_exponent, &radix_bi);
bigint_add(&t.specified_exponent, &multiplied, &digit_value_bi);
// we use parse_f128 to generate the float literal, so just
// need to get to the end of the token
}
break;
case TokenizeStateSawDash:
@ -1399,6 +1421,9 @@ void tokenize(Buf *buf, Tokenization *out) {
case TokenizeStateStart:
case TokenizeStateError:
break;
case TokenizeStateNumberNoUnderscore:
case TokenizeStateFloatFractionNoUnderscore:
case TokenizeStateFloatExponentNumberNoUnderscore:
case TokenizeStateNumberDot:
tokenize_error(&t, "unterminated number literal");
break;

View File

@ -389,6 +389,102 @@ pub fn addCases(cases: *tests.CompileErrorContext) void {
"tmp.zig:5:29: error: invalid token: '.'",
});
cases.add("invalid underscore placement in float literal - 1",
\\fn main() void {
\\ var bad: f128 = 0._0;
\\})
, &[_][]const u8{
"tmp.zig:2:23: error: invalid character: '_'",
});
cases.add("invalid underscore placement in float literal - 2",
\\fn main() void {
\\ var bad: f128 = 0_.0;
\\})
, &[_][]const u8{
"tmp.zig:2:23: error: invalid character: '.'",
});
cases.add("invalid underscore placement in float literal - 3",
\\fn main() void {
\\ var bad: f128 = 0.0_;
\\})
, &[_][]const u8{
"tmp.zig:2:25: error: invalid character: ';'",
});
cases.add("invalid underscore placement in float literal - 4",
\\fn main() void {
\\ var bad: f128 = 1.0e_1;
\\})
, &[_][]const u8{
"tmp.zig:2:25: error: invalid character: '_'",
});
cases.add("invalid underscore placement in float literal - 5",
\\fn main() void {
\\ var bad: f128 = 1.0e+_1;
\\})
, &[_][]const u8{
"tmp.zig:2:26: error: invalid character: '_'",
});
cases.add("invalid underscore placement in float literal - 6",
\\fn main() void {
\\ var bad: f128 = 1.0e-_1;
\\})
, &[_][]const u8{
"tmp.zig:2:26: error: invalid character: '_'",
});
cases.add("invalid underscore placement in float literal - 7",
\\fn main() void {
\\ var bad: f128 = 1.0e-1_;
\\})
, &[_][]const u8{
"tmp.zig:2:28: error: invalid character: ';'",
});
cases.add("invalid underscore placement in float literal - 9",
\\fn main() void {
\\ var bad: f128 = 1__0.0e-1;
\\})
, &[_][]const u8{
"tmp.zig:2:23: error: invalid character: '_'",
});
cases.add("invalid underscore placement in float literal - 10",
\\fn main() void {
\\ var bad: f128 = 1.0__0e-1;
\\})
, &[_][]const u8{
"tmp.zig:2:25: error: invalid character: '_'",
});
cases.add("invalid underscore placement in float literal - 11",
\\fn main() void {
\\ var bad: f128 = 1.0e-1__0;
\\})
, &[_][]const u8{
"tmp.zig:2:28: error: invalid character: '_'",
});
cases.add("invalid underscore placement in float literal - 12",
\\fn main() void {
\\ var bad: f128 = 0_x0.0;
\\})
, &[_][]const u8{
"tmp.zig:2:23: error: invalid character: 'x'",
});
cases.add("invalid underscore placement in float literal - 13",
\\fn main() void {
\\ var bad: f128 = 0x_0.0;
\\})
, &[_][]const u8{
"tmp.zig:2:23: error: invalid character: '_'",
});
cases.add("var args without c calling conv",
\\fn foo(args: ...) void {}
\\comptime {

View File

@ -411,6 +411,34 @@ test "quad hex float literal parsing accurate" {
comptime S.doTheTest();
}
test "underscore separator parsing" {
expect(0_0_0_0 == 0);
expect(1_234_567 == 1234567);
expect(001_234_567 == 1234567);
expect(0_0_1_2_3_4_5_6_7 == 1234567);
expect(0b0_0_0_0 == 0);
expect(0b1010_1010 == 0b10101010);
expect(0b0000_1010_1010 == 0b10101010);
expect(0b1_0_1_0_1_0_1_0 == 0b10101010);
expect(0o0_0_0_0 == 0);
expect(0o1010_1010 == 0o10101010);
expect(0o0000_1010_1010 == 0o10101010);
expect(0o1_0_1_0_1_0_1_0 == 0o10101010);
expect(0x0_0_0_0 == 0);
expect(0x1010_1010 == 0x10101010);
expect(0x0000_1010_1010 == 0x10101010);
expect(0x1_0_1_0_1_0_1_0 == 0x10101010);
expect(123_456.789_000e1_0 == 123456.789000e10);
expect(0_1_2_3_4_5_6.7_8_9_0_0_0e0_0_1_0 == 123456.789000e10);
expect(0x1234_5678.9ABC_DEF0p-1_0 == 0x12345678.9ABCDEF0p-10);
expect(0x1_2_3_4_5_6_7_8.9_A_B_C_D_E_F_0p-0_0_0_1_0 == 0x12345678.9ABCDEF0p-10);
}
test "hex float literal within range" {
const a = 0x1.0p16383;
const b = 0x0.1p16387;