From f5f33f0f6a540cf776712bec42e067b9a0e923b2 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Tue, 19 Mar 2024 11:27:33 +0100 Subject: [PATCH] lexer: allow TABS for indentation Allow TAB character to be used for indentation. This is useful to have TAB character used as indentation, especially when parts of build.ninja are hand-written as HEREDOCs in otherwise TAB-indented file (either mandated by style for other part of project, or required by language itself). Changing lexer is easy thanks to the use of re2c, syntax is perhaps a bit too permissive now, but that is job of the parser to reject use of mixed indentation. Let's stop complaining that: ninja: error: build.ninja:3: expected 'command =' line when it is exactly: command = cc $cflags -c $in -o $out tests: print string instead of int for Lexer::Token Extend C++ tests by a wrapper for Lexer::Token printing, to have string value reported instead of numeric value, that gives "newline" instead of "8" in test results. Original line numbers are kept in error messages as they were before. Extend python functional test to have TABs used in both variable and build statements. Closes #1598 Signed-of-by: Przemek Kitszel --- misc/output_test.py | 24 ++++++++++++++++ src/lexer.cc | 70 +++++++++++++++++++++++---------------------- src/lexer.in.cc | 21 ++++++-------- src/lexer_test.cc | 52 ++++++++++++++++++++++----------- 4 files changed, 105 insertions(+), 62 deletions(-) diff --git a/misc/output_test.py b/misc/output_test.py index 78848cbd4c..ff6e668240 100755 --- a/misc/output_test.py +++ b/misc/output_test.py @@ -162,6 +162,30 @@ def test_tool_inputs(self): out2 ''') + def test_tabs_indent(self): + content = ''' +rule exec + command = $cmd + +var_hello = hell$ + o + +build foo: exec + cmd = touch foo + +build bar: exec $ + foo + cmd = touch bar + +build $var_hello: exec + cmd = touch $var_hello + +build baz: exec $ + bar $var_hello + cmd = touch baz +''' + run(content) + if __name__ == '__main__': unittest.main() diff --git a/src/lexer.cc b/src/lexer.cc index e5729f00a0..efca1be1c3 100644 --- a/src/lexer.cc +++ b/src/lexer.cc @@ -105,12 +105,9 @@ const char* Lexer::TokenErrorHint(Token expected) { string Lexer::DescribeLastError() { if (last_token_) { - switch (last_token_[0]) { - case '\t': - return "tabs are not allowed, use spaces"; - } + return "lexing error <"+string(last_token_)+">"; } - return "lexing error"; + return "lexing error (EOF?)"; } void Lexer::UnreadToken() { @@ -130,7 +127,7 @@ Lexer::Token Lexer::ReadToken() { unsigned int yyaccept = 0; static const unsigned char yybm[] = { 0, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 0, 128, 128, 128, 128, 128, + 128, 160, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 160, 128, 128, 128, 128, 128, 128, 128, @@ -164,16 +161,17 @@ Lexer::Token Lexer::ReadToken() { }; yych = *p; if (yybm[0+yych] & 32) { - goto yy9; + goto yy6; } if (yych <= '^') { if (yych <= ',') { if (yych <= '\f') { if (yych <= 0x00) goto yy2; - if (yych == '\n') goto yy6; + if (yych <= 0x08) goto yy4; + if (yych <= '\n') goto yy9; goto yy4; } else { - if (yych <= '\r') goto yy8; + if (yych <= '\r') goto yy11; if (yych == '#') goto yy12; goto yy4; } @@ -228,31 +226,32 @@ Lexer::Token Lexer::ReadToken() { yy5: { token = ERROR; break; } yy6: - ++p; - { token = NEWLINE; break; } -yy8: - yych = *++p; - if (yych == '\n') goto yy28; - goto yy5; -yy9: yyaccept = 0; yych = *(q = ++p); if (yybm[0+yych] & 32) { - goto yy9; + goto yy6; } if (yych <= '\f') { - if (yych == '\n') goto yy6; + if (yych <= 0x08) goto yy8; + if (yych <= '\n') goto yy9; } else { - if (yych <= '\r') goto yy30; - if (yych == '#') goto yy32; + if (yych <= '\r') goto yy28; + if (yych == '#') goto yy30; } -yy11: +yy8: { token = INDENT; break; } +yy9: + ++p; + { token = NEWLINE; break; } +yy11: + yych = *++p; + if (yych == '\n') goto yy32; + goto yy5; yy12: yyaccept = 1; yych = *(q = ++p); if (yych <= 0x00) goto yy5; - goto yy33; + goto yy31; yy13: yych = *++p; yy14: @@ -296,25 +295,27 @@ Lexer::Token Lexer::ReadToken() { if (yych == '|') goto yy44; { token = PIPE; break; } yy28: - ++p; - { token = NEWLINE; break; } -yy30: yych = *++p; - if (yych == '\n') goto yy28; -yy31: + if (yych == '\n') goto yy32; +yy29: p = q; if (yyaccept == 0) { - goto yy11; + goto yy8; } else { goto yy5; } -yy32: +yy30: yych = *++p; -yy33: +yy31: if (yybm[0+yych] & 128) { - goto yy32; + goto yy30; } - if (yych <= 0x00) goto yy31; + if (yych <= 0x00) goto yy29; + goto yy34; +yy32: + ++p; + { token = NEWLINE; break; } +yy34: ++p; { continue; } yy36: @@ -478,7 +479,7 @@ void Lexer::EatWhitespace() { unsigned char yych; static const unsigned char yybm[] = { 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 0, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, @@ -631,7 +632,7 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { unsigned char yych; static const unsigned char yybm[] = { 0, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 0, 16, 16, 0, 16, 16, + 16, 48, 0, 16, 16, 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 16, 16, 16, 0, 16, 16, 16, @@ -797,6 +798,7 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { goto yy113; yy128: yych = *++p; + if (yych == '\t') goto yy128; if (yych == ' ') goto yy128; { continue; diff --git a/src/lexer.in.cc b/src/lexer.in.cc index 6f1d8e7937..4dc3cdda18 100644 --- a/src/lexer.in.cc +++ b/src/lexer.in.cc @@ -104,12 +104,9 @@ const char* Lexer::TokenErrorHint(Token expected) { string Lexer::DescribeLastError() { if (last_token_) { - switch (last_token_[0]) { - case '\t': - return "tabs are not allowed, use spaces"; - } + return "lexing error <"+string(last_token_)+">"; } - return "lexing error"; + return "lexing error (EOF?)"; } void Lexer::UnreadToken() { @@ -133,10 +130,10 @@ Lexer::Token Lexer::ReadToken() { simple_varname = [a-zA-Z0-9_-]+; varname = [a-zA-Z0-9_.-]+; - [ ]*"#"[^\000\n]*"\n" { continue; } - [ ]*"\r\n" { token = NEWLINE; break; } - [ ]*"\n" { token = NEWLINE; break; } - [ ]+ { token = INDENT; break; } + [ \t]*"#"[^\000\n]*"\n" { continue; } + [ \t]*"\r\n" { token = NEWLINE; break; } + [ \t]*"\n" { token = NEWLINE; break; } + [ \t]+ { token = INDENT; break; } "build" { token = BUILD; break; } "pool" { token = POOL; break; } "rule" { token = RULE; break; } @@ -175,7 +172,7 @@ void Lexer::EatWhitespace() { for (;;) { ofs_ = p; /*!re2c - [ ]+ { continue; } + [ \t]+ { continue; } "$\r\n" { continue; } "$\n" { continue; } nul { break; } @@ -241,10 +238,10 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { eval->AddText(StringPiece(" ", 1)); continue; } - "$\r\n"[ ]* { + "$\r\n"[ \t]* { continue; } - "$\n"[ ]* { + "$\n"[ \t]* { continue; } "${"varname"}" { diff --git a/src/lexer_test.cc b/src/lexer_test.cc index c5c416dc54..ab884dbe62 100644 --- a/src/lexer_test.cc +++ b/src/lexer_test.cc @@ -17,12 +17,20 @@ #include "eval_env.h" #include "test.h" -using namespace std; +std::string tok(Lexer::Token t) { + const char *str = Lexer::TokenName(t); + if (!str) + return "TokenOutOfRange: " + std::to_string(t); + return str; +} + +#define EXPECT_EQ_TOK(t1, t2) \ + EXPECT_EQ(tok(t1), tok(t2)) TEST(Lexer, ReadVarValue) { Lexer lexer("plain text $var $VaR ${x}\n"); EvalString eval; - string err; + std::string err; EXPECT_TRUE(lexer.ReadVarValue(&eval, &err)); EXPECT_EQ("", err); EXPECT_EQ("[plain text ][$var][ ][$VaR][ ][$x]", @@ -32,7 +40,7 @@ TEST(Lexer, ReadVarValue) { TEST(Lexer, ReadEvalStringEscapes) { Lexer lexer("$ $$ab c$: $\ncde\n"); EvalString eval; - string err; + std::string err; EXPECT_TRUE(lexer.ReadVarValue(&eval, &err)); EXPECT_EQ("", err); EXPECT_EQ("[ $ab c: cde]", @@ -41,7 +49,7 @@ TEST(Lexer, ReadEvalStringEscapes) { TEST(Lexer, ReadIdent) { Lexer lexer("foo baR baz_123 foo-bar"); - string ident; + std::string ident; EXPECT_TRUE(lexer.ReadIdent(&ident)); EXPECT_EQ("foo", ident); EXPECT_TRUE(lexer.ReadIdent(&ident)); @@ -56,12 +64,12 @@ TEST(Lexer, ReadIdentCurlies) { // Verify that ReadIdent includes dots in the name, // but in an expansion $bar.dots stops at the dot. Lexer lexer("foo.dots $bar.dots ${bar.dots}\n"); - string ident; + std::string ident; EXPECT_TRUE(lexer.ReadIdent(&ident)); EXPECT_EQ("foo.dots", ident); EvalString eval; - string err; + std::string err; EXPECT_TRUE(lexer.ReadVarValue(&eval, &err)); EXPECT_EQ("", err); EXPECT_EQ("[$bar][.dots ][$bar.dots]", @@ -71,7 +79,7 @@ TEST(Lexer, ReadIdentCurlies) { TEST(Lexer, Error) { Lexer lexer("foo$\nbad $"); EvalString eval; - string err; + std::string err; ASSERT_FALSE(lexer.ReadVarValue(&eval, &err)); EXPECT_EQ("input:2: bad $-escape (literal $ must be written as $$)\n" "bad $\n" @@ -83,16 +91,28 @@ TEST(Lexer, CommentEOF) { // Verify we don't run off the end of the string when the EOF is // mid-comment. Lexer lexer("# foo"); - Lexer::Token token = lexer.ReadToken(); - EXPECT_EQ(Lexer::ERROR, token); + EXPECT_EQ_TOK(Lexer::ERROR, lexer.ReadToken()); } TEST(Lexer, Tabs) { - // Verify we print a useful error on a disallowed character. - Lexer lexer(" \tfoobar"); - Lexer::Token token = lexer.ReadToken(); - EXPECT_EQ(Lexer::INDENT, token); - token = lexer.ReadToken(); - EXPECT_EQ(Lexer::ERROR, token); - EXPECT_EQ("tabs are not allowed, use spaces", lexer.DescribeLastError()); + Lexer lexer("rule foo\n" + "\tcommand = foobin $in"); + + EXPECT_EQ_TOK(Lexer::RULE, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::IDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::NEWLINE, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::INDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::IDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::EQUALS, lexer.ReadToken()); +} + +TEST(Lexer, TabsInVars) { + Lexer lexer("cflags =\n" + "\t-std=c11"); + + EXPECT_EQ_TOK(Lexer::IDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::EQUALS, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::NEWLINE, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::INDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::IDENT, lexer.ReadToken()); }