1 module tcenal.d.lexer; 2 3 import std.array : Appender; 4 import std.ascii : isAlpha, isAlphaNum, isWhite, isDigit; 5 import std.range.primitives : empty; 6 import std.algorithm : startsWith; 7 import std.meta : AliasSeq; 8 9 import tcenal.parser_combinator.token : Token; 10 11 import compile_time_unittest : enableCompileTimeUnittest; 12 import assert_that : assertThat, eq, array, fields; 13 14 mixin enableCompileTimeUnittest; 15 16 17 Token[] lex(string src) 18 { 19 return root(src); 20 } 21 unittest 22 { 23 mixin assertThat!( 24 "tokens", q{ 25 lex(q{ 26 import std.stdio : writeln; 27 void main() 28 { 29 writeln("Hello, world!"); 30 } 31 }) 32 }, 33 array!()._!( 34 fields!()._!(eq!q{import}, eq!""), 35 fields!()._!(eq!q{std}, eq!"identifier"), 36 fields!()._!(eq!q{.}, eq!""), 37 fields!()._!(eq!q{stdio}, eq!"identifier"), 38 fields!()._!(eq!q{:}, eq!""), 39 fields!()._!(eq!q{writeln}, eq!"identifier"), 40 fields!()._!(eq!q{;}, eq!""), 41 fields!()._!(eq!q{void}, eq!""), 42 fields!()._!(eq!q{main}, eq!"identifier"), 43 fields!()._!(eq!q{(}, eq!""), 44 fields!()._!(eq!q{)}, eq!""), 45 fields!()._!(eq!"{", eq!""), 46 fields!()._!(eq!q{writeln}, eq!"identifier"), 47 fields!()._!(eq!q{(}, eq!""), 48 fields!()._!(eq!q{"Hello, world!"}, eq!"stringLiteral"), 49 fields!()._!(eq!q{)}, eq!""), 50 fields!()._!(eq!q{;}, eq!""), 51 fields!()._!(eq!"}", eq!""), 52 ) 53 ); 54 } 55 56 Token[] root(string src) 57 { 58 Appender!(Token[]) tokenAppender; 59 60 loop: 61 while (!src.empty) { 62 if (src[0].isWhite()) { 63 src = src[1..$]; 64 continue; 65 } 66 67 if (src.startsWith("//")) 68 { 69 lineComment(src); 70 continue; 71 } 72 73 if (src.startsWith("/*")) 74 { 75 blockComment(src); 76 continue; 77 } 78 79 if (src.startsWith("/+")) 80 { 81 nestingBlockComment(src); 82 continue; 83 } 84 85 alias untypedTokens = AliasSeq!("~=", "~", "}", "||", "|=", "|", "{", "^^=", "^^", "^=", "^", "]", "[", "@", "?", ">>>=", ">>>", ">>=", ">>", ">=", ">", "=>", "==", "=", "<>=", "<>", "<=", "<<=", "<<", "<", ";", ":", "/=", "/", "...", "..", ".", "-=", "--", "-", ",", "+=", "++", "+", "*=", "*", ")", "(", "&=", "&&", "&", "%=", "%", "$", "#", "!>=", "!>", "!=", "!<>=", "!<>", "!<=", "!<", "!"); 86 foreach (untypedToken; untypedTokens) 87 { 88 if (src.startsWith(untypedToken)) { 89 tokenAppender.put(Token(untypedToken)); 90 src = src[untypedToken.length..$]; 91 92 continue loop; 93 } 94 } 95 96 if (src.startsWith(`r"`) || src[0] == '`' || src[0] == '"') 97 { 98 tokenAppender.put(stringLiteral(src)); 99 continue; 100 } 101 102 if (src[0] == '\'') 103 { 104 tokenAppender.put(characterLiteral(src)); 105 continue; 106 } 107 108 if (src[0].isDigit()) 109 { 110 tokenAppender.put(numericLiteral(src)); 111 continue; 112 } 113 114 if (src[0].isAlpha() || src[0] == '_') 115 { 116 tokenAppender.put(identifier(src)); 117 continue; 118 } 119 120 throw new Exception(src); 121 } 122 123 return tokenAppender.data; 124 } 125 126 void lineComment()(auto ref string src) 127 { 128 while (!src.empty) 129 { 130 if (src[0] == '\n') 131 { 132 src = src[1..$]; 133 break; 134 } 135 136 src = src[1..$]; 137 } 138 } 139 140 void blockComment()(auto ref string src) 141 { 142 while (!src.empty) 143 { 144 if (src.startsWith("*/")) 145 { 146 src = src[2..$]; 147 break; 148 } 149 150 src = src[1..$]; 151 } 152 } 153 154 void nestingBlockComment()(auto ref string src) 155 { 156 src = src[2..$]; 157 158 while (!src.empty) 159 { 160 if (src.startsWith("/+")) 161 { 162 nestingBlockComment(src); 163 } 164 165 if (src.startsWith("+/")) 166 { 167 src = src[2..$]; 168 break; 169 } 170 171 src = src[1..$]; 172 } 173 } 174 175 Token stringLiteral()(auto ref string src) 176 { 177 char closingQuote; 178 bool escapeSequenceAvailable; 179 size_t contentStartingIndex; 180 181 switch (src[0]) 182 { 183 case 'r': 184 closingQuote = '"'; 185 contentStartingIndex = 2; 186 break; 187 188 case '`': 189 closingQuote = '`'; 190 contentStartingIndex = 1; 191 break; 192 193 case '"': 194 closingQuote = '"'; 195 escapeSequenceAvailable = true; 196 contentStartingIndex = 1; 197 break; 198 199 default: 200 assert(0); 201 } 202 203 size_t closingIndex; 204 bool inEscapeSequence; 205 foreach (i, c; src) { 206 if (i < contentStartingIndex) continue; 207 208 if (escapeSequenceAvailable && c == '\\') 209 { 210 inEscapeSequence = true; 211 continue; 212 } 213 214 if (!inEscapeSequence && c == closingQuote) 215 { 216 closingIndex = i; 217 break; 218 } 219 220 inEscapeSequence = false; 221 } 222 223 if (closingIndex == 0) throw new Exception(""); 224 225 Token token = Token(src[0..(closingIndex + 1)], "stringLiteral"); 226 src = src[(closingIndex + 1)..$]; 227 228 return token; 229 } 230 unittest 231 { 232 assert(stringLiteral(q{"foo"}) == Token(q{"foo"}, "stringLiteral")); 233 assert(stringLiteral(q{"foo\"bar"}) == Token(q{"foo\"bar"}, "stringLiteral")); 234 assert(stringLiteral(q{r"f\o\o"}) == Token(q{r"f\o\o"}, "stringLiteral")); 235 assert(stringLiteral(q{`"f\o\o"`}) == Token(q{`"f\o\o"`}, "stringLiteral")); 236 } 237 238 Token characterLiteral()(auto ref string src) 239 { 240 Token token; 241 token.type = "characterLiteral"; 242 243 if (src[1] == '\\') 244 { 245 token.value = src[0..4]; 246 src = src[4..$]; 247 } 248 else 249 { 250 token.value = src[0..3]; 251 src = src[3..$]; 252 } 253 254 return token; 255 } 256 unittest 257 { 258 assert(characterLiteral(q{'c'}) == Token(q{'c'}, "characterLiteral")); 259 assert(characterLiteral(q{'\n'}) == Token(q{'\n'}, "characterLiteral")); 260 assert(characterLiteral(q{'\\'}) == Token(q{'\\'}, "characterLiteral")); 261 } 262 263 Token identifier()(auto ref string src) 264 { 265 size_t immediatelyFollowingNonAlphaNumIndex; 266 267 foreach (i, c; src) { 268 if (!c.isAlphaNum() && c != '_') 269 { 270 immediatelyFollowingNonAlphaNumIndex = i; 271 break; 272 } 273 } 274 275 if (immediatelyFollowingNonAlphaNumIndex == 0) immediatelyFollowingNonAlphaNumIndex = src.length; 276 277 Token token = Token(src[0..immediatelyFollowingNonAlphaNumIndex], "identifier"); 278 src = src[immediatelyFollowingNonAlphaNumIndex..$]; 279 280 alias keywords = AliasSeq!("abstract", "alias", "align", "asm", "assert", "auto", "body", "bool", "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", "char", "class", "const", "continue", "creal", "dchar", "debug", "default", "delegate", "delete (deprecated)", "deprecated", "do", "double", "else", "enum", "export", "extern", "false", "final", "finally", "float", "for", "foreach", "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", "immutable", "import", "in", "inout", "int", "interface", "invariant", "ireal", "is", "lazy", "long", "macro (unused)", "mixin", "module", "new", "nothrow", "null", "out", "override", "package", "pragma", "private", "protected", "public", "pure", "real", "ref", "return", "scope", "shared", "short", "static", "struct", "super", "switch", "synchronized", "template", "this", "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", "uint", "ulong", "union", "unittest", "ushort", "version", "void", "volatile", "wchar", "while", "with", "__FILE__", "__FILE_FULL_PATH__", "__MODULE__", "__LINE__", "__FUNCTION__", "__PRETTY_FUNCTION__", "__gshared", "__traits", "__vector", "__parameters"); 281 foreach (keyword; keywords) 282 { 283 if (token.value == keyword) { 284 token.type = ""; 285 break; 286 } 287 } 288 289 return token; 290 } 291 unittest 292 { 293 mixin assertThat!("token", q{identifier(q{foo})}, fields!()._!(eq!q{foo}, eq!"identifier")); 294 mixin assertThat!("token", q{identifier(q{bar57})}, fields!()._!(eq!q{bar57}, eq!"identifier")); 295 mixin assertThat!("token", q{identifier(q{_foo_bar_57})}, fields!()._!(eq!q{_foo_bar_57}, eq!"identifier")); 296 } 297 298 Token numericLiteral()(auto ref string src) 299 { 300 size_t immediatelyFollowingNonDigitIndex; 301 302 foreach (i, c; src) 303 { 304 if (!c.isDigit() && c != '_') 305 { 306 immediatelyFollowingNonDigitIndex = i; 307 break; 308 } 309 } 310 311 if (immediatelyFollowingNonDigitIndex == 0) immediatelyFollowingNonDigitIndex = src.length; 312 313 Token token = Token(src[0..immediatelyFollowingNonDigitIndex], "integerLiteral"); 314 src = src[immediatelyFollowingNonDigitIndex..$]; 315 316 return token; 317 } 318 unittest 319 { 320 assert(numericLiteral(q{123}) == Token(q{123}, "integerLiteral")); 321 }