1 module tcenal.d.lexer;
2 
3 import std.array : Appender;
4 import std.ascii : isAlpha, isAlphaNum, isWhite, isDigit;
5 import std.range.primitives : empty;
6 import std.algorithm : startsWith;
7 import std.meta : AliasSeq;
8 
9 import tcenal.parser_combinator.token : Token;
10 
11 import compile_time_unittest : enableCompileTimeUnittest;
12 import assert_that : assertThat, eq, array, fields;
13 
14 mixin enableCompileTimeUnittest;
15 
16 
17 Token[] lex(string src)
18 {
19     return root(src);
20 }
21 unittest
22 {
23     mixin assertThat!(
24         "tokens", q{
25             lex(q{
26                 import std.stdio : writeln;
27                 void main()
28                 {
29                     writeln("Hello, world!");
30                 }
31             })
32         },
33         array!()._!(
34             fields!()._!(eq!q{import}, eq!""),
35             fields!()._!(eq!q{std}, eq!"identifier"),
36             fields!()._!(eq!q{.}, eq!""),
37             fields!()._!(eq!q{stdio}, eq!"identifier"),
38             fields!()._!(eq!q{:}, eq!""),
39             fields!()._!(eq!q{writeln}, eq!"identifier"),
40             fields!()._!(eq!q{;}, eq!""),
41             fields!()._!(eq!q{void}, eq!""),
42             fields!()._!(eq!q{main}, eq!"identifier"),
43             fields!()._!(eq!q{(}, eq!""),
44             fields!()._!(eq!q{)}, eq!""),
45             fields!()._!(eq!"{", eq!""),
46             fields!()._!(eq!q{writeln}, eq!"identifier"),
47             fields!()._!(eq!q{(}, eq!""),
48             fields!()._!(eq!q{"Hello, world!"}, eq!"stringLiteral"),
49             fields!()._!(eq!q{)}, eq!""),
50             fields!()._!(eq!q{;}, eq!""),
51             fields!()._!(eq!"}", eq!""),
52         )
53     );
54 }
55 
56 Token[] root(string src)
57 {
58     Appender!(Token[]) tokenAppender;
59 
60     loop:
61     while (!src.empty) {
62         if (src[0].isWhite()) {
63             src = src[1..$];
64             continue;
65         }
66 
67         if (src.startsWith("//"))
68         {
69             lineComment(src);
70             continue;
71         }
72 
73         if (src.startsWith("/*"))
74         {
75             blockComment(src);
76             continue;
77         }
78 
79         if (src.startsWith("/+"))
80         {
81             nestingBlockComment(src);
82             continue;
83         }
84 
85         alias untypedTokens = AliasSeq!("~=", "~", "}", "||", "|=", "|", "{", "^^=", "^^", "^=", "^", "]", "[", "@", "?", ">>>=", ">>>", ">>=", ">>", ">=", ">", "=>", "==", "=", "<>=", "<>", "<=", "<<=", "<<", "<", ";", ":", "/=", "/", "...", "..", ".", "-=", "--", "-", ",", "+=", "++", "+", "*=", "*", ")", "(", "&=", "&&", "&", "%=", "%", "$", "#", "!>=", "!>", "!=", "!<>=", "!<>", "!<=", "!<", "!");
86         foreach (untypedToken; untypedTokens)
87         {
88             if (src.startsWith(untypedToken)) {
89                 tokenAppender.put(Token(untypedToken));
90                 src = src[untypedToken.length..$];
91 
92                 continue loop;
93             }
94         }
95 
96         if (src.startsWith(`r"`) || src[0] == '`' || src[0] == '"')
97         {
98             tokenAppender.put(stringLiteral(src));
99             continue;
100         }
101 
102         if (src[0] == '\'')
103         {
104             tokenAppender.put(characterLiteral(src));
105             continue;
106         }
107 
108         if (src[0].isDigit())
109         {
110             tokenAppender.put(numericLiteral(src));
111             continue;
112         }
113 
114         if (src[0].isAlpha() || src[0] == '_')
115         {
116             tokenAppender.put(identifier(src));
117             continue;
118         }
119 
120         throw new Exception(src);
121     }
122 
123     return tokenAppender.data;
124 }
125 
126 void lineComment()(auto ref string src)
127 {
128     while (!src.empty)
129     {
130         if (src[0] == '\n')
131         {
132             src = src[1..$];
133             break;
134         }
135 
136         src = src[1..$];
137     }
138 }
139 
140 void blockComment()(auto ref string src)
141 {
142     while (!src.empty)
143     {
144         if (src.startsWith("*/"))
145         {
146             src = src[2..$];
147             break;
148         }
149 
150         src = src[1..$];
151     }
152 }
153 
154 void nestingBlockComment()(auto ref string src)
155 {
156     src = src[2..$];
157 
158     while (!src.empty)
159     {
160         if (src.startsWith("/+"))
161         {
162             nestingBlockComment(src);
163         }
164 
165         if (src.startsWith("+/"))
166         {
167             src = src[2..$];
168             break;
169         }
170 
171         src = src[1..$];
172     }
173 }
174 
175 Token stringLiteral()(auto ref string src)
176 {
177     char closingQuote;
178     bool escapeSequenceAvailable;
179     size_t contentStartingIndex;
180 
181     switch (src[0])
182     {
183         case 'r':
184             closingQuote = '"';
185             contentStartingIndex = 2;
186             break;
187 
188         case '`':
189             closingQuote = '`';
190             contentStartingIndex = 1;
191             break;
192 
193         case '"':
194             closingQuote = '"';
195             escapeSequenceAvailable = true;
196             contentStartingIndex = 1;
197             break;
198 
199         default:
200             assert(0);
201     }
202 
203     size_t closingIndex;
204     bool inEscapeSequence;
205     foreach (i, c; src) {
206         if (i < contentStartingIndex) continue;
207 
208         if (escapeSequenceAvailable && c == '\\')
209         {
210             inEscapeSequence = true;
211             continue;
212         }
213 
214         if (!inEscapeSequence && c == closingQuote)
215         {
216             closingIndex = i;
217             break;
218         }
219 
220         inEscapeSequence = false;
221     }
222 
223     if (closingIndex == 0) throw new Exception("");
224 
225     Token token = Token(src[0..(closingIndex + 1)], "stringLiteral");
226     src = src[(closingIndex + 1)..$];
227 
228     return token;
229 }
230 unittest
231 {
232     assert(stringLiteral(q{"foo"}) == Token(q{"foo"}, "stringLiteral"));
233     assert(stringLiteral(q{"foo\"bar"}) == Token(q{"foo\"bar"}, "stringLiteral"));
234     assert(stringLiteral(q{r"f\o\o"}) == Token(q{r"f\o\o"}, "stringLiteral"));
235     assert(stringLiteral(q{`"f\o\o"`}) == Token(q{`"f\o\o"`}, "stringLiteral"));
236 }
237 
238 Token characterLiteral()(auto ref string src)
239 {
240     Token token;
241     token.type = "characterLiteral";
242 
243     if (src[1] == '\\')
244     {
245         token.value = src[0..4];
246         src = src[4..$];
247     }
248     else
249     {
250         token.value = src[0..3];
251         src = src[3..$];
252     }
253 
254     return token;
255 }
256 unittest
257 {
258     assert(characterLiteral(q{'c'}) == Token(q{'c'}, "characterLiteral"));
259     assert(characterLiteral(q{'\n'}) == Token(q{'\n'}, "characterLiteral"));
260     assert(characterLiteral(q{'\\'}) == Token(q{'\\'}, "characterLiteral"));
261 }
262 
263 Token identifier()(auto ref string src)
264 {
265     size_t immediatelyFollowingNonAlphaNumIndex;
266 
267     foreach (i, c; src) {
268         if (!c.isAlphaNum() && c != '_')
269         {
270             immediatelyFollowingNonAlphaNumIndex = i;
271             break;
272         }
273     }
274 
275     if (immediatelyFollowingNonAlphaNumIndex == 0) immediatelyFollowingNonAlphaNumIndex = src.length;
276 
277     Token token = Token(src[0..immediatelyFollowingNonAlphaNumIndex], "identifier");
278     src = src[immediatelyFollowingNonAlphaNumIndex..$];
279 
280     alias keywords = AliasSeq!("abstract", "alias", "align", "asm", "assert", "auto", "body", "bool", "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", "char", "class", "const", "continue", "creal", "dchar", "debug", "default", "delegate", "delete (deprecated)", "deprecated", "do", "double", "else", "enum", "export", "extern", "false", "final", "finally", "float", "for", "foreach", "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", "immutable", "import", "in", "inout", "int", "interface", "invariant", "ireal", "is", "lazy", "long", "macro (unused)", "mixin", "module", "new", "nothrow", "null", "out", "override", "package", "pragma", "private", "protected", "public", "pure", "real", "ref", "return", "scope", "shared", "short", "static", "struct", "super", "switch", "synchronized", "template", "this", "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", "uint", "ulong", "union", "unittest", "ushort", "version", "void", "volatile", "wchar", "while", "with", "__FILE__", "__FILE_FULL_PATH__", "__MODULE__", "__LINE__", "__FUNCTION__", "__PRETTY_FUNCTION__", "__gshared", "__traits", "__vector", "__parameters");
281     foreach (keyword; keywords)
282     {
283         if (token.value == keyword) {
284             token.type = "";
285             break;
286         }
287     }
288 
289     return token;
290 }
291 unittest
292 {
293     mixin assertThat!("token", q{identifier(q{foo})},         fields!()._!(eq!q{foo},         eq!"identifier"));
294     mixin assertThat!("token", q{identifier(q{bar57})},       fields!()._!(eq!q{bar57},       eq!"identifier"));
295     mixin assertThat!("token", q{identifier(q{_foo_bar_57})}, fields!()._!(eq!q{_foo_bar_57}, eq!"identifier"));
296 }
297 
298 Token numericLiteral()(auto ref string src)
299 {
300     size_t immediatelyFollowingNonDigitIndex;
301 
302     foreach (i, c; src)
303     {
304         if (!c.isDigit() && c != '_')
305         {
306             immediatelyFollowingNonDigitIndex = i;
307             break;
308         }
309     }
310 
311     if (immediatelyFollowingNonDigitIndex == 0) immediatelyFollowingNonDigitIndex = src.length;
312 
313     Token token = Token(src[0..immediatelyFollowingNonDigitIndex], "integerLiteral");
314     src = src[immediatelyFollowingNonDigitIndex..$];
315 
316     return token;
317 }
318 unittest
319 {
320     assert(numericLiteral(q{123}) == Token(q{123}, "integerLiteral"));
321 }