1 // This file is part of Visual D
2 //
3 // Visual D integrates the D programming language into Visual Studio
4 // Copyright (c) 2010-2011 by Rainer Schuetze, All Rights Reserved
5 //
6 // Distributed under the Boost Software License, Version 1.0.
7 // See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt
8 
9 module vdc.lexer;
10 
11 import std.ascii;
12 import std.uni : isAlpha;
13 import std.utf;
14 import std.conv;
15 
16 enum supportUnorderedCompareOps = false;
17 
18 // current limitations:
19 // - nested comments must not nest more than 255 times
20 // - braces must not nest more than 4095 times inside token string
21 // - number of different delimiters must not exceed 256
22 
23 enum TokenCat : int
24 {
25     // assumed to match beginning of visuald.colorizer.TokenColor
26     Text,
27     Keyword,
28     Comment,
29     Identifier,
30     String,
31     Literal,
32     Text2,
33     Operator,
34 }
35 
36 struct TokenInfo
37 {
38     TokenCat type;
39     int tokid;
40     int StartIndex;
41     int EndIndex;
42 }
43 
44 ///////////////////////////////////////////////////////////////////////////////
45 
46 struct Lexer
47 {
48     enum State
49     {
50         kWhite,
51         kBlockComment,
52         kNestedComment,
53         kStringCStyle,
54         kStringWysiwyg,
55         kStringAltWysiwyg,
56         kStringDelimited,
57         kStringDelimitedNestedBracket,
58         kStringDelimitedNestedParen,
59         kStringDelimitedNestedBrace,
60         kStringDelimitedNestedAngle,
61         kStringTokenFirst,  // after 'q', but before '{' to pass '{' as single operator
62         kStringToken,  // encoded by tokenStringLevel > 0
63         kStringHex,    // for now, treated as State.kStringWysiwyg
64         kStringEscape, // removed in D2.026, not supported
65     }
66 
67     // lexer scan state is: ___TTNNS
68     // TT: token string nesting level
69     // NN: comment nesting level/string delimiter id
70     // S: State
71     static State scanState(int state) { return cast(State) (state & 0xf); }
72     static int nestingLevel(int state) { return (state >> 4) & 0xff; } // used for state kNestedComment and kStringDelimited
73     static int tokenStringLevel(int state) { return (state >> 12) & 0xff; }
74     static int getOtherState(int state) { return (state & 0xfff00000); }
75 
76     bool mTokenizeTokenString = true;
77     bool mSplitNestedComments = true;
78     bool mAllowDollarInIdentifiers = false;
79 
80     static int toState(State s, int nesting, int tokLevel, int otherState)
81     {
82         static assert(State.kStringToken <= 15);
83         assert(s >= State.kWhite && s <= State.kStringToken);
84         assert(nesting < 32);
85         assert(tokLevel < 32);
86 
87         return s | ((nesting & 0xff) << 4) | ((tokLevel & 0xff) << 12) | otherState;
88     }
89 
90     static bool isStringState(State state) { return state >= State.kStringCStyle; }
91     static bool isCommentState(State state) { return state == State.kBlockComment || state == State.kNestedComment; }
92 
93     static string[256] s_delimiters;
94     static int s_nextDelimiter;
95 
96     static int getDelimiterIndex(string delim)
97     {
98         int idx = (s_nextDelimiter - 1) & 0xff;
99         for( ; idx != s_nextDelimiter; idx = (idx - 1) & 0xff)
100             if(delim == s_delimiters[idx])
101                 return idx;
102 
103         s_nextDelimiter = (s_nextDelimiter + 1) & 0xff;
104         s_delimiters[idx] = delim;
105         return idx;
106     }
107 
108     int scanIdentifier(S)(S text, size_t startpos, ref size_t pos)
109     {
110         int pid;
111         return scanIdentifier(text, startpos, pos, pid);
112     }
113 
114     int scanIdentifier(S)(S text, size_t startpos, ref size_t pos, ref int pid)
115     {
116         while(pos < text.length)
117         {
118             auto nextpos = pos;
119             dchar ch = decode(text, nextpos);
120             if(!isIdentifierCharOrDigit(ch))
121                 break;
122             pos = nextpos;
123         }
124         string ident = toUTF8(text[startpos .. pos]);
125 
126         if(findKeyword(ident, pid))
127             return pid == TOK_is ? TokenCat.Operator : TokenCat.Keyword;
128         if(findSpecial(ident, pid))
129             return TokenCat.String;
130 
131         pid = TOK_Identifier;
132         return TokenCat.Identifier;
133     }
134 
135     static int scanOperator(S)(S text, size_t startpos, ref size_t pos, ref int pid)
136     {
137         size_t len;
138         int id = parseOperator(text, startpos, len);
139         if(id == TOK_error)
140             return TokenCat.Text;
141 
142         pid = id;
143         pos = startpos + len;
144         return TokenCat.Operator;
145     }
146 
147     static dchar trydecode(S)(S text, ref size_t pos)
148     {
149         if(pos >= text.length)
150             return 0;
151         dchar ch = decode(text, pos);
152         return ch;
153     }
154 
155     static void skipDigits(S)(S text, ref size_t pos, int base)
156     {
157         while(pos < text.length)
158         {
159             auto nextpos = pos;
160             dchar ch = decode(text, nextpos);
161             if(ch != '_')
162             {
163                 if(base < 16 && (ch < '0' || ch >= '0' + base))
164                     break;
165                 else if(base == 16 && !isHexDigit(ch))
166                     break;
167             }
168             pos = nextpos;
169         }
170     }
171 
172     static int scanNumber(S)(S text, dchar ch, ref size_t pos)
173     {
174         int pid;
175         return scanNumber(text, ch, pos, pid);
176     }
177 
178     static int scanNumber(S)(S text, dchar ch, ref size_t pos, ref int pid)
179     {
180         // pos after first digit
181         int base = 10;
182         size_t nextpos = pos;
183         if(ch == '.')
184             goto L_float;
185 
186         if(ch == '0')
187         {
188             size_t prevpos = pos;
189             ch = trydecode(text, pos);
190             ch = toLower(ch);
191             if(ch == 'b')
192                 base = 2;
193             else if (ch == 'x')
194                 base = 16;
195             else
196             {
197                 base = 8;
198                 pos = prevpos;
199             }
200         }
201 
202         // pos now after prefix or first digit
203         skipDigits(text, pos, base);
204         // pos now after last digit of integer part
205 
206         nextpos = pos;
207         ch = trydecode(text, nextpos);
208 
209         if((base == 10 && toLower(ch) == 'e') || (base == 16 && toLower(ch) == 'p'))
210             goto L_exponent;
211         if(base >= 8 && ch == '.') // ".." is the slice token
212         {
213             { // mute errors about goto skipping declaration
214                 size_t trypos = nextpos;
215                 dchar trych = trydecode(text, trypos);
216                 if (trych == '.')
217                     goto L_integer;
218                 //if (isAlpha(trych) || trych == '_' || (p[1] & 0x80))
219                 //    goto done;
220             }
221             // float
222             if(base < 10)
223                 base = 10;
224 L_float:
225             pos = nextpos;
226             skipDigits(text, pos, base);
227 
228             nextpos = pos;
229             ch = trydecode(text, nextpos);
230             if((base == 10 && toLower(ch) == 'e') || (base == 16 && toLower(ch) == 'p'))
231             {
232 L_exponent:
233                 // exponent
234                 pos = nextpos;
235                 ch = trydecode(text, nextpos);
236                 if(ch == '-' || ch == '+')
237                     pos = nextpos;
238                 skipDigits(text, pos, 10);
239             }
240 
241             // suffix
242             nextpos = pos;
243             ch = trydecode(text, nextpos);
244             if(ch == 'L' || toUpper(ch) == 'F')
245             {
246 L_floatLiteral:
247                 pos = nextpos;
248                 ch = trydecode(text, nextpos);
249             }
250             if(ch == 'i')
251 L_complexLiteral:
252                 pos = nextpos;
253             pid = TOK_FloatLiteral;
254         }
255         else
256         {
257             // check integer suffix
258             if(ch == 'i')
259                 goto L_complexLiteral;
260             if(toUpper(ch) == 'F')
261                 goto L_floatLiteral;
262 
263             if(toUpper(ch) == 'U')
264             {
265                 pos = nextpos;
266                 ch = trydecode(text, nextpos);
267                 if(ch == 'L')
268                     pos = nextpos;
269             }
270             else if (ch == 'L')
271             {
272                 pos = nextpos;
273                 ch = trydecode(text, nextpos);
274                 if(ch == 'i')
275                     goto L_complexLiteral;
276                 if(toUpper(ch) == 'U')
277                     pos = nextpos;
278             }
279 L_integer:
280             pid = TOK_IntegerLiteral;
281         }
282         return TokenCat.Literal;
283     }
284 
285     version(unspecified) unittest
286     {
287         int pid;
288         size_t pos = 1;
289         auto cat = scanNumber("0.0i", '0', pos, pid);
290         assert(pid == TOK_FloatLiteral);
291         pos = 1;
292         cat = scanNumber("0.i", '0', pos, pid);
293         assert(pid == TOK_IntegerLiteral);
294     }
295 
296     static State scanBlockComment(S)(S text, ref size_t pos)
297     {
298         while(pos < text.length)
299         {
300             dchar ch = decode(text, pos);
301             while(ch == '*')
302             {
303                 if (pos >= text.length)
304                     return State.kBlockComment;
305                 ch = decode(text, pos);
306                 if(ch == '/')
307                     return State.kWhite;
308             }
309         }
310         return State.kBlockComment;
311     }
312 
313     State scanNestedComment(S)(S text, size_t startpos, ref size_t pos, ref int nesting)
314     {
315         while(pos < text.length)
316         {
317             dchar ch = decode(text, pos);
318             while(ch == '/')
319             {
320                 if (pos >= text.length)
321                     return State.kNestedComment;
322                 ch = decode(text, pos);
323                 if(ch == '+')
324                 {
325                     if(mSplitNestedComments && pos > startpos + 2)
326                     {
327                         pos -= 2;
328                         return State.kNestedComment;
329                     }
330                     nesting++;
331                     goto nextChar;
332                 }
333             }
334             while(ch == '+')
335             {
336                 if (pos >= text.length)
337                     return State.kNestedComment;
338                 ch = decode(text, pos);
339                 if(ch == '/')
340                 {
341                     nesting--;
342                     if(nesting == 0)
343                         return State.kWhite;
344                     if(mSplitNestedComments)
345                         return State.kNestedComment;
346                     break;
347                 }
348             }
349         nextChar:;
350         }
351         return State.kNestedComment;
352     }
353 
354     static State scanStringPostFix(S)(S text, ref size_t pos)
355     {
356         size_t nextpos = pos;
357         dchar ch = trydecode(text, nextpos);
358         if(ch == 'c' || ch == 'w' || ch == 'd')
359             pos = nextpos;
360         return State.kWhite;
361     }
362 
363     static State scanStringWysiwyg(S)(S text, ref size_t pos)
364     {
365         while(pos < text.length)
366         {
367             dchar ch = decode(text, pos);
368             if(ch == '"')
369                 return scanStringPostFix(text, pos);
370         }
371         return State.kStringWysiwyg;
372     }
373 
374     static State scanStringAltWysiwyg(S)(S text, ref size_t pos)
375     {
376         while(pos < text.length)
377         {
378             dchar ch = decode(text, pos);
379             if(ch == '`')
380                 return scanStringPostFix(text, pos);
381         }
382         return State.kStringAltWysiwyg;
383     }
384 
385     static State scanStringCStyle(S)(S text, ref size_t pos, dchar term)
386     {
387         while(pos < text.length)
388         {
389             dchar ch = decode(text, pos);
390             if(ch == '\\')
391             {
392                 if (pos >= text.length)
393                     break;
394                 ch = decode(text, pos);
395             }
396             else if(ch == term)
397                 return scanStringPostFix(text, pos);
398         }
399         return State.kStringCStyle;
400     }
401 
402     State startDelimiterString(S)(S text, ref size_t pos, ref int nesting)
403     {
404         import std.uni : isWhite;
405         nesting = 1;
406 
407         auto startpos = pos;
408         dchar ch = trydecode(text, pos);
409         State s = State.kStringDelimited;
410         if(ch == '[')
411             s = State.kStringDelimitedNestedBracket;
412         else if(ch == '(')
413             s = State.kStringDelimitedNestedParen;
414         else if(ch == '{')
415             s = State.kStringDelimitedNestedBrace;
416         else if(ch == '<')
417             s = State.kStringDelimitedNestedAngle;
418         else if(ch == 0 || isWhite(ch)) // bad delimiter, fallback to wysiwyg string
419             s = State.kStringWysiwyg;
420         else
421         {
422             if(isIdentifierChar(ch))
423                 scanIdentifier(text, startpos, pos);
424             string delim = toUTF8(text[startpos .. pos]);
425             nesting = getDelimiterIndex(delim);
426         }
427         return s;
428     }
429 
430     State scanTokenString(S)(S text, ref size_t pos, ref int tokLevel)
431     {
432         int state = toState(State.kWhite, 0, 0, 0);
433         int id = -1;
434         while(pos < text.length && tokLevel > 0)
435         {
436             int type = scan(state, text, pos, id);
437             if(id == TOK_lcurly)
438                 tokLevel++;
439             else if(id == TOK_rcurly)
440                 tokLevel--;
441         }
442         return (tokLevel > 0 ? State.kStringToken : State.kWhite);
443     }
444 
445     static bool isStartingComment(S)(S txt, ref size_t idx)
446     {
447         if(idx >= 0 && idx < txt.length-1 && txt[idx] == '/' && (txt[idx+1] == '*' || txt[idx+1] == '+'))
448             return true;
449         if((txt[idx] == '*' || txt[idx] == '+') && idx > 0 && txt[idx-1] == '/')
450         {
451             idx--;
452             return true;
453         }
454         return false;
455     }
456 
457     static bool isEndingComment(S)(S txt, ref size_t pos)
458     {
459         if(pos < txt.length && pos > 0 && txt[pos] == '/' && (txt[pos-1] == '*' || txt[pos-1] == '+'))
460         {
461             pos--;
462             return true;
463         }
464         if(pos < txt.length-1 && pos >= 0 && (txt[pos] == '*' || txt[pos] == '+') && txt[pos+1] == '/')
465             return true;
466         return false;
467     }
468 
469     bool isIdentifierChar(dchar ch)
470     {
471         if(mAllowDollarInIdentifiers && ch == '$')
472             return true;
473         return isAlpha(ch) || ch == '_' || ch == '@';
474     }
475 
476     bool isIdentifierCharOrDigit(dchar ch)
477     {
478         return isIdentifierChar(ch) || isDigit(ch);
479     }
480 
481     bool isIdentifier(S)(S text)
482     {
483         if(text.length == 0)
484             return false;
485 
486         size_t pos;
487         dchar ch = decode(text, pos);
488         if(!isIdentifierChar(ch))
489             return false;
490 
491         while(pos < text.length)
492         {
493             ch = decode(text, pos);
494             if(!isIdentifierCharOrDigit(ch))
495                 return false;
496         }
497         return true;
498     }
499 
500     static bool isInteger(S)(S text)
501     {
502         if(text.length == 0)
503             return false;
504 
505         size_t pos;
506         while(pos < text.length)
507         {
508             dchar ch = decode(text, pos);
509             if(!isDigit(ch))
510                 return false;
511         }
512         return true;
513     }
514 
515     static bool isBracketPair(dchar ch1, dchar ch2)
516     {
517         switch(ch1)
518         {
519         case '{': return ch2 == '}';
520         case '}': return ch2 == '{';
521         case '(': return ch2 == ')';
522         case ')': return ch2 == '(';
523         case '[': return ch2 == ']';
524         case ']': return ch2 == '[';
525         default:  return false;
526         }
527     }
528 
529     static bool isOpeningBracket(dchar ch)
530     {
531         return ch == '[' || ch == '(' || ch == '{';
532     }
533 
534     static bool isClosingBracket(dchar ch)
535     {
536         return ch == ']' || ch == ')' || ch == '}';
537     }
538 
539     static dchar openingBracket(State s)
540     {
541         switch(s)
542         {
543         case State.kStringDelimitedNestedBracket: return '[';
544         case State.kStringDelimitedNestedParen:   return '(';
545         case State.kStringDelimitedNestedBrace:   return '{';
546         case State.kStringDelimitedNestedAngle:   return '<';
547         default: break;
548         }
549         assert(0);
550     }
551 
552     static dchar closingBracket(State s)
553     {
554         switch(s)
555         {
556         case State.kStringDelimitedNestedBracket: return ']';
557         case State.kStringDelimitedNestedParen:   return ')';
558         case State.kStringDelimitedNestedBrace:   return '}';
559         case State.kStringDelimitedNestedAngle:   return '>';
560         default: break;
561         }
562         assert(0);
563     }
564 
565     static bool isCommentOrSpace(S)(int type, S text)
566     {
567         return (type == TokenCat.Comment || (type == TokenCat.Text && isWhite(text[0])));
568     }
569 
570     static State scanNestedDelimiterString(S)(S text, ref size_t pos, State s, ref int nesting)
571     {
572         dchar open  = openingBracket(s);
573         dchar close = closingBracket(s);
574 
575         while(pos < text.length)
576         {
577             dchar ch = decode(text, pos);
578             if(ch == open)
579                 nesting++;
580             else if(ch == close && nesting > 0)
581                 nesting--;
582             else if(ch == '"' && nesting == 0)
583                 return scanStringPostFix(text, pos);
584         }
585         return s;
586     }
587 
588     State scanDelimitedString(S)(S text, ref size_t pos, ref int delim)
589     {
590         string delimiter = s_delimiters[delim];
591 
592         while(pos < text.length)
593         {
594             auto startpos = pos;
595             dchar ch = decode(text, pos);
596             if(isIdentifierChar(ch))
597                 scanIdentifier(text, startpos, pos);
598             string ident = toUTF8(text[startpos .. pos]);
599             if(ident == delimiter)
600             {
601                 ch = trydecode(text, pos);
602                 if(ch == '"')
603                 {
604                     delim = 0; // reset delimiter id, it shadows nesting
605                     return scanStringPostFix(text, pos);
606                 }
607             }
608         }
609         return State.kStringDelimited;
610     }
611 
612     int scan(S)(ref int state, in S text, ref size_t pos, ref int id)
613     {
614         State s = scanState(state);
615         int nesting = nestingLevel(state);
616         int tokLevel = tokenStringLevel(state);
617         int otherState = getOtherState(state);
618 
619         int type = TokenCat.Text;
620         size_t startpos = pos;
621         dchar ch;
622 
623         id = TOK_Space;
624 
625         switch(s)
626         {
627         case State.kWhite:
628             ch = decode(text, pos);
629             if(ch == 'r' || ch == 'x' || ch == 'q')
630             {
631                 size_t prevpos = pos;
632                 dchar nch = trydecode(text, pos);
633                 if(nch == '"' && ch == 'q')
634                 {
635                     s = startDelimiterString(text, pos, nesting);
636                     if(s == State.kStringDelimited)
637                         goto case State.kStringDelimited;
638                     else if(s == State.kStringWysiwyg)
639                         goto case State.kStringWysiwyg;
640                     else
641                         goto case State.kStringDelimitedNestedBracket;
642                 }
643                 else if(tokLevel == 0 && ch == 'q' && nch == '{')
644                 {
645                     type = TokenCat.String;
646                     id = TOK_StringLiteral;
647                     if(mTokenizeTokenString)
648                     {
649                         pos = prevpos;
650                         s = State.kStringTokenFirst;
651                     }
652                     else
653                     {
654                         tokLevel = 1;
655                         s = scanTokenString(text, pos, tokLevel);
656                     }
657                     break;
658                 }
659                 else if(nch == '"')
660                 {
661                     goto case State.kStringWysiwyg;
662                 }
663                 else
664                 {
665                     pos = prevpos;
666                     type = scanIdentifier(text, startpos, pos, id);
667                 }
668             }
669             else if(isIdentifierChar(ch))
670                 type = scanIdentifier(text, startpos, pos, id);
671             else if(isDigit(ch))
672                 type = scanNumber(text, ch, pos, id);
673             else if (ch == '.')
674             {
675                 size_t nextpos = pos;
676                 ch = trydecode(text, nextpos);
677                 if(isDigit(ch))
678                     type = scanNumber(text, '.', pos, id);
679                 else
680                     type = scanOperator(text, startpos, pos, id);
681             }
682             else if (ch == '/')
683             {
684                 size_t prevpos = pos;
685                 ch = trydecode(text, pos);
686                 if (ch == '/')
687                 {
688                     // line comment
689                     type = TokenCat.Comment;
690                     id = TOK_Comment;
691                     while(pos < text.length && decode(text, pos) != '\n') {}
692                 }
693                 else if (ch == '*')
694                 {
695                     s = scanBlockComment(text, pos);
696                     type = TokenCat.Comment;
697                     id = TOK_Comment;
698                 }
699                 else if (ch == '+')
700                 {
701                     nesting = 1;
702                     s = scanNestedComment(text, startpos, pos, nesting);
703                     type = TokenCat.Comment;
704                     id = TOK_Comment;
705                 }
706                 else
707                 {
708                     // step back to position after '/'
709                     pos = prevpos;
710                     type = scanOperator(text, startpos, pos, id);
711                 }
712             }
713             else if (ch == '"')
714                 goto case State.kStringCStyle;
715 
716             else if (ch == '`')
717                 goto case State.kStringAltWysiwyg;
718 
719             else if (ch == '\'')
720             {
721                 s = scanStringCStyle(text, pos, '\'');
722                 id = TOK_CharacterLiteral;
723                 type = TokenCat.String;
724             }
725             else if (ch == '#')
726             {
727                 // display #! or #line as line comment
728                 type = TokenCat.Comment;
729                 id = TOK_Comment;
730                 while(pos < text.length && decode(text, pos) != '\n') {}
731             }
732             else
733             {
734                 if (tokLevel > 0)
735                 {
736                     if(ch == '{')
737                         tokLevel++;
738                     else if (ch == '}')
739                         tokLevel--;
740                     if(!isWhite(ch))
741                         type = scanOperator(text, startpos, pos, id);
742                     id = TOK_StringLiteral;
743                 }
744                 else if(!isWhite(ch))
745                     type = scanOperator(text, startpos, pos, id);
746             }
747             break;
748 
749         case State.kStringTokenFirst:
750             ch = decode(text, pos);
751             assert(ch == '{');
752 
753             tokLevel = 1;
754             type = TokenCat.Operator;
755             id = TOK_StringLiteral;
756             s = State.kWhite;
757             break;
758 
759         case State.kStringToken:
760             type = TokenCat.String;
761             id = TOK_StringLiteral;
762             s = scanTokenString(text, pos, tokLevel);
763             break;
764 
765         case State.kBlockComment:
766             s = scanBlockComment(text, pos);
767             type = TokenCat.Comment;
768             id = TOK_Comment;
769             break;
770 
771         case State.kNestedComment:
772             s = scanNestedComment(text, pos, pos, nesting);
773             type = TokenCat.Comment;
774             id = TOK_Comment;
775             break;
776 
777         case State.kStringCStyle:
778             s = scanStringCStyle(text, pos, '"');
779             type = TokenCat.String;
780             id = TOK_StringLiteral;
781             break;
782 
783         case State.kStringWysiwyg:
784             s = scanStringWysiwyg(text, pos);
785             type = TokenCat.String;
786             id = TOK_StringLiteral;
787             break;
788 
789         case State.kStringAltWysiwyg:
790             s = scanStringAltWysiwyg(text, pos);
791             type = TokenCat.String;
792             id = TOK_StringLiteral;
793             break;
794 
795         case State.kStringDelimited:
796             s = scanDelimitedString(text, pos, nesting);
797             type = TokenCat.String;
798             id = TOK_StringLiteral;
799             break;
800 
801         case State.kStringDelimitedNestedBracket:
802         case State.kStringDelimitedNestedParen:
803         case State.kStringDelimitedNestedBrace:
804         case State.kStringDelimitedNestedAngle:
805             s = scanNestedDelimiterString(text, pos, s, nesting);
806             type = TokenCat.String;
807             id = TOK_StringLiteral;
808             break;
809 
810         default:
811             break;
812         }
813         state = toState(s, nesting, tokLevel, otherState);
814 
815         if(tokLevel > 0)
816             id = TOK_StringLiteral;
817         return type;
818     }
819 
820     int scan(S)(ref int state, in S text, ref size_t pos)
821     {
822         int id;
823         return scan(state, text, pos, id);
824     }
825 
826     ///////////////////////////////////////////////////////////////
827     TokenInfo[] ScanLine(S)(int iState, S text)
828     {
829         TokenInfo[] lineInfo;
830         for(size_t pos = 0; pos < text.length; )
831         {
832             TokenInfo info;
833             info.StartIndex = pos;
834             info.type = cast(TokenCat) scan(iState, text, pos, info.tokid);
835             info.EndIndex = pos;
836             lineInfo ~= info;
837         }
838         return lineInfo;
839     }
840 }
841 
842 ///////////////////////////////////////////////////////////////
843 
844 // converted int[string] to short[string] due to bug #2500
845 __gshared short[string] keywords_map; // maps to TOK enumerator
846 __gshared short[string] specials_map; // maps to TOK enumerator
847 alias AssociativeArray!(string, short) _wa1; // fully instantiate type info
848 alias AssociativeArray!(int, const(int)) _wa2; // fully instantiate type info
849 
850 shared static this()
851 {
852     foreach(i, s; keywords)
853         keywords_map[s] = cast(short) (TOK_begin_Keywords + i);
854 
855     foreach(i, s; specials)
856         specials_map[s] = cast(short) i;
857 }
858 
859 bool findKeyword(string ident, ref int id)
860 {
861     if(__ctfe)
862     {
863         // slow, but compiles
864         foreach(i, k; keywords)
865             if(k == ident)
866             {
867                 id = cast(int) (TOK_begin_Keywords + i);
868                 return true;
869             }
870     }
871     else if(auto pident = ident in keywords_map)
872     {
873         id = *pident;
874         return true;
875     }
876     return false;
877 }
878 
879 bool isKeyword(string ident)
880 {
881     int id;
882     return findKeyword(ident, id);
883 }
884 
885 bool findSpecial(string ident, ref int id)
886 {
887     if(__ctfe)
888     {
889         // slow, but compiles
890         foreach(i, k; specials)
891             if(k == ident)
892             {
893                 id = TOK_StringLiteral;
894                 return true;
895             }
896     }
897     else if(auto pident = ident in specials_map)
898     {
899         id = TOK_StringLiteral;
900         return true;
901     }
902     return false;
903 }
904 
905 const string[] keywords =
906 [
907     "this",
908     "super",
909     "assert",
910     "null",
911     "true",
912     "false",
913     "cast",
914     "new",
915     "delete",
916     "throw",
917     "module",
918     "pragma",
919     "typeof",
920     "typeid",
921     "template",
922 
923     "void",
924     "byte",
925     "ubyte",
926     "short",
927     "ushort",
928     "int",
929     "uint",
930     "long",
931     "ulong",
932     "cent",
933     "ucent",
934     "float",
935     "double",
936     "real",
937     "bool",
938     "char",
939     "wchar",
940     "dchar",
941     "ifloat",
942     "idouble",
943     "ireal",
944 
945     "cfloat",
946     "cdouble",
947     "creal",
948 
949     "delegate",
950     "function",
951 
952     "is",
953     "if",
954     "else",
955     "while",
956     "for",
957     "do",
958     "switch",
959     "case",
960     "default",
961     "break",
962     "continue",
963     "synchronized",
964     "return",
965     "goto",
966     "try",
967     "catch",
968     "finally",
969     "with",
970     "asm",
971     "foreach",
972     "foreach_reverse",
973     "scope",
974 
975     "struct",
976     "class",
977     "interface",
978     "union",
979     "enum",
980     "import",
981     "mixin",
982     "static",
983     "final",
984     "const",
985     "typedef",
986     "alias",
987     "override",
988     "abstract",
989     "volatile",
990     "debug",
991     "deprecated",
992     "in",
993     "out",
994     "inout",
995     "lazy",
996     "auto",
997 
998     "align",
999     "extern",
1000     "private",
1001     "package",
1002     "protected",
1003     "public",
1004     "export",
1005 
1006     "body",
1007     "invariant",
1008     "unittest",
1009     "version",
1010     //{    "manifest",    TOKmanifest    },
1011 
1012     // Added after 1.0
1013     "ref",
1014     "macro",
1015     "pure",
1016     "nothrow",
1017     "__gshared",
1018     "__thread",
1019     "__traits",
1020     "__overloadset",
1021     "__parameters",
1022     "__argTypes",
1023     "__vector",
1024 
1025     "__FILE__",
1026     "__LINE__",
1027     "__FUNCTION__",
1028     "__PRETTY_FUNCTION__",
1029     "__MODULE__",
1030 
1031     "shared",
1032     "immutable",
1033 
1034     "@disable",
1035     "@property",
1036     "@nogc",
1037     "@safe",
1038     "@system",
1039     "@trusted",
1040 
1041 ];
1042 
1043 // not listed as keywords, but "special tokens"
1044 const string[] specials =
1045 [
1046     "__DATE__",
1047     "__EOF__",
1048     "__TIME__",
1049     "__TIMESTAMP__",
1050     "__VENDOR__",
1051     "__VERSION__",
1052 ];
1053 
1054 ////////////////////////////////////////////////////////////////////////
1055 enum
1056 {
1057     TOK_begin_Generic,
1058     TOK_Space = TOK_begin_Generic,
1059     TOK_Comment,
1060     TOK_Identifier,
1061     TOK_IntegerLiteral,
1062     TOK_FloatLiteral,
1063     TOK_StringLiteral,
1064     TOK_CharacterLiteral,
1065     TOK_EOF,
1066     TOK_RECOVER,
1067     TOK_end_Generic
1068 }
1069 
1070 string genKeywordEnum(string kw)
1071 {
1072     if(kw[0] == '@')
1073         kw = kw[1..$];
1074     return "TOK_" ~ kw;
1075 }
1076 
1077 string genKeywordsEnum(T)(const string[] kwords, T begin)
1078 {
1079     string enums = "enum { TOK_begin_Keywords = " ~ to!string(begin) ~ ", ";
1080     bool first = true;
1081     foreach(kw; kwords)
1082     {
1083         enums ~= genKeywordEnum(kw);
1084         if(first)
1085         {
1086             first = false;
1087             enums ~= " = TOK_begin_Keywords";
1088         }
1089         enums ~= ",";
1090     }
1091     enums ~= "TOK_end_Keywords }";
1092     return enums;
1093 }
1094 
1095 mixin(genKeywordsEnum(keywords, "TOK_end_Generic"));
1096 
1097 const string[2][] operators =
1098 [
1099     [ "lcurly",           "{" ],
1100     [ "rcurly",           "}" ],
1101     [ "lparen",           "(" ],
1102     [ "rparen",           ")" ],
1103     [ "lbracket",         "[" ],
1104     [ "rbracket",         "]" ],
1105     [ "semicolon",        ";" ],
1106     [ "colon",            ":" ],
1107     [ "comma",            "," ],
1108     [ "dot",              "." ],
1109 
1110     // binary operators
1111     [ "xor",              "^" ],
1112     [ "lt",               "<" ],
1113     [ "gt",               ">" ],
1114     [ "le",               "<=" ],
1115     [ "ge",               ">=" ],
1116     [ "equal",            "==" ],
1117     [ "notequal",         "!=" ],
1118     [ "lambda",           "=>" ],
1119 
1120     [ "unord",            "!<>=" ],
1121     [ "ue",               "!<>" ],
1122     [ "lg",               "<>" ],
1123     [ "leg",              "<>=" ],
1124     [ "ule",              "!>" ],
1125     [ "ul",               "!>=" ],
1126     [ "uge",              "!<" ],
1127     [ "ug",               "!<=" ],
1128     [ "notcontains",      "!in" ],
1129     [ "notidentity",      "!is" ],
1130 
1131     [ "shl",              "<<" ],
1132     [ "shr",              ">>" ],
1133     [ "ushr",             ">>>" ],
1134     [ "add",              "+" ],
1135     [ "min",              "-" ],
1136     [ "mul",              "*" ],
1137     [ "div",              "/" ],
1138     [ "mod",              "%" ],
1139     [ "pow",              "^^" ],
1140     [ "and",              "&" ],
1141     [ "andand",           "&&" ],
1142     [ "or",               "|" ],
1143     [ "oror",             "||" ],
1144     [ "tilde",            "~" ],
1145 
1146     [ "assign",           "=" ],
1147     [ "xorass",           "^=" ],
1148     [ "addass",           "+=" ],
1149     [ "minass",           "-=" ],
1150     [ "mulass",           "*=" ],
1151     [ "divass",           "/=" ],
1152     [ "modass",           "%=" ],
1153     [ "powass",           "^^=" ],
1154     [ "shlass",           "<<=" ],
1155     [ "shrass",           ">>=" ],
1156     [ "ushrass",          ">>>=" ],
1157     [ "andass",           "&=" ],
1158     [ "orass",            "|=" ],
1159     [ "catass",           "~=" ],
1160 
1161     // end of binary operators
1162 
1163     [ "not",              "!" ],
1164     [ "dollar",           "$" ],
1165     [ "slice",            ".." ],
1166     [ "dotdotdot",        "..." ],
1167     [ "plusplus",         "++" ],
1168     [ "minusminus",       "--" ],
1169     [ "question",         "?" ],
1170 /+
1171     [ "array",            "[]" ],
1172     // symbols with duplicate meaning
1173     [ "address",          "&" ],
1174     [ "star",             "*" ],
1175     [ "preplusplus",      "++" ],
1176     [ "preminusminus",    "--" ],
1177     [ "neg",              "-" ],
1178     [ "uadd",             "+" ],
1179     [ "cat",              "~" ],
1180     [ "identity",         "is" ],
1181     [ "plus",             "++" ],
1182     [ "minus",            "--" ],
1183 +/
1184 ];
1185 
1186 string genOperatorEnum(T)(const string[2][] ops, T begin)
1187 {
1188     string enums = "enum { TOK_begin_Operators = " ~ to!string(begin) ~ ", ";
1189     bool first = true;
1190     for(int o = 0; o < ops.length; o++)
1191     {
1192         enums ~= "TOK_" ~ ops[o][0];
1193         if(first)
1194         {
1195             first = false;
1196             enums ~= " = TOK_begin_Operators";
1197         }
1198         enums ~= ",";
1199     }
1200     enums ~= "TOK_end_Operators }";
1201     return enums;
1202 }
1203 
1204 mixin(genOperatorEnum(operators, "TOK_end_Keywords"));
1205 
1206 enum TOK_binaryOperatorFirst = TOK_xor;
1207 enum TOK_binaryOperatorLast  = TOK_catass;
1208 enum TOK_assignOperatorFirst = TOK_assign;
1209 enum TOK_assignOperatorLast  = TOK_catass;
1210 enum TOK_unorderedOperatorFirst = TOK_unord;
1211 enum TOK_unorderedOperatorLast  = TOK_ug;
1212 
1213 enum TOK_error = -1;
1214 
1215 bool _stringEqual(string s1, string s2, int length)
1216 {
1217     if(s1.length < length || s2.length < length)
1218         return false;
1219     for(int i = 0; i < length; i++)
1220         if(s1[i] != s2[i])
1221             return false;
1222     return true;
1223 }
1224 
1225 int[] sortedOperatorIndexArray()
1226 {
1227     // create sorted list of operators
1228     int[] opIndex;
1229     for(int o = 0; o < operators.length; o++)
1230     {
1231         string op = operators[o][1];
1232         int p = 0;
1233         while(p < opIndex.length)
1234         {
1235             assert(op != operators[opIndex[p]][1], "duplicate operator " ~ op);
1236             if(op < operators[opIndex[p]][1])
1237                 break;
1238             p++;
1239         }
1240         // array slicing does not work in CTFE?
1241         // opIndex ~= opIndex[0..p] ~ o ~ opIndex[p..$];
1242         int[] nIndex;
1243         for(int i = 0; i < p; i++)
1244             nIndex ~= opIndex[i];
1245         nIndex ~= o;
1246         for(int i = p; i < opIndex.length; i++)
1247             nIndex ~= opIndex[i];
1248         opIndex = nIndex;
1249     }
1250     return opIndex;
1251 }
1252 
1253 string[] sortedOperatorArray()
1254 {
1255     string[] array;
1256     foreach(o; sortedOperatorIndexArray())
1257         array ~= operators[o][1];
1258     return array;
1259 }
1260 
1261 string genOperatorParser(string getch)
1262 {
1263     int[] opIndex = sortedOperatorIndexArray();
1264 
1265     int matchlen = 0;
1266     string indent = "";
1267     string[] defaults = [ "error" ];
1268     string txt = indent ~ "dchar ch;\n";
1269     for(int o = 0; o < opIndex.length; o++)
1270     {
1271         string op = operators[opIndex[o]][1];
1272         string nextop;
1273         if(o + 1 < opIndex.length)
1274             nextop = operators[opIndex[o+1]][1];
1275 
1276         while(op.length > matchlen)
1277         {
1278             if(matchlen > 0)
1279                 txt ~= indent ~ "case '" ~ op[matchlen-1] ~ "':\n";
1280             indent ~= "  ";
1281             txt ~= indent ~ "ch = " ~ getch ~ ";\n";
1282             txt ~= indent ~ "switch(ch)\n";
1283             txt ~= indent ~ "{\n";
1284             indent ~= "  ";
1285             int len = (matchlen > 0 ? matchlen - 1 : 0);
1286             while(len > 0 && defaults[len] == defaults[len+1])
1287                 len--;
1288             txt ~= indent ~ "default: len = " ~ to!string(len) ~ "; return TOK_" ~ defaults[$-1] ~ ";\n";
1289             //txt ~= indent ~ "case '" ~ op[matchlen] ~ "':\n";
1290             defaults ~= defaults[$-1];
1291             matchlen++;
1292         }
1293         if(nextop.length > matchlen && nextop[0..matchlen] == op)
1294         {
1295             if(matchlen > 0)
1296                 txt ~= indent ~ "case '" ~ op[matchlen-1] ~ "':\n";
1297             indent ~= "  ";
1298             txt ~= indent ~ "ch = " ~ getch ~ ";\n";
1299             txt ~= indent ~ "switch(ch)\n";
1300             txt ~= indent ~ "{\n";
1301             indent ~= "  ";
1302             txt ~= indent ~ "default: len = " ~ to!string(matchlen) ~ "; return TOK_" ~ operators[opIndex[o]][0] ~ "; // " ~ op ~ "\n";
1303             defaults ~= operators[opIndex[o]][0];
1304             matchlen++;
1305         }
1306         else
1307         {
1308             string case_txt = "case '" ~ op[matchlen-1] ~ "':";
1309             if(isAlphaNum(op[matchlen-1]))
1310                 case_txt ~= " ch = getch(); if(isAlphaNum(ch) || ch == '_') goto default;\n" ~ indent ~ "  ";
1311             txt ~= indent ~ case_txt ~ " len = " ~ to!string(matchlen) ~ "; return TOK_" ~ operators[opIndex[o]][0] ~ "; // " ~ op ~ "\n";
1312 
1313             while(nextop.length < matchlen || (matchlen > 0 && !_stringEqual(op, nextop, matchlen-1)))
1314             {
1315                 matchlen--;
1316                 indent = indent[0..$-2];
1317                 txt ~= indent ~ "}\n";
1318                 indent = indent[0..$-2];
1319                 defaults = defaults[0..$-1];
1320             }
1321         }
1322     }
1323     return txt;
1324 }
1325 
1326 int parseOperator(S)(S txt, size_t pos, ref size_t len)
1327 {
1328     dchar getch()
1329     {
1330         if(pos >= txt.length)
1331             return 0;
1332         return decode(txt, pos);
1333     }
1334 
1335     mixin(genOperatorParser("getch()"));
1336 }
1337 
1338 ////////////////////////////////////////////////////////////////////////
1339 version(none)
1340 {
1341     pragma(msg, genKeywordsEnum(keywords, "TOK_end_Generic"));
1342     pragma(msg, genOperatorEnum(operators, "TOK_end_Keywords"));
1343     pragma(msg, sortedOperatorArray());
1344     pragma(msg, genOperatorParser("getch()"));
1345 }
1346 
1347 string tokenString(int id)
1348 {
1349     switch(id)
1350     {
1351         case TOK_Space:            return " ";
1352         case TOK_Comment:          return "/**/";
1353         case TOK_Identifier:       return "Identifier";
1354         case TOK_IntegerLiteral:   return "IntegerLiteral";
1355         case TOK_FloatLiteral:     return "FloatLiteral";
1356         case TOK_StringLiteral:    return "StringtLiteral";
1357         case TOK_CharacterLiteral: return "CharacterLiteral";
1358         case TOK_EOF:              return "__EOF__";
1359         case TOK_RECOVER:          return "__RECOVER__";
1360         case TOK_begin_Keywords: .. case TOK_end_Keywords - 1:
1361             return keywords[id - TOK_begin_Keywords];
1362         case TOK_begin_Operators: .. case TOK_end_Operators - 1:
1363             return operators[id - TOK_begin_Operators][1];
1364         default:
1365             assert(false);
1366     }
1367 }
1368 
1369 string operatorName(int id)
1370 {
1371     switch(id)
1372     {
1373         case TOK_begin_Operators: .. case TOK_end_Operators - 1:
1374             return operators[id - TOK_begin_Operators][0];
1375         default:
1376             assert(false);
1377     }
1378 }
1379 
1380 enum case_TOKs_BasicTypeX = q{
1381     case TOK_bool:
1382     case TOK_byte:
1383     case TOK_ubyte:
1384     case TOK_short:
1385     case TOK_ushort:
1386     case TOK_int:
1387     case TOK_uint:
1388     case TOK_long:
1389     case TOK_ulong:
1390     case TOK_char:
1391     case TOK_wchar:
1392     case TOK_dchar:
1393     case TOK_float:
1394     case TOK_double:
1395     case TOK_real:
1396     case TOK_ifloat:
1397     case TOK_idouble:
1398     case TOK_ireal:
1399     case TOK_cfloat:
1400     case TOK_cdouble:
1401     case TOK_creal:
1402     case TOK_void:
1403 };
1404 
1405 enum case_TOKs_TemplateSingleArgument = q{
1406     case TOK_Identifier:
1407     case TOK_CharacterLiteral:
1408     case TOK_StringLiteral:
1409     case TOK_IntegerLiteral:
1410     case TOK_FloatLiteral:
1411     case TOK_true:
1412     case TOK_false:
1413     case TOK_null:
1414     case TOK___FILE__:
1415     case TOK___LINE__:
1416 }; // + case_TOKs_BasicTypeX;