hoshi-lang dev
Yet another programming language
Loading...
Searching...
No Matches
lexer.cpp
Go to the documentation of this file.
1//
2// Created by XIaokang00010 on 2023/1/24.
3//
4
5#include "lexer.hpp"
6
7namespace yoi {
8 lexer::lexer(std::wstringstream ss) : stream(std::move(ss)), line(0), col(0), curCh() {
9 getCh();
10 }
11
12 void lexer::getCh() {
13 start:
14 if (!stream) {
15 panic(line, col, "hoshi::lexer::getCh() - eof");
16 }
17 if (!stream.get(curCh) || stream.fail()) {
18 curCh = '\0';
19 }
20 if (curCh == L'\n') {
21 line++, col = 0;
22 } else if (curCh == L'\r') {
23 goto start;
24 } else {
25 col++;
26 }
27 }
28
30 if (curCh == '\0') {
32 }
33 while (curCh == ' ' or curCh == '\n' or curCh == '\t') getCh();
34 if (std::isalpha(curCh) or curCh == '_') {
35 return curToken = alphaStart();
36 } else if (std::isdigit(curCh)) {
37 return curToken = digitStart();
38 } else if (curCh == L'+') {
39 return curToken = plusStart();
40 } else if (curCh == L'-') {
41 return curToken = minusStart();
42 } else if (curCh == L'*') {
43 return curToken = asteriskStart();
44 } else if (curCh == L'/') {
45 return curToken = slashStart();
46 } else if (curCh == L'%') {
47 return curToken = percentSignStart();
48 } else if (curCh == L'!') {
49 return curToken = notStart();
50 } else if (curCh == L'=') {
51 return curToken = equalStart();
52 } else if (curCh == L'>') {
53 return curToken = greaterStart();
54 } else if (curCh == L'<') {
55 return curToken = lessStart();
56 } else if (curCh == L'"' or curCh == L'\'') {
57 return curToken = strStart();
58 } else if (curCh == L';') {
59 return curToken = semicolonStart();
60 } else if (curCh == L':') {
61 return curToken = colonStart();
62 } else if (curCh == L',') {
63 return curToken = commaStart();
64 } else if (curCh == L'.') {
65 return curToken = dotStart();
66 } else if (curCh == L'(') {
68 } else if (curCh == L')') {
70 } else if (curCh == L'[') {
71 return curToken = leftBracketStart();
72 } else if (curCh == L']') {
73 return curToken = rightBracketStart();
74 } else if (curCh == L'{') {
75 return curToken = leftBracesStart();
76 } else if (curCh == L'}') {
77 return curToken = rightBracesStart();
78 } else if (curCh == L'&') {
79 return curToken = andStart();
80 } else if (curCh == L'|') {
81 return curToken = orStart();
82 } else if (curCh == L'^') {
83 return curToken = xorStart();
84 } else if (curCh == L'#') {
85 return curToken = sharpStart();
86 } else if (curCh == L'!') {
87 return curToken = notStart();
88 } else if (curCh == L'\0') {
90 } else {
91 panic(line, col, "hoshi::lexer::scan() - undefined token");
92 return {};
93 }
94 }
95
98 wstr tempStr;
99 tempStr += curCh;
100 getCh();
101 while (isalpha(curCh) or isdigit(curCh) or curCh == L'_') {
102 tempStr += curCh;
103 getCh();
104 }
105
106 // 关键词处理
107 if (tempStr == L"return") {
108 tok.kind = token::tokenKind::kReturn;
109 } else if (tempStr == L"continue") {
111 } else if (tempStr == L"break") {
112 tok.kind = token::tokenKind::kBreak;
113 } else if (tempStr == L"for") {
114 tok.kind = token::tokenKind::kFor;
115 } else if (tempStr == L"forEach") {
117 } else if (tempStr == L"while") {
118 tok.kind = token::tokenKind::kWhile;
119 } else if (tempStr == L"func") {
120 tok.kind = token::tokenKind::kFunc;
121 } else if (tempStr == L"use") {
122 tok.kind = token::tokenKind::kUse;
123 } else if (tempStr == L"let") {
124 tok.kind = token::tokenKind::kLet;
125 } else if (tempStr == L"cast") {
126 tok.kind = token::tokenKind::kCast;
127 } else if (tempStr == L"in") {
128 tok.kind = token::tokenKind::kIn;
129 } else if (tempStr == L"if") {
130 tok.kind = token::tokenKind::kIf;
131 } else if (tempStr == L"else") {
132 tok.kind = token::tokenKind::kElse;
133 } else if (tempStr == L"elif") {
134 tok.kind = token::tokenKind::kElif;
135 } else if (tempStr == L"interface") {
137 } else if (tempStr == L"constructor") {
139 } else if (tempStr == L"finalizer") {
141 } else if (tempStr == L"struct") {
142 tok.kind = token::tokenKind::kStruct;
143 } else if (tempStr == L"impl") {
144 tok.kind = token::tokenKind::kImpl;
145 } else if (tempStr == L"null") {
146 tok.kind = token::tokenKind::kNull;
147 } else if (tempStr == L"import") {
148 tok.kind = token::tokenKind::kImport;
149 } else if (tempStr == L"export") {
150 tok.kind = token::tokenKind::kExport;
151 } else if (tempStr == L"as") {
152 tok.kind = token::tokenKind::kAs;
153 } else if (tempStr == L"from") {
154 tok.kind = token::tokenKind::kFrom;
155 } else if (tempStr == L"type_id") {
156 tok.kind = token::tokenKind::kTypeId;
157 } else if (tempStr == L"dyn_cast") {
159 } else if (tempStr == L"try") {
160 tok.kind = token::tokenKind::kTry;
161 } else if (tempStr == L"catch") {
162 tok.kind = token::tokenKind::kCatch;
163 } else if (tempStr == L"finally") {
165 } else if (tempStr == L"throw") {
166 tok.kind = token::tokenKind::kThrow;
167 } else if (tempStr == L"noffi") {
168 tok.kind = token::tokenKind::kNoFFI;
169 } else if (tempStr == L"always_inline") {
171 } else if (tempStr == L"new") {
172 tok.kind = token::tokenKind::kNew;
173 } else if (tempStr == L"interfaceof") {
175 } else if (tempStr == L"static") {
176 tok.kind = token::tokenKind::kStatic;
177 } else if (tempStr == L"intrinsic") {
179 } else if (tempStr == L"callable") {
181 } else if (tempStr == L"alias") {
182 tok.kind = token::tokenKind::kAlias;
183 } else if (tempStr == L"enum") {
184 tok.kind = token::tokenKind::kEnum;
185 } else if (tempStr == L"datastruct") {
187 } else if (tempStr == L"datafield") {
189 } else if (tempStr == L"generator") {
191 } else if (tempStr == L"yield") {
192 tok.kind = token::tokenKind::kYield;
193 } else if (tempStr == L"decltype") {
195 } else if (tempStr == L"concept") {
197 } else if (tempStr == L"satisfy") {
199 } else if (tempStr == L"operator") {
200 scan();
201 return operatorStart();
202 } else if (tempStr == L"true" or tempStr == L"false") {
203 tok.kind = token::tokenKind::boolean;
204 tok.basicVal.vBool = tempStr == L"true";
205 } else {
206 tok.strVal = tempStr;
207 }
208
209 return tok;
210 }
211
213 wchar strV = curCh;
215 getCh();
216 while (curCh != strV) {
217 if (curCh == '\\') {
218 getCh();
219 tok.strVal += '\\';
220 }
221 tok.strVal += curCh;
222 getCh();
223 }
224 getCh(); // skip "
225 std::wistringstream ss{tok.strVal};
226 tok.strVal = {};
227 parseString(ss, tok.strVal);
228 if (strV == L'\'' && tok.strVal.size() > 1)
229 panic(line, col, "lexer::strStart() - character literal length > 1");
230 return tok;
231 }
232
234 enum class MatchPattern {
235 hex,
236 oct,
237 dec,
238 bin
239 } matchPattern = MatchPattern::dec;
241 wstr tempStr;
242 if (curCh == '0') {
243 tempStr += curCh;
244 getCh();
245 switch (curCh) {
246 case 'x':
247 case 'X':
248 matchPattern = MatchPattern::hex;
249 getCh();
250 tempStr += curCh;
251 getCh();
252 break;
253 case 'b':
254 case 'B':
255 matchPattern = MatchPattern::bin;
256 getCh();
257 tempStr += curCh;
258 getCh();
259 break;
260 case '0':
261 case '1':
262 case '2':
263 case '3':
264 case '4':
265 case '5':
266 case '6':
267 case '7':
268 matchPattern = MatchPattern::oct;
269 tempStr += curCh;
270 getCh();
271 break;
272 case 'o':
273 case 'O':
274 matchPattern = MatchPattern::oct;
275 getCh();
276 tempStr += curCh;
277 getCh();
278 break;
279 default:
280 break;
281 }
282 }
283 while (isdigit(curCh) || (matchPattern == MatchPattern::hex && isxdigit(curCh))) {
284 tempStr += curCh;
285 getCh();
286 }
287 if (curCh == '.') {
288 yoi_assert(line, col, matchPattern == MatchPattern::dec, "lexer::digitStart() - invalid match pattern");
289 tok.kind = token::tokenKind::decimal;
290 tempStr += curCh;
291 getCh();
292 while (isdigit(curCh)) {
293 tempStr += curCh;
294 getCh();
295 }
296 }
297 if (tok.kind == token::tokenKind::integer) {
298 switch (curCh) {
299 case 'u':
300 case 'U':
301 getCh();
303 break;
304 case 's':
305 case 'S':
306 getCh();
308 break;
309 }
310 int base = 10;
311 switch (matchPattern) {
312 case MatchPattern::dec: {
313 base = 10;
314 break;
315 }
316 case MatchPattern::hex: {
317 base = 16;
318 break;
319 }
320 case MatchPattern::oct: {
321 base = 8;
322 break;
323 }
324 case MatchPattern::bin: {
325 base = 2;
326 break;
327 }
328 }
329 if (tok.kind == token::tokenKind::integer)
330 tok.basicVal.vInt = std::stoll(tempStr, nullptr, base);
331 else if (tok.kind == token::tokenKind::unsignedInt)
332 tok.basicVal.vUint = std::stoull(tempStr, nullptr, base);
333 else if (tok.kind == token::tokenKind::shortInt)
334 tok.basicVal.vShort = static_cast<int16_t>(std::stoi(tempStr, nullptr, base));
335 } else {
336 tok.basicVal.vDeci = std::stod(tempStr);
337 }
338 return tok;
339 }
340
343 getCh();
344 if (curCh == '=') {
346 getCh();
347 } else if (curCh == '-') {
349 getCh();
350 } else if (curCh == '>') {
351 tok.kind = token::tokenKind::toSign;
352 getCh();
353 }
354 return tok;
355 }
356
359 getCh();
360 if (curCh == '=') {
362 getCh();
363 } else if (curCh == '+') {
365 getCh();
366 }
367 return tok;
368 }
369
372 getCh();
373 if (curCh == '=') {
375 getCh();
376 }
377 return tok;
378 }
379
382 uint64_t startLine = line, startCol = col;
383 getCh();
384 if (curCh == '=') {
386 getCh();
387 } else if (curCh == '/') {
388 wstr commentText = L"//";
389 getCh();
390 while (curCh and curCh != '\n') {
391 commentText += curCh;
392 getCh();
393 }
394 comments.push_back({startLine, startCol, commentText, false});
395 return scan(); // 单行注释解析
396 } else if (curCh == '*') {
397 wstr commentText = L"/*";
398 getCh();
399 while (curCh) {
400 commentText += curCh;
401 if (curCh == '*') {
402 getCh();
403 if (curCh == '/') {
404 commentText += curCh;
405 getCh();
406 break;
407 }
408 } else {
409 getCh();
410 }
411 }
412 comments.push_back({startLine, startCol, commentText, true});
413 return scan(); // 多行注释解析
414 }
415 return tok;
416 }
417
420 getCh();
421 if (curCh == '=') {
423 getCh();
424 }
425 return tok;
426 }
427
430 getCh();
431 if (curCh == '=') {
432 tok.kind = token::tokenKind::equal;
433 getCh();
434 }
435 return tok;
436 }
437
440 getCh();
441 if (curCh == '=') {
443 getCh();
444 }
445 return tok;
446 }
447
450 getCh();
451 if (curCh == '=') {
453 getCh();
454 } else if (curCh == '<') {
456 getCh();
457 }
458 return tok;
459 }
460
463 getCh();
464 if (curCh == '=') {
466 getCh();
467 } else if (curCh == '>') {
469 getCh();
470 }
471 return tok;
472 }
473
479
482 getCh();
483 if (curCh == '=') {
485 getCh();
486 }
487 return tok;
488 }
489
495
497 getCh();
498 if (curCh == '.' && stream.peek() == '.') {
499 getCh(); getCh(); // skip ".."
501 }
502
504 return tok;
505 }
506
512
518
524
530
536
542
544 states.emplace_back(line, col, (int64_t) stream.tellg(), curCh, curToken);
545 }
546
548 stream.clear();
549 lexerState &state = states.back();
550 line = state.line, col = state.col, curCh = state.curCh, curToken = state.curToken;
551 stream.seekg(state.pos);
552 dropState();
553 }
554
556 if (!states.empty())
557 states.pop_back();
558 }
559
562 getCh();
563 if (curCh == '&') {
565 getCh();
566 }
567 return tok;
568 }
569
572 getCh();
573 if (curCh == '|') {
574 tok.kind = token::tokenKind::logicOr;
575 getCh();
576 }
577 return tok;
578 }
579
585
591
597
599
600 }
601
603
604 }
605
607
608 }
609
613
614
616
617 }
618
620 line(line), col(col), kind(kind), basicVal(), strVal() {
621
622 }
623
625 line(line), col(col), kind(kind), basicVal(basicVal), strVal() {
626
627 }
628
629 lexer::token::token(int64_t line, int64_t col, lexer::token::tokenKind kind, wstr strVal) :
630 line(line), col(col), kind(kind), basicVal(), strVal(std::move(strVal)) {
631
632 }
633
634
636
637 }
638
639 lexer::lexerState::lexerState(int64_t line, int64_t col, std::istream::pos_type pos, wchar curCh,
641 :
642 line(line), col(col), pos(pos), curCh(curCh), curToken(std::move(curToken)) {
643
644 }
645
647 yoi::wstr operatorId{L"operator"};
648 switch (curToken.kind) {
649 case token::tokenKind::plus: operatorId += L"+"; break;
650 case token::tokenKind::minus: operatorId += L"-"; break;
651 case token::tokenKind::asterisk: operatorId += L"*"; break;
652 case token::tokenKind::slash: operatorId += L"/"; break;
653 case token::tokenKind::percentSign: operatorId += L"%"; break;
654 case token::tokenKind::lessThan: operatorId += L"<"; break;
655 case token::tokenKind::greaterThan: operatorId += L">"; break;
656 case token::tokenKind::equal: operatorId += L"=="; break;
657 case token::tokenKind::notEqual: operatorId += L"!="; break;
658 case token::tokenKind::binaryAnd: operatorId += L"&"; break;
659 case token::tokenKind::binaryNot: operatorId += L"~"; break;
660 case token::tokenKind::binaryOr: operatorId += L"|"; break;
661 case token::tokenKind::binaryXor: operatorId += L"^"; break;
662 case token::tokenKind::binaryShiftLeft: operatorId += L"<<"; break;
663 case token::tokenKind::binaryShiftRight: operatorId += L">>"; break;
664 case token::tokenKind::incrementSign: operatorId += L"++"; break;
665 case token::tokenKind::decrementSign: operatorId += L"--"; break;
666 case token::tokenKind::directAssignSign: operatorId += L":="; break;
667 case token::tokenKind::additionAssignment: operatorId += L"+="; break;
668 case token::tokenKind::subtractionAssignment: operatorId += L"-="; break;
669 case token::tokenKind::multiplicationAssignment: operatorId += L"*="; break;
670 case token::tokenKind::divisionAssignment: operatorId += L"/="; break;
671 case token::tokenKind::reminderAssignment: operatorId += L"%="; break;
672 case token::tokenKind::assignSign: operatorId += L"="; break;
674 scan();
675 yoi_assert(curToken.kind == token::tokenKind::rightParentheses, line, col, "lexer::operatorStart() - invalid operator");
676 operatorId += L"()";
677 break;
678 }
680 scan();
681 yoi_assert(curToken.kind == token::tokenKind::rightBracket, line, col, "lexer::operatorStart() - invalid operator");
682 operatorId += L"[]";
683 break;
684 }
685 default: panic(line, col, "lexer::operatorStart() - unknown operator");
686 }
687 return curToken = lexer::token{line, col, token::tokenKind::identifier, std::move(operatorId)};
688 }
691} // namespace yoi
vec< lexerState > states
Definition lexer.hpp:170
token percentSignStart()
Definition lexer.cpp:418
token equalStart()
Definition lexer.cpp:428
token rightBracesStart()
Definition lexer.cpp:537
token scan()
Definition lexer.cpp:29
token semicolonStart()
Definition lexer.cpp:474
token andStart()
Definition lexer.cpp:560
token curToken
Definition lexer.hpp:172
lexer(std::wstringstream ss)
Definition lexer.cpp:8
token strStart()
Definition lexer.cpp:212
token greaterStart()
Definition lexer.cpp:461
token alphaStart()
Definition lexer.cpp:96
token xorStart()
Definition lexer.cpp:580
token leftParenthesesStart()
Definition lexer.cpp:507
std::wstringstream stream
Definition lexer.hpp:15
token asteriskStart()
Definition lexer.cpp:370
vec< Comment > comments
Definition lexer.hpp:23
token digitStart()
Definition lexer.cpp:233
token binaryNotStart()
Definition lexer.cpp:592
void getCh()
Definition lexer.cpp:12
token plusStart()
Definition lexer.cpp:357
token orStart()
Definition lexer.cpp:570
token slashStart()
Definition lexer.cpp:380
int64_t col
Definition lexer.hpp:175
token rightBracketStart()
Definition lexer.cpp:525
int64_t line
Definition lexer.hpp:175
token sharpStart()
Definition lexer.cpp:586
void saveState()
Definition lexer.cpp:543
token minusStart()
Definition lexer.cpp:341
token dotStart()
Definition lexer.cpp:496
void returnState()
Definition lexer.cpp:547
token leftBracesStart()
Definition lexer.cpp:531
token colonStart()
Definition lexer.cpp:480
wchar curCh
Definition lexer.hpp:173
token leftBracketStart()
Definition lexer.cpp:519
token notStart()
Definition lexer.cpp:438
token operatorStart()
Definition lexer.cpp:646
token commaStart()
Definition lexer.cpp:490
token lessStart()
Definition lexer.cpp:448
token rightParenthesesStart()
Definition lexer.cpp:513
void dropState()
Definition lexer.cpp:555
wstr::value_type wchar
Definition def.hpp:49
void parseString(std::wistream &input, wstr &value)
Definition def.cpp:25
void yoi_assert(bool condition, yoi::indexT line, yoi::indexT col, const std::string &msg)
Asserts a condition that would be true and throws a runtime_error if it is false.
Definition def.cpp:171
std::wstring wstr
Definition def.hpp:48
void panic(yoi::indexT line, yoi::indexT col, const std::string &msg)
Definition def.cpp:131
std::istream::pos_type pos
Definition lexer.hpp:161
uint64_t col
Definition lexer.hpp:26
enum yoi::lexer::token::tokenKind kind
uint64_t line
Definition lexer.hpp:26
union yoi::lexer::token::vBasicValue basicVal