1use dada_util::Map;
2
3use dada_ir_ast::{
4 ast::{Identifier, LiteralKind},
5 diagnostic::{Diagnostic, Level},
6 span::{Anchor, Offset, Span},
7};
8
9#[salsa::interned(debug)]
12pub struct TokenText<'db> {
13 #[return_ref]
14 pub text: String,
15}
16
17#[derive(Clone)]
18pub struct Token<'input, 'db> {
19 pub span: Span<'db>,
20 pub skipped: Option<Skipped>,
21 pub kind: TokenKind<'input, 'db>,
22}
23
24impl std::fmt::Debug for Token<'_, '_> {
25 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
26 f.debug_struct("Token")
27 .field("span", &"...")
28 .field("skipped", &self.skipped)
29 .field("kind", &self.kind)
30 .finish()
31 }
32}
33
34#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
36pub enum Skipped {
37 Whitespace,
39
40 Newline,
42
43 Comment,
45}
46
47#[derive(Clone, Debug)]
48pub enum TokenKind<'input, 'db> {
49 Identifier(Identifier<'db>),
51
52 Keyword(Keyword),
54
55 Delimited {
57 delimiter: Delimiter,
58 text: &'input str,
59 },
60
61 OpChar(char),
63
64 Literal(LiteralKind, TokenText<'db>),
66
67 Error(Diagnostic),
69}
70
71macro_rules! keywords {
72 (pub enum $Keyword:ident {
73 $($kw:ident = $kwstr:expr,)*
74 }) => {
75 #[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
76 pub enum $Keyword {
77 $($kw,)*
78 }
79
80 impl std::fmt::Display for $Keyword {
81 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82 let s = match self {
83 $(Self::$kw => $kwstr,)*
84 };
85 write!(f, "`{}`", s)
86 }
87 }
88
89 impl $Keyword {
90 const STRINGS: &'static [(&'static str, $Keyword)] = &[
91 $(($kwstr, $Keyword::$kw),)*
92 ];
93 }
94 }
95}
96
97keywords! {
98 pub enum Keyword {
99 As = "as",
100 Async = "async",
101 Await = "await",
102 Box = "box",
103 Boxed = "boxed",
104 Class = "class",
105 Crate = "crate",
106 Dyn = "dyn",
107 Else = "else",
108 Enum = "enum",
109 Export = "export",
110 False = "false",
111 Fn = "fn",
112 If = "if",
113 Is = "is",
114 Lent = "lent",
115 Let = "let",
116 Give = "give",
117 Given = "given",
118 Match = "match",
119 Matches = "matches",
120 Mod = "mod",
121 Mut = "mut",
122 My = "my",
123 Our = "our",
124 Owned = "owned",
125 Perm = "perm",
126 Place = "place",
127 Pub = "pub",
128 Ref = "ref",
129 Return = "return",
130 Self_ = "self",
131 Share = "share",
132 Shared = "shared",
133 Struct = "struct",
134 Tracked = "tracked",
135 True = "true",
136 Type = "type",
137 Unique = "unique",
138 Unsafe = "unsafe",
139 Use = "use",
140 Where = "where",
141 }
142}
143
144impl Keyword {
145 fn map() -> &'static Map<String, Keyword> {
146 static MAP: std::sync::OnceLock<Map<String, Keyword>> = std::sync::OnceLock::new();
147 MAP.get_or_init(|| {
148 let mut map = Map::default();
149 for (upper_str, kw) in Keyword::STRINGS {
150 map.insert(upper_str.to_string(), *kw);
151 }
152 map
153 })
154 }
155}
156
157pub mod operator {
158 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
160 pub struct Op(&'static [char]);
161
162 impl std::ops::Deref for Op {
163 type Target = [char];
164
165 fn deref(&self) -> &Self::Target {
166 self.0
167 }
168 }
169
170 impl std::fmt::Display for Op {
171 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
172 for char in self.0 {
173 write!(f, "{char}")?;
174 }
175 Ok(())
176 }
177 }
178
179 pub const PLUS: Op = Op(&['+']);
180 pub const MINUS: Op = Op(&['-']);
181 pub const STAR: Op = Op(&['*']);
182 pub const SLASH: Op = Op(&['/']);
183 #[expect(dead_code)]
184 pub const AND: Op = Op(&['&']);
185 pub const ANDAND: Op = Op(&['&', '&']);
186 #[expect(dead_code)]
187 pub const PIPE: Op = Op(&['|']);
188 pub const PIPEPIPE: Op = Op(&['|', '|']);
189 pub const LESSTHAN: Op = Op(&['<']);
190 pub const LESSTHANEQ: Op = Op(&['<', '=']);
191 pub const GREATERTHAN: Op = Op(&['>']);
192 pub const GREATERTHANEQ: Op = Op(&['>', '=']);
193 pub const EQ: Op = Op(&['=']);
194 pub const EQEQ: Op = Op(&['=', '=']);
195 pub const ARROW: Op = Op(&['-', '>']);
196 pub const DOT: Op = Op(&['.']);
197 pub const COLON: Op = Op(&[':']);
198 pub const BANG: Op = Op(&['!']);
199 pub const COMMA: Op = Op(&[',']);
200}
201
202#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
203pub enum Delimiter {
204 Parentheses,
205 SquareBrackets,
206 CurlyBraces,
207}
208
209impl Delimiter {
210 pub fn open_char(self) -> char {
211 match self {
212 Self::Parentheses => '(',
213 Self::SquareBrackets => '[',
214 Self::CurlyBraces => '{',
215 }
216 }
217
218 pub fn close_char(self) -> char {
219 match self {
220 Self::Parentheses => ')',
221 Self::SquareBrackets => ']',
222 Self::CurlyBraces => '}',
223 }
224 }
225
226 pub fn chars(self) -> &'static str {
227 match self {
228 Delimiter::Parentheses => "()",
229 Delimiter::SquareBrackets => "[]",
230 Delimiter::CurlyBraces => "{}",
231 }
232 }
233}
234
235pub fn tokenize<'input, 'db>(
236 db: &'db dyn crate::Db,
237 anchor: Anchor<'db>,
238 input_offset: Offset,
239 input: &'input str,
240) -> Vec<Token<'input, 'db>> {
241 Tokenizer {
242 db,
243 anchor,
244 input,
245 chars: input.char_indices().peekable(),
246 tokens: vec![],
247 kws: Keyword::map(),
248 error_start: None,
249 input_offset,
250 skipped_accum: None,
251 }
252 .tokenize()
253}
254
255struct Tokenizer<'input, 'db> {
256 db: &'db dyn crate::Db,
257 anchor: Anchor<'db>,
258 input: &'input str,
259 chars: CharIndices<'input>,
260 tokens: Vec<Token<'input, 'db>>,
261 kws: &'static Map<String, Keyword>,
262 input_offset: Offset,
263 error_start: Option<usize>,
264 skipped_accum: Option<Skipped>,
265}
266
267impl<'input, 'db> Tokenizer<'input, 'db> {
268 fn tokenize(mut self) -> Vec<Token<'input, 'db>> {
269 while let Some((index, ch)) = self.chars.next() {
270 match ch {
271 '#' => self.comment(index),
273
274 _ if ch.is_alphabetic() || ch == '_' => self.identifier(index, ch),
276
277 '{' => self.delimited(index, Delimiter::CurlyBraces, '}'),
279 '[' => self.delimited(index, Delimiter::SquareBrackets, ']'),
280 '(' => self.delimited(index, Delimiter::Parentheses, ')'),
281
282 _ if ch.is_ascii_digit() => self.integer(index, ch),
284
285 '"' => self.string_literal(index),
287
288 '\n' => {
290 self.accumulate_skipped(Skipped::Newline);
291 }
292
293 _ if ch.is_whitespace() => {
295 self.accumulate_skipped(Skipped::Whitespace);
296 }
297
298 _ if is_op_char(ch) => self.ops(index, ch),
300
301 _ => {
302 if self.error_start.is_none() {
306 self.error_start = Some(index);
307 }
308 }
309 }
310 }
311
312 let _skipped = self.clear_accumulated(self.input.len());
313
314 self.tokens
315 }
316
317 fn accumulate_skipped(&mut self, skipped: Skipped) {
318 self.skipped_accum = std::cmp::max(self.skipped_accum, Some(skipped));
319 }
320
321 fn clear_accumulated(&mut self, index: usize) -> Option<Skipped> {
325 if let Some(start) = self.error_start {
326 self.error_start = None;
327
328 let span = self.span(start, index);
329 self.tokens.push(Token {
330 span,
331 skipped: None,
332 kind: TokenKind::Error(
333 Diagnostic::error(self.db, span, "unrecognized characters(s)").label(
334 self.db,
335 Level::Error,
336 span,
337 "I don't know how to interpret these characters",
338 ),
339 ),
340 });
341 }
342
343 self.skipped_accum.take()
344 }
345
346 fn span(&self, start: usize, end: usize) -> Span<'db> {
347 assert!(end >= start);
348 Span {
349 anchor: self.anchor,
350 start: self.input_offset + start,
351 end: self.input_offset + end,
352 }
353 }
354
355 fn comment(&mut self, index: usize) {
356 let _skipped = self.clear_accumulated(index);
357 self.accumulate_skipped(Skipped::Comment);
358
359 for (_index, ch) in &mut self.chars {
360 if ch == '\n' {
361 return;
362 }
363 }
364 }
365
366 fn identifier(&mut self, start: usize, ch: char) {
367 let skipped = self.clear_accumulated(start);
368
369 let mut end = start + ch.len_utf8();
370
371 while let Some(&(index, ch)) = self.chars.peek() {
372 if ch.is_alphanumeric() || ch == '_' {
373 end = index + ch.len_utf8();
374 self.chars.next();
375 } else {
376 break;
377 }
378 }
379
380 let span = self.span(start, end);
381
382 let text = &self.input[start..end];
383 if let Some(kw) = self.kws.get(text) {
384 self.tokens.push(Token {
385 span,
386 skipped,
387 kind: TokenKind::Keyword(*kw),
388 });
389 } else {
390 let identifier = Identifier::new(self.db, text.to_string());
391 self.tokens.push(Token {
392 span,
393 skipped,
394 kind: TokenKind::Identifier(identifier),
395 })
396 }
397 }
398
399 fn integer(&mut self, start: usize, ch: char) {
400 let skipped = self.clear_accumulated(start);
401
402 let mut end = start + ch.len_utf8();
403
404 while let Some(&(index, ch)) = self.chars.peek() {
405 if ch.is_ascii_digit() || ch == '_' {
406 end = index + ch.len_utf8();
407 self.chars.next();
408 } else {
409 break;
410 }
411 }
412
413 let span = self.span(start, end);
414
415 let text = &self.input[start..end];
416 let token_text = TokenText::new(self.db, text.to_string());
417 self.tokens.push(Token {
418 span,
419 skipped,
420 kind: TokenKind::Literal(LiteralKind::Integer, token_text),
421 });
422 }
423
424 fn escape_sequence(&mut self, backslash_offset: usize, content: &mut String) {
427 if let Some((index, escape)) = self.chars.next() {
428 match escape {
429 '"' => content.push('"'),
430 '\\' => content.push('\\'),
431 'n' => content.push('\n'),
432 'r' => content.push('\r'),
433 't' => content.push('\t'),
434 '{' => content.push('{'),
435 '}' => content.push('}'),
436 _ => {
437 content.push('\\');
438 content.push(escape);
439
440 let span = self.span(index, index + escape.len_utf8());
441 self.tokens.push(Token {
442 span,
443 skipped: None,
444 kind: TokenKind::Error(Diagnostic::error(
445 self.db,
446 span,
447 format!("invalid escape `\\{escape}`"),
448 )),
449 });
450 }
451 }
452 } else {
453 content.push('\\');
454
455 let span = self.span(backslash_offset, backslash_offset + '\\'.len_utf8());
456 self.tokens.push(Token {
457 span,
458 skipped: None,
459 kind: TokenKind::Error(Diagnostic::error(
460 self.db,
461 span,
462 "`\\` must be followed by an escape character",
463 )),
464 });
465 }
466 }
467
468 fn emit_string_literal(
475 &mut self,
476 span: Span<'db>,
477 skipped: Option<Skipped>,
478 content: String,
479 quote_len: usize,
480 raw: bool,
481 ) {
482 let raw_start = (span.start - self.input_offset).as_usize() + quote_len;
484 let raw_end = (span.end - self.input_offset).as_usize() - quote_len;
485 let raw_content = &self.input[raw_start..raw_end];
486
487 let final_content = if raw {
496 let after_marker = &raw_content[1..]; process_escape_sequences(after_marker)
499 } else if raw_content.starts_with('\n') {
500 let dedented = dedent_multiline(raw_content);
501 process_escape_sequences(&dedented)
502 } else {
503 content
504 };
505
506 let token_text = TokenText::new(self.db, final_content);
507 self.tokens.push(Token {
508 span,
509 skipped,
510 kind: TokenKind::Literal(LiteralKind::String, token_text),
511 });
512 }
513
514 fn emit_unterminated_string(
518 &mut self,
519 start: usize,
520 skipped: Option<Skipped>,
521 content: String,
522 message: &str,
523 ) {
524 let span = self.span(start, self.input.len());
525 let token_text = TokenText::new(self.db, content);
526 self.tokens.push(Token {
527 span,
528 skipped,
529 kind: TokenKind::Literal(LiteralKind::String, token_text),
530 });
531 self.tokens.push(Token {
532 span,
533 skipped: None,
534 kind: TokenKind::Error(Diagnostic::error(self.db, span, message)),
535 });
536 }
537
538 fn string_literal(&mut self, start: usize) {
539 let skipped = self.clear_accumulated(start);
540
541 if let Some(&(_, '"')) = self.chars.peek() {
544 self.chars.next();
547 if let Some(&(_, '"')) = self.chars.peek() {
548 self.chars.next();
550 return self.triple_quoted_string_literal(start, skipped);
551 }
552
553 self.emit_string_literal(
555 self.span(start, start + 2),
556 skipped,
557 String::new(),
558 1,
559 false,
560 );
561 return;
562 }
563
564 let raw = if let Some(&(_, '\\')) = self.chars.peek() {
567 let mut lookahead = self.chars.clone();
569 lookahead.next(); matches!(lookahead.next(), Some((_, '\n')))
571 } else {
572 false
573 };
574
575 if raw {
576 self.chars.next();
578 }
579
580 let mut processed_content = String::new();
581
582 while let Some((end, ch)) = self.chars.next() {
583 if ch == '"' {
584 self.emit_string_literal(
585 self.span(start, end + ch.len_utf8()),
586 skipped,
587 processed_content,
588 1,
589 raw,
590 );
591 return;
592 }
593
594 if ch == '\\' {
595 self.escape_sequence(end, &mut processed_content);
596 } else {
597 processed_content.push(ch);
598 }
599 }
600
601 self.emit_unterminated_string(
602 start,
603 skipped,
604 processed_content,
605 "missing end quote for string",
606 );
607 }
608
609 fn triple_quoted_string_literal(&mut self, start: usize, skipped: Option<Skipped>) {
612 let mut processed_content = String::new();
613
614 while let Some((end, ch)) = self.chars.next() {
615 if ch == '"' {
616 processed_content.push('"');
620 if let Some(&(_, '"')) = self.chars.peek() {
621 self.chars.next();
622 processed_content.push('"');
623 if let Some(&(third_idx, '"')) = self.chars.peek() {
624 self.chars.next();
627 processed_content.pop();
628 processed_content.pop();
629 self.emit_string_literal(
630 self.span(start, third_idx + '"'.len_utf8()),
631 skipped,
632 processed_content,
633 3,
634 false,
635 );
636 return;
637 }
638 }
639 } else if ch == '\\' {
640 self.escape_sequence(end, &mut processed_content);
641 } else {
642 processed_content.push(ch);
643 }
644 }
645
646 self.emit_unterminated_string(
647 start,
648 skipped,
649 processed_content,
650 "missing end quotes for triple-quoted string",
651 );
652 }
653
654 fn delimited(&mut self, start: usize, delim: Delimiter, close: char) {
655 let skipped = self.clear_accumulated(start);
656 let mut close_stack = vec![close];
657
658 while let Some((end, ch)) = self.chars.next() {
659 match ch {
660 '{' => close_stack.push('}'),
661 '[' => close_stack.push(']'),
662 '(' => close_stack.push(')'),
663 '}' | ']' | ')' => {
664 if ch == *close_stack.last().unwrap() {
665 close_stack.pop();
666 if close_stack.is_empty() {
667 assert!(ch.len_utf8() == 1);
668 self.tokens.push(Token {
669 span: self.span(start, end + 1),
670 skipped,
671 kind: TokenKind::Delimited {
672 delimiter: delim,
673 text: &self.input[start + 1..end],
674 },
675 });
676 return;
677 }
678 } else {
679 break;
680 }
681 }
682 _ => {}
683 }
684 }
685
686 let end = self.input.len();
692 let span = self.span(start, end);
693 self.tokens.push(Token {
694 span,
695 skipped: None,
696 kind: TokenKind::Error(Diagnostic::error(
697 self.db,
698 span,
699 format!("missing `{close}`"),
700 )),
701 });
702 }
703
704 fn ops(&mut self, start: usize, ch: char) {
705 let skipped = self.clear_accumulated(start);
706 self.tokens.push(Token {
707 span: self.span(start, start + ch.len_utf8()),
708 skipped,
709 kind: TokenKind::OpChar(ch),
710 });
711 }
712}
713
714pub fn is_op_char(ch: char) -> bool {
715 matches!(
716 ch,
717 '+' | '-'
718 | '*'
719 | '/'
720 | '%'
721 | '='
722 | '!'
723 | '<'
724 | '>'
725 | '&'
726 | '|'
727 | ':'
728 | ','
729 | '.'
730 | ';'
731 | '?'
732 )
733}
734
735type CharIndices<'input> = std::iter::Peekable<std::str::CharIndices<'input>>;
736
737fn dedent_multiline(raw: &str) -> String {
754 let content = &raw[1..];
756
757 let content = match content.rfind('\n') {
759 Some(pos) => &content[..pos],
760 None => {
761 return content.to_string();
764 }
765 };
766
767 let lines: Vec<&str> = content.split('\n').collect();
769 let common_prefix = lines
770 .iter()
771 .filter(|line| !line.is_empty())
772 .map(|line| line.len() - line.trim_start().len())
773 .min()
774 .unwrap_or(0);
775
776 lines
778 .iter()
779 .map(|line| {
780 if line.len() >= common_prefix {
781 &line[common_prefix..]
782 } else {
783 line
785 }
786 })
787 .collect::<Vec<_>>()
788 .join("\n")
789}
790
791fn process_escape_sequences(raw: &str) -> String {
797 let mut result = String::with_capacity(raw.len());
798 let mut chars = raw.chars();
799
800 while let Some(ch) = chars.next() {
801 if ch == '\\' {
802 match chars.next() {
803 Some('"') => result.push('"'),
804 Some('\\') => result.push('\\'),
805 Some('n') => result.push('\n'),
806 Some('r') => result.push('\r'),
807 Some('t') => result.push('\t'),
808 Some('{') => result.push('{'),
809 Some('}') => result.push('}'),
810 Some(escape) => {
811 result.push('\\');
813 result.push(escape);
814 }
815 None => {
816 result.push('\\');
818 }
819 }
820 } else {
821 result.push(ch);
822 }
823 }
824
825 result
826}