Skip to main content

rml/
cst_js.rs

1//! JavaScript ↔ `.lino` CST converter (issue #138).
2//!
3//! Token-level lossless converter for JavaScript source. Produces a
4//! `lino-cst.js.*` flat CST whose round-trip is byte-faithful:
5//! `print_js(&parse_js(src)) == src`. Mirrors `js/src/cst-js.mjs`
6//! line for line.
7
8use crate::cst::{dialects::JS, print_cst, CstNode};
9
10/// Parse JavaScript source into a `lino-cst.js.*` CST.
11pub fn parse_js(src: &str) -> CstNode {
12    let children = tokenise(src);
13    CstNode::list(format!("{}.program", JS), children)
14}
15
16/// Print a JS CST back to source.
17pub fn print_js(node: &CstNode) -> String {
18    print_cst(node)
19}
20
21#[derive(Clone, Copy)]
22enum LastKind {
23    None,
24    Trivia,
25    Ident,
26    Punct,
27    Other,
28}
29
30fn tokenise(src: &str) -> Vec<CstNode> {
31    let chars: Vec<char> = src.chars().collect();
32    let mut out: Vec<CstNode> = Vec::new();
33    let mut i = 0usize;
34    let mut last_kind = LastKind::None;
35    let mut last_text: String = String::new();
36
37    if chars.len() >= 2 && chars[0] == '#' && chars[1] == '!' {
38        let mut j = 0usize;
39        while j < chars.len() && chars[j] != '\n' {
40            j += 1;
41        }
42        out.push(CstNode::trivia(
43            chars[..j].iter().collect::<String>(),
44            Some(&format!("{}.hashbang", JS)),
45        ));
46        i = j;
47    }
48
49    while i < chars.len() {
50        let c = chars[i];
51
52        if c == ' ' || c == '\t' || c == '\r' || c == '\n' {
53            let mut j = i;
54            while j < chars.len()
55                && (chars[j] == ' ' || chars[j] == '\t' || chars[j] == '\r' || chars[j] == '\n')
56            {
57                j += 1;
58            }
59            out.push(CstNode::trivia(
60                chars[i..j].iter().collect::<String>(),
61                Some(&format!("{}.whitespace", JS)),
62            ));
63            i = j;
64            last_kind = LastKind::Trivia;
65            continue;
66        }
67
68        if c == '/' && chars.get(i + 1) == Some(&'/') {
69            let mut j = i + 2;
70            while j < chars.len() && chars[j] != '\n' {
71                j += 1;
72            }
73            out.push(CstNode::trivia(
74                chars[i..j].iter().collect::<String>(),
75                Some(&format!("{}.comment.line", JS)),
76            ));
77            i = j;
78            last_kind = LastKind::Trivia;
79            continue;
80        }
81
82        if c == '/' && chars.get(i + 1) == Some(&'*') {
83            let j = scan_block_comment(&chars, i);
84            out.push(CstNode::trivia(
85                chars[i..j].iter().collect::<String>(),
86                Some(&format!("{}.comment.block", JS)),
87            ));
88            i = j;
89            last_kind = LastKind::Trivia;
90            continue;
91        }
92
93        if c == '"' || c == '\'' {
94            let j = scan_string(&chars, i + 1, c);
95            let text: String = chars[i..j].iter().collect();
96            out.push(CstNode::token(
97                text.clone(),
98                Some(&format!("{}.string_literal", JS)),
99            ));
100            i = j;
101            last_kind = LastKind::Other;
102            last_text = text;
103            continue;
104        }
105
106        if c == '`' {
107            let j = scan_template(&chars, i);
108            let text: String = chars[i..j].iter().collect();
109            out.push(CstNode::token(
110                text.clone(),
111                Some(&format!("{}.template_literal", JS)),
112            ));
113            i = j;
114            last_kind = LastKind::Other;
115            last_text = text;
116            continue;
117        }
118
119        if c == '/' && can_be_regex(last_kind, &last_text) {
120            let j = scan_regex(&chars, i);
121            if j > i + 1 {
122                let text: String = chars[i..j].iter().collect();
123                out.push(CstNode::token(
124                    text.clone(),
125                    Some(&format!("{}.regexp_literal", JS)),
126                ));
127                i = j;
128                last_kind = LastKind::Other;
129                last_text = text;
130                continue;
131            }
132        }
133
134        if c.is_ascii_digit()
135            || (c == '.' && chars.get(i + 1).map(|x| x.is_ascii_digit()).unwrap_or(false))
136        {
137            let j = scan_number(&chars, i);
138            let text: String = chars[i..j].iter().collect();
139            out.push(CstNode::token(
140                text.clone(),
141                Some(&format!("{}.numeric_literal", JS)),
142            ));
143            i = j;
144            last_kind = LastKind::Other;
145            last_text = text;
146            continue;
147        }
148
149        if is_ident_start(c) {
150            let mut j = i + 1;
151            while j < chars.len() && is_ident_continue(chars[j]) {
152                j += 1;
153            }
154            let text: String = chars[i..j].iter().collect();
155            out.push(CstNode::token(
156                text.clone(),
157                Some(&format!("{}.ident", JS)),
158            ));
159            i = j;
160            last_kind = LastKind::Ident;
161            last_text = text;
162            continue;
163        }
164
165        let text = c.to_string();
166        out.push(CstNode::token(
167            text.clone(),
168            Some(&format!("{}.punct", JS)),
169        ));
170        i += 1;
171        last_kind = LastKind::Punct;
172        last_text = text;
173    }
174
175    out
176}
177
178fn scan_block_comment(chars: &[char], i: usize) -> usize {
179    let mut j = i + 2;
180    while j < chars.len() {
181        if chars[j] == '*' && chars.get(j + 1) == Some(&'/') {
182            return j + 2;
183        }
184        j += 1;
185    }
186    chars.len()
187}
188
189fn scan_string(chars: &[char], mut j: usize, quote: char) -> usize {
190    while j < chars.len() {
191        let c = chars[j];
192        if c == '\\' {
193            j += 2;
194            continue;
195        }
196        if c == '\n' && (quote == '"' || quote == '\'') {
197            return j;
198        }
199        if c == quote {
200            return j + 1;
201        }
202        j += 1;
203    }
204    j
205}
206
207fn scan_template(chars: &[char], i: usize) -> usize {
208    let mut j = i + 1;
209    while j < chars.len() {
210        let c = chars[j];
211        if c == '\\' {
212            j += 2;
213            continue;
214        }
215        if c == '`' {
216            return j + 1;
217        }
218        if c == '$' && chars.get(j + 1) == Some(&'{') {
219            j += 2;
220            let mut depth = 1;
221            while j < chars.len() && depth > 0 {
222                let k = chars[j];
223                if k == '{' {
224                    depth += 1;
225                } else if k == '}' {
226                    depth -= 1;
227                } else if k == '"' || k == '\'' {
228                    j = scan_string(chars, j + 1, k).saturating_sub(1);
229                } else if k == '`' {
230                    j = scan_template(chars, j).saturating_sub(1);
231                } else if k == '/' && chars.get(j + 1) == Some(&'/') {
232                    while j < chars.len() && chars[j] != '\n' {
233                        j += 1;
234                    }
235                    continue;
236                } else if k == '/' && chars.get(j + 1) == Some(&'*') {
237                    j = scan_block_comment(chars, j);
238                    continue;
239                }
240                j += 1;
241            }
242            continue;
243        }
244        j += 1;
245    }
246    j
247}
248
249fn scan_regex(chars: &[char], i: usize) -> usize {
250    let mut j = i + 1;
251    let mut in_class = false;
252    while j < chars.len() {
253        let c = chars[j];
254        if c == '\\' {
255            j += 2;
256            continue;
257        }
258        if c == '[' {
259            in_class = true;
260        } else if c == ']' {
261            in_class = false;
262        } else if c == '/' && !in_class {
263            j += 1;
264            while j < chars.len() && chars[j].is_ascii_alphabetic() {
265                j += 1;
266            }
267            return j;
268        } else if c == '\n' {
269            return i + 1;
270        }
271        j += 1;
272    }
273    j
274}
275
276fn scan_number(chars: &[char], i: usize) -> usize {
277    let mut j = i;
278    if chars.get(j) == Some(&'0') && matches!(chars.get(j + 1), Some('x') | Some('X')) {
279        j += 2;
280        while j < chars.len() && (chars[j].is_ascii_hexdigit() || chars[j] == '_') {
281            j += 1;
282        }
283    } else if chars.get(j) == Some(&'0') && matches!(chars.get(j + 1), Some('o') | Some('O')) {
284        j += 2;
285        while j < chars.len() && matches!(chars[j], '0'..='7' | '_') {
286            j += 1;
287        }
288    } else if chars.get(j) == Some(&'0') && matches!(chars.get(j + 1), Some('b') | Some('B')) {
289        j += 2;
290        while j < chars.len() && matches!(chars[j], '0' | '1' | '_') {
291            j += 1;
292        }
293    } else {
294        while j < chars.len() && (chars[j].is_ascii_digit() || chars[j] == '_') {
295            j += 1;
296        }
297        if chars.get(j) == Some(&'.') {
298            j += 1;
299            while j < chars.len() && (chars[j].is_ascii_digit() || chars[j] == '_') {
300                j += 1;
301            }
302        }
303        if matches!(chars.get(j), Some('e') | Some('E')) {
304            j += 1;
305            if matches!(chars.get(j), Some('+') | Some('-')) {
306                j += 1;
307            }
308            while j < chars.len() && (chars[j].is_ascii_digit() || chars[j] == '_') {
309                j += 1;
310            }
311        }
312    }
313    if chars.get(j) == Some(&'n') {
314        j += 1;
315    }
316    j
317}
318
319const REGEX_PRECEDING_KEYWORDS: &[&str] = &[
320    "return",
321    "typeof",
322    "instanceof",
323    "in",
324    "of",
325    "do",
326    "else",
327    "throw",
328    "new",
329    "delete",
330    "void",
331    "await",
332    "yield",
333    "case",
334];
335
336fn can_be_regex(last_kind: LastKind, last_text: &str) -> bool {
337    match last_kind {
338        LastKind::None | LastKind::Trivia => true,
339        LastKind::Ident => REGEX_PRECEDING_KEYWORDS.iter().any(|k| *k == last_text),
340        LastKind::Punct => !(last_text == ")" || last_text == "]"),
341        LastKind::Other => false,
342    }
343}
344
345fn is_ident_start(c: char) -> bool {
346    c == '_' || c == '$' || c.is_ascii_alphabetic() || (c as u32) > 0x7F
347}
348
349fn is_ident_continue(c: char) -> bool {
350    c == '_' || c == '$' || c.is_ascii_alphanumeric() || (c as u32) > 0x7F
351}
352
353#[cfg(test)]
354mod tests {
355    use super::*;
356
357    fn rt(src: &str) {
358        let node = parse_js(src);
359        let back = print_js(&node);
360        assert_eq!(back, src, "round-trip mismatch");
361    }
362
363    #[test]
364    fn empty_string() {
365        rt("");
366    }
367
368    #[test]
369    fn simple_const() {
370        rt("const x = 1;\n");
371    }
372
373    #[test]
374    fn comments_and_strings() {
375        rt("// hi\n/* block */ const s = 'a\\'b';\n");
376    }
377
378    #[test]
379    fn template_with_nested_expr() {
380        rt("const t = `nested ${`inner ${1+2}`} done`;\n");
381    }
382
383    #[test]
384    fn regex_after_assignment() {
385        rt("const r = /foo\\/bar/g;\n");
386    }
387
388    #[test]
389    fn divide_after_paren_is_not_regex() {
390        rt("const x = (1)/2;\n");
391    }
392
393    #[test]
394    fn bigint_and_hex() {
395        rt("const n = 0xff_ff;\nconst b = 0b1010n;\nconst f = 3.14e-2;\n");
396    }
397
398    #[test]
399    fn hashbang_preserved() {
400        rt("#!/usr/bin/env node\nconsole.log(\"hi\");\n");
401    }
402}