1use crate::cst::{dialects::JS, print_cst, CstNode};
9
10pub fn parse_js(src: &str) -> CstNode {
12 let children = tokenise(src);
13 CstNode::list(format!("{}.program", JS), children)
14}
15
16pub fn print_js(node: &CstNode) -> String {
18 print_cst(node)
19}
20
21#[derive(Clone, Copy)]
22enum LastKind {
23 None,
24 Trivia,
25 Ident,
26 Punct,
27 Other,
28}
29
30fn tokenise(src: &str) -> Vec<CstNode> {
31 let chars: Vec<char> = src.chars().collect();
32 let mut out: Vec<CstNode> = Vec::new();
33 let mut i = 0usize;
34 let mut last_kind = LastKind::None;
35 let mut last_text: String = String::new();
36
37 if chars.len() >= 2 && chars[0] == '#' && chars[1] == '!' {
38 let mut j = 0usize;
39 while j < chars.len() && chars[j] != '\n' {
40 j += 1;
41 }
42 out.push(CstNode::trivia(
43 chars[..j].iter().collect::<String>(),
44 Some(&format!("{}.hashbang", JS)),
45 ));
46 i = j;
47 }
48
49 while i < chars.len() {
50 let c = chars[i];
51
52 if c == ' ' || c == '\t' || c == '\r' || c == '\n' {
53 let mut j = i;
54 while j < chars.len()
55 && (chars[j] == ' ' || chars[j] == '\t' || chars[j] == '\r' || chars[j] == '\n')
56 {
57 j += 1;
58 }
59 out.push(CstNode::trivia(
60 chars[i..j].iter().collect::<String>(),
61 Some(&format!("{}.whitespace", JS)),
62 ));
63 i = j;
64 last_kind = LastKind::Trivia;
65 continue;
66 }
67
68 if c == '/' && chars.get(i + 1) == Some(&'/') {
69 let mut j = i + 2;
70 while j < chars.len() && chars[j] != '\n' {
71 j += 1;
72 }
73 out.push(CstNode::trivia(
74 chars[i..j].iter().collect::<String>(),
75 Some(&format!("{}.comment.line", JS)),
76 ));
77 i = j;
78 last_kind = LastKind::Trivia;
79 continue;
80 }
81
82 if c == '/' && chars.get(i + 1) == Some(&'*') {
83 let j = scan_block_comment(&chars, i);
84 out.push(CstNode::trivia(
85 chars[i..j].iter().collect::<String>(),
86 Some(&format!("{}.comment.block", JS)),
87 ));
88 i = j;
89 last_kind = LastKind::Trivia;
90 continue;
91 }
92
93 if c == '"' || c == '\'' {
94 let j = scan_string(&chars, i + 1, c);
95 let text: String = chars[i..j].iter().collect();
96 out.push(CstNode::token(
97 text.clone(),
98 Some(&format!("{}.string_literal", JS)),
99 ));
100 i = j;
101 last_kind = LastKind::Other;
102 last_text = text;
103 continue;
104 }
105
106 if c == '`' {
107 let j = scan_template(&chars, i);
108 let text: String = chars[i..j].iter().collect();
109 out.push(CstNode::token(
110 text.clone(),
111 Some(&format!("{}.template_literal", JS)),
112 ));
113 i = j;
114 last_kind = LastKind::Other;
115 last_text = text;
116 continue;
117 }
118
119 if c == '/' && can_be_regex(last_kind, &last_text) {
120 let j = scan_regex(&chars, i);
121 if j > i + 1 {
122 let text: String = chars[i..j].iter().collect();
123 out.push(CstNode::token(
124 text.clone(),
125 Some(&format!("{}.regexp_literal", JS)),
126 ));
127 i = j;
128 last_kind = LastKind::Other;
129 last_text = text;
130 continue;
131 }
132 }
133
134 if c.is_ascii_digit()
135 || (c == '.' && chars.get(i + 1).map(|x| x.is_ascii_digit()).unwrap_or(false))
136 {
137 let j = scan_number(&chars, i);
138 let text: String = chars[i..j].iter().collect();
139 out.push(CstNode::token(
140 text.clone(),
141 Some(&format!("{}.numeric_literal", JS)),
142 ));
143 i = j;
144 last_kind = LastKind::Other;
145 last_text = text;
146 continue;
147 }
148
149 if is_ident_start(c) {
150 let mut j = i + 1;
151 while j < chars.len() && is_ident_continue(chars[j]) {
152 j += 1;
153 }
154 let text: String = chars[i..j].iter().collect();
155 out.push(CstNode::token(
156 text.clone(),
157 Some(&format!("{}.ident", JS)),
158 ));
159 i = j;
160 last_kind = LastKind::Ident;
161 last_text = text;
162 continue;
163 }
164
165 let text = c.to_string();
166 out.push(CstNode::token(
167 text.clone(),
168 Some(&format!("{}.punct", JS)),
169 ));
170 i += 1;
171 last_kind = LastKind::Punct;
172 last_text = text;
173 }
174
175 out
176}
177
178fn scan_block_comment(chars: &[char], i: usize) -> usize {
179 let mut j = i + 2;
180 while j < chars.len() {
181 if chars[j] == '*' && chars.get(j + 1) == Some(&'/') {
182 return j + 2;
183 }
184 j += 1;
185 }
186 chars.len()
187}
188
189fn scan_string(chars: &[char], mut j: usize, quote: char) -> usize {
190 while j < chars.len() {
191 let c = chars[j];
192 if c == '\\' {
193 j += 2;
194 continue;
195 }
196 if c == '\n' && (quote == '"' || quote == '\'') {
197 return j;
198 }
199 if c == quote {
200 return j + 1;
201 }
202 j += 1;
203 }
204 j
205}
206
207fn scan_template(chars: &[char], i: usize) -> usize {
208 let mut j = i + 1;
209 while j < chars.len() {
210 let c = chars[j];
211 if c == '\\' {
212 j += 2;
213 continue;
214 }
215 if c == '`' {
216 return j + 1;
217 }
218 if c == '$' && chars.get(j + 1) == Some(&'{') {
219 j += 2;
220 let mut depth = 1;
221 while j < chars.len() && depth > 0 {
222 let k = chars[j];
223 if k == '{' {
224 depth += 1;
225 } else if k == '}' {
226 depth -= 1;
227 } else if k == '"' || k == '\'' {
228 j = scan_string(chars, j + 1, k).saturating_sub(1);
229 } else if k == '`' {
230 j = scan_template(chars, j).saturating_sub(1);
231 } else if k == '/' && chars.get(j + 1) == Some(&'/') {
232 while j < chars.len() && chars[j] != '\n' {
233 j += 1;
234 }
235 continue;
236 } else if k == '/' && chars.get(j + 1) == Some(&'*') {
237 j = scan_block_comment(chars, j);
238 continue;
239 }
240 j += 1;
241 }
242 continue;
243 }
244 j += 1;
245 }
246 j
247}
248
249fn scan_regex(chars: &[char], i: usize) -> usize {
250 let mut j = i + 1;
251 let mut in_class = false;
252 while j < chars.len() {
253 let c = chars[j];
254 if c == '\\' {
255 j += 2;
256 continue;
257 }
258 if c == '[' {
259 in_class = true;
260 } else if c == ']' {
261 in_class = false;
262 } else if c == '/' && !in_class {
263 j += 1;
264 while j < chars.len() && chars[j].is_ascii_alphabetic() {
265 j += 1;
266 }
267 return j;
268 } else if c == '\n' {
269 return i + 1;
270 }
271 j += 1;
272 }
273 j
274}
275
276fn scan_number(chars: &[char], i: usize) -> usize {
277 let mut j = i;
278 if chars.get(j) == Some(&'0') && matches!(chars.get(j + 1), Some('x') | Some('X')) {
279 j += 2;
280 while j < chars.len() && (chars[j].is_ascii_hexdigit() || chars[j] == '_') {
281 j += 1;
282 }
283 } else if chars.get(j) == Some(&'0') && matches!(chars.get(j + 1), Some('o') | Some('O')) {
284 j += 2;
285 while j < chars.len() && matches!(chars[j], '0'..='7' | '_') {
286 j += 1;
287 }
288 } else if chars.get(j) == Some(&'0') && matches!(chars.get(j + 1), Some('b') | Some('B')) {
289 j += 2;
290 while j < chars.len() && matches!(chars[j], '0' | '1' | '_') {
291 j += 1;
292 }
293 } else {
294 while j < chars.len() && (chars[j].is_ascii_digit() || chars[j] == '_') {
295 j += 1;
296 }
297 if chars.get(j) == Some(&'.') {
298 j += 1;
299 while j < chars.len() && (chars[j].is_ascii_digit() || chars[j] == '_') {
300 j += 1;
301 }
302 }
303 if matches!(chars.get(j), Some('e') | Some('E')) {
304 j += 1;
305 if matches!(chars.get(j), Some('+') | Some('-')) {
306 j += 1;
307 }
308 while j < chars.len() && (chars[j].is_ascii_digit() || chars[j] == '_') {
309 j += 1;
310 }
311 }
312 }
313 if chars.get(j) == Some(&'n') {
314 j += 1;
315 }
316 j
317}
318
319const REGEX_PRECEDING_KEYWORDS: &[&str] = &[
320 "return",
321 "typeof",
322 "instanceof",
323 "in",
324 "of",
325 "do",
326 "else",
327 "throw",
328 "new",
329 "delete",
330 "void",
331 "await",
332 "yield",
333 "case",
334];
335
336fn can_be_regex(last_kind: LastKind, last_text: &str) -> bool {
337 match last_kind {
338 LastKind::None | LastKind::Trivia => true,
339 LastKind::Ident => REGEX_PRECEDING_KEYWORDS.iter().any(|k| *k == last_text),
340 LastKind::Punct => !(last_text == ")" || last_text == "]"),
341 LastKind::Other => false,
342 }
343}
344
345fn is_ident_start(c: char) -> bool {
346 c == '_' || c == '$' || c.is_ascii_alphabetic() || (c as u32) > 0x7F
347}
348
349fn is_ident_continue(c: char) -> bool {
350 c == '_' || c == '$' || c.is_ascii_alphanumeric() || (c as u32) > 0x7F
351}
352
353#[cfg(test)]
354mod tests {
355 use super::*;
356
357 fn rt(src: &str) {
358 let node = parse_js(src);
359 let back = print_js(&node);
360 assert_eq!(back, src, "round-trip mismatch");
361 }
362
363 #[test]
364 fn empty_string() {
365 rt("");
366 }
367
368 #[test]
369 fn simple_const() {
370 rt("const x = 1;\n");
371 }
372
373 #[test]
374 fn comments_and_strings() {
375 rt("// hi\n/* block */ const s = 'a\\'b';\n");
376 }
377
378 #[test]
379 fn template_with_nested_expr() {
380 rt("const t = `nested ${`inner ${1+2}`} done`;\n");
381 }
382
383 #[test]
384 fn regex_after_assignment() {
385 rt("const r = /foo\\/bar/g;\n");
386 }
387
388 #[test]
389 fn divide_after_paren_is_not_regex() {
390 rt("const x = (1)/2;\n");
391 }
392
393 #[test]
394 fn bigint_and_hex() {
395 rt("const n = 0xff_ff;\nconst b = 0b1010n;\nconst f = 3.14e-2;\n");
396 }
397
398 #[test]
399 fn hashbang_preserved() {
400 rt("#!/usr/bin/env node\nconsole.log(\"hi\");\n");
401 }
402}