Skip to main content

rml/
cst.rs

1//! Universal lossless CST infrastructure for issue #138.
2//!
3//! This module mirrors `js/src/cst.mjs`: three node kinds (`list`, `token`,
4//! `trivia`), a content-agnostic round-trip printer, and `.lino`
5//! serialisation/deserialisation helpers. Used by `cst_rust`, `cst_js`,
6//! `cst_lean` and `cst_rocq` to express their token streams.
7
8use std::fmt::Write;
9
10/// The four host-language dialect tag prefixes plus the shared dialect.
11pub mod dialects {
12    pub const RUST: &str = "lino-cst.rust";
13    pub const JS: &str = "lino-cst.js";
14    pub const LEAN: &str = "lino-cst.lean";
15    pub const ROCQ: &str = "lino-cst.rocq";
16    pub const SHARED: &str = "lino-cst.shared";
17}
18
19/// A CST node. Three kinds: `List` (with optional `open`/`close` delimiters),
20/// `Token` (significant lexeme), `Trivia` (whitespace or comment).
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum CstNode {
23    /// A list node with an optional dialect tag and optional open/close
24    /// delimiter strings (e.g. `(`, `)`).
25    List {
26        tag: Option<String>,
27        open: Option<String>,
28        close: Option<String>,
29        children: Vec<CstNode>,
30    },
31    /// A non-trivia lexeme. `text` holds the original source bytes.
32    Token { tag: Option<String>, text: String },
33    /// Whitespace or comment trivia. `text` holds the original source bytes.
34    Trivia { tag: Option<String>, text: String },
35}
36
37impl CstNode {
38    /// Construct a `List` node.
39    pub fn list(tag: impl Into<String>, children: Vec<CstNode>) -> Self {
40        CstNode::List {
41            tag: Some(tag.into()),
42            open: None,
43            close: None,
44            children,
45        }
46    }
47
48    /// Construct an untagged `List` node with custom open/close delimiters.
49    pub fn list_with_delims(
50        tag: Option<String>,
51        open: Option<String>,
52        close: Option<String>,
53        children: Vec<CstNode>,
54    ) -> Self {
55        CstNode::List {
56            tag,
57            open,
58            close,
59            children,
60        }
61    }
62
63    /// Construct a `Token` leaf.
64    pub fn token(text: impl Into<String>, tag: Option<&str>) -> Self {
65        CstNode::Token {
66            tag: tag.map(String::from),
67            text: text.into(),
68        }
69    }
70
71    /// Construct a `Trivia` leaf.
72    pub fn trivia(text: impl Into<String>, tag: Option<&str>) -> Self {
73        CstNode::Trivia {
74            tag: tag.map(String::from),
75            text: text.into(),
76        }
77    }
78
79    /// True if this node is a list.
80    pub fn is_list(&self) -> bool {
81        matches!(self, CstNode::List { .. })
82    }
83
84    /// The dialect tag, if any.
85    pub fn tag(&self) -> Option<&str> {
86        match self {
87            CstNode::List { tag, .. } => tag.as_deref(),
88            CstNode::Token { tag, .. } => tag.as_deref(),
89            CstNode::Trivia { tag, .. } => tag.as_deref(),
90        }
91    }
92
93    /// The textual content of a leaf node. Returns `""` for lists.
94    pub fn text(&self) -> &str {
95        match self {
96            CstNode::Token { text, .. } => text,
97            CstNode::Trivia { text, .. } => text,
98            CstNode::List { .. } => "",
99        }
100    }
101
102    /// Iterate every leaf (token/trivia) of this subtree in document order.
103    pub fn leaves(&self) -> Vec<&CstNode> {
104        let mut out = Vec::new();
105        collect_leaves(self, &mut out);
106        out
107    }
108}
109
110fn collect_leaves<'a>(node: &'a CstNode, out: &mut Vec<&'a CstNode>) {
111    match node {
112        CstNode::List { children, .. } => {
113            for c in children {
114                collect_leaves(c, out);
115            }
116        }
117        _ => out.push(node),
118    }
119}
120
121/// Print a CST node back to its byte-faithful source form.
122pub fn print_cst(node: &CstNode) -> String {
123    let mut out = String::new();
124    emit(node, &mut out);
125    out
126}
127
128fn emit(node: &CstNode, out: &mut String) {
129    match node {
130        CstNode::Token { text, .. } | CstNode::Trivia { text, .. } => {
131            out.push_str(text);
132        }
133        CstNode::List {
134            children,
135            open,
136            close,
137            ..
138        } => {
139            if let Some(o) = open {
140                out.push_str(o);
141            }
142            for c in children {
143                emit(c, out);
144            }
145            if let Some(c) = close {
146                out.push_str(c);
147            }
148        }
149    }
150}
151
152/// Serialise a CST node into a `.lino` S-expression matching the format
153/// produced by `js/src/cst.mjs`'s `cstToLino`.
154pub fn cst_to_lino(node: &CstNode) -> String {
155    let mut out = String::new();
156    write_lino(node, &mut out);
157    out
158}
159
160fn write_lino(node: &CstNode, out: &mut String) {
161    match node {
162        CstNode::Token { text, .. } => {
163            let _ = write!(out, "(lino-cst.token {})", escape_text(text));
164        }
165        CstNode::Trivia { text, .. } => {
166            let _ = write!(out, "(lino-cst.trivia {})", escape_text(text));
167        }
168        CstNode::List {
169            tag,
170            open,
171            close,
172            children,
173        } => {
174            out.push('(');
175            out.push_str("lino-cst.list");
176            if let Some(t) = tag {
177                out.push(' ');
178                out.push_str(t);
179            }
180            if let Some(o) = open {
181                let _ = write!(out, " (open {})", escape_text(o));
182            }
183            if let Some(c) = close {
184                let _ = write!(out, " (close {})", escape_text(c));
185            }
186            for c in children {
187                out.push(' ');
188                write_lino(c, out);
189            }
190            out.push(')');
191        }
192    }
193}
194
195fn escape_text(text: &str) -> String {
196    let mut out = String::with_capacity(text.len() + 2);
197    out.push('"');
198    for ch in text.chars() {
199        match ch {
200            '"' => out.push_str("\\\""),
201            '\\' => out.push_str("\\\\"),
202            '\n' => out.push_str("\\n"),
203            '\r' => out.push_str("\\r"),
204            '\t' => out.push_str("\\t"),
205            '\x08' => out.push_str("\\b"),
206            '\x0C' => out.push_str("\\f"),
207            c if (c as u32) < 0x20 => {
208                let _ = write!(out, "\\u{:04x}", c as u32);
209            }
210            c => out.push(c),
211        }
212    }
213    out.push('"');
214    out
215}
216
217/// Parse the `.lino` S-expression produced by `cst_to_lino` back into a
218/// `CstNode`. Inverse of `cst_to_lino`.
219pub fn lino_to_cst(src: &str) -> Result<CstNode, String> {
220    let tokens = tokenise_lino_cst(src)?;
221    let mut idx = 0usize;
222    let node = parse_node(&tokens, &mut idx)?;
223    Ok(node)
224}
225
226fn parse_node(tokens: &[String], idx: &mut usize) -> Result<CstNode, String> {
227    expect(tokens, idx, "(")?;
228    let head = eat(tokens, idx)?;
229    match head.as_str() {
230        "lino-cst.token" => {
231            let lit = eat(tokens, idx)?;
232            expect(tokens, idx, ")")?;
233            Ok(CstNode::token(unescape_text(&lit)?, None))
234        }
235        "lino-cst.trivia" => {
236            let lit = eat(tokens, idx)?;
237            expect(tokens, idx, ")")?;
238            Ok(CstNode::trivia(unescape_text(&lit)?, None))
239        }
240        "lino-cst.list" => {
241            let mut tag: Option<String> = None;
242            let mut open: Option<String> = None;
243            let mut close: Option<String> = None;
244            let mut children: Vec<CstNode> = Vec::new();
245            while peek(tokens, *idx)? != ")" {
246                if peek(tokens, *idx)? == "(" {
247                    let lookahead = peek(tokens, *idx + 1)?;
248                    if lookahead == "open" {
249                        *idx += 2;
250                        let lit = eat(tokens, idx)?;
251                        expect(tokens, idx, ")")?;
252                        open = Some(unescape_text(&lit)?);
253                        continue;
254                    }
255                    if lookahead == "close" {
256                        *idx += 2;
257                        let lit = eat(tokens, idx)?;
258                        expect(tokens, idx, ")")?;
259                        close = Some(unescape_text(&lit)?);
260                        continue;
261                    }
262                    children.push(parse_node(tokens, idx)?);
263                } else {
264                    if tag.is_some() {
265                        return Err(format!("unexpected token {:?}", peek(tokens, *idx)?));
266                    }
267                    tag = Some(eat(tokens, idx)?);
268                }
269            }
270            expect(tokens, idx, ")")?;
271            Ok(CstNode::list_with_delims(tag, open, close, children))
272        }
273        other => Err(format!("unknown CST tag: {}", other)),
274    }
275}
276
277fn peek(tokens: &[String], idx: usize) -> Result<&str, String> {
278    tokens
279        .get(idx)
280        .map(String::as_str)
281        .ok_or_else(|| "unexpected end of input".to_string())
282}
283
284fn eat(tokens: &[String], idx: &mut usize) -> Result<String, String> {
285    let s = tokens
286        .get(*idx)
287        .cloned()
288        .ok_or_else(|| "unexpected end of input".to_string())?;
289    *idx += 1;
290    Ok(s)
291}
292
293fn expect(tokens: &[String], idx: &mut usize, t: &str) -> Result<(), String> {
294    let got = eat(tokens, idx)?;
295    if got != t {
296        return Err(format!("expected {:?}, got {:?}", t, got));
297    }
298    Ok(())
299}
300
301fn tokenise_lino_cst(src: &str) -> Result<Vec<String>, String> {
302    let bytes: Vec<char> = src.chars().collect();
303    let mut out = Vec::new();
304    let mut i = 0usize;
305    while i < bytes.len() {
306        let c = bytes[i];
307        if c == '(' || c == ')' {
308            out.push(c.to_string());
309            i += 1;
310            continue;
311        }
312        if c == ' ' || c == '\t' || c == '\n' || c == '\r' {
313            i += 1;
314            continue;
315        }
316        if c == '"' {
317            let mut j = i + 1;
318            while j < bytes.len() {
319                if bytes[j] == '\\' {
320                    j += 2;
321                    continue;
322                }
323                if bytes[j] == '"' {
324                    break;
325                }
326                j += 1;
327            }
328            if j >= bytes.len() {
329                return Err("unterminated string literal".to_string());
330            }
331            out.push(bytes[i..=j].iter().collect());
332            i = j + 1;
333            continue;
334        }
335        let mut j = i;
336        while j < bytes.len() && !" \t\n\r()".contains(bytes[j]) {
337            j += 1;
338        }
339        out.push(bytes[i..j].iter().collect());
340        i = j;
341    }
342    Ok(out)
343}
344
345fn unescape_text(literal: &str) -> Result<String, String> {
346    let chars: Vec<char> = literal.chars().collect();
347    if chars.first() != Some(&'"') || chars.last() != Some(&'"') {
348        return Err(format!("not a string literal: {}", literal));
349    }
350    let mut out = String::new();
351    let mut i = 1;
352    while i < chars.len() - 1 {
353        let c = chars[i];
354        if c == '\\' {
355            i += 1;
356            if i >= chars.len() - 1 {
357                return Err("trailing backslash".to_string());
358            }
359            let esc = chars[i];
360            match esc {
361                '"' => out.push('"'),
362                '\\' => out.push('\\'),
363                '/' => out.push('/'),
364                'n' => out.push('\n'),
365                'r' => out.push('\r'),
366                't' => out.push('\t'),
367                'b' => out.push('\x08'),
368                'f' => out.push('\x0C'),
369                'u' => {
370                    if i + 4 >= chars.len() {
371                        return Err("bad \\u escape".to_string());
372                    }
373                    let hex: String = chars[i + 1..=i + 4].iter().collect();
374                    let cp = u32::from_str_radix(&hex, 16)
375                        .map_err(|e| format!("bad \\u escape: {}", e))?;
376                    if let Some(ch) = char::from_u32(cp) {
377                        out.push(ch);
378                    }
379                    i += 4;
380                }
381                other => return Err(format!("unknown escape: \\{}", other)),
382            }
383            i += 1;
384        } else {
385            out.push(c);
386            i += 1;
387        }
388    }
389    Ok(out)
390}
391
392/// Convenience: return a clone of `node`. Provided for parity with the JS
393/// module which exports `cloneCst` explicitly.
394pub fn clone_cst(node: &CstNode) -> CstNode {
395    node.clone()
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    #[test]
403    fn prints_list_with_open_close() {
404        let n = CstNode::list_with_delims(
405            Some("demo".into()),
406            Some("(".into()),
407            Some(")".into()),
408            vec![CstNode::token("a", None), CstNode::token("b", None)],
409        );
410        assert_eq!(print_cst(&n), "(ab)");
411    }
412
413    #[test]
414    fn lino_round_trip_for_token_trivia_list() {
415        let n = CstNode::list(
416            "lino-cst.rust.fn",
417            vec![
418                CstNode::token("fn ", Some("kw")),
419                CstNode::token("foo", Some("ident")),
420                CstNode::trivia(" ", Some("ws")),
421            ],
422        );
423        let s = cst_to_lino(&n);
424        let back = lino_to_cst(&s).unwrap();
425        assert_eq!(print_cst(&n), print_cst(&back));
426    }
427}