Source: cst.mjs

// Universal lossless CST infrastructure for issue #138.
//
// This module implements the `.lino` concrete syntax tree (CST) model
// described in `docs/case-studies/issue-138/cst-model.md`. It is the shared
// core used by every host-language converter (Rust, JavaScript, Lean 4, Rocq):
// each converter produces a tree of `CstNode` values, which can be printed
// back to bytes via `printCst()`, giving a byte-faithful round-trip.
//
// Three node kinds, exactly as the model specifies:
//
// - `list`   — a list node tagged with a dialect-specific symbol (e.g.
//              `lino-cst.rust.fn`); children are emitted in order with the
//              `open` and `close` delimiters if present.
// - `token`  — a leaf carrying a single non-trivia lexeme; its `text` field is
//              the original source bytes.
// - `trivia` — a leaf carrying whitespace or a comment; `text` is the original
//              source bytes. Trivia attaches to the *following* non-trivia
//              leaf by convention (Roslyn/libsyntax style).
//
// The tree is intentionally minimal: each converter owns the choice of tags
// and the placement of trivia. The printer is content-agnostic — it simply
// concatenates `text` in document order.

/** @typedef {Object} CstListNode
 *  @property {'list'} kind
 *  @property {string|null} tag
 *  @property {string|null} open
 *  @property {string|null} close
 *  @property {CstNode[]} children
 */
/** @typedef {Object} CstTokenNode
 *  @property {'token'} kind
 *  @property {string|null} tag
 *  @property {string} text
 */
/** @typedef {Object} CstTriviaNode
 *  @property {'trivia'} kind
 *  @property {string|null} tag
 *  @property {string} text
 */
/** @typedef {CstListNode|CstTokenNode|CstTriviaNode} CstNode */
/** @typedef {Object} CstListOptions
 *  @property {string|null} [open]
 *  @property {string|null} [close]
 */

/**
 * Construct a `list` CST node.
 *
 * @param {string|null} tag dialect-specific symbol, e.g. `lino-cst.rust.fn`.
 * @param {CstNode[]} children child nodes in document order.
 * @param {CstListOptions} [opts]
 *   optional `open`/`close` delimiter strings to emit around the children.
 * @returns {CstListNode}
 */
export function list(tag, children = [], opts = {}) {
  return {
    kind: 'list',
    tag: tag === undefined ? null : tag,
    open: opts.open ?? null,
    close: opts.close ?? null,
    children: children.slice(),
  };
}

/**
 * Construct a `token` CST leaf.
 *
 * @param {string} text original source bytes for the lexeme.
 * @param {string|null} [tag] optional dialect-specific symbol.
 * @returns {CstTokenNode}
 */
export function token(text, tag = null) {
  return { kind: 'token', tag, text: String(text) };
}

/**
 * Construct a `trivia` CST leaf (whitespace or comment).
 *
 * @param {string} text original source bytes.
 * @param {string|null} [tag] optional categorisation, e.g. `comment.line`.
 * @returns {CstTriviaNode}
 */
export function trivia(text, tag = null) {
  return { kind: 'trivia', tag, text: String(text) };
}

/**
 * Print a CST node back to its original byte-for-byte source representation.
 *
 * The walk is the five-line algorithm from the model document:
 *   - for `token` and `trivia` nodes, emit `node.text`.
 *   - for `list` nodes, emit `open`, then children in order, then `close`.
 *
 * @param {CstNode} node tree to print.
 * @returns {string} reconstructed source.
 */
export function printCst(node) {
  const out = [];
  emit(node, out);
  return out.join('');
}

function emit(node, out) {
  if (!node) return;
  if (node.kind === 'token' || node.kind === 'trivia') {
    out.push(node.text);
    return;
  }
  if (node.kind === 'list') {
    if (node.open) out.push(node.open);
    for (const child of node.children) emit(child, out);
    if (node.close) out.push(node.close);
    return;
  }
  throw new TypeError(`Unknown CST node kind: ${node && node.kind}`);
}

/**
 * Iterate every leaf (`token` or `trivia`) of a CST in document order.
 *
 * @param {CstNode} node tree to walk.
 * @returns {Iterable<CstTokenNode|CstTriviaNode>}
 */
export function* leaves(node) {
  if (!node) return;
  if (node.kind === 'token' || node.kind === 'trivia') {
    yield node;
    return;
  }
  if (node.kind === 'list') {
    for (const child of node.children) yield* leaves(child);
  }
}

/**
 * Serialise a CST node into a `.lino` S-expression suitable for embedding in
 * `.lino` files or for round-trip testing. The serialisation is the literal
 * shape documented in `docs/case-studies/issue-138/cst-model.md`:
 *
 *   `(lino-cst.list <tag> <child> <child> ...)`
 *   `(lino-cst.token "<text>")`
 *   `(lino-cst.trivia "<text>")`
 *
 * String contents are escaped using JSON rules so that the result is a valid
 * LiNo expression. `parseCstLino` is the exact inverse.
 *
 * @param {CstNode} node tree to serialise.
 * @returns {string}
 */
export function cstToLino(node) {
  if (!node) return '';
  if (node.kind === 'token') return `(lino-cst.token ${escapeText(node.text)})`;
  if (node.kind === 'trivia') return `(lino-cst.trivia ${escapeText(node.text)})`;
  if (node.kind === 'list') {
    const parts = ['lino-cst.list'];
    if (node.tag) parts.push(node.tag);
    if (node.open !== null && node.open !== undefined) parts.push(`(open ${escapeText(node.open)})`);
    if (node.close !== null && node.close !== undefined) parts.push(`(close ${escapeText(node.close)})`);
    for (const child of node.children) parts.push(cstToLino(child));
    return `(${parts.join(' ')})`;
  }
  throw new TypeError(`Unknown CST node kind: ${node && node.kind}`);
}

function escapeText(text) {
  return JSON.stringify(String(text));
}

function unescapeText(literal) {
  return JSON.parse(literal);
}

/**
 * Parse the `.lino` S-expression produced by `cstToLino()` back into a
 * `CstNode` tree. Inverse of `cstToLino`.
 *
 * @param {string} src `.lino` S-expression.
 * @returns {CstNode}
 */
export function linoToCst(src) {
  const tokens = tokeniseLinoCst(String(src));
  let i = 0;
  function peek() { return tokens[i]; }
  function eat() { return tokens[i++]; }
  function expect(t) {
    const got = eat();
    if (got !== t) throw new SyntaxError(`expected ${JSON.stringify(t)}, got ${JSON.stringify(got)}`);
  }
  function parseNode() {
    expect('(');
    const head = eat();
    if (head === 'lino-cst.token') {
      const lit = eat();
      expect(')');
      return token(unescapeText(lit));
    }
    if (head === 'lino-cst.trivia') {
      const lit = eat();
      expect(')');
      return trivia(unescapeText(lit));
    }
    if (head === 'lino-cst.list') {
      let tag = null;
      let open = null;
      let close = null;
      const children = [];
      while (peek() !== ')') {
        if (peek() === '(') {
          const lookahead = tokens[i + 1];
          if (lookahead === 'open') {
            eat(); eat();
            open = unescapeText(eat());
            expect(')');
            continue;
          }
          if (lookahead === 'close') {
            eat(); eat();
            close = unescapeText(eat());
            expect(')');
            continue;
          }
          children.push(parseNode());
        } else {
          if (tag !== null) throw new SyntaxError(`unexpected token ${peek()}`);
          tag = eat();
        }
      }
      expect(')');
      return list(tag, children, { open, close });
    }
    throw new SyntaxError(`unknown CST tag: ${head}`);
  }
  return parseNode();
}

function tokeniseLinoCst(src) {
  const out = [];
  let i = 0;
  while (i < src.length) {
    const c = src[i];
    if (c === '(' || c === ')') {
      out.push(c);
      i++;
      continue;
    }
    if (c === ' ' || c === '\t' || c === '\n' || c === '\r') {
      i++;
      continue;
    }
    if (c === '"') {
      let j = i + 1;
      while (j < src.length) {
        if (src[j] === '\\') { j += 2; continue; }
        if (src[j] === '"') break;
        j++;
      }
      if (j >= src.length) throw new SyntaxError('unterminated string literal');
      out.push(src.substring(i, j + 1));
      i = j + 1;
      continue;
    }
    let j = i;
    while (j < src.length && !' \t\n\r()'.includes(src[j])) j++;
    out.push(src.substring(i, j));
    i = j;
  }
  return out;
}

/**
 * The four host-language dialect tag prefixes plus the shared dialect.
 * Exported so that external tooling can refer to them symbolically.
 */
export const DIALECTS = Object.freeze({
  rust: 'lino-cst.rust',
  js: 'lino-cst.js',
  lean: 'lino-cst.lean',
  rocq: 'lino-cst.rocq',
  shared: 'lino-cst.shared',
});

/**
 * Compute and return a structural copy of a CST node. Used by translators that
 * need to mutate without altering the input.
 *
 * @param {CstNode} node
 * @returns {CstNode}
 */
export function cloneCst(node) {
  if (!node) return node;
  if (node.kind === 'token' || node.kind === 'trivia') {
    return { ...node };
  }
  return {
    kind: 'list',
    tag: node.tag,
    open: node.open,
    close: node.close,
    children: node.children.map(cloneCst),
  };
}