JSDoc: Source: cst-js.mjs

// JavaScript ↔ `.lino` CST converter (issue #138).
//
// Token-level lossless converter for JavaScript source. Produces a
// `lino-cst.js.*` flat CST. Round-trip is byte-faithful:
// `printJs(parseJs(src)) === src`.
//
// Scope: this converter operates at the token-stream level. It recognises:
//
//   - line comments (`// ...`)
//   - block comments (`/* ... */`)
//   - whitespace (spaces, tabs, CR, LF)
//   - hashbang on the first line (`#! ...`)
//   - string literals (`"..."`, `'...'`)
//   - template literals (``` `...${...}...` ```), with nested expressions
//   - numeric literals (decimal, hex, oct, bin, BigInt)
//   - regexp literals (best-effort, using a context heuristic)
//   - identifiers (Unicode XID-friendly)
//   - punctuation
//
// Regex disambiguation is the only non-trivial piece: a `/` is the start of a
// regex when it follows a token that cannot begin a binary-divide operand
// (e.g. `=`, `(`, `,`, `return`, etc.). We use the standard heuristic.

import { list, token, trivia, DIALECTS } from './cst.mjs';

const JS = DIALECTS.js;

/**
 * Parse JavaScript source into a `lino-cst.js.*` CST.
 *
 * @param {string} src JS source.
 * @returns {CstNode}
 */
export function parseJs(src) {
  return list(`${JS}.program`, tokeniseJs(String(src)));
}

/**
 * Print a JS CST back to source.
 * @param {CstNode} node
 * @returns {string}
 */
export function printJs(node) {
  const out = [];
  emit(node, out);
  return out.join('');
}

function emit(node, out) {
  if (!node) return;
  if (node.kind === 'token' || node.kind === 'trivia') {
    out.push(node.text);
    return;
  }
  if (node.kind === 'list') {
    if (node.open) out.push(node.open);
    for (const child of node.children) emit(child, out);
    if (node.close) out.push(node.close);
  }
}

const ID_START = /[A-Za-z_$]/;
const ID_CONT = /[A-Za-z0-9_$]/;
const DIGIT = /[0-9]/;
const HEX = /[0-9A-Fa-f]/;

const REGEX_PRECEDING_KEYWORDS = new Set([
  'return', 'typeof', 'instanceof', 'in', 'of', 'do', 'else', 'throw',
  'new', 'delete', 'void', 'await', 'yield', 'case',
]);

function tokeniseJs(src) {
  const out = [];
  let i = 0;
  let lastSignificant = null;

  if (src.startsWith('#!')) {
    let j = src.indexOf('\n');
    if (j === -1) j = src.length;
    out.push(trivia(src.substring(0, j), `${JS}.hashbang`));
    i = j;
  }

  while (i < src.length) {
    const c = src[i];

    if (c === ' ' || c === '\t' || c === '\r' || c === '\n') {
      let j = i;
      while (j < src.length && (src[j] === ' ' || src[j] === '\t' || src[j] === '\r' || src[j] === '\n')) j++;
      out.push(trivia(src.substring(i, j), `${JS}.whitespace`));
      i = j;
      continue;
    }

    if (c === '/' && src[i + 1] === '/') {
      let j = i + 2;
      while (j < src.length && src[j] !== '\n') j++;
      out.push(trivia(src.substring(i, j), `${JS}.comment.line`));
      i = j;
      continue;
    }

    if (c === '/' && src[i + 1] === '*') {
      const end = src.indexOf('*/', i + 2);
      const j = end === -1 ? src.length : end + 2;
      out.push(trivia(src.substring(i, j), `${JS}.comment.block`));
      i = j;
      continue;
    }

    if (c === '"' || c === "'") {
      const j = scanString(src, i + 1, c);
      const tok = token(src.substring(i, j), `${JS}.string_literal`);
      out.push(tok);
      lastSignificant = tok;
      i = j;
      continue;
    }

    if (c === '`') {
      const j = scanTemplate(src, i);
      const tok = token(src.substring(i, j), `${JS}.template_literal`);
      out.push(tok);
      lastSignificant = tok;
      i = j;
      continue;
    }

    if (c === '/' && canBeRegex(lastSignificant)) {
      const j = scanRegex(src, i);
      if (j > i + 1) {
        const tok = token(src.substring(i, j), `${JS}.regexp_literal`);
        out.push(tok);
        lastSignificant = tok;
        i = j;
        continue;
      }
    }

    if (DIGIT.test(c) || (c === '.' && DIGIT.test(src[i + 1]))) {
      const j = scanNumber(src, i);
      const tok = token(src.substring(i, j), `${JS}.numeric_literal`);
      out.push(tok);
      lastSignificant = tok;
      i = j;
      continue;
    }

    if (ID_START.test(c) || isUnicodeIdStart(c)) {
      let j = i + 1;
      while (j < src.length && (ID_CONT.test(src[j]) || isUnicodeIdContinue(src[j]))) j++;
      const tok = token(src.substring(i, j), `${JS}.ident`);
      out.push(tok);
      lastSignificant = tok;
      i = j;
      continue;
    }

    const tok = token(c, `${JS}.punct`);
    out.push(tok);
    lastSignificant = tok;
    i += 1;
  }

  return out;
}

function scanString(src, j, quote) {
  while (j < src.length) {
    const c = src[j];
    if (c === '\\') { j += 2; continue; }
    if (c === '\n' && (quote === '"' || quote === "'")) {
      // Unterminated; stop at end of line for safety.
      return j;
    }
    if (c === quote) return j + 1;
    j++;
  }
  return j;
}

function scanTemplate(src, i) {
  let j = i + 1;
  while (j < src.length) {
    const c = src[j];
    if (c === '\\') { j += 2; continue; }
    if (c === '`') return j + 1;
    if (c === '$' && src[j + 1] === '{') {
      // Skip nested ${ ... } expression with bracket counting.
      j += 2;
      let depth = 1;
      while (j < src.length && depth > 0) {
        const k = src[j];
        if (k === '{') depth++;
        else if (k === '}') depth--;
        else if (k === '"' || k === "'") j = scanString(src, j + 1, k) - 1;
        else if (k === '`') j = scanTemplate(src, j) - 1;
        else if (k === '/' && src[j + 1] === '/') {
          while (j < src.length && src[j] !== '\n') j++;
          continue;
        }
        else if (k === '/' && src[j + 1] === '*') {
          const e = src.indexOf('*/', j + 2);
          j = e === -1 ? src.length : e + 2;
          continue;
        }
        j++;
      }
      continue;
    }
    j++;
  }
  return j;
}

function scanRegex(src, i) {
  let j = i + 1;
  let inClass = false;
  while (j < src.length) {
    const c = src[j];
    if (c === '\\') { j += 2; continue; }
    if (c === '[') inClass = true;
    else if (c === ']') inClass = false;
    else if (c === '/' && !inClass) {
      j++;
      // Flags
      while (j < src.length && /[a-zA-Z]/.test(src[j])) j++;
      return j;
    } else if (c === '\n') {
      return i + 1;
    }
    j++;
  }
  return j;
}

function scanNumber(src, i) {
  let j = i;
  if (src[j] === '0' && (src[j + 1] === 'x' || src[j + 1] === 'X')) {
    j += 2;
    while (j < src.length && (HEX.test(src[j]) || src[j] === '_')) j++;
  } else if (src[j] === '0' && (src[j + 1] === 'o' || src[j + 1] === 'O')) {
    j += 2;
    while (j < src.length && /[0-7_]/.test(src[j])) j++;
  } else if (src[j] === '0' && (src[j + 1] === 'b' || src[j + 1] === 'B')) {
    j += 2;
    while (j < src.length && /[01_]/.test(src[j])) j++;
  } else {
    while (j < src.length && (DIGIT.test(src[j]) || src[j] === '_')) j++;
    if (src[j] === '.') {
      j++;
      while (j < src.length && (DIGIT.test(src[j]) || src[j] === '_')) j++;
    }
    if (src[j] === 'e' || src[j] === 'E') {
      j++;
      if (src[j] === '+' || src[j] === '-') j++;
      while (j < src.length && (DIGIT.test(src[j]) || src[j] === '_')) j++;
    }
  }
  if (src[j] === 'n') j++; // BigInt suffix
  return j;
}

function canBeRegex(prev) {
  if (!prev) return true;
  if (prev.kind === 'trivia') return true;
  if (prev.tag === `${JS}.ident`) {
    return REGEX_PRECEDING_KEYWORDS.has(prev.text);
  }
  if (prev.tag === `${JS}.punct`) {
    // After most punctuation, `/` starts a regex.
    return !')]'.includes(prev.text);
  }
  return false;
}

function isUnicodeIdStart(c) {
  if (!c) return false;
  return c.codePointAt(0) > 0x7f;
}

function isUnicodeIdContinue(c) {
  return isUnicodeIdStart(c);
}