Skip to main content

meta_language/document_formatting/
document.rs

1//! Language-free concept-layer document tree and Markdown/HTML converters.
2//!
3//! A [`FormattingDocument`] is the language-free concept layer: a tree of
4//! blocks and inline spans tagged with their shared concept ids. Markdown and
5//! HTML each parse *into* this layer and render *out of* it through the seeded
6//! per-format templates, so a Markdown document using bold/italic/heading/
7//! list/link round-trips to HTML and back through one concept ontology.
8
9use std::collections::BTreeMap;
10
11use super::DocumentFormatInstance;
12use crate::link_network::LinkNetwork;
13
14/// An inline span in the language-free concept layer.
15#[derive(Clone, Debug, PartialEq, Eq)]
16pub enum InlineNode {
17    /// Literal text run.
18    Text(String),
19    /// Content wrapped by an inline formatting concept (`strong`, `emphasis`,
20    /// `hyperlink`, …) with optional named attributes.
21    Wrapped {
22        /// Exact concept id of the inline formatting.
23        concept: String,
24        /// Named attributes (for example `href` for a hyperlink).
25        attributes: BTreeMap<String, String>,
26        /// Nested inline content.
27        children: Vec<Self>,
28    },
29}
30
31/// A block in the language-free concept layer.
32#[derive(Clone, Debug, PartialEq, Eq)]
33pub enum BlockNode {
34    /// Section heading carrying a level.
35    Heading {
36        /// Heading level from 1 (most significant) to 6.
37        level: u8,
38        /// Inline heading content.
39        children: Vec<InlineNode>,
40    },
41    /// Paragraph of running text.
42    Paragraph {
43        /// Inline paragraph content.
44        children: Vec<InlineNode>,
45    },
46    /// Bullet or ordered list.
47    List {
48        /// `bullet-list` or `ordered-list`.
49        concept: String,
50        /// Inline content of each list item.
51        items: Vec<Vec<InlineNode>>,
52    },
53}
54
55impl BlockNode {
56    /// The concept id this block maps onto in the shared ontology.
57    #[must_use]
58    pub fn concept_id(&self) -> &str {
59        match self {
60            Self::Heading { .. } => "heading",
61            Self::Paragraph { .. } => "paragraph",
62            Self::List { concept, .. } => concept,
63        }
64    }
65}
66
67/// A language-free document in the concept layer.
68#[derive(Clone, Debug, Default, PartialEq, Eq)]
69pub struct FormattingDocument {
70    /// Ordered blocks making up the document.
71    pub blocks: Vec<BlockNode>,
72}
73
74impl LinkNetwork {
75    /// Renders a language-free [`FormattingDocument`] (the concept layer) into
76    /// `language` surface syntax using the seeded per-format templates.
77    #[must_use]
78    pub fn render_markup_document(&self, language: &str, document: &FormattingDocument) -> String {
79        if language == "PDF" {
80            return super::render_pdf_document(document);
81        }
82        if language == "DOCX" {
83            return super::render_docx_document(document);
84        }
85        if language == "txt" {
86            return render_txt_document(document);
87        }
88        let block_separator = if language == "Markdown" { "\n\n" } else { "\n" };
89        document
90            .blocks
91            .iter()
92            .map(|block| self.render_block(language, block))
93            .collect::<Vec<_>>()
94            .join(block_separator)
95    }
96
97    fn render_block(&self, language: &str, block: &BlockNode) -> String {
98        match block {
99            BlockNode::Heading { level, children } => {
100                let mut instance =
101                    DocumentFormatInstance::from_content(self.render_inline(language, children));
102                instance.level = Some(*level);
103                self.render_document_format("heading", language, &instance)
104                    .unwrap_or_default()
105            }
106            BlockNode::Paragraph { children } => {
107                let instance =
108                    DocumentFormatInstance::from_content(self.render_inline(language, children));
109                self.render_document_format("paragraph", language, &instance)
110                    .unwrap_or_default()
111            }
112            BlockNode::List { concept, items } => {
113                let item_separator = if language == "Markdown" { "\n" } else { "" };
114                let rendered_items = items
115                    .iter()
116                    .map(|item| {
117                        let instance = DocumentFormatInstance::from_content(
118                            self.render_inline(language, item),
119                        );
120                        self.render_document_format("list-item", language, &instance)
121                            .unwrap_or_default()
122                    })
123                    .collect::<Vec<_>>()
124                    .join(item_separator);
125                let instance = DocumentFormatInstance::from_content(rendered_items);
126                self.render_document_format(concept, language, &instance)
127                    .unwrap_or_default()
128            }
129        }
130    }
131
132    fn render_inline(&self, language: &str, nodes: &[InlineNode]) -> String {
133        let mut output = String::new();
134        for node in nodes {
135            match node {
136                InlineNode::Text(text) => output.push_str(&escape_text(language, text)),
137                InlineNode::Wrapped {
138                    concept,
139                    attributes,
140                    children,
141                } => {
142                    let instance = DocumentFormatInstance {
143                        content: self.render_inline(language, children),
144                        level: None,
145                        attributes: attributes.clone(),
146                    };
147                    if let Some(rendered) =
148                        self.render_document_format(concept, language, &instance)
149                    {
150                        output.push_str(&rendered);
151                    }
152                }
153            }
154        }
155        output
156    }
157
158    /// Parses `text` written in `source_language` into the language-free concept
159    /// layer, then renders it as `target_language` surface syntax.
160    ///
161    /// This is the cross-format reconstruction substrate: a Markdown document
162    /// using bold/italic/heading/list/link round-trips to HTML and back through
163    /// the shared concept ontology.
164    #[must_use]
165    pub fn translate_markup_document(
166        &self,
167        source_language: &str,
168        target_language: &str,
169        text: &str,
170    ) -> Option<String> {
171        let document = parse_markup_document(source_language, text)?;
172        Some(self.render_markup_document(target_language, &document))
173    }
174}
175
176/// Parses `text` written in `language` into the language-free concept layer.
177///
178/// Supports the `Markdown` and `HTML` markup targets for the founding
179/// bold/italic/heading/list/link feature set.
180#[must_use]
181pub fn parse_markup_document(language: &str, text: &str) -> Option<FormattingDocument> {
182    match language {
183        "Markdown" => Some(parse_markdown_document(text)),
184        "HTML" => Some(parse_html_document(text)),
185        "PDF" => Some(super::parse_pdf_document(text)),
186        "DOCX" => Some(super::parse_docx_document(text)),
187        "txt" => Some(parse_txt_document(text)),
188        _ => None,
189    }
190}
191
192/// Parses plain `txt` into the concept layer.
193///
194/// `txt` carries no formatting markup, so each blank-line-separated run of text
195/// becomes one [`BlockNode::Paragraph`] of literal text. This is the lossy
196/// fallback target: a document reconstructed as `txt` keeps its prose but drops
197/// heading levels, list structure, and inline styling (see the per-format
198/// fidelity matrix and [`super::document_format_profile`]).
199fn parse_txt_document(text: &str) -> FormattingDocument {
200    let mut blocks = Vec::new();
201    let mut group: Vec<&str> = Vec::new();
202
203    for line in text.lines() {
204        if line.trim().is_empty() {
205            flush_txt_block(&mut blocks, &group);
206            group.clear();
207        } else {
208            group.push(line);
209        }
210    }
211    flush_txt_block(&mut blocks, &group);
212
213    FormattingDocument { blocks }
214}
215
216fn flush_txt_block(blocks: &mut Vec<BlockNode>, lines: &[&str]) {
217    if lines.is_empty() {
218        return;
219    }
220    blocks.push(BlockNode::Paragraph {
221        children: vec![InlineNode::Text(lines.join(" "))],
222    });
223}
224
225/// Renders the concept layer into plain `txt`, flattening every formatting
226/// concept to its text content.
227fn render_txt_document(document: &FormattingDocument) -> String {
228    document
229        .blocks
230        .iter()
231        .map(render_txt_block)
232        .collect::<Vec<_>>()
233        .join("\n\n")
234}
235
236fn render_txt_block(block: &BlockNode) -> String {
237    match block {
238        BlockNode::Heading { children, .. } | BlockNode::Paragraph { children } => {
239            flatten_inline_text(children)
240        }
241        BlockNode::List { concept, items } => {
242            let ordered = concept == "ordered-list";
243            items
244                .iter()
245                .enumerate()
246                .map(|(index, item)| {
247                    let marker = if ordered {
248                        format!("{}. ", index + 1)
249                    } else {
250                        "- ".to_string()
251                    };
252                    format!("{marker}{}", flatten_inline_text(item))
253                })
254                .collect::<Vec<_>>()
255                .join("\n")
256        }
257    }
258}
259
260/// Concatenates the text content of inline nodes, discarding all formatting.
261fn flatten_inline_text(nodes: &[InlineNode]) -> String {
262    let mut output = String::new();
263    for node in nodes {
264        match node {
265            InlineNode::Text(text) => output.push_str(text),
266            InlineNode::Wrapped { children, .. } => {
267                output.push_str(&flatten_inline_text(children));
268            }
269        }
270    }
271    output
272}
273
274fn parse_markdown_document(text: &str) -> FormattingDocument {
275    let mut blocks = Vec::new();
276    let mut group: Vec<&str> = Vec::new();
277
278    for line in text.lines() {
279        if line.trim().is_empty() {
280            flush_markdown_block(&mut blocks, &group);
281            group.clear();
282        } else {
283            group.push(line);
284        }
285    }
286    flush_markdown_block(&mut blocks, &group);
287
288    FormattingDocument { blocks }
289}
290
291fn flush_markdown_block(blocks: &mut Vec<BlockNode>, lines: &[&str]) {
292    if lines.is_empty() {
293        return;
294    }
295
296    if lines.iter().all(|line| line.starts_with("- ")) {
297        let items = lines
298            .iter()
299            .map(|line| parse_inline_markdown(&line[2..]))
300            .collect();
301        blocks.push(BlockNode::List {
302            concept: "bullet-list".to_string(),
303            items,
304        });
305        return;
306    }
307
308    if let [line] = lines {
309        let hashes = line
310            .chars()
311            .take_while(|character| *character == '#')
312            .count();
313        if (1..=6).contains(&hashes) && line[hashes..].starts_with(' ') {
314            let level = u8::try_from(hashes).expect("heading level within 1..=6");
315            blocks.push(BlockNode::Heading {
316                level,
317                children: parse_inline_markdown(&line[hashes + 1..]),
318            });
319            return;
320        }
321    }
322
323    blocks.push(BlockNode::Paragraph {
324        children: parse_inline_markdown(&lines.join(" ")),
325    });
326}
327
328fn parse_inline_markdown(input: &str) -> Vec<InlineNode> {
329    let mut nodes = Vec::new();
330    let mut text = String::new();
331    let mut cursor = 0usize;
332
333    while cursor < input.len() {
334        let rest = &input[cursor..];
335        if let Some(inner_len) = wrapped_span(rest, "**", "**") {
336            flush_text(&mut nodes, &mut text);
337            let inner = &rest[2..2 + inner_len];
338            nodes.push(wrapped("strong", parse_inline_markdown(inner)));
339            cursor += 4 + inner_len;
340        } else if let Some(inner_len) = wrapped_span(rest, "*", "*") {
341            flush_text(&mut nodes, &mut text);
342            let inner = &rest[1..=inner_len];
343            nodes.push(wrapped("emphasis", parse_inline_markdown(inner)));
344            cursor += 2 + inner_len;
345        } else if let Some((text_inner, href, consumed)) = markdown_link(rest) {
346            flush_text(&mut nodes, &mut text);
347            nodes.push(hyperlink(href, parse_inline_markdown(text_inner)));
348            cursor += consumed;
349        } else {
350            let character = rest.chars().next().expect("non-empty remainder");
351            text.push(character);
352            cursor += character.len_utf8();
353        }
354    }
355
356    flush_text(&mut nodes, &mut text);
357    nodes
358}
359
360/// Returns the byte length of the content wrapped between `open` and `close`.
361fn wrapped_span(rest: &str, open: &str, close: &str) -> Option<usize> {
362    let body = rest.strip_prefix(open)?;
363    // For `*`, reject `**` so strong is preferred over emphasis.
364    if open == "*" && body.starts_with('*') {
365        return None;
366    }
367    body.find(close)
368}
369
370fn markdown_link(rest: &str) -> Option<(&str, &str, usize)> {
371    let body = rest.strip_prefix('[')?;
372    let text_end = body.find("](")?;
373    let text_inner = &body[..text_end];
374    let after = &body[text_end + 2..];
375    let href_end = after.find(')')?;
376    let href = &after[..href_end];
377    let consumed = 1 + text_end + 2 + href_end + 1;
378    Some((text_inner, href, consumed))
379}
380
381fn parse_html_document(text: &str) -> FormattingDocument {
382    let mut blocks = Vec::new();
383
384    for raw_line in text.lines() {
385        let line = raw_line.trim();
386        if line.is_empty() {
387            continue;
388        }
389
390        if let Some(block) = parse_html_heading(line)
391            .or_else(|| parse_html_list(line))
392            .or_else(|| parse_html_paragraph(line))
393        {
394            blocks.push(block);
395        }
396    }
397
398    FormattingDocument { blocks }
399}
400
401fn parse_html_heading(line: &str) -> Option<BlockNode> {
402    let after_marker = line.strip_prefix("<h")?;
403    let digit = after_marker.chars().next()?;
404    let level = u8::try_from(digit.to_digit(10)?)
405        .ok()
406        .filter(|value| (1..=6).contains(value))?;
407    let open = format!("<h{level}>");
408    let close = format!("</h{level}>");
409    let inner = line.strip_prefix(&open)?.strip_suffix(&close)?;
410    Some(BlockNode::Heading {
411        level,
412        children: parse_inline_html(inner),
413    })
414}
415
416fn parse_html_list(line: &str) -> Option<BlockNode> {
417    let (concept, inner) = if let Some(inner) = wrapped_inner(line, "<ul>", "</ul>") {
418        ("bullet-list", inner)
419    } else if let Some(inner) = wrapped_inner(line, "<ol>", "</ol>") {
420        ("ordered-list", inner)
421    } else {
422        return None;
423    };
424
425    let mut items = Vec::new();
426    let mut rest = inner;
427    while let Some(start) = rest.find("<li>") {
428        let after = &rest[start + 4..];
429        let end = after.find("</li>")?;
430        items.push(parse_inline_html(&after[..end]));
431        rest = &after[end + 5..];
432    }
433
434    Some(BlockNode::List {
435        concept: concept.to_string(),
436        items,
437    })
438}
439
440fn parse_html_paragraph(line: &str) -> Option<BlockNode> {
441    let inner = wrapped_inner(line, "<p>", "</p>")?;
442    Some(BlockNode::Paragraph {
443        children: parse_inline_html(inner),
444    })
445}
446
447fn parse_inline_html(input: &str) -> Vec<InlineNode> {
448    let mut nodes = Vec::new();
449    let mut text = String::new();
450    let mut cursor = 0usize;
451
452    while cursor < input.len() {
453        let rest = &input[cursor..];
454        if let Some((inner, consumed)) = html_tag_span(rest, "strong") {
455            flush_html_text(&mut nodes, &mut text);
456            nodes.push(wrapped("strong", parse_inline_html(inner)));
457            cursor += consumed;
458        } else if let Some((inner, consumed)) = html_tag_span(rest, "em") {
459            flush_html_text(&mut nodes, &mut text);
460            nodes.push(wrapped("emphasis", parse_inline_html(inner)));
461            cursor += consumed;
462        } else if let Some((href, inner, consumed)) = html_anchor(rest) {
463            flush_html_text(&mut nodes, &mut text);
464            nodes.push(hyperlink(href, parse_inline_html(inner)));
465            cursor += consumed;
466        } else {
467            let character = rest.chars().next().expect("non-empty remainder");
468            text.push(character);
469            cursor += character.len_utf8();
470        }
471    }
472
473    flush_html_text(&mut nodes, &mut text);
474    nodes
475}
476
477/// Matches an inline `<tag>…</tag>` span at the start of `rest`, returning the
478/// inner content and the number of bytes consumed.
479fn html_tag_span<'a>(rest: &'a str, tag: &str) -> Option<(&'a str, usize)> {
480    let open = format!("<{tag}>");
481    let close = format!("</{tag}>");
482    let body = rest.strip_prefix(&open)?;
483    let inner_end = body.find(&close)?;
484    let inner = &body[..inner_end];
485    let consumed = open.len() + inner_end + close.len();
486    Some((inner, consumed))
487}
488
489fn html_anchor(rest: &str) -> Option<(&str, &str, usize)> {
490    let body = rest.strip_prefix("<a href=\"")?;
491    let href_end = body.find('"')?;
492    let href = &body[..href_end];
493    let after_attr = &body[href_end..];
494    let inner_start = after_attr.strip_prefix("\">")?;
495    let inner_end = inner_start.find("</a>")?;
496    let inner = &inner_start[..inner_end];
497    let consumed = rest.len() - (inner_start.len() - inner_end - "</a>".len());
498    Some((href, inner, consumed))
499}
500
501fn wrapped_inner<'a>(input: &'a str, open: &str, close: &str) -> Option<&'a str> {
502    input.strip_prefix(open)?.strip_suffix(close)
503}
504
505fn wrapped(concept: &str, children: Vec<InlineNode>) -> InlineNode {
506    InlineNode::Wrapped {
507        concept: concept.to_string(),
508        attributes: BTreeMap::new(),
509        children,
510    }
511}
512
513fn hyperlink(href: &str, children: Vec<InlineNode>) -> InlineNode {
514    let mut attributes = BTreeMap::new();
515    attributes.insert("href".to_string(), href.to_string());
516    InlineNode::Wrapped {
517        concept: "hyperlink".to_string(),
518        attributes,
519        children,
520    }
521}
522
523fn flush_text(nodes: &mut Vec<InlineNode>, text: &mut String) {
524    if !text.is_empty() {
525        nodes.push(InlineNode::Text(std::mem::take(text)));
526    }
527}
528
529fn flush_html_text(nodes: &mut Vec<InlineNode>, text: &mut String) {
530    if !text.is_empty() {
531        nodes.push(InlineNode::Text(unescape_text(&std::mem::take(text))));
532    }
533}
534
535fn escape_text(language: &str, text: &str) -> String {
536    if language == "HTML" {
537        text.replace('&', "&amp;")
538            .replace('<', "&lt;")
539            .replace('>', "&gt;")
540    } else {
541        text.to_string()
542    }
543}
544
545fn unescape_text(text: &str) -> String {
546    text.replace("&lt;", "<")
547        .replace("&gt;", ">")
548        .replace("&amp;", "&")
549}