Skip to main content

meta_language/document_formatting/
mod.rs

1//! Shared, language-free document-formatting concept ontology.
2//!
3//! Documents in different container formats (Markdown, HTML, and — through the
4//! issues that build on this substrate — PDF and DOCX) express the *same*
5//! formatting concepts with different surface syntax. A Markdown `**bold**`, an
6//! HTML `<strong>bold</strong>`, and a DOCX run with the `<w:b/>` property all
7//! denote one language-free `strong` concept.
8//!
9//! This module seeds that concept set into a [`LinkNetwork`] with per-format
10//! syntax mappings, and provides data-driven resolution and reconstruction so
11//! the *same* concept link round-trips across formats. The per-format mapping
12//! is stored as a small template string whose `{}` placeholder marks the
13//! formatted content and whose `{name}` placeholders mark named attributes
14//! (`{href}`, `{src}`, `{lang}`) or the heading level (`{markers}` for the
15//! Markdown `#` run, `{level}` for the HTML digit).
16
17mod document;
18mod docx;
19mod opc;
20mod pdf;
21mod profile;
22
23pub use document::{parse_markup_document, BlockNode, FormattingDocument, InlineNode};
24pub use docx::{docx_profile_is_recognized, parse_docx_document, render_docx_document};
25pub use opc::{docx_package_is_recognized, parse_docx_package, render_docx_package};
26pub use pdf::{parse_pdf_document, pdf_profile_is_recognized, render_pdf_document};
27pub use profile::{
28    canonical_document_format, document_format_profile, CROSS_FORMAT_CONCEPTS, DOCUMENT_FORMATS,
29};
30
31use std::collections::BTreeMap;
32
33use crate::link_network::{LinkId, LinkNetwork, LinkType};
34
35/// A single document-formatting concept with its per-format templates.
36struct FormattingConcept {
37    id: &'static str,
38    definition: &'static str,
39    /// `(language, template)` pairs. The template uses `{}` for content and
40    /// `{name}` for named holes; see the module documentation.
41    templates: &'static [(&'static str, &'static str)],
42}
43
44/// The shared document-formatting concept set required by issue #83.
45const DOCUMENT_FORMATTING_CONCEPTS: &[FormattingConcept] = &[
46    // --- inline concepts ---
47    FormattingConcept {
48        id: "emphasis",
49        definition: "Inline emphasis (italic) applied to a text fragment.",
50        templates: &[("Markdown", "*{}*"), ("HTML", "<em>{}</em>")],
51    },
52    FormattingConcept {
53        id: "strong",
54        definition: "Inline strong importance (bold) applied to a text fragment.",
55        templates: &[("Markdown", "**{}**"), ("HTML", "<strong>{}</strong>")],
56    },
57    FormattingConcept {
58        id: "strikethrough",
59        definition: "Inline strikethrough (deleted) text fragment.",
60        templates: &[("Markdown", "~~{}~~"), ("HTML", "<del>{}</del>")],
61    },
62    FormattingConcept {
63        id: "inline-code",
64        definition: "Inline monospaced code span.",
65        templates: &[("Markdown", "`{}`"), ("HTML", "<code>{}</code>")],
66    },
67    FormattingConcept {
68        id: "hyperlink",
69        definition: "Inline hyperlink wrapping text and targeting a destination.",
70        templates: &[
71            ("Markdown", "[{}]({href})"),
72            ("HTML", "<a href=\"{href}\">{}</a>"),
73        ],
74    },
75    FormattingConcept {
76        id: "image",
77        definition: "Inline image with alternative text and a source reference.",
78        templates: &[
79            ("Markdown", "![{}]({src})"),
80            ("HTML", "<img src=\"{src}\" alt=\"{}\" />"),
81        ],
82    },
83    FormattingConcept {
84        id: "line-break",
85        definition: "Explicit inline line break inside a block.",
86        templates: &[("Markdown", "  \n"), ("HTML", "<br />")],
87    },
88    // --- block concepts ---
89    FormattingConcept {
90        id: "heading",
91        definition: "Section heading carrying a level from 1 (most significant) downward.",
92        templates: &[
93            ("Markdown", "{markers} {}"),
94            ("HTML", "<h{level}>{}</h{level}>"),
95        ],
96    },
97    FormattingConcept {
98        id: "paragraph",
99        definition: "Block of running text.",
100        templates: &[("Markdown", "{}"), ("HTML", "<p>{}</p>")],
101    },
102    FormattingConcept {
103        id: "blockquote",
104        definition: "Quoted block set off from the surrounding text.",
105        templates: &[
106            ("Markdown", "> {}"),
107            ("HTML", "<blockquote>{}</blockquote>"),
108        ],
109    },
110    FormattingConcept {
111        id: "bullet-list",
112        definition: "Unordered list container.",
113        templates: &[("Markdown", "{}"), ("HTML", "<ul>{}</ul>")],
114    },
115    FormattingConcept {
116        id: "ordered-list",
117        definition: "Ordered list container.",
118        templates: &[("Markdown", "{}"), ("HTML", "<ol>{}</ol>")],
119    },
120    FormattingConcept {
121        id: "list-item",
122        definition: "Single item within a list.",
123        templates: &[("Markdown", "- {}"), ("HTML", "<li>{}</li>")],
124    },
125    FormattingConcept {
126        id: "code-block",
127        definition: "Fenced block of preformatted code carrying an optional language.",
128        templates: &[
129            ("Markdown", "```{lang}\n{}\n```"),
130            (
131                "HTML",
132                "<pre><code class=\"language-{lang}\">{}</code></pre>",
133            ),
134        ],
135    },
136    FormattingConcept {
137        id: "thematic-break",
138        definition: "Thematic break (horizontal rule) between sections.",
139        templates: &[("Markdown", "---"), ("HTML", "<hr />")],
140    },
141    FormattingConcept {
142        id: "table",
143        definition: "Tabular data container.",
144        templates: &[("Markdown", "{}"), ("HTML", "<table>{}</table>")],
145    },
146    FormattingConcept {
147        id: "table-row",
148        definition: "Row within a table.",
149        templates: &[("Markdown", "{}"), ("HTML", "<tr>{}</tr>")],
150    },
151    FormattingConcept {
152        id: "table-cell",
153        definition: "Cell within a table row.",
154        templates: &[("Markdown", "{}"), ("HTML", "<td>{}</td>")],
155    },
156];
157
158/// Summary returned after seeding the document-formatting concept set.
159#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
160pub struct DocumentFormattingSeedReport {
161    concepts: usize,
162    syntax_mappings: usize,
163}
164
165impl DocumentFormattingSeedReport {
166    /// Number of language-free formatting concepts seeded.
167    #[must_use]
168    pub const fn concepts(self) -> usize {
169        self.concepts
170    }
171
172    /// Number of per-format syntax mappings attached to the formatting concepts.
173    #[must_use]
174    pub const fn syntax_mappings(self) -> usize {
175        self.syntax_mappings
176    }
177}
178
179/// A formatting fragment resolved to its language-free concept.
180#[derive(Clone, Debug, PartialEq, Eq)]
181pub struct DocumentFormatMatch {
182    /// Exact concept id (for example `strong`).
183    pub concept: String,
184    /// Concept link in the network the fragment maps onto.
185    pub link: LinkId,
186    /// Formatted content captured from the `{}` placeholder.
187    pub content: String,
188    /// Heading level, when the concept carries one.
189    pub level: Option<u8>,
190    /// Named attribute captures (`href`, `src`, `lang`).
191    pub attributes: BTreeMap<String, String>,
192}
193
194/// A concept instance ready to be rendered into a target format.
195#[derive(Clone, Debug, Default, PartialEq, Eq)]
196pub struct DocumentFormatInstance {
197    /// Formatted content for the `{}` placeholder.
198    pub content: String,
199    /// Heading level, when the concept carries one.
200    pub level: Option<u8>,
201    /// Named attribute values (`href`, `src`, `lang`).
202    pub attributes: BTreeMap<String, String>,
203}
204
205impl DocumentFormatInstance {
206    /// Creates an instance carrying only formatted content.
207    #[must_use]
208    pub fn from_content(content: impl Into<String>) -> Self {
209        Self {
210            content: content.into(),
211            level: None,
212            attributes: BTreeMap::new(),
213        }
214    }
215
216    fn attribute(&self, name: &str) -> Option<&str> {
217        self.attributes.get(name).map(String::as_str)
218    }
219}
220
221/// A single segment of a compiled template.
222enum Segment {
223    Literal(String),
224    Hole(Hole),
225}
226
227/// The kind of value a template hole carries.
228#[derive(Clone, Copy, PartialEq, Eq)]
229enum Hole {
230    /// Formatted content (`{}`).
231    Content,
232    /// Markdown heading markers rendered as a run of `#`.
233    Markers,
234    /// HTML heading level rendered as a decimal digit.
235    Level,
236    /// Named attribute hole such as `href`, `src`, or `lang`.
237    Attribute(&'static str),
238}
239
240fn hole_for_name(name: &str) -> Hole {
241    match name {
242        "" => Hole::Content,
243        "markers" => Hole::Markers,
244        "level" => Hole::Level,
245        "href" => Hole::Attribute("href"),
246        "src" => Hole::Attribute("src"),
247        "lang" => Hole::Attribute("lang"),
248        other => panic!("unsupported document-formatting template hole {{{other}}}"),
249    }
250}
251
252/// Compiles a template string into ordered literal and hole segments.
253fn compile_template(template: &str) -> Vec<Segment> {
254    let mut segments = Vec::new();
255    let mut literal = String::new();
256    let mut chars = template.chars();
257
258    while let Some(character) = chars.next() {
259        if character == '{' {
260            if !literal.is_empty() {
261                segments.push(Segment::Literal(std::mem::take(&mut literal)));
262            }
263            let mut name = String::new();
264            for inner in chars.by_ref() {
265                if inner == '}' {
266                    break;
267                }
268                name.push(inner);
269            }
270            segments.push(Segment::Hole(hole_for_name(&name)));
271        } else {
272            literal.push(character);
273        }
274    }
275
276    if !literal.is_empty() {
277        segments.push(Segment::Literal(literal));
278    }
279
280    segments
281}
282
283/// Renders a compiled template from an instance, or `None` when a required
284/// attribute is missing.
285fn render_segments(segments: &[Segment], instance: &DocumentFormatInstance) -> Option<String> {
286    let mut output = String::new();
287    for segment in segments {
288        match segment {
289            Segment::Literal(text) => output.push_str(text),
290            Segment::Hole(Hole::Content) => output.push_str(&instance.content),
291            Segment::Hole(Hole::Markers) => {
292                let level = instance.level.unwrap_or(1).max(1);
293                output.push_str(&"#".repeat(usize::from(level)));
294            }
295            Segment::Hole(Hole::Level) => {
296                output.push_str(&instance.level.unwrap_or(1).max(1).to_string());
297            }
298            Segment::Hole(Hole::Attribute(name)) => output.push_str(instance.attribute(name)?),
299        }
300    }
301    Some(output)
302}
303
304/// Matches a fragment against a compiled template, capturing hole values.
305fn match_segments(segments: &[Segment], fragment: &str) -> Option<DocumentFormatInstance> {
306    let mut instance = DocumentFormatInstance::default();
307    let mut cursor = 0usize;
308
309    for (index, segment) in segments.iter().enumerate() {
310        match segment {
311            Segment::Literal(text) => {
312                if !fragment[cursor..].starts_with(text.as_str()) {
313                    return None;
314                }
315                cursor += text.len();
316            }
317            Segment::Hole(hole) => {
318                let rest = &fragment[cursor..];
319                let captured = match segments.get(index + 1) {
320                    Some(Segment::Literal(next)) => {
321                        let relative = rest.find(next.as_str())?;
322                        &rest[..relative]
323                    }
324                    // Two holes cannot sit next to each other in our templates.
325                    Some(Segment::Hole(_)) => return None,
326                    None => rest,
327                };
328                store_capture(&mut instance, *hole, captured)?;
329                cursor += captured.len();
330            }
331        }
332    }
333
334    if cursor == fragment.len() {
335        Some(instance)
336    } else {
337        None
338    }
339}
340
341fn store_capture(instance: &mut DocumentFormatInstance, hole: Hole, captured: &str) -> Option<()> {
342    match hole {
343        Hole::Content => instance.content = captured.to_string(),
344        Hole::Markers => {
345            if captured.is_empty() || !captured.bytes().all(|byte| byte == b'#') {
346                return None;
347            }
348            let level = u8::try_from(captured.len()).ok()?;
349            assign_level(instance, level)?;
350        }
351        Hole::Level => {
352            let level: u8 = captured.parse().ok()?;
353            assign_level(instance, level)?;
354        }
355        Hole::Attribute(name) => {
356            instance
357                .attributes
358                .insert(name.to_string(), captured.to_string());
359        }
360    }
361    Some(())
362}
363
364/// Stores a heading level, rejecting a fragment whose repeated level holes
365/// disagree (for example an HTML `<h2>...</h1>`).
366fn assign_level(instance: &mut DocumentFormatInstance, level: u8) -> Option<()> {
367    match instance.level {
368        Some(existing) if existing != level => None,
369        _ => {
370            instance.level = Some(level);
371            Some(())
372        }
373    }
374}
375
376fn template_for(concept: &str, language: &str) -> Option<&'static str> {
377    DOCUMENT_FORMATTING_CONCEPTS
378        .iter()
379        .find(|entry| entry.id == concept)?
380        .templates
381        .iter()
382        .find(|(lang, _)| *lang == language)
383        .map(|(_, template)| *template)
384}
385
386/// Inline concepts are resolved most-specific-first so that, for example, an
387/// image is not mistaken for a hyperlink and bold is not mistaken for italic.
388const INLINE_RESOLUTION_ORDER: &[&str] = &[
389    "image",
390    "hyperlink",
391    "strong",
392    "emphasis",
393    "strikethrough",
394    "inline-code",
395];
396
397impl LinkNetwork {
398    /// Seeds the shared document-formatting concept set with per-format syntax
399    /// mappings.
400    ///
401    /// Each concept becomes a language-free [`LinkType::Concept`] link, and each
402    /// `(language, template)` pair becomes a semantic syntax mapping so the same
403    /// concept reconstructs as `**…**` in Markdown, `<strong>…</strong>` in
404    /// HTML, and so on.
405    pub fn seed_document_formatting_concepts(&mut self) -> DocumentFormattingSeedReport {
406        let mut syntax_mappings = 0;
407        for concept in DOCUMENT_FORMATTING_CONCEPTS {
408            let concept_link = self.intern_concept(concept.id, Some(concept.definition));
409            for (language, template) in concept.templates {
410                self.insert_concept_syntax_mapping(
411                    concept_link,
412                    concept.id,
413                    language,
414                    template,
415                    true,
416                );
417                syntax_mappings += 1;
418            }
419        }
420
421        DocumentFormattingSeedReport {
422            concepts: DOCUMENT_FORMATTING_CONCEPTS.len(),
423            syntax_mappings,
424        }
425    }
426
427    /// Resolves a formatting `fragment` written in `language` to the shared,
428    /// language-free concept it denotes.
429    ///
430    /// Both Markdown `**bold**` and HTML `<strong>bold</strong>` resolve to the
431    /// one seeded `strong` concept link. Returns `None` when the fragment is not
432    /// a known formatting construct or the concept set has not been seeded.
433    #[must_use]
434    pub fn resolve_document_format(
435        &self,
436        language: &str,
437        fragment: &str,
438    ) -> Option<DocumentFormatMatch> {
439        for concept in INLINE_RESOLUTION_ORDER
440            .iter()
441            .copied()
442            .chain(DOCUMENT_FORMATTING_CONCEPTS.iter().map(|entry| entry.id))
443        {
444            let Some(template) = template_for(concept, language) else {
445                continue;
446            };
447            let segments = compile_template(template);
448            if let Some(instance) = match_segments(&segments, fragment) {
449                let Some(link) = self.find_term(concept) else {
450                    continue;
451                };
452                return Some(DocumentFormatMatch {
453                    concept: concept.to_string(),
454                    link,
455                    content: instance.content,
456                    level: instance.level,
457                    attributes: instance.attributes,
458                });
459            }
460        }
461        None
462    }
463
464    /// Renders a concept instance into `language` surface syntax.
465    ///
466    /// Returns `None` when the concept has no template for the language or a
467    /// required attribute is missing from the instance.
468    #[must_use]
469    pub fn render_document_format(
470        &self,
471        concept: &str,
472        language: &str,
473        instance: &DocumentFormatInstance,
474    ) -> Option<String> {
475        let template = template_for(concept, language)?;
476        render_segments(&compile_template(template), instance)
477    }
478
479    /// Translates a single formatting fragment from `source_language` to
480    /// `target_language` through the shared concept layer.
481    #[must_use]
482    pub fn translate_document_format(
483        &self,
484        source_language: &str,
485        target_language: &str,
486        fragment: &str,
487    ) -> Option<String> {
488        let resolved = self.resolve_document_format(source_language, fragment)?;
489        let instance = DocumentFormatInstance {
490            content: resolved.content,
491            level: resolved.level,
492            attributes: resolved.attributes,
493        };
494        self.render_document_format(&resolved.concept, target_language, &instance)
495    }
496
497    /// Returns the seeded concept link for a formatting concept id, when present.
498    #[must_use]
499    pub fn document_formatting_concept(&self, concept: &str) -> Option<LinkId> {
500        let _ = template_for(concept, "Markdown")?;
501        self.find_term(concept)
502            .filter(|link| self.is_concept_link(*link))
503    }
504
505    fn is_concept_link(&self, link: LinkId) -> bool {
506        self.link(link)
507            .is_some_and(|link| link.metadata().link_type() == Some(LinkType::Concept))
508    }
509}