Skip to main content

meta_language/document_formatting/
docx.rs

1//! OOXML (`WordprocessingML`) `word/document.xml` content layer for the shared
2//! document-formatting concept tree.
3//!
4//! DOCX is an OPC (ZIP) package of OOXML parts; this module models its primary
5//! `word/document.xml` part — the text content layer that carries the document
6//! structure. It renders a language-free [`FormattingDocument`] into the OOXML
7//! body markup and parses that markup back into the same concept tree, so the
8//! tree round-trips DOCX ⇄ Markdown/HTML/PDF through the shared ontology
9//! (issue #83). The binary OPC packaging is assembled in [`super::opc`].
10//!
11//! # Representation
12//!
13//! Block role is carried by paragraph properties (`<w:pPr>`):
14//!
15//! - Heading level `n`: `<w:pStyle w:val="Heading{n}"/>` (`n` = 1..6).
16//! - Paragraph: neither `pStyle` nor `numPr`.
17//! - List item: `<w:numPr><w:ilvl w:val="0"/><w:numId w:val="{id}"/></w:numPr>`
18//!   with `id` = 1 for a bullet list and 2 for an ordered list. OOXML has no
19//!   list container element, so consecutive items sharing a `numId` group into
20//!   one [`BlockNode::List`].
21//!
22//! Inline style is carried by run properties (`<w:rPr>`):
23//!
24//! - Regular run: bare `<w:r><w:t>…</w:t></w:r>`.
25//! - Strong (bold): `<w:rPr><w:b/></w:rPr>` → the `strong` concept.
26//! - Emphasis (italic): `<w:rPr><w:i/></w:rPr>` → the `emphasis` concept.
27//!
28//! See `docs/docx-fidelity.md` for the full round-trip fidelity matrix.
29
30use super::document::{BlockNode, FormattingDocument, InlineNode};
31
32/// Inline style carried by a single run.
33#[derive(Clone, Copy, PartialEq, Eq)]
34enum RunStyle {
35    Regular,
36    Strong,
37    Emphasis,
38}
39
40impl RunStyle {
41    /// Wraps content text in the inline concept node for this style.
42    fn wrap(self, text: String) -> InlineNode {
43        match self {
44            Self::Regular => InlineNode::Text(text),
45            Self::Strong => wrapped("strong", text),
46            Self::Emphasis => wrapped("emphasis", text),
47        }
48    }
49}
50
51fn wrapped(concept: &str, text: String) -> InlineNode {
52    InlineNode::Wrapped {
53        concept: concept.to_string(),
54        attributes: std::collections::BTreeMap::new(),
55        children: vec![InlineNode::Text(text)],
56    }
57}
58
59const HEADER: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
60const BODY_OPEN: &str =
61    "<w:document xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"><w:body>";
62const BODY_CLOSE: &str = "<w:sectPr/></w:body></w:document>\n";
63
64const BULLET_NUM_ID: &str = "1";
65const ORDERED_NUM_ID: &str = "2";
66
67// --- rendering -------------------------------------------------------------
68
69/// Renders a language-free [`FormattingDocument`] into OOXML
70/// `word/document.xml` markup in the documented profile.
71#[must_use]
72pub fn render_docx_document(document: &FormattingDocument) -> String {
73    let mut output = String::from(HEADER);
74    output.push_str(BODY_OPEN);
75    for block in &document.blocks {
76        render_block(&mut output, block);
77    }
78    output.push_str(BODY_CLOSE);
79    output
80}
81
82fn render_block(output: &mut String, block: &BlockNode) {
83    match block {
84        BlockNode::Heading { level, children } => {
85            let level = (*level).clamp(1, 6);
86            output.push_str("<w:p><w:pPr><w:pStyle w:val=\"Heading");
87            output.push_str(&level.to_string());
88            output.push_str("\"/></w:pPr>");
89            render_runs(output, children);
90            output.push_str("</w:p>");
91        }
92        BlockNode::Paragraph { children } => {
93            output.push_str("<w:p>");
94            render_runs(output, children);
95            output.push_str("</w:p>");
96        }
97        BlockNode::List { concept, items } => {
98            let num_id = if concept == "ordered-list" {
99                ORDERED_NUM_ID
100            } else {
101                BULLET_NUM_ID
102            };
103            for item in items {
104                output.push_str("<w:p><w:pPr><w:numPr><w:ilvl w:val=\"0\"/><w:numId w:val=\"");
105                output.push_str(num_id);
106                output.push_str("\"/></w:numPr></w:pPr>");
107                render_runs(output, item);
108                output.push_str("</w:p>");
109            }
110        }
111    }
112}
113
114fn render_runs(output: &mut String, nodes: &[InlineNode]) {
115    let mut runs = Vec::new();
116    flatten_runs(nodes, RunStyle::Regular, &mut runs);
117    merge_adjacent_runs(&mut runs);
118    for (style, text) in runs {
119        if text.is_empty() {
120            continue;
121        }
122        output.push_str("<w:r>");
123        match style {
124            RunStyle::Strong => output.push_str("<w:rPr><w:b/></w:rPr>"),
125            RunStyle::Emphasis => output.push_str("<w:rPr><w:i/></w:rPr>"),
126            RunStyle::Regular => {}
127        }
128        output.push_str("<w:t xml:space=\"preserve\">");
129        output.push_str(&escape_xml(&text));
130        output.push_str("</w:t></w:r>");
131    }
132}
133
134fn flatten_runs(nodes: &[InlineNode], style: RunStyle, runs: &mut Vec<(RunStyle, String)>) {
135    for node in nodes {
136        match node {
137            InlineNode::Text(text) => runs.push((style, text.clone())),
138            InlineNode::Wrapped {
139                concept, children, ..
140            } => {
141                let child_style = match concept.as_str() {
142                    "strong" => RunStyle::Strong,
143                    "emphasis" => RunStyle::Emphasis,
144                    // Unsupported inline concepts (hyperlink, image, …) keep the
145                    // surrounding style; their text is preserved but unstyled.
146                    _ => style,
147                };
148                flatten_runs(children, child_style, runs);
149            }
150        }
151    }
152}
153
154fn merge_adjacent_runs(runs: &mut Vec<(RunStyle, String)>) {
155    let mut merged: Vec<(RunStyle, String)> = Vec::with_capacity(runs.len());
156    for (style, text) in runs.drain(..) {
157        if let Some(last) = merged.last_mut() {
158            if last.0 == style {
159                last.1.push_str(&text);
160                continue;
161            }
162        }
163        merged.push((style, text));
164    }
165    *runs = merged;
166}
167
168fn escape_xml(text: &str) -> String {
169    let mut escaped = String::with_capacity(text.len());
170    for character in text.chars() {
171        match character {
172            '&' => escaped.push_str("&amp;"),
173            '<' => escaped.push_str("&lt;"),
174            '>' => escaped.push_str("&gt;"),
175            other => escaped.push(other),
176        }
177    }
178    escaped
179}
180
181// --- parsing ---------------------------------------------------------------
182
183/// Parses OOXML `word/document.xml` markup in the documented profile back into
184/// the language-free concept layer.
185///
186/// Returns an empty document when no recognizable `<w:p>` paragraphs are
187/// present, so out-of-profile XML degrades gracefully rather than producing a
188/// corrupt tree.
189#[must_use]
190pub fn parse_docx_document(text: &str) -> FormattingDocument {
191    FormattingDocument {
192        blocks: parse_blocks(text),
193    }
194}
195
196/// Whether `text` is OOXML `document.xml` carrying at least one recognized block.
197#[must_use]
198pub fn docx_profile_is_recognized(text: &str) -> bool {
199    !parse_docx_document(text).blocks.is_empty()
200}
201
202/// A pending run of consecutive list-item paragraphs sharing one `numId`.
203struct PendingList {
204    concept: String,
205    items: Vec<Vec<InlineNode>>,
206}
207
208fn parse_blocks(text: &str) -> Vec<BlockNode> {
209    let mut blocks = Vec::new();
210    let mut pending: Option<PendingList> = None;
211
212    for paragraph in paragraphs(text) {
213        if let Some(level) = heading_level(paragraph) {
214            flush_pending(&mut blocks, &mut pending);
215            blocks.push(BlockNode::Heading {
216                level,
217                children: parse_runs(paragraph),
218            });
219        } else if let Some(num_id) = list_num_id(paragraph) {
220            let concept = if num_id == ORDERED_NUM_ID {
221                "ordered-list"
222            } else {
223                "bullet-list"
224            };
225            let item = parse_runs(paragraph);
226            match pending.as_mut() {
227                Some(list) if list.concept == concept => list.items.push(item),
228                _ => {
229                    flush_pending(&mut blocks, &mut pending);
230                    pending = Some(PendingList {
231                        concept: concept.to_string(),
232                        items: vec![item],
233                    });
234                }
235            }
236        } else {
237            flush_pending(&mut blocks, &mut pending);
238            blocks.push(BlockNode::Paragraph {
239                children: parse_runs(paragraph),
240            });
241        }
242    }
243
244    flush_pending(&mut blocks, &mut pending);
245    blocks
246}
247
248fn flush_pending(blocks: &mut Vec<BlockNode>, pending: &mut Option<PendingList>) {
249    if let Some(list) = pending.take() {
250        blocks.push(BlockNode::List {
251            concept: list.concept,
252            items: list.items,
253        });
254    }
255}
256
257/// Yields the inner markup of each `<w:p>…</w:p>` paragraph in document order.
258fn paragraphs(text: &str) -> Vec<&str> {
259    let mut found = Vec::new();
260    let mut rest = text;
261    while let Some(inner) = next_element(&mut rest, "w:p") {
262        found.push(inner);
263    }
264    found
265}
266
267/// Heading level from a `<w:pStyle w:val="Heading{n}"/>` paragraph property.
268fn heading_level(paragraph: &str) -> Option<u8> {
269    let value = attribute_value(paragraph, "<w:pStyle", "w:val")?;
270    let digits = value
271        .strip_prefix("Heading")
272        .or_else(|| value.strip_prefix("heading "))?;
273    let level: u8 = digits.trim().parse().ok()?;
274    (1..=6).contains(&level).then_some(level)
275}
276
277/// The `numId` of a list-item paragraph, when present.
278fn list_num_id(paragraph: &str) -> Option<String> {
279    attribute_value(paragraph, "<w:numId", "w:val").map(str::to_string)
280}
281
282fn parse_runs(paragraph: &str) -> Vec<InlineNode> {
283    let mut runs: Vec<(RunStyle, String)> = Vec::new();
284    let mut rest = paragraph;
285    while let Some(run) = next_element(&mut rest, "w:r") {
286        let style = if has_toggle(run, "b") {
287            RunStyle::Strong
288        } else if has_toggle(run, "i") {
289            RunStyle::Emphasis
290        } else {
291            RunStyle::Regular
292        };
293        let text = run_text(run);
294        if !text.is_empty() {
295            runs.push((style, text));
296        }
297    }
298    merge_adjacent_runs(&mut runs);
299    runs.into_iter()
300        .map(|(style, text)| style.wrap(text))
301        .collect()
302}
303
304/// Concatenated text of every `<w:t>…</w:t>` element inside a run.
305fn run_text(run: &str) -> String {
306    let mut text = String::new();
307    let mut rest = run;
308    while let Some(inner) = next_element(&mut rest, "w:t") {
309        text.push_str(&unescape_xml(inner));
310    }
311    text
312}
313
314/// Whether a run carries an enabled `<w:{tag}>` toggle property (for example
315/// `<w:b/>`), honoring an explicit `w:val="false"`/`"0"`/`"none"` disable.
316fn has_toggle(run: &str, tag: &str) -> bool {
317    let needle = format!("<w:{tag}");
318    let mut rest = run;
319    while let Some(index) = rest.find(&needle) {
320        let after = &rest[index + needle.len()..];
321        // Reject longer element names such as `<w:bCs` when matching `<w:b`.
322        match after.chars().next() {
323            Some('>' | '/' | ' ') => {
324                let tag_end = after.find('>').unwrap_or(after.len());
325                let attributes = &after[..tag_end];
326                if !toggle_disabled(attributes) {
327                    return true;
328                }
329                rest = &after[tag_end..];
330            }
331            _ => rest = after,
332        }
333    }
334    false
335}
336
337fn toggle_disabled(attributes: &str) -> bool {
338    attribute_value(attributes, "", "w:val")
339        .is_some_and(|value| matches!(value, "false" | "0" | "off" | "none"))
340}
341
342/// Reads the `attribute` value from the first `tag` element in `text`. When
343/// `tag` is empty the lookup is performed against `text` directly.
344fn attribute_value<'a>(text: &'a str, tag: &str, attribute: &str) -> Option<&'a str> {
345    let scope = if tag.is_empty() {
346        text
347    } else {
348        let start = text.find(tag)?;
349        let after = &text[start..];
350        let end = after.find('>').map_or(after.len(), |index| index + 1);
351        &after[..end]
352    };
353    let needle = format!("{attribute}=\"");
354    let start = scope.find(&needle)? + needle.len();
355    let end = scope[start..].find('"')? + start;
356    Some(&scope[start..end])
357}
358
359/// Consumes the next `<{tag}>…</{tag}>` element from `rest`, advancing `rest`
360/// past it and returning the inner markup. Self-closing `<{tag}/>` elements are
361/// skipped (their inner content is empty) and reported as an empty string.
362fn next_element<'a>(rest: &mut &'a str, tag: &str) -> Option<&'a str> {
363    let open = format!("<{tag}");
364    let close = format!("</{tag}>");
365    loop {
366        let index = rest.find(&open)?;
367        let after = &rest[index + open.len()..];
368        // Reject longer element names (for example `<w:pPr` when seeking `<w:p`).
369        let boundary = after.chars().next();
370        if !matches!(boundary, Some('>' | '/' | ' ')) {
371            *rest = after;
372            continue;
373        }
374        let tag_end = after.find('>')?;
375        if after[..tag_end].ends_with('/') {
376            // Self-closing element: no inner content.
377            *rest = &after[tag_end + 1..];
378            return Some("");
379        }
380        let body = &after[tag_end + 1..];
381        let close_index = body.find(&close)?;
382        let inner = &body[..close_index];
383        *rest = &body[close_index + close.len()..];
384        return Some(inner);
385    }
386}
387
388fn unescape_xml(text: &str) -> String {
389    text.replace("&lt;", "<")
390        .replace("&gt;", ">")
391        .replace("&quot;", "\"")
392        .replace("&apos;", "'")
393        .replace("&amp;", "&")
394}