meta_language/document_formatting/
docx.rs1use super::document::{BlockNode, FormattingDocument, InlineNode};
31
32#[derive(Clone, Copy, PartialEq, Eq)]
34enum RunStyle {
35 Regular,
36 Strong,
37 Emphasis,
38}
39
40impl RunStyle {
41 fn wrap(self, text: String) -> InlineNode {
43 match self {
44 Self::Regular => InlineNode::Text(text),
45 Self::Strong => wrapped("strong", text),
46 Self::Emphasis => wrapped("emphasis", text),
47 }
48 }
49}
50
51fn wrapped(concept: &str, text: String) -> InlineNode {
52 InlineNode::Wrapped {
53 concept: concept.to_string(),
54 attributes: std::collections::BTreeMap::new(),
55 children: vec![InlineNode::Text(text)],
56 }
57}
58
59const HEADER: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
60const BODY_OPEN: &str =
61 "<w:document xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"><w:body>";
62const BODY_CLOSE: &str = "<w:sectPr/></w:body></w:document>\n";
63
64const BULLET_NUM_ID: &str = "1";
65const ORDERED_NUM_ID: &str = "2";
66
67#[must_use]
72pub fn render_docx_document(document: &FormattingDocument) -> String {
73 let mut output = String::from(HEADER);
74 output.push_str(BODY_OPEN);
75 for block in &document.blocks {
76 render_block(&mut output, block);
77 }
78 output.push_str(BODY_CLOSE);
79 output
80}
81
82fn render_block(output: &mut String, block: &BlockNode) {
83 match block {
84 BlockNode::Heading { level, children } => {
85 let level = (*level).clamp(1, 6);
86 output.push_str("<w:p><w:pPr><w:pStyle w:val=\"Heading");
87 output.push_str(&level.to_string());
88 output.push_str("\"/></w:pPr>");
89 render_runs(output, children);
90 output.push_str("</w:p>");
91 }
92 BlockNode::Paragraph { children } => {
93 output.push_str("<w:p>");
94 render_runs(output, children);
95 output.push_str("</w:p>");
96 }
97 BlockNode::List { concept, items } => {
98 let num_id = if concept == "ordered-list" {
99 ORDERED_NUM_ID
100 } else {
101 BULLET_NUM_ID
102 };
103 for item in items {
104 output.push_str("<w:p><w:pPr><w:numPr><w:ilvl w:val=\"0\"/><w:numId w:val=\"");
105 output.push_str(num_id);
106 output.push_str("\"/></w:numPr></w:pPr>");
107 render_runs(output, item);
108 output.push_str("</w:p>");
109 }
110 }
111 }
112}
113
114fn render_runs(output: &mut String, nodes: &[InlineNode]) {
115 let mut runs = Vec::new();
116 flatten_runs(nodes, RunStyle::Regular, &mut runs);
117 merge_adjacent_runs(&mut runs);
118 for (style, text) in runs {
119 if text.is_empty() {
120 continue;
121 }
122 output.push_str("<w:r>");
123 match style {
124 RunStyle::Strong => output.push_str("<w:rPr><w:b/></w:rPr>"),
125 RunStyle::Emphasis => output.push_str("<w:rPr><w:i/></w:rPr>"),
126 RunStyle::Regular => {}
127 }
128 output.push_str("<w:t xml:space=\"preserve\">");
129 output.push_str(&escape_xml(&text));
130 output.push_str("</w:t></w:r>");
131 }
132}
133
134fn flatten_runs(nodes: &[InlineNode], style: RunStyle, runs: &mut Vec<(RunStyle, String)>) {
135 for node in nodes {
136 match node {
137 InlineNode::Text(text) => runs.push((style, text.clone())),
138 InlineNode::Wrapped {
139 concept, children, ..
140 } => {
141 let child_style = match concept.as_str() {
142 "strong" => RunStyle::Strong,
143 "emphasis" => RunStyle::Emphasis,
144 _ => style,
147 };
148 flatten_runs(children, child_style, runs);
149 }
150 }
151 }
152}
153
154fn merge_adjacent_runs(runs: &mut Vec<(RunStyle, String)>) {
155 let mut merged: Vec<(RunStyle, String)> = Vec::with_capacity(runs.len());
156 for (style, text) in runs.drain(..) {
157 if let Some(last) = merged.last_mut() {
158 if last.0 == style {
159 last.1.push_str(&text);
160 continue;
161 }
162 }
163 merged.push((style, text));
164 }
165 *runs = merged;
166}
167
168fn escape_xml(text: &str) -> String {
169 let mut escaped = String::with_capacity(text.len());
170 for character in text.chars() {
171 match character {
172 '&' => escaped.push_str("&"),
173 '<' => escaped.push_str("<"),
174 '>' => escaped.push_str(">"),
175 other => escaped.push(other),
176 }
177 }
178 escaped
179}
180
181#[must_use]
190pub fn parse_docx_document(text: &str) -> FormattingDocument {
191 FormattingDocument {
192 blocks: parse_blocks(text),
193 }
194}
195
196#[must_use]
198pub fn docx_profile_is_recognized(text: &str) -> bool {
199 !parse_docx_document(text).blocks.is_empty()
200}
201
202struct PendingList {
204 concept: String,
205 items: Vec<Vec<InlineNode>>,
206}
207
208fn parse_blocks(text: &str) -> Vec<BlockNode> {
209 let mut blocks = Vec::new();
210 let mut pending: Option<PendingList> = None;
211
212 for paragraph in paragraphs(text) {
213 if let Some(level) = heading_level(paragraph) {
214 flush_pending(&mut blocks, &mut pending);
215 blocks.push(BlockNode::Heading {
216 level,
217 children: parse_runs(paragraph),
218 });
219 } else if let Some(num_id) = list_num_id(paragraph) {
220 let concept = if num_id == ORDERED_NUM_ID {
221 "ordered-list"
222 } else {
223 "bullet-list"
224 };
225 let item = parse_runs(paragraph);
226 match pending.as_mut() {
227 Some(list) if list.concept == concept => list.items.push(item),
228 _ => {
229 flush_pending(&mut blocks, &mut pending);
230 pending = Some(PendingList {
231 concept: concept.to_string(),
232 items: vec![item],
233 });
234 }
235 }
236 } else {
237 flush_pending(&mut blocks, &mut pending);
238 blocks.push(BlockNode::Paragraph {
239 children: parse_runs(paragraph),
240 });
241 }
242 }
243
244 flush_pending(&mut blocks, &mut pending);
245 blocks
246}
247
248fn flush_pending(blocks: &mut Vec<BlockNode>, pending: &mut Option<PendingList>) {
249 if let Some(list) = pending.take() {
250 blocks.push(BlockNode::List {
251 concept: list.concept,
252 items: list.items,
253 });
254 }
255}
256
257fn paragraphs(text: &str) -> Vec<&str> {
259 let mut found = Vec::new();
260 let mut rest = text;
261 while let Some(inner) = next_element(&mut rest, "w:p") {
262 found.push(inner);
263 }
264 found
265}
266
267fn heading_level(paragraph: &str) -> Option<u8> {
269 let value = attribute_value(paragraph, "<w:pStyle", "w:val")?;
270 let digits = value
271 .strip_prefix("Heading")
272 .or_else(|| value.strip_prefix("heading "))?;
273 let level: u8 = digits.trim().parse().ok()?;
274 (1..=6).contains(&level).then_some(level)
275}
276
277fn list_num_id(paragraph: &str) -> Option<String> {
279 attribute_value(paragraph, "<w:numId", "w:val").map(str::to_string)
280}
281
282fn parse_runs(paragraph: &str) -> Vec<InlineNode> {
283 let mut runs: Vec<(RunStyle, String)> = Vec::new();
284 let mut rest = paragraph;
285 while let Some(run) = next_element(&mut rest, "w:r") {
286 let style = if has_toggle(run, "b") {
287 RunStyle::Strong
288 } else if has_toggle(run, "i") {
289 RunStyle::Emphasis
290 } else {
291 RunStyle::Regular
292 };
293 let text = run_text(run);
294 if !text.is_empty() {
295 runs.push((style, text));
296 }
297 }
298 merge_adjacent_runs(&mut runs);
299 runs.into_iter()
300 .map(|(style, text)| style.wrap(text))
301 .collect()
302}
303
304fn run_text(run: &str) -> String {
306 let mut text = String::new();
307 let mut rest = run;
308 while let Some(inner) = next_element(&mut rest, "w:t") {
309 text.push_str(&unescape_xml(inner));
310 }
311 text
312}
313
314fn has_toggle(run: &str, tag: &str) -> bool {
317 let needle = format!("<w:{tag}");
318 let mut rest = run;
319 while let Some(index) = rest.find(&needle) {
320 let after = &rest[index + needle.len()..];
321 match after.chars().next() {
323 Some('>' | '/' | ' ') => {
324 let tag_end = after.find('>').unwrap_or(after.len());
325 let attributes = &after[..tag_end];
326 if !toggle_disabled(attributes) {
327 return true;
328 }
329 rest = &after[tag_end..];
330 }
331 _ => rest = after,
332 }
333 }
334 false
335}
336
337fn toggle_disabled(attributes: &str) -> bool {
338 attribute_value(attributes, "", "w:val")
339 .is_some_and(|value| matches!(value, "false" | "0" | "off" | "none"))
340}
341
342fn attribute_value<'a>(text: &'a str, tag: &str, attribute: &str) -> Option<&'a str> {
345 let scope = if tag.is_empty() {
346 text
347 } else {
348 let start = text.find(tag)?;
349 let after = &text[start..];
350 let end = after.find('>').map_or(after.len(), |index| index + 1);
351 &after[..end]
352 };
353 let needle = format!("{attribute}=\"");
354 let start = scope.find(&needle)? + needle.len();
355 let end = scope[start..].find('"')? + start;
356 Some(&scope[start..end])
357}
358
359fn next_element<'a>(rest: &mut &'a str, tag: &str) -> Option<&'a str> {
363 let open = format!("<{tag}");
364 let close = format!("</{tag}>");
365 loop {
366 let index = rest.find(&open)?;
367 let after = &rest[index + open.len()..];
368 let boundary = after.chars().next();
370 if !matches!(boundary, Some('>' | '/' | ' ')) {
371 *rest = after;
372 continue;
373 }
374 let tag_end = after.find('>')?;
375 if after[..tag_end].ends_with('/') {
376 *rest = &after[tag_end + 1..];
378 return Some("");
379 }
380 let body = &after[tag_end + 1..];
381 let close_index = body.find(&close)?;
382 let inner = &body[..close_index];
383 *rest = &body[close_index + close.len()..];
384 return Some(inner);
385 }
386}
387
388fn unescape_xml(text: &str) -> String {
389 text.replace("<", "<")
390 .replace(">", ">")
391 .replace(""", "\"")
392 .replace("'", "'")
393 .replace("&", "&")
394}