1mod document;
18mod docx;
19mod opc;
20mod pdf;
21mod profile;
22
23pub use document::{parse_markup_document, BlockNode, FormattingDocument, InlineNode};
24pub use docx::{docx_profile_is_recognized, parse_docx_document, render_docx_document};
25pub use opc::{docx_package_is_recognized, parse_docx_package, render_docx_package};
26pub use pdf::{parse_pdf_document, pdf_profile_is_recognized, render_pdf_document};
27pub use profile::{
28 canonical_document_format, document_format_profile, CROSS_FORMAT_CONCEPTS, DOCUMENT_FORMATS,
29};
30
31use std::collections::BTreeMap;
32
33use crate::link_network::{LinkId, LinkNetwork, LinkType};
34
35struct FormattingConcept {
37 id: &'static str,
38 definition: &'static str,
39 templates: &'static [(&'static str, &'static str)],
42}
43
44const DOCUMENT_FORMATTING_CONCEPTS: &[FormattingConcept] = &[
46 FormattingConcept {
48 id: "emphasis",
49 definition: "Inline emphasis (italic) applied to a text fragment.",
50 templates: &[("Markdown", "*{}*"), ("HTML", "<em>{}</em>")],
51 },
52 FormattingConcept {
53 id: "strong",
54 definition: "Inline strong importance (bold) applied to a text fragment.",
55 templates: &[("Markdown", "**{}**"), ("HTML", "<strong>{}</strong>")],
56 },
57 FormattingConcept {
58 id: "strikethrough",
59 definition: "Inline strikethrough (deleted) text fragment.",
60 templates: &[("Markdown", "~~{}~~"), ("HTML", "<del>{}</del>")],
61 },
62 FormattingConcept {
63 id: "inline-code",
64 definition: "Inline monospaced code span.",
65 templates: &[("Markdown", "`{}`"), ("HTML", "<code>{}</code>")],
66 },
67 FormattingConcept {
68 id: "hyperlink",
69 definition: "Inline hyperlink wrapping text and targeting a destination.",
70 templates: &[
71 ("Markdown", "[{}]({href})"),
72 ("HTML", "<a href=\"{href}\">{}</a>"),
73 ],
74 },
75 FormattingConcept {
76 id: "image",
77 definition: "Inline image with alternative text and a source reference.",
78 templates: &[
79 ("Markdown", ""),
80 ("HTML", "<img src=\"{src}\" alt=\"{}\" />"),
81 ],
82 },
83 FormattingConcept {
84 id: "line-break",
85 definition: "Explicit inline line break inside a block.",
86 templates: &[("Markdown", " \n"), ("HTML", "<br />")],
87 },
88 FormattingConcept {
90 id: "heading",
91 definition: "Section heading carrying a level from 1 (most significant) downward.",
92 templates: &[
93 ("Markdown", "{markers} {}"),
94 ("HTML", "<h{level}>{}</h{level}>"),
95 ],
96 },
97 FormattingConcept {
98 id: "paragraph",
99 definition: "Block of running text.",
100 templates: &[("Markdown", "{}"), ("HTML", "<p>{}</p>")],
101 },
102 FormattingConcept {
103 id: "blockquote",
104 definition: "Quoted block set off from the surrounding text.",
105 templates: &[
106 ("Markdown", "> {}"),
107 ("HTML", "<blockquote>{}</blockquote>"),
108 ],
109 },
110 FormattingConcept {
111 id: "bullet-list",
112 definition: "Unordered list container.",
113 templates: &[("Markdown", "{}"), ("HTML", "<ul>{}</ul>")],
114 },
115 FormattingConcept {
116 id: "ordered-list",
117 definition: "Ordered list container.",
118 templates: &[("Markdown", "{}"), ("HTML", "<ol>{}</ol>")],
119 },
120 FormattingConcept {
121 id: "list-item",
122 definition: "Single item within a list.",
123 templates: &[("Markdown", "- {}"), ("HTML", "<li>{}</li>")],
124 },
125 FormattingConcept {
126 id: "code-block",
127 definition: "Fenced block of preformatted code carrying an optional language.",
128 templates: &[
129 ("Markdown", "```{lang}\n{}\n```"),
130 (
131 "HTML",
132 "<pre><code class=\"language-{lang}\">{}</code></pre>",
133 ),
134 ],
135 },
136 FormattingConcept {
137 id: "thematic-break",
138 definition: "Thematic break (horizontal rule) between sections.",
139 templates: &[("Markdown", "---"), ("HTML", "<hr />")],
140 },
141 FormattingConcept {
142 id: "table",
143 definition: "Tabular data container.",
144 templates: &[("Markdown", "{}"), ("HTML", "<table>{}</table>")],
145 },
146 FormattingConcept {
147 id: "table-row",
148 definition: "Row within a table.",
149 templates: &[("Markdown", "{}"), ("HTML", "<tr>{}</tr>")],
150 },
151 FormattingConcept {
152 id: "table-cell",
153 definition: "Cell within a table row.",
154 templates: &[("Markdown", "{}"), ("HTML", "<td>{}</td>")],
155 },
156];
157
158#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
160pub struct DocumentFormattingSeedReport {
161 concepts: usize,
162 syntax_mappings: usize,
163}
164
165impl DocumentFormattingSeedReport {
166 #[must_use]
168 pub const fn concepts(self) -> usize {
169 self.concepts
170 }
171
172 #[must_use]
174 pub const fn syntax_mappings(self) -> usize {
175 self.syntax_mappings
176 }
177}
178
179#[derive(Clone, Debug, PartialEq, Eq)]
181pub struct DocumentFormatMatch {
182 pub concept: String,
184 pub link: LinkId,
186 pub content: String,
188 pub level: Option<u8>,
190 pub attributes: BTreeMap<String, String>,
192}
193
194#[derive(Clone, Debug, Default, PartialEq, Eq)]
196pub struct DocumentFormatInstance {
197 pub content: String,
199 pub level: Option<u8>,
201 pub attributes: BTreeMap<String, String>,
203}
204
205impl DocumentFormatInstance {
206 #[must_use]
208 pub fn from_content(content: impl Into<String>) -> Self {
209 Self {
210 content: content.into(),
211 level: None,
212 attributes: BTreeMap::new(),
213 }
214 }
215
216 fn attribute(&self, name: &str) -> Option<&str> {
217 self.attributes.get(name).map(String::as_str)
218 }
219}
220
221enum Segment {
223 Literal(String),
224 Hole(Hole),
225}
226
227#[derive(Clone, Copy, PartialEq, Eq)]
229enum Hole {
230 Content,
232 Markers,
234 Level,
236 Attribute(&'static str),
238}
239
240fn hole_for_name(name: &str) -> Hole {
241 match name {
242 "" => Hole::Content,
243 "markers" => Hole::Markers,
244 "level" => Hole::Level,
245 "href" => Hole::Attribute("href"),
246 "src" => Hole::Attribute("src"),
247 "lang" => Hole::Attribute("lang"),
248 other => panic!("unsupported document-formatting template hole {{{other}}}"),
249 }
250}
251
252fn compile_template(template: &str) -> Vec<Segment> {
254 let mut segments = Vec::new();
255 let mut literal = String::new();
256 let mut chars = template.chars();
257
258 while let Some(character) = chars.next() {
259 if character == '{' {
260 if !literal.is_empty() {
261 segments.push(Segment::Literal(std::mem::take(&mut literal)));
262 }
263 let mut name = String::new();
264 for inner in chars.by_ref() {
265 if inner == '}' {
266 break;
267 }
268 name.push(inner);
269 }
270 segments.push(Segment::Hole(hole_for_name(&name)));
271 } else {
272 literal.push(character);
273 }
274 }
275
276 if !literal.is_empty() {
277 segments.push(Segment::Literal(literal));
278 }
279
280 segments
281}
282
283fn render_segments(segments: &[Segment], instance: &DocumentFormatInstance) -> Option<String> {
286 let mut output = String::new();
287 for segment in segments {
288 match segment {
289 Segment::Literal(text) => output.push_str(text),
290 Segment::Hole(Hole::Content) => output.push_str(&instance.content),
291 Segment::Hole(Hole::Markers) => {
292 let level = instance.level.unwrap_or(1).max(1);
293 output.push_str(&"#".repeat(usize::from(level)));
294 }
295 Segment::Hole(Hole::Level) => {
296 output.push_str(&instance.level.unwrap_or(1).max(1).to_string());
297 }
298 Segment::Hole(Hole::Attribute(name)) => output.push_str(instance.attribute(name)?),
299 }
300 }
301 Some(output)
302}
303
304fn match_segments(segments: &[Segment], fragment: &str) -> Option<DocumentFormatInstance> {
306 let mut instance = DocumentFormatInstance::default();
307 let mut cursor = 0usize;
308
309 for (index, segment) in segments.iter().enumerate() {
310 match segment {
311 Segment::Literal(text) => {
312 if !fragment[cursor..].starts_with(text.as_str()) {
313 return None;
314 }
315 cursor += text.len();
316 }
317 Segment::Hole(hole) => {
318 let rest = &fragment[cursor..];
319 let captured = match segments.get(index + 1) {
320 Some(Segment::Literal(next)) => {
321 let relative = rest.find(next.as_str())?;
322 &rest[..relative]
323 }
324 Some(Segment::Hole(_)) => return None,
326 None => rest,
327 };
328 store_capture(&mut instance, *hole, captured)?;
329 cursor += captured.len();
330 }
331 }
332 }
333
334 if cursor == fragment.len() {
335 Some(instance)
336 } else {
337 None
338 }
339}
340
341fn store_capture(instance: &mut DocumentFormatInstance, hole: Hole, captured: &str) -> Option<()> {
342 match hole {
343 Hole::Content => instance.content = captured.to_string(),
344 Hole::Markers => {
345 if captured.is_empty() || !captured.bytes().all(|byte| byte == b'#') {
346 return None;
347 }
348 let level = u8::try_from(captured.len()).ok()?;
349 assign_level(instance, level)?;
350 }
351 Hole::Level => {
352 let level: u8 = captured.parse().ok()?;
353 assign_level(instance, level)?;
354 }
355 Hole::Attribute(name) => {
356 instance
357 .attributes
358 .insert(name.to_string(), captured.to_string());
359 }
360 }
361 Some(())
362}
363
364fn assign_level(instance: &mut DocumentFormatInstance, level: u8) -> Option<()> {
367 match instance.level {
368 Some(existing) if existing != level => None,
369 _ => {
370 instance.level = Some(level);
371 Some(())
372 }
373 }
374}
375
376fn template_for(concept: &str, language: &str) -> Option<&'static str> {
377 DOCUMENT_FORMATTING_CONCEPTS
378 .iter()
379 .find(|entry| entry.id == concept)?
380 .templates
381 .iter()
382 .find(|(lang, _)| *lang == language)
383 .map(|(_, template)| *template)
384}
385
386const INLINE_RESOLUTION_ORDER: &[&str] = &[
389 "image",
390 "hyperlink",
391 "strong",
392 "emphasis",
393 "strikethrough",
394 "inline-code",
395];
396
397impl LinkNetwork {
398 pub fn seed_document_formatting_concepts(&mut self) -> DocumentFormattingSeedReport {
406 let mut syntax_mappings = 0;
407 for concept in DOCUMENT_FORMATTING_CONCEPTS {
408 let concept_link = self.intern_concept(concept.id, Some(concept.definition));
409 for (language, template) in concept.templates {
410 self.insert_concept_syntax_mapping(
411 concept_link,
412 concept.id,
413 language,
414 template,
415 true,
416 );
417 syntax_mappings += 1;
418 }
419 }
420
421 DocumentFormattingSeedReport {
422 concepts: DOCUMENT_FORMATTING_CONCEPTS.len(),
423 syntax_mappings,
424 }
425 }
426
427 #[must_use]
434 pub fn resolve_document_format(
435 &self,
436 language: &str,
437 fragment: &str,
438 ) -> Option<DocumentFormatMatch> {
439 for concept in INLINE_RESOLUTION_ORDER
440 .iter()
441 .copied()
442 .chain(DOCUMENT_FORMATTING_CONCEPTS.iter().map(|entry| entry.id))
443 {
444 let Some(template) = template_for(concept, language) else {
445 continue;
446 };
447 let segments = compile_template(template);
448 if let Some(instance) = match_segments(&segments, fragment) {
449 let Some(link) = self.find_term(concept) else {
450 continue;
451 };
452 return Some(DocumentFormatMatch {
453 concept: concept.to_string(),
454 link,
455 content: instance.content,
456 level: instance.level,
457 attributes: instance.attributes,
458 });
459 }
460 }
461 None
462 }
463
464 #[must_use]
469 pub fn render_document_format(
470 &self,
471 concept: &str,
472 language: &str,
473 instance: &DocumentFormatInstance,
474 ) -> Option<String> {
475 let template = template_for(concept, language)?;
476 render_segments(&compile_template(template), instance)
477 }
478
479 #[must_use]
482 pub fn translate_document_format(
483 &self,
484 source_language: &str,
485 target_language: &str,
486 fragment: &str,
487 ) -> Option<String> {
488 let resolved = self.resolve_document_format(source_language, fragment)?;
489 let instance = DocumentFormatInstance {
490 content: resolved.content,
491 level: resolved.level,
492 attributes: resolved.attributes,
493 };
494 self.render_document_format(&resolved.concept, target_language, &instance)
495 }
496
497 #[must_use]
499 pub fn document_formatting_concept(&self, concept: &str) -> Option<LinkId> {
500 let _ = template_for(concept, "Markdown")?;
501 self.find_term(concept)
502 .filter(|link| self.is_concept_link(*link))
503 }
504
505 fn is_concept_link(&self, link: LinkId) -> bool {
506 self.link(link)
507 .is_some_and(|link| link.metadata().link_type() == Some(LinkType::Concept))
508 }
509}