Skip to main content

meta_language/document_formatting/
profile.rs

1//! Per-format capability profiles for cross-format document reconstruction.
2//!
3//! Each supported document format (`txt`, `Markdown`, `HTML`, `PDF`, `DOCX`)
4//! exposes a [`LanguageProfile`] over the shared, language-free formatting
5//! concept ontology (issue #83). The profile records which formatting concepts
6//! the format can represent natively and, for every concept it cannot, the
7//! documented lossy fallback applied when the concept is encountered. This is
8//! the per-target fidelity report required by issue #86: a document parsed from
9//! one format reconstructs into any other when the source uses only concepts
10//! both formats support, and unsupported concepts degrade through a declared
11//! fallback rather than silent data loss.
12
13use crate::language_profile::LanguageProfile;
14use crate::link_network::LinkType;
15
16/// The ordered set of document formats the cross-format reconstruction layer
17/// supports, used as both source and target of `reconstruct_text_as`.
18pub const DOCUMENT_FORMATS: &[&str] = &["txt", "Markdown", "HTML", "PDF", "DOCX"];
19
20/// The shared formatting concepts considered when reporting cross-format
21/// fidelity. Every format profile classifies each of these as either natively
22/// supported or carrying a documented lossy fallback.
23pub const CROSS_FORMAT_CONCEPTS: &[&str] = &[
24    "heading",
25    "paragraph",
26    "bullet-list",
27    "ordered-list",
28    "list-item",
29    "strong",
30    "emphasis",
31    "hyperlink",
32];
33
34/// Returns the capability profile for a document `format`, or `None` when the
35/// format is not one of the cross-format reconstruction targets.
36///
37/// The returned [`LanguageProfile`] lists the formatting concepts the format
38/// represents natively (via [`LanguageProfile::supports_concept`]) and the
39/// documented fallback for every concept it cannot
40/// (via [`LanguageProfile::concept_fallback`] / [`LanguageProfile::fallbacks`]).
41#[must_use]
42pub fn document_format_profile(format: &str) -> Option<LanguageProfile> {
43    let canonical = canonical_document_format(format)?;
44    let profile = base_profile(canonical);
45    Some(match canonical {
46        "txt" => txt_profile(profile),
47        "Markdown" => markdown_profile(profile),
48        "HTML" => html_profile(profile),
49        "PDF" => pdf_profile(profile),
50        "DOCX" => docx_profile(profile),
51        _ => unreachable!("canonical_document_format only yields known formats"),
52    })
53}
54
55/// Canonicalizes a format/language label to one of [`DOCUMENT_FORMATS`].
56///
57/// Matching is case-insensitive and accepts the common aliases used when a
58/// network is parsed (for example `md` for Markdown or `plain-text` for `txt`).
59#[must_use]
60pub fn canonical_document_format(format: &str) -> Option<&'static str> {
61    match format.to_ascii_lowercase().as_str() {
62        "txt" | "text" | "plain-text" | "plaintext" => Some("txt"),
63        "markdown" | "md" => Some("Markdown"),
64        "html" | "htm" => Some("HTML"),
65        "pdf" => Some("PDF"),
66        "docx" => Some("DOCX"),
67        _ => None,
68    }
69}
70
71fn base_profile(canonical: &str) -> LanguageProfile {
72    LanguageProfile::new(canonical, canonical)
73        .with_link_type(LinkType::Document)
74        .with_link_type(LinkType::Concept)
75        .with_link_type(LinkType::Token)
76}
77
78fn with_supported<'a>(
79    mut profile: LanguageProfile,
80    concepts: impl IntoIterator<Item = &'a str>,
81) -> LanguageProfile {
82    for concept in concepts {
83        profile = profile.with_concept(concept);
84    }
85    profile
86}
87
88fn txt_profile(profile: LanguageProfile) -> LanguageProfile {
89    with_supported(profile, ["paragraph"])
90        .with_concept_fallback(
91            "heading",
92            "flattened to a plain paragraph (heading level dropped)",
93        )
94        .with_concept_fallback(
95            "bullet-list",
96            "flattened to plain lines with a `- ` marker per item",
97        )
98        .with_concept_fallback(
99            "ordered-list",
100            "flattened to plain lines with a `N. ` marker per item",
101        )
102        .with_concept_fallback("list-item", "rendered as a single plain line")
103        .with_concept_fallback("strong", "rendered as unstyled plain text")
104        .with_concept_fallback("emphasis", "rendered as unstyled plain text")
105        .with_concept_fallback("hyperlink", "rendered as its visible text (URL dropped)")
106}
107
108fn markdown_profile(profile: LanguageProfile) -> LanguageProfile {
109    with_supported(
110        profile,
111        [
112            "heading",
113            "paragraph",
114            "bullet-list",
115            "list-item",
116            "strong",
117            "emphasis",
118            "hyperlink",
119        ],
120    )
121    .with_concept_fallback(
122        "ordered-list",
123        "rendered with bullet `- ` markers (ordering not preserved by the Markdown profile)",
124    )
125}
126
127fn html_profile(profile: LanguageProfile) -> LanguageProfile {
128    // HTML represents every cross-format concept natively.
129    with_supported(profile, CROSS_FORMAT_CONCEPTS.iter().copied())
130}
131
132fn pdf_profile(profile: LanguageProfile) -> LanguageProfile {
133    with_supported(
134        profile,
135        [
136            "heading",
137            "paragraph",
138            "bullet-list",
139            "ordered-list",
140            "list-item",
141            "strong",
142            "emphasis",
143        ],
144    )
145    .with_concept_fallback(
146        "hyperlink",
147        "rendered as its visible text, unstyled (URL dropped)",
148    )
149}
150
151fn docx_profile(profile: LanguageProfile) -> LanguageProfile {
152    with_supported(
153        profile,
154        [
155            "heading",
156            "paragraph",
157            "bullet-list",
158            "ordered-list",
159            "list-item",
160            "strong",
161            "emphasis",
162        ],
163    )
164    .with_concept_fallback(
165        "hyperlink",
166        "rendered as its visible text, unstyled (URL dropped)",
167    )
168}