Skip to main content

meta_language/document_formatting/
pdf.rs

1//! A documented, text-only PDF profile for the document-formatting concept layer.
2//!
3//! PDF is a binary container. A faithful, general PDF reader (with stream
4//! compression, embedded fonts, content reflow, and scanned-image recovery) is
5//! out of scope for this crate. Instead this module defines a **constrained,
6//! self-describing PDF profile**: an uncompressed, single-page, ASCII PDF whose
7//! content stream carries the document structure with standard PDF operators
8//! plus a small, documented marked-content convention. Documents produced by
9//! [`render_pdf_document`] are valid PDFs (correct `xref`, object offsets, and
10//! stream `Length`) that open in conformant viewers, and [`parse_pdf_document`]
11//! recovers the same [`FormattingDocument`] concept tree they were built from.
12//!
13//! # Representation
14//!
15//! Block role is encoded with marked content:
16//!
17//! - `/H1 BDC … EMC` … `/H6 BDC … EMC` — headings carrying their level.
18//! - `/P BDC … EMC` — paragraphs.
19//! - `/UL BDC … EMC` / `/OL BDC … EMC` — bullet / ordered lists, each holding
20//!   `/LI BDC … EMC` items.
21//!
22//! Inline style is encoded with the font resource selected before each shown
23//! string:
24//!
25//! - `/F1` — regular text (`emphasis`/`strong` absent).
26//! - `/F2` — strong (bold), mapped to the `strong` concept.
27//! - `/F3` — emphasis (italic), mapped to the `emphasis` concept.
28//!
29//! Each run is one `/Fn size Tf` selector followed by one `(text) Tj` show, so
30//! parsing and rendering are exact inverses for the supported feature set.
31//!
32//! See `docs/pdf-fidelity.md` for the full round-trip fidelity matrix.
33
34use std::fmt::Write as _;
35
36use super::document::{BlockNode, FormattingDocument, InlineNode};
37
38/// Inline style carried by a single shown text run.
39#[derive(Clone, Copy, PartialEq, Eq)]
40enum RunStyle {
41    Regular,
42    Strong,
43    Emphasis,
44}
45
46impl RunStyle {
47    /// The font resource name used for this style in the rendered content stream.
48    const fn font(self) -> &'static str {
49        match self {
50            Self::Regular => "F1",
51            Self::Strong => "F2",
52            Self::Emphasis => "F3",
53        }
54    }
55
56    /// Resolves a font resource name back to the inline style it encodes.
57    fn from_font(font: &str) -> Option<Self> {
58        match font {
59            "F1" => Some(Self::Regular),
60            "F2" => Some(Self::Strong),
61            "F3" => Some(Self::Emphasis),
62            _ => None,
63        }
64    }
65
66    /// Wraps content text in the inline concept node for this style.
67    fn wrap(self, text: String) -> InlineNode {
68        match self {
69            Self::Regular => InlineNode::Text(text),
70            Self::Strong => InlineNode::Wrapped {
71                concept: "strong".to_string(),
72                attributes: std::collections::BTreeMap::new(),
73                children: vec![InlineNode::Text(text)],
74            },
75            Self::Emphasis => InlineNode::Wrapped {
76                concept: "emphasis".to_string(),
77                attributes: std::collections::BTreeMap::new(),
78                children: vec![InlineNode::Text(text)],
79            },
80        }
81    }
82}
83
84/// Renders a language-free [`FormattingDocument`] into a valid, uncompressed
85/// PDF in the documented text profile.
86#[must_use]
87pub fn render_pdf_document(document: &FormattingDocument) -> String {
88    let content = render_content_stream(document);
89    assemble_pdf(&content)
90}
91
92/// Parses a PDF written in the documented text profile back into the
93/// language-free concept layer.
94///
95/// Returns an empty document when no recognizable profile content stream is
96/// present, so general (out-of-profile) PDFs degrade gracefully rather than
97/// producing a corrupt tree.
98#[must_use]
99pub fn parse_pdf_document(text: &str) -> FormattingDocument {
100    let Some(content) = content_stream(text) else {
101        return FormattingDocument::default();
102    };
103    FormattingDocument {
104        blocks: parse_content_stream(content),
105    }
106}
107
108/// Whether `text` is a PDF in this profile carrying at least one recognized block.
109#[must_use]
110pub fn pdf_profile_is_recognized(text: &str) -> bool {
111    !parse_pdf_document(text).blocks.is_empty()
112}
113
114// --- rendering -------------------------------------------------------------
115
116/// First baseline (in PDF points from the page bottom) and the vertical step
117/// between lines. Coordinates are visual only; parsing ignores them.
118const TOP_BASELINE: i32 = 720;
119const LINE_STEP: i32 = 22;
120
121const fn heading_size(level: u8) -> u8 {
122    match level {
123        1 => 24,
124        2 => 20,
125        3 => 18,
126        4 => 16,
127        5 => 14,
128        _ => 13,
129    }
130}
131
132const BODY_SIZE: u8 = 12;
133
134fn render_content_stream(document: &FormattingDocument) -> String {
135    let mut output = String::new();
136    let mut baseline = TOP_BASELINE;
137    for block in &document.blocks {
138        render_block(&mut output, block, &mut baseline);
139    }
140    output
141}
142
143fn render_block(output: &mut String, block: &BlockNode, baseline: &mut i32) {
144    match block {
145        BlockNode::Heading { level, children } => {
146            let level = (*level).clamp(1, 6);
147            let _ = writeln!(output, "/H{level} BDC");
148            render_text_object(output, children, heading_size(level), None, baseline);
149            output.push_str("EMC\n");
150        }
151        BlockNode::Paragraph { children } => {
152            output.push_str("/P BDC\n");
153            render_text_object(output, children, BODY_SIZE, None, baseline);
154            output.push_str("EMC\n");
155        }
156        BlockNode::List { concept, items } => {
157            let ordered = concept == "ordered-list";
158            output.push_str(if ordered { "/OL BDC\n" } else { "/UL BDC\n" });
159            for (index, item) in items.iter().enumerate() {
160                let marker = if ordered {
161                    format!("{}. ", index + 1)
162                } else {
163                    "- ".to_string()
164                };
165                output.push_str("/LI BDC\n");
166                render_text_object(output, item, BODY_SIZE, Some(&marker), baseline);
167                output.push_str("EMC\n");
168            }
169            output.push_str("EMC\n");
170        }
171    }
172}
173
174fn render_text_object(
175    output: &mut String,
176    nodes: &[InlineNode],
177    size: u8,
178    marker: Option<&str>,
179    baseline: &mut i32,
180) {
181    let mut runs = Vec::new();
182    if let Some(marker) = marker {
183        runs.push((RunStyle::Regular, marker.to_string()));
184    }
185    flatten_runs(nodes, RunStyle::Regular, &mut runs);
186    merge_adjacent_runs(&mut runs);
187
188    output.push_str("BT\n");
189    let _ = writeln!(output, "72 {baseline} Td");
190    for (style, text) in runs {
191        let _ = writeln!(output, "/{} {size} Tf", style.font());
192        let _ = writeln!(output, "({}) Tj", escape_pdf_string(&text));
193    }
194    output.push_str("ET\n");
195    *baseline -= LINE_STEP;
196}
197
198fn flatten_runs(nodes: &[InlineNode], style: RunStyle, runs: &mut Vec<(RunStyle, String)>) {
199    for node in nodes {
200        match node {
201            InlineNode::Text(text) => runs.push((style, text.clone())),
202            InlineNode::Wrapped {
203                concept, children, ..
204            } => {
205                let child_style = match concept.as_str() {
206                    "strong" => RunStyle::Strong,
207                    "emphasis" => RunStyle::Emphasis,
208                    // Unsupported inline concepts (hyperlink, image, …) keep the
209                    // surrounding style; their text is preserved but unstyled.
210                    _ => style,
211                };
212                flatten_runs(children, child_style, runs);
213            }
214        }
215    }
216}
217
218fn merge_adjacent_runs(runs: &mut Vec<(RunStyle, String)>) {
219    let mut merged: Vec<(RunStyle, String)> = Vec::with_capacity(runs.len());
220    for (style, text) in runs.drain(..) {
221        if let Some(last) = merged.last_mut() {
222            if last.0 == style {
223                last.1.push_str(&text);
224                continue;
225            }
226        }
227        merged.push((style, text));
228    }
229    *runs = merged;
230}
231
232fn escape_pdf_string(text: &str) -> String {
233    let mut escaped = String::with_capacity(text.len());
234    for character in text.chars() {
235        match character {
236            '\\' => escaped.push_str("\\\\"),
237            '(' => escaped.push_str("\\("),
238            ')' => escaped.push_str("\\)"),
239            other => escaped.push(other),
240        }
241    }
242    escaped
243}
244
245/// Assembles the cross-reference table, trailer, and object bodies around an
246/// already-rendered content stream.
247fn assemble_pdf(content: &str) -> String {
248    let objects = [
249        "<< /Type /Catalog /Pages 2 0 R >>".to_string(),
250        "<< /Type /Pages /Kids [3 0 R] /Count 1 >>".to_string(),
251        "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
252            /Resources << /Font << /F1 4 0 R /F2 5 0 R /F3 6 0 R >> >> /Contents 7 0 R >>"
253            .to_string(),
254        "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>".to_string(),
255        "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica-Bold >>".to_string(),
256        "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica-Oblique >>".to_string(),
257        format!(
258            "<< /Length {} >>\nstream\n{content}endstream",
259            content.len()
260        ),
261    ];
262
263    let mut body = String::from("%PDF-1.7\n");
264    let mut offsets = Vec::with_capacity(objects.len());
265    for (index, object) in objects.iter().enumerate() {
266        offsets.push(body.len());
267        let _ = writeln!(body, "{} 0 obj\n{object}\nendobj", index + 1);
268    }
269
270    let xref_offset = body.len();
271    let count = objects.len() + 1;
272    let _ = writeln!(body, "xref\n0 {count}");
273    body.push_str("0000000000 65535 f \n");
274    for offset in offsets {
275        let _ = writeln!(body, "{offset:010} 00000 n ");
276    }
277    let _ = writeln!(
278        body,
279        "trailer\n<< /Size {count} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF"
280    );
281    body
282}
283
284// --- parsing ---------------------------------------------------------------
285
286/// Extracts the bytes between the first `stream`/`endstream` pair.
287fn content_stream(text: &str) -> Option<&str> {
288    let start = text.find("stream\n")? + "stream\n".len();
289    let end = text[start..].find("endstream")? + start;
290    Some(&text[start..end])
291}
292
293/// A block context opened by a marked-content `BDC` operator.
294enum Context {
295    Heading(u8),
296    Paragraph,
297    List {
298        ordered: bool,
299        items: Vec<Vec<InlineNode>>,
300    },
301    ListItem,
302}
303
304fn parse_content_stream(content: &str) -> Vec<BlockNode> {
305    let mut blocks = Vec::new();
306    let mut stack: Vec<Context> = Vec::new();
307    let mut runs: Vec<(RunStyle, String)> = Vec::new();
308    let mut font = RunStyle::Regular;
309
310    for line in content.lines() {
311        let line = line.trim();
312        if let Some(level) = heading_marker(line) {
313            stack.push(Context::Heading(level));
314            runs.clear();
315        } else if line == "/P BDC" {
316            stack.push(Context::Paragraph);
317            runs.clear();
318        } else if line == "/UL BDC" || line == "/OL BDC" {
319            stack.push(Context::List {
320                ordered: line == "/OL BDC",
321                items: Vec::new(),
322            });
323        } else if line == "/LI BDC" {
324            stack.push(Context::ListItem);
325            runs.clear();
326        } else if let Some(style) = font_selector(line) {
327            font = style;
328        } else if let Some(text) = show_string(line) {
329            runs.push((font, text));
330        } else if line == "EMC" {
331            close_context(&mut stack, &mut blocks, &mut runs);
332        }
333        // `BT`, `ET`, `Td`, and any other operators carry no structure.
334    }
335
336    blocks
337}
338
339fn close_context(
340    stack: &mut Vec<Context>,
341    blocks: &mut Vec<BlockNode>,
342    runs: &mut Vec<(RunStyle, String)>,
343) {
344    let Some(context) = stack.pop() else {
345        return;
346    };
347    match context {
348        Context::Heading(level) => {
349            blocks.push(BlockNode::Heading {
350                level,
351                children: runs_to_inline(std::mem::take(runs), false),
352            });
353        }
354        Context::Paragraph => {
355            blocks.push(BlockNode::Paragraph {
356                children: runs_to_inline(std::mem::take(runs), false),
357            });
358        }
359        Context::ListItem => {
360            let children = runs_to_inline(std::mem::take(runs), true);
361            if let Some(Context::List { items, .. }) = stack.last_mut() {
362                items.push(children);
363            }
364        }
365        Context::List { ordered, items } => {
366            blocks.push(BlockNode::List {
367                concept: if ordered {
368                    "ordered-list"
369                } else {
370                    "bullet-list"
371                }
372                .to_string(),
373                items,
374            });
375        }
376    }
377}
378
379fn heading_marker(line: &str) -> Option<u8> {
380    let rest = line.strip_prefix("/H")?.strip_suffix(" BDC")?;
381    let level: u8 = rest.parse().ok()?;
382    (1..=6).contains(&level).then_some(level)
383}
384
385fn font_selector(line: &str) -> Option<RunStyle> {
386    let rest = line.strip_prefix('/')?.strip_suffix(" Tf")?;
387    let font = rest.split_whitespace().next()?;
388    RunStyle::from_font(font)
389}
390
391fn show_string(line: &str) -> Option<String> {
392    let body = line.strip_prefix('(')?;
393    let inner = body.strip_suffix(") Tj")?;
394    Some(unescape_pdf_string(inner))
395}
396
397fn unescape_pdf_string(text: &str) -> String {
398    let mut output = String::with_capacity(text.len());
399    let mut chars = text.chars();
400    while let Some(character) = chars.next() {
401        if character == '\\' {
402            match chars.next() {
403                Some(escaped) => output.push(escaped),
404                None => output.push('\\'),
405            }
406        } else {
407            output.push(character);
408        }
409    }
410    output
411}
412
413fn runs_to_inline(runs: Vec<(RunStyle, String)>, strip_list_marker: bool) -> Vec<InlineNode> {
414    let mut runs = runs;
415    if strip_list_marker {
416        strip_marker(&mut runs);
417    }
418    merge_adjacent_runs(&mut runs);
419    runs.into_iter()
420        .filter(|(_, text)| !text.is_empty())
421        .map(|(style, text)| style.wrap(text))
422        .collect()
423}
424
425/// Removes a leading `- ` or `N. ` list marker from the first text run.
426fn strip_marker(runs: &mut [(RunStyle, String)]) {
427    let Some((style, text)) = runs.first_mut() else {
428        return;
429    };
430    if *style != RunStyle::Regular {
431        return;
432    }
433    if let Some(rest) = text.strip_prefix("- ") {
434        *text = rest.to_string();
435        return;
436    }
437    if let Some(dot) = text.find(". ") {
438        if text[..dot]
439            .chars()
440            .all(|character| character.is_ascii_digit())
441            && dot > 0
442        {
443            *text = text[dot + 2..].to_string();
444        }
445    }
446}