meta_language/document_formatting/
pdf.rs1use std::fmt::Write as _;
35
36use super::document::{BlockNode, FormattingDocument, InlineNode};
37
38#[derive(Clone, Copy, PartialEq, Eq)]
40enum RunStyle {
41 Regular,
42 Strong,
43 Emphasis,
44}
45
46impl RunStyle {
47 const fn font(self) -> &'static str {
49 match self {
50 Self::Regular => "F1",
51 Self::Strong => "F2",
52 Self::Emphasis => "F3",
53 }
54 }
55
56 fn from_font(font: &str) -> Option<Self> {
58 match font {
59 "F1" => Some(Self::Regular),
60 "F2" => Some(Self::Strong),
61 "F3" => Some(Self::Emphasis),
62 _ => None,
63 }
64 }
65
66 fn wrap(self, text: String) -> InlineNode {
68 match self {
69 Self::Regular => InlineNode::Text(text),
70 Self::Strong => InlineNode::Wrapped {
71 concept: "strong".to_string(),
72 attributes: std::collections::BTreeMap::new(),
73 children: vec![InlineNode::Text(text)],
74 },
75 Self::Emphasis => InlineNode::Wrapped {
76 concept: "emphasis".to_string(),
77 attributes: std::collections::BTreeMap::new(),
78 children: vec![InlineNode::Text(text)],
79 },
80 }
81 }
82}
83
84#[must_use]
87pub fn render_pdf_document(document: &FormattingDocument) -> String {
88 let content = render_content_stream(document);
89 assemble_pdf(&content)
90}
91
92#[must_use]
99pub fn parse_pdf_document(text: &str) -> FormattingDocument {
100 let Some(content) = content_stream(text) else {
101 return FormattingDocument::default();
102 };
103 FormattingDocument {
104 blocks: parse_content_stream(content),
105 }
106}
107
108#[must_use]
110pub fn pdf_profile_is_recognized(text: &str) -> bool {
111 !parse_pdf_document(text).blocks.is_empty()
112}
113
114const TOP_BASELINE: i32 = 720;
119const LINE_STEP: i32 = 22;
120
121const fn heading_size(level: u8) -> u8 {
122 match level {
123 1 => 24,
124 2 => 20,
125 3 => 18,
126 4 => 16,
127 5 => 14,
128 _ => 13,
129 }
130}
131
132const BODY_SIZE: u8 = 12;
133
134fn render_content_stream(document: &FormattingDocument) -> String {
135 let mut output = String::new();
136 let mut baseline = TOP_BASELINE;
137 for block in &document.blocks {
138 render_block(&mut output, block, &mut baseline);
139 }
140 output
141}
142
143fn render_block(output: &mut String, block: &BlockNode, baseline: &mut i32) {
144 match block {
145 BlockNode::Heading { level, children } => {
146 let level = (*level).clamp(1, 6);
147 let _ = writeln!(output, "/H{level} BDC");
148 render_text_object(output, children, heading_size(level), None, baseline);
149 output.push_str("EMC\n");
150 }
151 BlockNode::Paragraph { children } => {
152 output.push_str("/P BDC\n");
153 render_text_object(output, children, BODY_SIZE, None, baseline);
154 output.push_str("EMC\n");
155 }
156 BlockNode::List { concept, items } => {
157 let ordered = concept == "ordered-list";
158 output.push_str(if ordered { "/OL BDC\n" } else { "/UL BDC\n" });
159 for (index, item) in items.iter().enumerate() {
160 let marker = if ordered {
161 format!("{}. ", index + 1)
162 } else {
163 "- ".to_string()
164 };
165 output.push_str("/LI BDC\n");
166 render_text_object(output, item, BODY_SIZE, Some(&marker), baseline);
167 output.push_str("EMC\n");
168 }
169 output.push_str("EMC\n");
170 }
171 }
172}
173
174fn render_text_object(
175 output: &mut String,
176 nodes: &[InlineNode],
177 size: u8,
178 marker: Option<&str>,
179 baseline: &mut i32,
180) {
181 let mut runs = Vec::new();
182 if let Some(marker) = marker {
183 runs.push((RunStyle::Regular, marker.to_string()));
184 }
185 flatten_runs(nodes, RunStyle::Regular, &mut runs);
186 merge_adjacent_runs(&mut runs);
187
188 output.push_str("BT\n");
189 let _ = writeln!(output, "72 {baseline} Td");
190 for (style, text) in runs {
191 let _ = writeln!(output, "/{} {size} Tf", style.font());
192 let _ = writeln!(output, "({}) Tj", escape_pdf_string(&text));
193 }
194 output.push_str("ET\n");
195 *baseline -= LINE_STEP;
196}
197
198fn flatten_runs(nodes: &[InlineNode], style: RunStyle, runs: &mut Vec<(RunStyle, String)>) {
199 for node in nodes {
200 match node {
201 InlineNode::Text(text) => runs.push((style, text.clone())),
202 InlineNode::Wrapped {
203 concept, children, ..
204 } => {
205 let child_style = match concept.as_str() {
206 "strong" => RunStyle::Strong,
207 "emphasis" => RunStyle::Emphasis,
208 _ => style,
211 };
212 flatten_runs(children, child_style, runs);
213 }
214 }
215 }
216}
217
218fn merge_adjacent_runs(runs: &mut Vec<(RunStyle, String)>) {
219 let mut merged: Vec<(RunStyle, String)> = Vec::with_capacity(runs.len());
220 for (style, text) in runs.drain(..) {
221 if let Some(last) = merged.last_mut() {
222 if last.0 == style {
223 last.1.push_str(&text);
224 continue;
225 }
226 }
227 merged.push((style, text));
228 }
229 *runs = merged;
230}
231
232fn escape_pdf_string(text: &str) -> String {
233 let mut escaped = String::with_capacity(text.len());
234 for character in text.chars() {
235 match character {
236 '\\' => escaped.push_str("\\\\"),
237 '(' => escaped.push_str("\\("),
238 ')' => escaped.push_str("\\)"),
239 other => escaped.push(other),
240 }
241 }
242 escaped
243}
244
245fn assemble_pdf(content: &str) -> String {
248 let objects = [
249 "<< /Type /Catalog /Pages 2 0 R >>".to_string(),
250 "<< /Type /Pages /Kids [3 0 R] /Count 1 >>".to_string(),
251 "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
252 /Resources << /Font << /F1 4 0 R /F2 5 0 R /F3 6 0 R >> >> /Contents 7 0 R >>"
253 .to_string(),
254 "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>".to_string(),
255 "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica-Bold >>".to_string(),
256 "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica-Oblique >>".to_string(),
257 format!(
258 "<< /Length {} >>\nstream\n{content}endstream",
259 content.len()
260 ),
261 ];
262
263 let mut body = String::from("%PDF-1.7\n");
264 let mut offsets = Vec::with_capacity(objects.len());
265 for (index, object) in objects.iter().enumerate() {
266 offsets.push(body.len());
267 let _ = writeln!(body, "{} 0 obj\n{object}\nendobj", index + 1);
268 }
269
270 let xref_offset = body.len();
271 let count = objects.len() + 1;
272 let _ = writeln!(body, "xref\n0 {count}");
273 body.push_str("0000000000 65535 f \n");
274 for offset in offsets {
275 let _ = writeln!(body, "{offset:010} 00000 n ");
276 }
277 let _ = writeln!(
278 body,
279 "trailer\n<< /Size {count} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF"
280 );
281 body
282}
283
284fn content_stream(text: &str) -> Option<&str> {
288 let start = text.find("stream\n")? + "stream\n".len();
289 let end = text[start..].find("endstream")? + start;
290 Some(&text[start..end])
291}
292
293enum Context {
295 Heading(u8),
296 Paragraph,
297 List {
298 ordered: bool,
299 items: Vec<Vec<InlineNode>>,
300 },
301 ListItem,
302}
303
304fn parse_content_stream(content: &str) -> Vec<BlockNode> {
305 let mut blocks = Vec::new();
306 let mut stack: Vec<Context> = Vec::new();
307 let mut runs: Vec<(RunStyle, String)> = Vec::new();
308 let mut font = RunStyle::Regular;
309
310 for line in content.lines() {
311 let line = line.trim();
312 if let Some(level) = heading_marker(line) {
313 stack.push(Context::Heading(level));
314 runs.clear();
315 } else if line == "/P BDC" {
316 stack.push(Context::Paragraph);
317 runs.clear();
318 } else if line == "/UL BDC" || line == "/OL BDC" {
319 stack.push(Context::List {
320 ordered: line == "/OL BDC",
321 items: Vec::new(),
322 });
323 } else if line == "/LI BDC" {
324 stack.push(Context::ListItem);
325 runs.clear();
326 } else if let Some(style) = font_selector(line) {
327 font = style;
328 } else if let Some(text) = show_string(line) {
329 runs.push((font, text));
330 } else if line == "EMC" {
331 close_context(&mut stack, &mut blocks, &mut runs);
332 }
333 }
335
336 blocks
337}
338
339fn close_context(
340 stack: &mut Vec<Context>,
341 blocks: &mut Vec<BlockNode>,
342 runs: &mut Vec<(RunStyle, String)>,
343) {
344 let Some(context) = stack.pop() else {
345 return;
346 };
347 match context {
348 Context::Heading(level) => {
349 blocks.push(BlockNode::Heading {
350 level,
351 children: runs_to_inline(std::mem::take(runs), false),
352 });
353 }
354 Context::Paragraph => {
355 blocks.push(BlockNode::Paragraph {
356 children: runs_to_inline(std::mem::take(runs), false),
357 });
358 }
359 Context::ListItem => {
360 let children = runs_to_inline(std::mem::take(runs), true);
361 if let Some(Context::List { items, .. }) = stack.last_mut() {
362 items.push(children);
363 }
364 }
365 Context::List { ordered, items } => {
366 blocks.push(BlockNode::List {
367 concept: if ordered {
368 "ordered-list"
369 } else {
370 "bullet-list"
371 }
372 .to_string(),
373 items,
374 });
375 }
376 }
377}
378
379fn heading_marker(line: &str) -> Option<u8> {
380 let rest = line.strip_prefix("/H")?.strip_suffix(" BDC")?;
381 let level: u8 = rest.parse().ok()?;
382 (1..=6).contains(&level).then_some(level)
383}
384
385fn font_selector(line: &str) -> Option<RunStyle> {
386 let rest = line.strip_prefix('/')?.strip_suffix(" Tf")?;
387 let font = rest.split_whitespace().next()?;
388 RunStyle::from_font(font)
389}
390
391fn show_string(line: &str) -> Option<String> {
392 let body = line.strip_prefix('(')?;
393 let inner = body.strip_suffix(") Tj")?;
394 Some(unescape_pdf_string(inner))
395}
396
397fn unescape_pdf_string(text: &str) -> String {
398 let mut output = String::with_capacity(text.len());
399 let mut chars = text.chars();
400 while let Some(character) = chars.next() {
401 if character == '\\' {
402 match chars.next() {
403 Some(escaped) => output.push(escaped),
404 None => output.push('\\'),
405 }
406 } else {
407 output.push(character);
408 }
409 }
410 output
411}
412
413fn runs_to_inline(runs: Vec<(RunStyle, String)>, strip_list_marker: bool) -> Vec<InlineNode> {
414 let mut runs = runs;
415 if strip_list_marker {
416 strip_marker(&mut runs);
417 }
418 merge_adjacent_runs(&mut runs);
419 runs.into_iter()
420 .filter(|(_, text)| !text.is_empty())
421 .map(|(style, text)| style.wrap(text))
422 .collect()
423}
424
425fn strip_marker(runs: &mut [(RunStyle, String)]) {
427 let Some((style, text)) = runs.first_mut() else {
428 return;
429 };
430 if *style != RunStyle::Regular {
431 return;
432 }
433 if let Some(rest) = text.strip_prefix("- ") {
434 *text = rest.to_string();
435 return;
436 }
437 if let Some(dot) = text.find(". ") {
438 if text[..dot]
439 .chars()
440 .all(|character| character.is_ascii_digit())
441 && dot > 0
442 {
443 *text = text[dot + 2..].to_string();
444 }
445 }
446}