Skip to main content

meta_language/document_formatting/
opc.rs

1//! OPC (Open Packaging Conventions) binary layer for real `.docx` packages.
2//!
3//! A DOCX file is a ZIP container of OOXML parts. The text content layer in
4//! [`super::docx`] models the primary `word/document.xml` part; this module
5//! wraps that part — together with the fixed packaging parts a conformant reader
6//! requires — into a valid ZIP archive, and reads `word/document.xml` back out
7//! of such an archive.
8//!
9//! # Constrained profile
10//!
11//! To keep the crate dependency-free, packages are written as **stored**
12//! (uncompressed) ZIP entries with a self-implemented CRC-32, and read back the
13//! same way. A package produced by [`render_docx_package`] opens in conformant
14//! word processors; an arbitrary externally-authored `.docx` that uses DEFLATE
15//! compression is out of profile for [`parse_docx_package`] (its
16//! `word/document.xml` is not extracted). See `docs/docx-fidelity.md`.
17
18use super::document::FormattingDocument;
19use super::docx::{parse_docx_document, render_docx_document};
20
21const DOCUMENT_PART: &str = "word/document.xml";
22
23// --- fixed packaging parts -------------------------------------------------
24
25const CONTENT_TYPES: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n\
26<Types xmlns=\"http://schemas.openxmlformats.org/package/2006/content-types\">\
27<Default Extension=\"rels\" ContentType=\"application/vnd.openxmlformats-package.relationships+xml\"/>\
28<Default Extension=\"xml\" ContentType=\"application/xml\"/>\
29<Override PartName=\"/word/document.xml\" ContentType=\"application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml\"/>\
30<Override PartName=\"/word/styles.xml\" ContentType=\"application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml\"/>\
31<Override PartName=\"/word/numbering.xml\" ContentType=\"application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml\"/>\
32</Types>\n";
33
34const ROOT_RELS: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n\
35<Relationships xmlns=\"http://schemas.openxmlformats.org/package/2006/relationships\">\
36<Relationship Id=\"rId1\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument\" Target=\"word/document.xml\"/>\
37</Relationships>\n";
38
39const DOCUMENT_RELS: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n\
40<Relationships xmlns=\"http://schemas.openxmlformats.org/package/2006/relationships\">\
41<Relationship Id=\"rId1\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles\" Target=\"styles.xml\"/>\
42<Relationship Id=\"rId2\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering\" Target=\"numbering.xml\"/>\
43</Relationships>\n";
44
45const STYLES: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n\
46<w:styles xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\">\
47<w:style w:type=\"paragraph\" w:styleId=\"Heading1\"><w:name w:val=\"heading 1\"/><w:pPr><w:outlineLvl w:val=\"0\"/></w:pPr></w:style>\
48<w:style w:type=\"paragraph\" w:styleId=\"Heading2\"><w:name w:val=\"heading 2\"/><w:pPr><w:outlineLvl w:val=\"1\"/></w:pPr></w:style>\
49<w:style w:type=\"paragraph\" w:styleId=\"Heading3\"><w:name w:val=\"heading 3\"/><w:pPr><w:outlineLvl w:val=\"2\"/></w:pPr></w:style>\
50<w:style w:type=\"paragraph\" w:styleId=\"Heading4\"><w:name w:val=\"heading 4\"/><w:pPr><w:outlineLvl w:val=\"3\"/></w:pPr></w:style>\
51<w:style w:type=\"paragraph\" w:styleId=\"Heading5\"><w:name w:val=\"heading 5\"/><w:pPr><w:outlineLvl w:val=\"4\"/></w:pPr></w:style>\
52<w:style w:type=\"paragraph\" w:styleId=\"Heading6\"><w:name w:val=\"heading 6\"/><w:pPr><w:outlineLvl w:val=\"5\"/></w:pPr></w:style>\
53</w:styles>\n";
54
55const NUMBERING: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n\
56<w:numbering xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\">\
57<w:abstractNum w:abstractNumId=\"0\"><w:lvl w:ilvl=\"0\"><w:numFmt w:val=\"bullet\"/><w:lvlText w:val=\"\u{2022}\"/></w:lvl></w:abstractNum>\
58<w:abstractNum w:abstractNumId=\"1\"><w:lvl w:ilvl=\"0\"><w:numFmt w:val=\"decimal\"/><w:lvlText w:val=\"%1.\"/></w:lvl></w:abstractNum>\
59<w:num w:numId=\"1\"><w:abstractNumId w:val=\"0\"/></w:num>\
60<w:num w:numId=\"2\"><w:abstractNumId w:val=\"1\"/></w:num>\
61</w:numbering>\n";
62
63// --- public API ------------------------------------------------------------
64
65/// Renders a language-free [`FormattingDocument`] into a valid `.docx` (OPC ZIP)
66/// package in the documented stored-entry profile.
67#[must_use]
68pub fn render_docx_package(document: &FormattingDocument) -> Vec<u8> {
69    let document_xml = render_docx_document(document);
70    let entries: [(&str, &[u8]); 6] = [
71        ("[Content_Types].xml", CONTENT_TYPES.as_bytes()),
72        ("_rels/.rels", ROOT_RELS.as_bytes()),
73        (DOCUMENT_PART, document_xml.as_bytes()),
74        ("word/_rels/document.xml.rels", DOCUMENT_RELS.as_bytes()),
75        ("word/styles.xml", STYLES.as_bytes()),
76        ("word/numbering.xml", NUMBERING.as_bytes()),
77    ];
78    write_zip(&entries)
79}
80
81/// Parses the `word/document.xml` part of a stored-profile `.docx` package back
82/// into the language-free concept layer.
83///
84/// Returns an empty document when the package has no extractable
85/// `word/document.xml` (for example a DEFLATE-compressed external file), so
86/// out-of-profile packages degrade gracefully rather than panicking.
87#[must_use]
88pub fn parse_docx_package(bytes: &[u8]) -> FormattingDocument {
89    read_zip_entry(bytes, DOCUMENT_PART)
90        .and_then(|data| String::from_utf8(data).ok())
91        .map_or_else(FormattingDocument::default, |document_xml| {
92            parse_docx_document(&document_xml)
93        })
94}
95
96/// Whether `bytes` is a stored-profile `.docx` package carrying at least one
97/// recognized block in its `word/document.xml` part.
98#[must_use]
99pub fn docx_package_is_recognized(bytes: &[u8]) -> bool {
100    !parse_docx_package(bytes).blocks.is_empty()
101}
102
103// --- ZIP writer (stored entries) -------------------------------------------
104
105const LOCAL_HEADER_SIGNATURE: u32 = 0x0403_4b50;
106const CENTRAL_HEADER_SIGNATURE: u32 = 0x0201_4b50;
107const END_OF_CENTRAL_SIGNATURE: u32 = 0x0605_4b50;
108const VERSION: u16 = 20;
109
110fn write_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
111    let mut output = Vec::new();
112    let mut central = Vec::new();
113    let mut offsets = Vec::with_capacity(entries.len());
114
115    for (name, data) in entries {
116        offsets.push(u32::try_from(output.len()).expect("archive offset fits in u32"));
117        let crc = crc32(data);
118        let size = u32::try_from(data.len()).expect("entry length fits in u32");
119        let name_len = u16::try_from(name.len()).expect("name length fits in u16");
120
121        // Local file header.
122        push_u32(&mut output, LOCAL_HEADER_SIGNATURE);
123        push_u16(&mut output, VERSION);
124        push_u16(&mut output, 0); // general purpose flags
125        push_u16(&mut output, 0); // method: stored
126        push_u16(&mut output, 0); // mod time
127        push_u16(&mut output, 0); // mod date
128        push_u32(&mut output, crc);
129        push_u32(&mut output, size); // compressed size
130        push_u32(&mut output, size); // uncompressed size
131        push_u16(&mut output, name_len);
132        push_u16(&mut output, 0); // extra length
133        output.extend_from_slice(name.as_bytes());
134        output.extend_from_slice(data);
135
136        // Central directory header (deferred).
137        push_u32(&mut central, CENTRAL_HEADER_SIGNATURE);
138        push_u16(&mut central, VERSION); // version made by
139        push_u16(&mut central, VERSION); // version needed
140        push_u16(&mut central, 0);
141        push_u16(&mut central, 0); // method: stored
142        push_u16(&mut central, 0);
143        push_u16(&mut central, 0);
144        push_u32(&mut central, crc);
145        push_u32(&mut central, size);
146        push_u32(&mut central, size);
147        push_u16(&mut central, name_len);
148        push_u16(&mut central, 0); // extra
149        push_u16(&mut central, 0); // comment
150        push_u16(&mut central, 0); // disk number
151        push_u16(&mut central, 0); // internal attrs
152        push_u32(&mut central, 0); // external attrs
153        push_u32(&mut central, *offsets.last().expect("offset pushed above"));
154        central.extend_from_slice(name.as_bytes());
155    }
156
157    let central_offset = u32::try_from(output.len()).expect("central offset fits in u32");
158    let central_size = u32::try_from(central.len()).expect("central size fits in u32");
159    let count = u16::try_from(entries.len()).expect("entry count fits in u16");
160    output.extend_from_slice(&central);
161
162    // End of central directory record.
163    push_u32(&mut output, END_OF_CENTRAL_SIGNATURE);
164    push_u16(&mut output, 0); // disk number
165    push_u16(&mut output, 0); // disk with central dir
166    push_u16(&mut output, count);
167    push_u16(&mut output, count);
168    push_u32(&mut output, central_size);
169    push_u32(&mut output, central_offset);
170    push_u16(&mut output, 0); // comment length
171
172    output
173}
174
175// --- ZIP reader (stored entries) -------------------------------------------
176
177/// Extracts the stored data of `name` by scanning local file headers.
178fn read_zip_entry(bytes: &[u8], name: &str) -> Option<Vec<u8>> {
179    let mut cursor = 0usize;
180    while cursor + 30 <= bytes.len() {
181        if read_u32(bytes, cursor)? != LOCAL_HEADER_SIGNATURE {
182            break;
183        }
184        let method = read_u16(bytes, cursor + 8)?;
185        let compressed = read_u32(bytes, cursor + 18)? as usize;
186        let name_len = read_u16(bytes, cursor + 26)? as usize;
187        let extra_len = read_u16(bytes, cursor + 28)? as usize;
188        let name_start = cursor + 30;
189        let data_start = name_start + name_len + extra_len;
190        let data_end = data_start + compressed;
191        if data_end > bytes.len() {
192            break;
193        }
194        let entry_name = bytes.get(name_start..name_start + name_len)?;
195        if entry_name == name.as_bytes() && method == 0 {
196            return Some(bytes[data_start..data_end].to_vec());
197        }
198        cursor = data_end;
199    }
200    None
201}
202
203// --- little-endian helpers -------------------------------------------------
204
205fn push_u16(buffer: &mut Vec<u8>, value: u16) {
206    buffer.extend_from_slice(&value.to_le_bytes());
207}
208
209fn push_u32(buffer: &mut Vec<u8>, value: u32) {
210    buffer.extend_from_slice(&value.to_le_bytes());
211}
212
213fn read_u16(bytes: &[u8], offset: usize) -> Option<u16> {
214    let slice = bytes.get(offset..offset + 2)?;
215    Some(u16::from_le_bytes([slice[0], slice[1]]))
216}
217
218fn read_u32(bytes: &[u8], offset: usize) -> Option<u32> {
219    let slice = bytes.get(offset..offset + 4)?;
220    Some(u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]))
221}
222
223// --- CRC-32 (IEEE 802.3, polynomial 0xEDB88320) ----------------------------
224
225fn crc32(data: &[u8]) -> u32 {
226    let mut crc: u32 = 0xFFFF_FFFF;
227    for &byte in data {
228        crc ^= u32::from(byte);
229        for _ in 0..8 {
230            let mask = (crc & 1).wrapping_neg();
231            crc = (crc >> 1) ^ (0xEDB8_8320 & mask);
232        }
233    }
234    !crc
235}