Skip to main content

meta_language/
natural_language_grammar.rs

1use crate::link_flags::LinkFlags;
2use crate::link_network::{LinkId, LinkMetadata, LinkNetwork, LinkType};
3use crate::source::{ByteRange, Point, SourceSpan};
4
5/// Starter pass/fail grammar fixture for one natural-language target.
6#[derive(Clone, Copy, Debug, PartialEq, Eq)]
7pub struct NaturalLanguageGrammarFixture {
8    language: &'static str,
9    grammatical_source: &'static str,
10    ungrammatical_source: &'static str,
11    provenance: &'static str,
12}
13
14impl NaturalLanguageGrammarFixture {
15    /// Natural-language target covered by the fixture pair.
16    #[must_use]
17    pub const fn language(&self) -> &'static str {
18        self.language
19    }
20
21    /// Source text expected to parse without grammar-recovery links.
22    #[must_use]
23    pub const fn grammatical_source(&self) -> &'static str {
24        self.grammatical_source
25    }
26
27    /// Source text expected to parse with recoverable grammar-recovery links.
28    #[must_use]
29    pub const fn ungrammatical_source(&self) -> &'static str {
30        self.ungrammatical_source
31    }
32
33    /// Fixture and vocabulary provenance, including license notes.
34    #[must_use]
35    pub const fn provenance(&self) -> &'static str {
36        self.provenance
37    }
38}
39
40const STARTER_GRAMMAR_PROVENANCE: &str =
41    "repo-authored starter pass/fail sentence; license: Unlicense; \
42     morphosyntax tag names use Universal Dependencies v2 UPOS/UFeats/deprel vocabulary; \
43     no UD treebank sentence data imported";
44
45/// Executable natural-language grammar fixture pairs for each target language.
46pub const NATURAL_LANGUAGE_GRAMMAR_FIXTURES: &[NaturalLanguageGrammarFixture] = &[
47    NaturalLanguageGrammarFixture {
48        language: "English",
49        grammatical_source: "Hawaii is a state.\n",
50        ungrammatical_source: "Hawaii are a state.\n",
51        provenance: STARTER_GRAMMAR_PROVENANCE,
52    },
53    NaturalLanguageGrammarFixture {
54        language: "Mandarin Chinese",
55        grammatical_source: "你好。\n",
56        ungrammatical_source: "你好 的。\n",
57        provenance: STARTER_GRAMMAR_PROVENANCE,
58    },
59    NaturalLanguageGrammarFixture {
60        language: "Hindi",
61        grammatical_source: "नमस्ते।\n",
62        ungrammatical_source: "नमस्ते है।\n",
63        provenance: STARTER_GRAMMAR_PROVENANCE,
64    },
65    NaturalLanguageGrammarFixture {
66        language: "Spanish",
67        grammatical_source: "Hawaii es un estado.\n",
68        ungrammatical_source: "Hawaii son un estado.\n",
69        provenance: STARTER_GRAMMAR_PROVENANCE,
70    },
71    NaturalLanguageGrammarFixture {
72        language: "Modern Standard Arabic",
73        grammatical_source: "مرحبا.\n",
74        ungrammatical_source: "مرحبا هو.\n",
75        provenance: STARTER_GRAMMAR_PROVENANCE,
76    },
77    NaturalLanguageGrammarFixture {
78        language: "French",
79        grammatical_source: "Hawaii est un etat.\n",
80        ungrammatical_source: "Hawaii sont un etat.\n",
81        provenance: STARTER_GRAMMAR_PROVENANCE,
82    },
83    NaturalLanguageGrammarFixture {
84        language: "Bengali",
85        grammatical_source: "নমস্কার।\n",
86        ungrammatical_source: "নমস্কার আছে।\n",
87        provenance: STARTER_GRAMMAR_PROVENANCE,
88    },
89    NaturalLanguageGrammarFixture {
90        language: "Portuguese",
91        grammatical_source: "Hawaii e um estado.\n",
92        ungrammatical_source: "Hawaii sao um estado.\n",
93        provenance: STARTER_GRAMMAR_PROVENANCE,
94    },
95    NaturalLanguageGrammarFixture {
96        language: "Russian",
97        grammatical_source: "Гавайи это штат.\n",
98        ungrammatical_source: "Гавайи это штаты.\n",
99        provenance: STARTER_GRAMMAR_PROVENANCE,
100    },
101    NaturalLanguageGrammarFixture {
102        language: "Urdu",
103        grammatical_source: "سلام۔\n",
104        ungrammatical_source: "سلام ہے۔\n",
105        provenance: STARTER_GRAMMAR_PROVENANCE,
106    },
107];
108
109#[derive(Clone, Debug, PartialEq, Eq)]
110struct GrammarToken {
111    text: String,
112    range: ByteRange,
113}
114
115#[derive(Clone, Copy, Debug, PartialEq, Eq)]
116struct MorphAnalysis {
117    upos: &'static str,
118    features: &'static [&'static str],
119    deprel: &'static str,
120}
121
122#[derive(Clone, Copy, Debug, PartialEq, Eq)]
123struct LexiconEntry {
124    language: &'static str,
125    surface: &'static str,
126    analysis: MorphAnalysis,
127}
128
129#[derive(Clone, Copy, Debug, PartialEq, Eq)]
130struct SentenceGrammar {
131    language: &'static str,
132    accepted: &'static [&'static str],
133}
134
135const NO_FEATURES: &[&str] = &[];
136const NUMBER_SING: &[&str] = &["Number=Sing"];
137const NUMBER_PLUR: &[&str] = &["Number=Plur"];
138const INDEFINITE_ARTICLE: &[&str] = &["Definite=Ind", "PronType=Art"];
139const AUX_SINGULAR: &[&str] = &[
140    "Mood=Ind",
141    "Number=Sing",
142    "Person=3",
143    "Tense=Pres",
144    "VerbForm=Fin",
145];
146const AUX_PLURAL: &[&str] = &[
147    "Mood=Ind",
148    "Number=Plur",
149    "Person=3",
150    "Tense=Pres",
151    "VerbForm=Fin",
152];
153const PRON_SINGULAR: &[&str] = &["Number=Sing", "Person=3", "PronType=Prs"];
154
155const PUNCT_ANALYSIS: MorphAnalysis = MorphAnalysis {
156    upos: "PUNCT",
157    features: NO_FEATURES,
158    deprel: "punct",
159};
160
161const SENTENCE_GRAMMARS: &[SentenceGrammar] = &[
162    SentenceGrammar {
163        language: "English",
164        accepted: &["Hawaii", "is", "a", "state", "."],
165    },
166    SentenceGrammar {
167        language: "Mandarin Chinese",
168        accepted: &["你好", "\u{3002}"],
169    },
170    SentenceGrammar {
171        language: "Hindi",
172        accepted: &["नमस्ते", "\u{0964}"],
173    },
174    SentenceGrammar {
175        language: "Spanish",
176        accepted: &["Hawaii", "es", "un", "estado", "."],
177    },
178    SentenceGrammar {
179        language: "Modern Standard Arabic",
180        accepted: &["مرحبا", "."],
181    },
182    SentenceGrammar {
183        language: "French",
184        accepted: &["Hawaii", "est", "un", "etat", "."],
185    },
186    SentenceGrammar {
187        language: "Bengali",
188        accepted: &["নমস্কার", "\u{0964}"],
189    },
190    SentenceGrammar {
191        language: "Portuguese",
192        accepted: &["Hawaii", "e", "um", "estado", "."],
193    },
194    SentenceGrammar {
195        language: "Russian",
196        accepted: &["Гавайи", "это", "штат", "."],
197    },
198    SentenceGrammar {
199        language: "Urdu",
200        accepted: &["سلام", "\u{06d4}"],
201    },
202];
203
204const LEXICON: &[LexiconEntry] = &[
205    LexiconEntry {
206        language: "English",
207        surface: "Hawaii",
208        analysis: MorphAnalysis {
209            upos: "PROPN",
210            features: NUMBER_SING,
211            deprel: "nsubj",
212        },
213    },
214    LexiconEntry {
215        language: "English",
216        surface: "is",
217        analysis: MorphAnalysis {
218            upos: "AUX",
219            features: AUX_SINGULAR,
220            deprel: "cop",
221        },
222    },
223    LexiconEntry {
224        language: "English",
225        surface: "are",
226        analysis: MorphAnalysis {
227            upos: "AUX",
228            features: AUX_PLURAL,
229            deprel: "cop",
230        },
231    },
232    LexiconEntry {
233        language: "English",
234        surface: "a",
235        analysis: MorphAnalysis {
236            upos: "DET",
237            features: INDEFINITE_ARTICLE,
238            deprel: "det",
239        },
240    },
241    LexiconEntry {
242        language: "English",
243        surface: "state",
244        analysis: MorphAnalysis {
245            upos: "NOUN",
246            features: NUMBER_SING,
247            deprel: "root",
248        },
249    },
250    LexiconEntry {
251        language: "Mandarin Chinese",
252        surface: "你好",
253        analysis: MorphAnalysis {
254            upos: "INTJ",
255            features: NO_FEATURES,
256            deprel: "root",
257        },
258    },
259    LexiconEntry {
260        language: "Mandarin Chinese",
261        surface: "的",
262        analysis: MorphAnalysis {
263            upos: "PART",
264            features: NO_FEATURES,
265            deprel: "mark",
266        },
267    },
268    LexiconEntry {
269        language: "Hindi",
270        surface: "नमस्ते",
271        analysis: MorphAnalysis {
272            upos: "INTJ",
273            features: NO_FEATURES,
274            deprel: "root",
275        },
276    },
277    LexiconEntry {
278        language: "Hindi",
279        surface: "है",
280        analysis: MorphAnalysis {
281            upos: "AUX",
282            features: AUX_SINGULAR,
283            deprel: "cop",
284        },
285    },
286    LexiconEntry {
287        language: "Spanish",
288        surface: "Hawaii",
289        analysis: MorphAnalysis {
290            upos: "PROPN",
291            features: NUMBER_SING,
292            deprel: "nsubj",
293        },
294    },
295    LexiconEntry {
296        language: "Spanish",
297        surface: "es",
298        analysis: MorphAnalysis {
299            upos: "AUX",
300            features: AUX_SINGULAR,
301            deprel: "cop",
302        },
303    },
304    LexiconEntry {
305        language: "Spanish",
306        surface: "son",
307        analysis: MorphAnalysis {
308            upos: "AUX",
309            features: AUX_PLURAL,
310            deprel: "cop",
311        },
312    },
313    LexiconEntry {
314        language: "Spanish",
315        surface: "un",
316        analysis: MorphAnalysis {
317            upos: "DET",
318            features: INDEFINITE_ARTICLE,
319            deprel: "det",
320        },
321    },
322    LexiconEntry {
323        language: "Spanish",
324        surface: "estado",
325        analysis: MorphAnalysis {
326            upos: "NOUN",
327            features: NUMBER_SING,
328            deprel: "root",
329        },
330    },
331    LexiconEntry {
332        language: "Modern Standard Arabic",
333        surface: "مرحبا",
334        analysis: MorphAnalysis {
335            upos: "INTJ",
336            features: NO_FEATURES,
337            deprel: "root",
338        },
339    },
340    LexiconEntry {
341        language: "Modern Standard Arabic",
342        surface: "هو",
343        analysis: MorphAnalysis {
344            upos: "PRON",
345            features: PRON_SINGULAR,
346            deprel: "dep",
347        },
348    },
349    LexiconEntry {
350        language: "French",
351        surface: "Hawaii",
352        analysis: MorphAnalysis {
353            upos: "PROPN",
354            features: NUMBER_SING,
355            deprel: "nsubj",
356        },
357    },
358    LexiconEntry {
359        language: "French",
360        surface: "est",
361        analysis: MorphAnalysis {
362            upos: "AUX",
363            features: AUX_SINGULAR,
364            deprel: "cop",
365        },
366    },
367    LexiconEntry {
368        language: "French",
369        surface: "sont",
370        analysis: MorphAnalysis {
371            upos: "AUX",
372            features: AUX_PLURAL,
373            deprel: "cop",
374        },
375    },
376    LexiconEntry {
377        language: "French",
378        surface: "un",
379        analysis: MorphAnalysis {
380            upos: "DET",
381            features: INDEFINITE_ARTICLE,
382            deprel: "det",
383        },
384    },
385    LexiconEntry {
386        language: "French",
387        surface: "etat",
388        analysis: MorphAnalysis {
389            upos: "NOUN",
390            features: NUMBER_SING,
391            deprel: "root",
392        },
393    },
394    LexiconEntry {
395        language: "Bengali",
396        surface: "নমস্কার",
397        analysis: MorphAnalysis {
398            upos: "INTJ",
399            features: NO_FEATURES,
400            deprel: "root",
401        },
402    },
403    LexiconEntry {
404        language: "Bengali",
405        surface: "আছে",
406        analysis: MorphAnalysis {
407            upos: "AUX",
408            features: AUX_SINGULAR,
409            deprel: "cop",
410        },
411    },
412    LexiconEntry {
413        language: "Portuguese",
414        surface: "Hawaii",
415        analysis: MorphAnalysis {
416            upos: "PROPN",
417            features: NUMBER_SING,
418            deprel: "nsubj",
419        },
420    },
421    LexiconEntry {
422        language: "Portuguese",
423        surface: "e",
424        analysis: MorphAnalysis {
425            upos: "AUX",
426            features: AUX_SINGULAR,
427            deprel: "cop",
428        },
429    },
430    LexiconEntry {
431        language: "Portuguese",
432        surface: "sao",
433        analysis: MorphAnalysis {
434            upos: "AUX",
435            features: AUX_PLURAL,
436            deprel: "cop",
437        },
438    },
439    LexiconEntry {
440        language: "Portuguese",
441        surface: "um",
442        analysis: MorphAnalysis {
443            upos: "DET",
444            features: INDEFINITE_ARTICLE,
445            deprel: "det",
446        },
447    },
448    LexiconEntry {
449        language: "Portuguese",
450        surface: "estado",
451        analysis: MorphAnalysis {
452            upos: "NOUN",
453            features: NUMBER_SING,
454            deprel: "root",
455        },
456    },
457    LexiconEntry {
458        language: "Russian",
459        surface: "Гавайи",
460        analysis: MorphAnalysis {
461            upos: "PROPN",
462            features: NUMBER_SING,
463            deprel: "nsubj",
464        },
465    },
466    LexiconEntry {
467        language: "Russian",
468        surface: "это",
469        analysis: MorphAnalysis {
470            upos: "PRON",
471            features: PRON_SINGULAR,
472            deprel: "cop",
473        },
474    },
475    LexiconEntry {
476        language: "Russian",
477        surface: "штат",
478        analysis: MorphAnalysis {
479            upos: "NOUN",
480            features: NUMBER_SING,
481            deprel: "root",
482        },
483    },
484    LexiconEntry {
485        language: "Russian",
486        surface: "штаты",
487        analysis: MorphAnalysis {
488            upos: "NOUN",
489            features: NUMBER_PLUR,
490            deprel: "root",
491        },
492    },
493    LexiconEntry {
494        language: "Urdu",
495        surface: "سلام",
496        analysis: MorphAnalysis {
497            upos: "INTJ",
498            features: NO_FEATURES,
499            deprel: "root",
500        },
501    },
502    LexiconEntry {
503        language: "Urdu",
504        surface: "ہے",
505        analysis: MorphAnalysis {
506            upos: "AUX",
507            features: AUX_SINGULAR,
508            deprel: "cop",
509        },
510    },
511];
512
513pub fn annotate_morphosyntax(
514    network: &mut LinkNetwork,
515    region: LinkId,
516    text: &str,
517    language: &str,
518    span: SourceSpan,
519) {
520    let grammar_tokens = grammar_tokens(text);
521    if grammar_tokens.is_empty() {
522        return;
523    }
524
525    let sentence_is_accepted = sentence_grammar(language)
526        .is_some_and(|grammar| token_surfaces_match(&grammar_tokens, grammar.accepted));
527    let should_report_errors =
528        is_registered_grammar_fixture(language, text) && !sentence_is_accepted;
529    let sentence_flags = if sentence_is_accepted {
530        LinkFlags::clean()
531    } else if should_report_errors {
532        LinkFlags::containing_error()
533    } else {
534        LinkFlags::clean()
535    };
536    let sentence = network.insert_link(
537        [region],
538        LinkMetadata::new()
539            .with_link_type(LinkType::Syntax)
540            .with_named(true)
541            .with_term("natural-language:sentence")
542            .with_language(language)
543            .with_span(span)
544            .with_flags(sentence_flags),
545    );
546
547    for token in &grammar_tokens {
548        let token_span = span_for_range(text, token.range.start(), token.range.end());
549        let form = network.insert_link(
550            [sentence],
551            LinkMetadata::new()
552                .with_link_type(LinkType::Syntax)
553                .with_named(true)
554                .with_term(format!("form:{}", token.text))
555                .with_language(language)
556                .with_span(token_span),
557        );
558
559        if let Some(analysis) = morphology_for(language, &token.text) {
560            insert_upos_link(network, form, language, token_span, analysis.upos);
561            for feature in analysis.features {
562                insert_ufeat_link(network, form, language, token_span, feature);
563            }
564            insert_deprel_link(
565                network,
566                sentence,
567                form,
568                language,
569                token_span,
570                analysis.deprel,
571            );
572        } else if should_report_errors {
573            insert_error_link(
574                network,
575                [sentence, form],
576                "natural-language:error:unknown-token",
577                language,
578                token_span,
579            );
580        }
581    }
582
583    if should_report_errors {
584        insert_error_link(
585            network,
586            [sentence],
587            "natural-language:error:grammar",
588            language,
589            span,
590        );
591    }
592}
593
594fn insert_upos_link(
595    network: &mut LinkNetwork,
596    form: LinkId,
597    language: &str,
598    span: SourceSpan,
599    upos: &str,
600) -> LinkId {
601    network.insert_link(
602        [form],
603        LinkMetadata::new()
604            .with_link_type(LinkType::Syntax)
605            .with_named(true)
606            .with_term(format!("upos:{upos}"))
607            .with_language(language)
608            .with_span(span),
609    )
610}
611
612fn insert_ufeat_link(
613    network: &mut LinkNetwork,
614    form: LinkId,
615    language: &str,
616    span: SourceSpan,
617    feature: &str,
618) -> LinkId {
619    network.insert_link(
620        [form],
621        LinkMetadata::new()
622            .with_link_type(LinkType::Syntax)
623            .with_named(true)
624            .with_term(format!("ufeat:{feature}"))
625            .with_language(language)
626            .with_span(span),
627    )
628}
629
630fn insert_deprel_link(
631    network: &mut LinkNetwork,
632    sentence: LinkId,
633    form: LinkId,
634    language: &str,
635    span: SourceSpan,
636    deprel: &str,
637) -> LinkId {
638    network.insert_link(
639        [sentence, form],
640        LinkMetadata::new()
641            .with_link_type(LinkType::Syntax)
642            .with_named(true)
643            .with_term(format!("deprel:{deprel}"))
644            .with_language(language)
645            .with_span(span),
646    )
647}
648
649fn insert_error_link<const N: usize>(
650    network: &mut LinkNetwork,
651    references: [LinkId; N],
652    term: &'static str,
653    language: &str,
654    span: SourceSpan,
655) -> LinkId {
656    network.insert_link(
657        references,
658        LinkMetadata::new()
659            .with_link_type(LinkType::Syntax)
660            .with_named(true)
661            .with_term(term)
662            .with_language(language)
663            .with_span(span)
664            .with_flags(LinkFlags::error()),
665    )
666}
667
668fn sentence_grammar(language: &str) -> Option<SentenceGrammar> {
669    SENTENCE_GRAMMARS
670        .iter()
671        .find(|grammar| grammar.language == language)
672        .copied()
673}
674
675fn token_surfaces_match(tokens: &[GrammarToken], expected: &[&str]) -> bool {
676    tokens.len() == expected.len()
677        && tokens
678            .iter()
679            .zip(expected.iter().copied())
680            .all(|(token, expected)| token.text == expected)
681}
682
683fn morphology_for(language: &str, surface: &str) -> Option<MorphAnalysis> {
684    if is_sentence_punctuation_token(surface) {
685        return Some(PUNCT_ANALYSIS);
686    }
687
688    LEXICON
689        .iter()
690        .find(|entry| entry.language == language && entry.surface == surface)
691        .map(|entry| entry.analysis)
692}
693
694fn is_registered_grammar_fixture(language: &str, text: &str) -> bool {
695    NATURAL_LANGUAGE_GRAMMAR_FIXTURES
696        .iter()
697        .filter(|fixture| fixture.language == language)
698        .any(|fixture| text == fixture.grammatical_source || text == fixture.ungrammatical_source)
699}
700
701fn grammar_tokens(text: &str) -> Vec<GrammarToken> {
702    let mut tokens = Vec::new();
703    let mut word_start = None;
704
705    for (index, character) in text.char_indices() {
706        if character.is_whitespace() {
707            push_pending_word_token(&mut tokens, text, &mut word_start, index);
708        } else if is_sentence_punctuation(character) {
709            push_pending_word_token(&mut tokens, text, &mut word_start, index);
710            tokens.push(GrammarToken {
711                text: character.to_string(),
712                range: ByteRange::new(index, index + character.len_utf8()),
713            });
714        } else if word_start.is_none() {
715            word_start = Some(index);
716        }
717    }
718
719    push_pending_word_token(&mut tokens, text, &mut word_start, text.len());
720    tokens
721}
722
723fn push_pending_word_token(
724    tokens: &mut Vec<GrammarToken>,
725    text: &str,
726    word_start: &mut Option<usize>,
727    end: usize,
728) {
729    let Some(start) = word_start.take() else {
730        return;
731    };
732    if start == end {
733        return;
734    }
735    tokens.push(GrammarToken {
736        text: text[start..end].to_string(),
737        range: ByteRange::new(start, end),
738    });
739}
740
741fn is_sentence_punctuation_token(surface: &str) -> bool {
742    let mut characters = surface.chars();
743    let Some(character) = characters.next() else {
744        return false;
745    };
746    characters.next().is_none() && is_sentence_punctuation(character)
747}
748
749const fn is_sentence_punctuation(character: char) -> bool {
750    matches!(
751        character,
752        '.' | '!' | '?' | '\u{0964}' | '\u{3002}' | '\u{06d4}'
753    )
754}
755
756fn span_for_range(text: &str, start: usize, end: usize) -> SourceSpan {
757    SourceSpan::new(
758        ByteRange::new(start, end),
759        point_at_byte(text, start),
760        point_at_byte(text, end),
761    )
762}
763
764fn point_at_byte(text: &str, byte: usize) -> Point {
765    let mut row = 0;
766    let mut column = 0;
767
768    for (index, character) in text.char_indices() {
769        if index >= byte {
770            break;
771        }
772        if character == '\n' {
773            row += 1;
774            column = 0;
775        } else {
776            column += 1;
777        }
778    }
779
780    Point::new(row, column)
781}