1use crate::link_flags::LinkFlags;
2use crate::link_network::{LinkId, LinkMetadata, LinkNetwork, LinkType};
3use crate::source::{ByteRange, Point, SourceSpan};
4
5#[derive(Clone, Copy, Debug, PartialEq, Eq)]
7pub struct NaturalLanguageGrammarFixture {
8 language: &'static str,
9 grammatical_source: &'static str,
10 ungrammatical_source: &'static str,
11 provenance: &'static str,
12}
13
14impl NaturalLanguageGrammarFixture {
15 #[must_use]
17 pub const fn language(&self) -> &'static str {
18 self.language
19 }
20
21 #[must_use]
23 pub const fn grammatical_source(&self) -> &'static str {
24 self.grammatical_source
25 }
26
27 #[must_use]
29 pub const fn ungrammatical_source(&self) -> &'static str {
30 self.ungrammatical_source
31 }
32
33 #[must_use]
35 pub const fn provenance(&self) -> &'static str {
36 self.provenance
37 }
38}
39
40const STARTER_GRAMMAR_PROVENANCE: &str =
41 "repo-authored starter pass/fail sentence; license: Unlicense; \
42 morphosyntax tag names use Universal Dependencies v2 UPOS/UFeats/deprel vocabulary; \
43 no UD treebank sentence data imported";
44
45pub const NATURAL_LANGUAGE_GRAMMAR_FIXTURES: &[NaturalLanguageGrammarFixture] = &[
47 NaturalLanguageGrammarFixture {
48 language: "English",
49 grammatical_source: "Hawaii is a state.\n",
50 ungrammatical_source: "Hawaii are a state.\n",
51 provenance: STARTER_GRAMMAR_PROVENANCE,
52 },
53 NaturalLanguageGrammarFixture {
54 language: "Mandarin Chinese",
55 grammatical_source: "你好。\n",
56 ungrammatical_source: "你好 的。\n",
57 provenance: STARTER_GRAMMAR_PROVENANCE,
58 },
59 NaturalLanguageGrammarFixture {
60 language: "Hindi",
61 grammatical_source: "नमस्ते।\n",
62 ungrammatical_source: "नमस्ते है।\n",
63 provenance: STARTER_GRAMMAR_PROVENANCE,
64 },
65 NaturalLanguageGrammarFixture {
66 language: "Spanish",
67 grammatical_source: "Hawaii es un estado.\n",
68 ungrammatical_source: "Hawaii son un estado.\n",
69 provenance: STARTER_GRAMMAR_PROVENANCE,
70 },
71 NaturalLanguageGrammarFixture {
72 language: "Modern Standard Arabic",
73 grammatical_source: "مرحبا.\n",
74 ungrammatical_source: "مرحبا هو.\n",
75 provenance: STARTER_GRAMMAR_PROVENANCE,
76 },
77 NaturalLanguageGrammarFixture {
78 language: "French",
79 grammatical_source: "Hawaii est un etat.\n",
80 ungrammatical_source: "Hawaii sont un etat.\n",
81 provenance: STARTER_GRAMMAR_PROVENANCE,
82 },
83 NaturalLanguageGrammarFixture {
84 language: "Bengali",
85 grammatical_source: "নমস্কার।\n",
86 ungrammatical_source: "নমস্কার আছে।\n",
87 provenance: STARTER_GRAMMAR_PROVENANCE,
88 },
89 NaturalLanguageGrammarFixture {
90 language: "Portuguese",
91 grammatical_source: "Hawaii e um estado.\n",
92 ungrammatical_source: "Hawaii sao um estado.\n",
93 provenance: STARTER_GRAMMAR_PROVENANCE,
94 },
95 NaturalLanguageGrammarFixture {
96 language: "Russian",
97 grammatical_source: "Гавайи это штат.\n",
98 ungrammatical_source: "Гавайи это штаты.\n",
99 provenance: STARTER_GRAMMAR_PROVENANCE,
100 },
101 NaturalLanguageGrammarFixture {
102 language: "Urdu",
103 grammatical_source: "سلام۔\n",
104 ungrammatical_source: "سلام ہے۔\n",
105 provenance: STARTER_GRAMMAR_PROVENANCE,
106 },
107];
108
109#[derive(Clone, Debug, PartialEq, Eq)]
110struct GrammarToken {
111 text: String,
112 range: ByteRange,
113}
114
115#[derive(Clone, Copy, Debug, PartialEq, Eq)]
116struct MorphAnalysis {
117 upos: &'static str,
118 features: &'static [&'static str],
119 deprel: &'static str,
120}
121
122#[derive(Clone, Copy, Debug, PartialEq, Eq)]
123struct LexiconEntry {
124 language: &'static str,
125 surface: &'static str,
126 analysis: MorphAnalysis,
127}
128
129#[derive(Clone, Copy, Debug, PartialEq, Eq)]
130struct SentenceGrammar {
131 language: &'static str,
132 accepted: &'static [&'static str],
133}
134
135const NO_FEATURES: &[&str] = &[];
136const NUMBER_SING: &[&str] = &["Number=Sing"];
137const NUMBER_PLUR: &[&str] = &["Number=Plur"];
138const INDEFINITE_ARTICLE: &[&str] = &["Definite=Ind", "PronType=Art"];
139const AUX_SINGULAR: &[&str] = &[
140 "Mood=Ind",
141 "Number=Sing",
142 "Person=3",
143 "Tense=Pres",
144 "VerbForm=Fin",
145];
146const AUX_PLURAL: &[&str] = &[
147 "Mood=Ind",
148 "Number=Plur",
149 "Person=3",
150 "Tense=Pres",
151 "VerbForm=Fin",
152];
153const PRON_SINGULAR: &[&str] = &["Number=Sing", "Person=3", "PronType=Prs"];
154
155const PUNCT_ANALYSIS: MorphAnalysis = MorphAnalysis {
156 upos: "PUNCT",
157 features: NO_FEATURES,
158 deprel: "punct",
159};
160
161const SENTENCE_GRAMMARS: &[SentenceGrammar] = &[
162 SentenceGrammar {
163 language: "English",
164 accepted: &["Hawaii", "is", "a", "state", "."],
165 },
166 SentenceGrammar {
167 language: "Mandarin Chinese",
168 accepted: &["你好", "\u{3002}"],
169 },
170 SentenceGrammar {
171 language: "Hindi",
172 accepted: &["नमस्ते", "\u{0964}"],
173 },
174 SentenceGrammar {
175 language: "Spanish",
176 accepted: &["Hawaii", "es", "un", "estado", "."],
177 },
178 SentenceGrammar {
179 language: "Modern Standard Arabic",
180 accepted: &["مرحبا", "."],
181 },
182 SentenceGrammar {
183 language: "French",
184 accepted: &["Hawaii", "est", "un", "etat", "."],
185 },
186 SentenceGrammar {
187 language: "Bengali",
188 accepted: &["নমস্কার", "\u{0964}"],
189 },
190 SentenceGrammar {
191 language: "Portuguese",
192 accepted: &["Hawaii", "e", "um", "estado", "."],
193 },
194 SentenceGrammar {
195 language: "Russian",
196 accepted: &["Гавайи", "это", "штат", "."],
197 },
198 SentenceGrammar {
199 language: "Urdu",
200 accepted: &["سلام", "\u{06d4}"],
201 },
202];
203
204const LEXICON: &[LexiconEntry] = &[
205 LexiconEntry {
206 language: "English",
207 surface: "Hawaii",
208 analysis: MorphAnalysis {
209 upos: "PROPN",
210 features: NUMBER_SING,
211 deprel: "nsubj",
212 },
213 },
214 LexiconEntry {
215 language: "English",
216 surface: "is",
217 analysis: MorphAnalysis {
218 upos: "AUX",
219 features: AUX_SINGULAR,
220 deprel: "cop",
221 },
222 },
223 LexiconEntry {
224 language: "English",
225 surface: "are",
226 analysis: MorphAnalysis {
227 upos: "AUX",
228 features: AUX_PLURAL,
229 deprel: "cop",
230 },
231 },
232 LexiconEntry {
233 language: "English",
234 surface: "a",
235 analysis: MorphAnalysis {
236 upos: "DET",
237 features: INDEFINITE_ARTICLE,
238 deprel: "det",
239 },
240 },
241 LexiconEntry {
242 language: "English",
243 surface: "state",
244 analysis: MorphAnalysis {
245 upos: "NOUN",
246 features: NUMBER_SING,
247 deprel: "root",
248 },
249 },
250 LexiconEntry {
251 language: "Mandarin Chinese",
252 surface: "你好",
253 analysis: MorphAnalysis {
254 upos: "INTJ",
255 features: NO_FEATURES,
256 deprel: "root",
257 },
258 },
259 LexiconEntry {
260 language: "Mandarin Chinese",
261 surface: "的",
262 analysis: MorphAnalysis {
263 upos: "PART",
264 features: NO_FEATURES,
265 deprel: "mark",
266 },
267 },
268 LexiconEntry {
269 language: "Hindi",
270 surface: "नमस्ते",
271 analysis: MorphAnalysis {
272 upos: "INTJ",
273 features: NO_FEATURES,
274 deprel: "root",
275 },
276 },
277 LexiconEntry {
278 language: "Hindi",
279 surface: "है",
280 analysis: MorphAnalysis {
281 upos: "AUX",
282 features: AUX_SINGULAR,
283 deprel: "cop",
284 },
285 },
286 LexiconEntry {
287 language: "Spanish",
288 surface: "Hawaii",
289 analysis: MorphAnalysis {
290 upos: "PROPN",
291 features: NUMBER_SING,
292 deprel: "nsubj",
293 },
294 },
295 LexiconEntry {
296 language: "Spanish",
297 surface: "es",
298 analysis: MorphAnalysis {
299 upos: "AUX",
300 features: AUX_SINGULAR,
301 deprel: "cop",
302 },
303 },
304 LexiconEntry {
305 language: "Spanish",
306 surface: "son",
307 analysis: MorphAnalysis {
308 upos: "AUX",
309 features: AUX_PLURAL,
310 deprel: "cop",
311 },
312 },
313 LexiconEntry {
314 language: "Spanish",
315 surface: "un",
316 analysis: MorphAnalysis {
317 upos: "DET",
318 features: INDEFINITE_ARTICLE,
319 deprel: "det",
320 },
321 },
322 LexiconEntry {
323 language: "Spanish",
324 surface: "estado",
325 analysis: MorphAnalysis {
326 upos: "NOUN",
327 features: NUMBER_SING,
328 deprel: "root",
329 },
330 },
331 LexiconEntry {
332 language: "Modern Standard Arabic",
333 surface: "مرحبا",
334 analysis: MorphAnalysis {
335 upos: "INTJ",
336 features: NO_FEATURES,
337 deprel: "root",
338 },
339 },
340 LexiconEntry {
341 language: "Modern Standard Arabic",
342 surface: "هو",
343 analysis: MorphAnalysis {
344 upos: "PRON",
345 features: PRON_SINGULAR,
346 deprel: "dep",
347 },
348 },
349 LexiconEntry {
350 language: "French",
351 surface: "Hawaii",
352 analysis: MorphAnalysis {
353 upos: "PROPN",
354 features: NUMBER_SING,
355 deprel: "nsubj",
356 },
357 },
358 LexiconEntry {
359 language: "French",
360 surface: "est",
361 analysis: MorphAnalysis {
362 upos: "AUX",
363 features: AUX_SINGULAR,
364 deprel: "cop",
365 },
366 },
367 LexiconEntry {
368 language: "French",
369 surface: "sont",
370 analysis: MorphAnalysis {
371 upos: "AUX",
372 features: AUX_PLURAL,
373 deprel: "cop",
374 },
375 },
376 LexiconEntry {
377 language: "French",
378 surface: "un",
379 analysis: MorphAnalysis {
380 upos: "DET",
381 features: INDEFINITE_ARTICLE,
382 deprel: "det",
383 },
384 },
385 LexiconEntry {
386 language: "French",
387 surface: "etat",
388 analysis: MorphAnalysis {
389 upos: "NOUN",
390 features: NUMBER_SING,
391 deprel: "root",
392 },
393 },
394 LexiconEntry {
395 language: "Bengali",
396 surface: "নমস্কার",
397 analysis: MorphAnalysis {
398 upos: "INTJ",
399 features: NO_FEATURES,
400 deprel: "root",
401 },
402 },
403 LexiconEntry {
404 language: "Bengali",
405 surface: "আছে",
406 analysis: MorphAnalysis {
407 upos: "AUX",
408 features: AUX_SINGULAR,
409 deprel: "cop",
410 },
411 },
412 LexiconEntry {
413 language: "Portuguese",
414 surface: "Hawaii",
415 analysis: MorphAnalysis {
416 upos: "PROPN",
417 features: NUMBER_SING,
418 deprel: "nsubj",
419 },
420 },
421 LexiconEntry {
422 language: "Portuguese",
423 surface: "e",
424 analysis: MorphAnalysis {
425 upos: "AUX",
426 features: AUX_SINGULAR,
427 deprel: "cop",
428 },
429 },
430 LexiconEntry {
431 language: "Portuguese",
432 surface: "sao",
433 analysis: MorphAnalysis {
434 upos: "AUX",
435 features: AUX_PLURAL,
436 deprel: "cop",
437 },
438 },
439 LexiconEntry {
440 language: "Portuguese",
441 surface: "um",
442 analysis: MorphAnalysis {
443 upos: "DET",
444 features: INDEFINITE_ARTICLE,
445 deprel: "det",
446 },
447 },
448 LexiconEntry {
449 language: "Portuguese",
450 surface: "estado",
451 analysis: MorphAnalysis {
452 upos: "NOUN",
453 features: NUMBER_SING,
454 deprel: "root",
455 },
456 },
457 LexiconEntry {
458 language: "Russian",
459 surface: "Гавайи",
460 analysis: MorphAnalysis {
461 upos: "PROPN",
462 features: NUMBER_SING,
463 deprel: "nsubj",
464 },
465 },
466 LexiconEntry {
467 language: "Russian",
468 surface: "это",
469 analysis: MorphAnalysis {
470 upos: "PRON",
471 features: PRON_SINGULAR,
472 deprel: "cop",
473 },
474 },
475 LexiconEntry {
476 language: "Russian",
477 surface: "штат",
478 analysis: MorphAnalysis {
479 upos: "NOUN",
480 features: NUMBER_SING,
481 deprel: "root",
482 },
483 },
484 LexiconEntry {
485 language: "Russian",
486 surface: "штаты",
487 analysis: MorphAnalysis {
488 upos: "NOUN",
489 features: NUMBER_PLUR,
490 deprel: "root",
491 },
492 },
493 LexiconEntry {
494 language: "Urdu",
495 surface: "سلام",
496 analysis: MorphAnalysis {
497 upos: "INTJ",
498 features: NO_FEATURES,
499 deprel: "root",
500 },
501 },
502 LexiconEntry {
503 language: "Urdu",
504 surface: "ہے",
505 analysis: MorphAnalysis {
506 upos: "AUX",
507 features: AUX_SINGULAR,
508 deprel: "cop",
509 },
510 },
511];
512
513pub fn annotate_morphosyntax(
514 network: &mut LinkNetwork,
515 region: LinkId,
516 text: &str,
517 language: &str,
518 span: SourceSpan,
519) {
520 let grammar_tokens = grammar_tokens(text);
521 if grammar_tokens.is_empty() {
522 return;
523 }
524
525 let sentence_is_accepted = sentence_grammar(language)
526 .is_some_and(|grammar| token_surfaces_match(&grammar_tokens, grammar.accepted));
527 let should_report_errors =
528 is_registered_grammar_fixture(language, text) && !sentence_is_accepted;
529 let sentence_flags = if sentence_is_accepted {
530 LinkFlags::clean()
531 } else if should_report_errors {
532 LinkFlags::containing_error()
533 } else {
534 LinkFlags::clean()
535 };
536 let sentence = network.insert_link(
537 [region],
538 LinkMetadata::new()
539 .with_link_type(LinkType::Syntax)
540 .with_named(true)
541 .with_term("natural-language:sentence")
542 .with_language(language)
543 .with_span(span)
544 .with_flags(sentence_flags),
545 );
546
547 for token in &grammar_tokens {
548 let token_span = span_for_range(text, token.range.start(), token.range.end());
549 let form = network.insert_link(
550 [sentence],
551 LinkMetadata::new()
552 .with_link_type(LinkType::Syntax)
553 .with_named(true)
554 .with_term(format!("form:{}", token.text))
555 .with_language(language)
556 .with_span(token_span),
557 );
558
559 if let Some(analysis) = morphology_for(language, &token.text) {
560 insert_upos_link(network, form, language, token_span, analysis.upos);
561 for feature in analysis.features {
562 insert_ufeat_link(network, form, language, token_span, feature);
563 }
564 insert_deprel_link(
565 network,
566 sentence,
567 form,
568 language,
569 token_span,
570 analysis.deprel,
571 );
572 } else if should_report_errors {
573 insert_error_link(
574 network,
575 [sentence, form],
576 "natural-language:error:unknown-token",
577 language,
578 token_span,
579 );
580 }
581 }
582
583 if should_report_errors {
584 insert_error_link(
585 network,
586 [sentence],
587 "natural-language:error:grammar",
588 language,
589 span,
590 );
591 }
592}
593
594fn insert_upos_link(
595 network: &mut LinkNetwork,
596 form: LinkId,
597 language: &str,
598 span: SourceSpan,
599 upos: &str,
600) -> LinkId {
601 network.insert_link(
602 [form],
603 LinkMetadata::new()
604 .with_link_type(LinkType::Syntax)
605 .with_named(true)
606 .with_term(format!("upos:{upos}"))
607 .with_language(language)
608 .with_span(span),
609 )
610}
611
612fn insert_ufeat_link(
613 network: &mut LinkNetwork,
614 form: LinkId,
615 language: &str,
616 span: SourceSpan,
617 feature: &str,
618) -> LinkId {
619 network.insert_link(
620 [form],
621 LinkMetadata::new()
622 .with_link_type(LinkType::Syntax)
623 .with_named(true)
624 .with_term(format!("ufeat:{feature}"))
625 .with_language(language)
626 .with_span(span),
627 )
628}
629
630fn insert_deprel_link(
631 network: &mut LinkNetwork,
632 sentence: LinkId,
633 form: LinkId,
634 language: &str,
635 span: SourceSpan,
636 deprel: &str,
637) -> LinkId {
638 network.insert_link(
639 [sentence, form],
640 LinkMetadata::new()
641 .with_link_type(LinkType::Syntax)
642 .with_named(true)
643 .with_term(format!("deprel:{deprel}"))
644 .with_language(language)
645 .with_span(span),
646 )
647}
648
649fn insert_error_link<const N: usize>(
650 network: &mut LinkNetwork,
651 references: [LinkId; N],
652 term: &'static str,
653 language: &str,
654 span: SourceSpan,
655) -> LinkId {
656 network.insert_link(
657 references,
658 LinkMetadata::new()
659 .with_link_type(LinkType::Syntax)
660 .with_named(true)
661 .with_term(term)
662 .with_language(language)
663 .with_span(span)
664 .with_flags(LinkFlags::error()),
665 )
666}
667
668fn sentence_grammar(language: &str) -> Option<SentenceGrammar> {
669 SENTENCE_GRAMMARS
670 .iter()
671 .find(|grammar| grammar.language == language)
672 .copied()
673}
674
675fn token_surfaces_match(tokens: &[GrammarToken], expected: &[&str]) -> bool {
676 tokens.len() == expected.len()
677 && tokens
678 .iter()
679 .zip(expected.iter().copied())
680 .all(|(token, expected)| token.text == expected)
681}
682
683fn morphology_for(language: &str, surface: &str) -> Option<MorphAnalysis> {
684 if is_sentence_punctuation_token(surface) {
685 return Some(PUNCT_ANALYSIS);
686 }
687
688 LEXICON
689 .iter()
690 .find(|entry| entry.language == language && entry.surface == surface)
691 .map(|entry| entry.analysis)
692}
693
694fn is_registered_grammar_fixture(language: &str, text: &str) -> bool {
695 NATURAL_LANGUAGE_GRAMMAR_FIXTURES
696 .iter()
697 .filter(|fixture| fixture.language == language)
698 .any(|fixture| text == fixture.grammatical_source || text == fixture.ungrammatical_source)
699}
700
701fn grammar_tokens(text: &str) -> Vec<GrammarToken> {
702 let mut tokens = Vec::new();
703 let mut word_start = None;
704
705 for (index, character) in text.char_indices() {
706 if character.is_whitespace() {
707 push_pending_word_token(&mut tokens, text, &mut word_start, index);
708 } else if is_sentence_punctuation(character) {
709 push_pending_word_token(&mut tokens, text, &mut word_start, index);
710 tokens.push(GrammarToken {
711 text: character.to_string(),
712 range: ByteRange::new(index, index + character.len_utf8()),
713 });
714 } else if word_start.is_none() {
715 word_start = Some(index);
716 }
717 }
718
719 push_pending_word_token(&mut tokens, text, &mut word_start, text.len());
720 tokens
721}
722
723fn push_pending_word_token(
724 tokens: &mut Vec<GrammarToken>,
725 text: &str,
726 word_start: &mut Option<usize>,
727 end: usize,
728) {
729 let Some(start) = word_start.take() else {
730 return;
731 };
732 if start == end {
733 return;
734 }
735 tokens.push(GrammarToken {
736 text: text[start..end].to_string(),
737 range: ByteRange::new(start, end),
738 });
739}
740
741fn is_sentence_punctuation_token(surface: &str) -> bool {
742 let mut characters = surface.chars();
743 let Some(character) = characters.next() else {
744 return false;
745 };
746 characters.next().is_none() && is_sentence_punctuation(character)
747}
748
749const fn is_sentence_punctuation(character: char) -> bool {
750 matches!(
751 character,
752 '.' | '!' | '?' | '\u{0964}' | '\u{3002}' | '\u{06d4}'
753 )
754}
755
756fn span_for_range(text: &str, start: usize, end: usize) -> SourceSpan {
757 SourceSpan::new(
758 ByteRange::new(start, end),
759 point_at_byte(text, start),
760 point_at_byte(text, end),
761 )
762}
763
764fn point_at_byte(text: &str, byte: usize) -> Point {
765 let mut row = 0;
766 let mut column = 0;
767
768 for (index, character) in text.char_indices() {
769 if index >= byte {
770 break;
771 }
772 if character == '\n' {
773 row += 1;
774 column = 0;
775 } else {
776 column += 1;
777 }
778 }
779
780 Point::new(row, column)
781}