dailp/
morpheme.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
use crate::*;
use async_graphql::FieldResult;
use serde::{Deserialize, Serialize};
use sqlx::postgres::{PgHasArrayType, PgTypeInfo};
use std::borrow::Cow;

/// A single unit of meaning and its corresponding English gloss.
#[derive(Serialize, Clone, Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct WordSegment {
    /// Which Cherokee representation system is this segment written with?
    #[serde(skip)]
    pub system: Option<CherokeeOrthography>,
    /// Source language representation of this segment.
    pub morpheme: String,
    /// Target language representation of this segment.
    pub gloss: String,
    /// Database ID for the associated gloss, which may be shared by several
    /// morphemes in the same document.
    #[serde(skip)]
    pub gloss_id: Option<Uuid>,
    /// What kind of thing is the next segment?
    ///
    /// This field determines what character should separate this segment from
    /// the next one when reconstituting a full segmentation string.
    pub role: WordSegmentRole,
    /// Optional glossary entry for this segment which gives further information,
    /// like a definition and example usages.
    pub matching_tag: Option<MorphemeTag>,
}

/// The kind of segment that a particular sequence of characters in a morphemic
/// segmentations represent.
#[derive(
    Clone, Copy, Debug, Serialize, Deserialize, sqlx::Type, async_graphql::Enum, PartialEq, Eq,
)]
#[sqlx(type_name = "word_segment_role")]
pub enum WordSegmentRole {
    /// Separated by a hyphen '-'
    Morpheme,
    /// Separated by an equals sign '='
    Clitic,
    /// Separated by a colon ':'
    Modifier,
}

impl PgHasArrayType for WordSegmentRole {
    fn array_type_info() -> PgTypeInfo {
        <&str as PgHasArrayType>::array_type_info()
    }

    fn array_compatible(ty: &PgTypeInfo) -> bool {
        <&str as PgHasArrayType>::array_compatible(ty)
    }
}

impl WordSegment {
    /// Make a new morpheme segment
    pub fn new(morpheme: String, gloss: String, role: Option<WordSegmentRole>) -> Self {
        Self {
            system: None,
            morpheme,
            gloss,
            // FIXME Shortcut to keep this function the same while allowing
            // migration code to create this data structure.
            gloss_id: None,
            role: role.unwrap_or(WordSegmentRole::Morpheme),
            matching_tag: None,
        }
    }

    /// Parse all segments from a raw interlinear morphemic segmentation.
    /// The first argument is the segmented source, while the second argument is
    /// the target language gloss of each segment.
    pub fn parse_many(morpheme_layer: &str, gloss_layer: &str) -> Option<Vec<Self>> {
        let (_, result) = parse_gloss_layers(morpheme_layer, gloss_layer).ok()?;
        Some(result)
    }

    /// The separator that should follow this segment, based on the type of the
    /// next segment.
    pub fn get_previous_separator(&self) -> &str {
        use WordSegmentRole::*;
        match self.role {
            Morpheme => "-",
            Clitic => "=",
            Modifier => ":",
        }
    }

    /// Build a string of the morpheme gloss line, used in interlinear gloss
    /// text (IGT).
    pub fn gloss_layer<'a>(segments: impl IntoIterator<Item = &'a WordSegment>) -> String {
        use itertools::Itertools;
        segments
            .into_iter()
            .enumerate()
            .flat_map(|(index, s)| {
                vec![
                    if index > 0 {
                        s.get_previous_separator()
                    } else {
                        ""
                    },
                    &*s.gloss,
                ]
            })
            .join("")
    }

    /// Convert the source representation of this segment into the given
    /// phonemic writing system.
    pub fn get_morpheme(&self) -> Cow<'_, str> {
        match self.system {
            Some(orthography) => Cow::Owned(orthography.convert(&self.morpheme)),
            _ => Cow::Borrowed(&*self.morpheme),
        }
    }
}

#[async_graphql::Object]
impl WordSegment {
    /// Phonemic representation of the morpheme
    async fn morpheme(&self) -> Cow<'_, str> {
        self.get_morpheme()
    }

    /// English gloss in standard DAILP format that refers to a lexical item
    async fn gloss(&self) -> &str {
        &self.gloss
    }

    /// What kind of thing is this segment?
    async fn role(&self) -> WordSegmentRole {
        self.role
    }

    /// This field determines what character should separate this segment from
    /// the previous one when reconstituting the full segmentation string.
    async fn previous_separator(&self) -> &str {
        self.get_previous_separator()
    }

    /// If this morpheme represents a functional tag that we have further
    /// information on, this is the corresponding database entry.
    async fn matching_tag(
        &self,
        context: &async_graphql::Context<'_>,
    ) -> FieldResult<Option<MorphemeTag>> {
        use async_graphql::dataloader::*;
        if let Some(matching_tag) = &self.matching_tag {
            Ok(Some(matching_tag.clone()))
        } else if let Some(gloss_id) = self.gloss_id {
            Ok(context
                .data::<DataLoader<Database>>()?
                .load_one(TagForMorpheme(
                    gloss_id,
                    self.system.unwrap_or(CherokeeOrthography::Taoc),
                ))
                .await?)
        } else {
            Ok(None)
        }
    }
}

/// A single unit of meaning and its gloss which can be edited.
#[derive(async_graphql::InputObject)]
pub struct MorphemeSegmentUpdate {
    /// Which Cherokee representation system is this segment written with?
    pub system: Option<CherokeeOrthography>,
    /// Source language representation of this segment.
    pub morpheme: String,
    /// Target language representation of this segment.
    pub gloss: String,
    /// This field determines what character should separate this segment from
    /// the next one when reconstituting the full segmentation string.
    pub role: WordSegmentRole,
}