Skip to content

Commit e14cdfb

Browse files
Add code for preparing the new prompt
1 parent 4443d1e commit e14cdfb

4 files changed

Lines changed: 495 additions & 0 deletions

File tree

backend/mnemorai/constants/languages.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,156 @@
55
with open(config.get("G2P").get("LANGUAGE_JSON")) as f:
66
G2P_LANGCODES = json.load(f)
77
G2P_LANGUAGES: dict = dict(map(reversed, G2P_LANGCODES.items()))
8+
9+
EPITRAN_LANGCODES = {
10+
"aar-Latn": "Afar",
11+
"afr-Latn": "Afrikaans",
12+
"aii-Syrc": "Assyrian Neo-Aramaic",
13+
"amh-Ethi": "Amharic",
14+
"amh-Ethi-pp": "Amharic (more phonetic)",
15+
"amh-Ethi-red": "Amharic (reduced)",
16+
"ara-Arab": "Literary Arabic",
17+
"ava-Cyrl": "Avaric",
18+
"aze-Cyrl": "Azerbaijani (Cyrillic)",
19+
"aze-Latn": "Azerbaijani",
20+
"ben-Beng": "Bengali",
21+
"ben-Beng-red": "Bengali (reduced)",
22+
"ben-Beng-east": "Eastern Bengali",
23+
"bho-Deva": "Bhojpuri",
24+
"bxk-Latn": "Bukusu",
25+
"cat-Latn": "Catalan",
26+
"ceb-Latn": "Cebuano",
27+
"ces-Latn": "Czech",
28+
"cjy-Latn": "Jin (Wiktionary)",
29+
"ckb-Arab": "Sorani",
30+
"cmn-Hans": "Mandarin (Simplified)*",
31+
"cmn-Hant": "Mandarin (Traditional)*",
32+
"cmn-Latn": "Mandarin (Pinyin)*",
33+
"csb-Latn": "Kashubian",
34+
"deu-Latn": "German",
35+
"deu-Latn-np": "German†",
36+
"deu-Latn-nar": "German (more phonetic)",
37+
"eng-Latn": "English",
38+
"epo-Latn": "Esperanto",
39+
"est-Latn": "Estonian",
40+
"fas-Arab": "Farsi (Perso-Arabic)",
41+
"fin-Latn": "Finnish",
42+
"fra-Latn": "French",
43+
"fra-Latn-np": "French†",
44+
"fra-Latn-p": "French (more phonetic)",
45+
"ful-Latn": "Fulah",
46+
"gan-Latn": "Gan (Wiktionary)",
47+
"glg-Latn": "Galician",
48+
"got-Goth": "Gothic",
49+
"got-Latn": "Gothic (Latin)",
50+
"hak-Latn": "Hakka (pha̍k-fa-sṳ)",
51+
"hat-Latn-bab": "Haitian (Latin-Babel)",
52+
"hau-Latn": "Hausa",
53+
"hin-Deva": "Hindi",
54+
"hmn-Latn": "Hmong",
55+
"hrv-Latn": "Croatian",
56+
"hsn-Latn": "Xiang (Wiktionary)",
57+
"hun-Latn": "Hungarian",
58+
"ilo-Latn": "Ilocano",
59+
"ind-Latn": "Indonesian",
60+
"ita-Latn": "Italian",
61+
"jam-Latn": "Jamaican",
62+
"jav-Latn": "Javanese",
63+
"jpn-Hrgn": "Japanese (Hiragana)",
64+
"jpn-Hrgn-red": "Japanese (Hiragana, reduced)",
65+
"jpn-Ktkn": "Japanese (Katakana)",
66+
"jpn-Ktkn-red": "Japanese (Katakana, reduced)",
67+
"jpn-Jpan": "Japanese (Hiragana, Katakana, Kanji)",
68+
"jpn-Hira": "Japanese (Hiragana)",
69+
"jpn-Hira-red": "Japanese (Hiragana, reduced)",
70+
"jpn-Kana": "Japanese (Katakana)",
71+
"jpn-Kana-red": "Japanese (Katakana, reduced)",
72+
"kat-Geor": "Georgian",
73+
"kaz-Cyrl": "Kazakh (Cyrillic)",
74+
"kaz-Cyrl-bab": "Kazakh (Cyrillic—Babel)",
75+
"kaz-Latn": "Kazakh (Latin)",
76+
"kbd-Cyrl": "Kabardian",
77+
"khm-Khmr": "Khmer",
78+
"kin-Latn": "Kinyarwanda",
79+
"kir-Arab": "Kyrgyz (Perso-Arabic)",
80+
"kir-Cyrl": "Kyrgyz (Cyrillic)",
81+
"kir-Latn": "Kyrgyz (Latin)",
82+
"kmr-Latn": "Kurmanji",
83+
"kmr-Latn-red": "Kurmanji (reduced)",
84+
"kor-Hang": "Korean",
85+
"lao-Laoo": "Lao",
86+
"lao-Laoo-prereform": "Lao (Before spelling reform)",
87+
"lav-Latn": "Latvian",
88+
"lez-Cyrl": "Lezgian",
89+
"lij-Latn": "Ligurian",
90+
"lit-Latn": "Lithuanian",
91+
"lsm-Latn": "Saamia",
92+
"ltc-Latn-bax": "Middle Chinese (Baxter and Sagart 2014)",
93+
"lug-Latn": "Ganda / Luganda",
94+
"mal-Mlym": "Malayalam",
95+
"mar-Deva": "Marathi",
96+
"mlt-Latn": "Maltese",
97+
"mon-Cyrl-bab": "Mongolian (Cyrillic)",
98+
"mri-Latn": "Maori",
99+
"msa-Latn": "Malay",
100+
"mya-Mymr": "Burmese",
101+
"nan-Latn": "Hokkien (pe̍h-oē-jī)",
102+
"nan-Latn-tl": "Hokkien (Tâi-lô)",
103+
"nld-Latn": "Dutch",
104+
"nya-Latn": "Chichewa",
105+
"ood-Latn-alv": "Tohono O'odham (Alvarez-Hale)",
106+
"ood-Latn-sax": "Tohono O'odham (Saxton)",
107+
"ori-Orya": "Odia",
108+
"orm-Latn": "Oromo",
109+
"pan-Guru": "Punjabi (Eastern)",
110+
"pol-Latn": "Polish",
111+
"por-Latn": "Portuguese",
112+
"quy-Latn": "Ayacucho Quechua / Quechua Chanka",
113+
"ron-Latn": "Romanian",
114+
"run-Latn": "Rundi",
115+
"rus-Cyrl": "Russian",
116+
"sag-Latn": "Sango",
117+
"sin-Sinh": "Sinhala",
118+
"slv-Latn": "Slovene / Slovenian",
119+
"sna-Latn": "Shona",
120+
"som-Latn": "Somali",
121+
"spa-Latn": "Spanish",
122+
"spa-Latn-eu": "Spanish (Iberian)",
123+
"sqi-Latn": "Albanian",
124+
"sro-Latn": "Sardinian (Campidanese)",
125+
"srp-Latn": "Serbian (Latin)",
126+
"srp-Cyrl": "Serbian (Cyrillic)",
127+
"swa-Latn": "Swahili",
128+
"swa-Latn-red": "Swahili (reduced)",
129+
"swe-Latn": "Swedish",
130+
"tam-Taml": "Tamil",
131+
"tam-Taml-red": "Tamil (reduced)",
132+
"tel-Telu": "Telugu",
133+
"tgk-Cyrl": "Tajik",
134+
"tgl-Latn": "Tagalog",
135+
"tgl-Latn-red": "Tagalog (reduced)",
136+
"tha-Thai": "Thai",
137+
"tir-Ethi": "Tigrinya",
138+
"tir-Ethi-pp": "Tigrinya (more phonemic)",
139+
"tir-Ethi-red": "Tigrinya (reduced)",
140+
"tok-Latn": "Toki Pona",
141+
"tpi-Latn": "Tok Pisin",
142+
"tuk-Cyrl": "Turkmen (Cyrillic)",
143+
"tuk-Latn": "Turkmen (Latin)",
144+
"tur-Latn": "Turkish (Latin)",
145+
"tur-Latn-bab": "Turkish (Latin—Babel)",
146+
"tur-Latn-red": "Turkish (reduced)",
147+
"ukr-Cyrl": "Ukrainian",
148+
"urd-Arab": "Urdu",
149+
"uig-Arab": "Uyghur (Perso-Arabic)",
150+
"uzb-Cyrl": "Uzbek (Cyrillic)",
151+
"uzb-Latn": "Uzbek (Latin)",
152+
"vie-Latn": "Vietnamese",
153+
"wuu-Latn": "Shanghainese Wu (Wiktionary)",
154+
"xho-Latn": "Xhosa",
155+
"yor-Latn": "Yoruba",
156+
"yue-Latn": "Cantonese (Jyutping)",
157+
"yue-Hant": "Cantonese (Character)",
158+
"zha-Latn": "Zhuang",
159+
"zul-Latn": "Zulu",
160+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from collections import Counter
2+
from itertools import chain
3+
4+
from nltk import bigrams, download
5+
from nltk.corpus import brown, gutenberg, reuters, webtext
6+
from wordfreq import zipf_frequency
7+
8+
# Download the necessary NLTK corpora if not already present
9+
for c in ["reuters", "gutenberg", "webtext"]:
10+
download(c, quiet=True)
11+
12+
# Zipf → raw unigram probability. Zipf 6 ≈ 1/1 000, so P = 10**(zipf−9)
13+
_unigram_p = lambda w: 10 ** (zipf_frequency(w, "en") - 9)
14+
15+
tokens = [
16+
w.lower()
17+
for w in chain(brown.words(), reuters.words(), gutenberg.words(), webtext.words())
18+
]
19+
_unigram_cnt = Counter(tokens)
20+
_bigram_cnt = Counter(bigrams(tokens))
21+
_V = len(_unigram_cnt)
22+
23+
24+
def backoff_prob(w1: str, w2: str) -> float:
25+
"""Naïve product P(w1)·P(w2) from the Wordfreq unigram model."""
26+
return _unigram_p(w1) * _unigram_p(w2)
27+
28+
29+
def brown_bigram_prob(w1: str, w2: str) -> float:
30+
# Laplace-smoothed P(w2 | w1)
31+
return (_bigram_cnt[(w1, w2)] + 1) / (_unigram_cnt[w1] + _V)
32+
33+
34+
def bigram_prob(w1: str, w2: str, *, alpha: float = 0.1) -> float:
35+
"""
36+
Hybrid probability.
37+
38+
• if Brown corpus has seen (w1,w2), return its Laplace value
39+
• otherwise fall back to alpha·P_wordfreq(w1)·P_wordfreq(w2)
40+
41+
`alpha` (default 0.1) keeps back-off numbers on the same scale as
42+
real bigram counts—tune it if the gap feels too big or small.
43+
"""
44+
base = brown_bigram_prob(w1, w2)
45+
if _bigram_cnt[(w1, w2)] == 0: # unseen → use back-off
46+
return alpha * backoff_prob(w1, w2)
47+
return base
48+
49+
50+
def bigram_grid(list1, list2, *, sort_desc: bool = True):
51+
"""Return (w1, w2, hybrid-prob) for every w1∈list1, w2∈list2."""
52+
out = [
53+
(w1, w2, bigram_prob(w1.lower(), w2.lower())) # ← CHANGED
54+
for w1 in list1
55+
for w2 in list2
56+
]
57+
return sorted(out, key=lambda t: t[2], reverse=sort_desc) if sort_desc else out
58+
59+
60+
if __name__ == "__main__":
61+
unformatted_list1 = (
62+
"duh, ta, tea, toe, tie, dew, due, doe, dough, die, dart, door, "
63+
"thaw, though, there, tire, dare, donor, draw, tear, door, data, "
64+
"deter, tune, the"
65+
)
66+
unformatted_list2 = (
67+
"sing, sink, sting, thing, wing, king, cling, grin, gin, gang, gone, "
68+
"gong, then, thens, ten, tang, tan, town, tongue, tinge, begin, bing, "
69+
"singe, swing, twin"
70+
)
71+
list1 = unformatted_list1.split(", ")
72+
list2 = unformatted_list2.split(", ")
73+
74+
for w1, w2, p in bigram_grid(list1, list2):
75+
print(f"{w1} {w2:<10} P={p:.3e}")
76+
77+
# Maybe also do levenshtein distance between for the chunks

0 commit comments

Comments
 (0)