katakana.py

changeset 0
659ab465152e
child 1
f9788970fa46
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/katakana.py	Tue Jul 28 21:51:54 2020 +0300
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+
+# why isn't this in functools...
+# https://www.geeksforgeeks.org/function-composition-in-python/
+def compose(*func):
+	import functools
+	return functools.reduce(lambda f, g: lambda x: f(g(x)), func, lambda x: x)
+
+# mapping of romaji to katakana
+RAW_KATAKANA_TABLE = {
+	'a': 'ア',
+	'ba': 'バ',
+	'be': 'ベ',
+	'bi': 'ビ',
+	'bo': 'ボ',
+	'bu': 'ブ',
+	'bya': 'ビャ',
+	'byo': 'ビョ',
+	'byu': 'ビュ',
+	'cha': 'チャ',
+	'che': 'チェ',
+	'chi': 'チ',
+	'cho': 'チョ',
+	'chu': 'チュ',
+	'da': 'ダ',
+	'de': 'デ',
+	'di': 'ディ',
+	'do': 'ド',
+	'du': 'ドゥ',
+	'dyu': 'デュ',
+	'e': 'エ',
+	'fa': 'ファ',
+	'fe': 'フェ',
+	'fi': 'フィ',
+	'fo': 'フォ',
+	'fu': 'フ',
+	'fyu': 'フュ',
+	'ga': 'ガ',
+	'ge': 'ゲ',
+	'gi': 'ギ',
+	'go': 'ゴ',
+	'gu': 'グ',
+	'gya': 'ギャ',
+	'gyo': 'ギョ',
+	'gyu': 'ギュ',
+	'ha': 'ハ',
+	'he': 'ヘ',
+	'hi': 'ヒ',
+	'ho': 'ホ',
+	'hya': 'ヒャ',
+	'hyo': 'ヒョ',
+	'hyu': 'ヒュ',
+	'i': 'イ',
+	'ja': 'ジャ',
+	'je': 'ジェ',
+	'ji': 'ジ',
+	'jo': 'ジョ',
+	'ju': 'ジュ',
+	'ka': 'カ',
+	'ke': 'ケ',
+	'ki': 'キ',
+	'ko': 'コ',
+	'ku': 'ク',
+	'kya': 'キャ',
+	'kyo': 'キョ',
+	'kyu': 'キュ',
+	'ma': 'マ',
+	'me': 'メ',
+	'mi': 'ミ',
+	'mo': 'モ',
+	'mu': 'ム',
+	'mya': 'ミャ',
+	'myo': 'ミョ',
+	'myu': 'ミュ',
+	'na': 'ナ',
+	'ne': 'ネ',
+	'ni': 'ニ',
+	'no': 'ノ',
+	'nu': 'ヌ',
+	'nya': 'ニャ',
+	'nyo': 'ニョ',
+	'nyu': 'ニュ',
+	'n': 'ン',
+	'o': 'オ',
+	'pa': 'パ',
+	'pe': 'ペ',
+	'pi': 'ピ',
+	'po': 'ポ',
+	'pu': 'プ',
+	'pya': 'ピャ',
+	'pyo': 'ピョ',
+	'pyu': 'ピュ',
+	'ra': 'ラ',
+	're': 'レ',
+	'ri': 'リ',
+	'ro': 'ロ',
+	'ru': 'ル',
+	'rya': 'リャ',
+	'ryo': 'リョ',
+	'ryu': 'リュ',
+	'sa': 'サ',
+	'se': 'セ',
+	'so': 'ソ',
+	'su': 'ス',
+	'sha': 'シャ',
+	'she': 'シェ',
+	'shi': 'シ',
+	'sho': 'ショ',
+	'shu': 'シュ',
+	'ta': 'タ',
+	'te': 'テ',
+	'ti': 'ティ',
+	'to': 'ト',
+	'tu': 'トゥ',
+	'tsa': 'ツァ',
+	'tse': 'ツェ',
+	'tso': 'ツォ',
+	'tsu': 'ツ',
+	'tyu': 'テュ',
+	'u': 'ウ',
+	'va': 'ヴァ',
+	've': 'ヴェ',
+	'vi': 'ヴィ',
+	'vo': 'ヴォ',
+	'vu': 'ヴ',
+	'wa': 'ワ',
+	'we': 'ウェ',
+	'wi': 'ウィ',
+	'wo': 'ウォ',
+	'ya': 'ヤ',
+	'ye': 'イェ',
+	'yi': 'ヤィ',
+	'yo': 'ヨ',
+	'yu': 'ユ',
+	'za': 'ザ',
+	'ze': 'ゼ',
+	'zo': 'ゾ',
+	'zu': 'ズ',
+}
+
+def full_katakana_table(raw_table):
+	'''
+	adds small tsu and long vowel variants to the katakana table
+	'''
+	from copy import copy
+	katakana = copy(raw_table)
+	katakana['hu'] = katakana['fu']
+	katakana['si'] = katakana['shi']
+	# add small tsu versions
+	for latin in copy(list(katakana.keys())):
+		if len(latin) > 1 and latin[0] != 'n':
+			# we do not need a small tsu version for n because n is its own kana
+			katakana[latin[0] + latin] = 'ッ' + katakana[latin]
+	# add long vowel versions
+	for latin in copy(list(katakana.keys())):
+		katakana[latin + latin[-1]] = katakana[latin] + 'ー'
+	return katakana
+
+def katakana_keys(kana_table):
+	return sorted(kana_table.keys(), key = len)[::-1]
+
+katakana_table = full_katakana_table(RAW_KATAKANA_TABLE)
+
+def finnish_to_romaji(finnish):
+	# translates finnish text to Japanese romaji
+	# does not, however, fill in 'u' vowels to consonants, that is done 
+	# by the splice_romaji function
+	from re import sub
+	cleanup = lambda str: sub(r'[^a-zåäö]', '', str)
+	return cleanup(finnish
+		.lower()
+		.replace('y', 'u')
+		.replace('w', 'v')
+		.replace('j', 'y')
+		.replace('l', 'r')
+		.replace('ä', 'a')
+		.replace('ö', 'o')
+		.replace('å', 'oo'))
+
+def splice_romaji(romaji, keys):
+	while len(romaji) > 0:
+		for key in keys:
+			if romaji.startswith(key):
+				yield key
+				romaji = romaji[len(key):]
+				break
+		else:
+			yield romaji[0] + 'u'
+			romaji = romaji[1:]
+
+def splices_to_katakana(splices, katakana_table):
+	to_katakana = lambda romaji: katakana_table[romaji]
+	return ''.join(map(to_katakana, splices))
+
+class Transliterator:
+	def __init__(self):
+		self.cached_katakana_table = full_katakana_table(RAW_KATAKANA_TABLE)
+		self.cached_katakana_keys = katakana_keys(self.cached_katakana_table)
+	def __call__(self, finnish):
+		return compose(
+			lambda k: splices_to_katakana(k, katakana_table = self.cached_katakana_table),
+			lambda k: splice_romaji(k, keys = self.cached_katakana_keys),
+			finnish_to_romaji,
+		)(finnish)
+	def __repr__(self):
+		return 'Transliterator()'
+
+transliterate = Transliterator()

mercurial