katakana.py

Wed, 29 Jul 2020 23:45:53 +0300

author
Teemu Piippo <teemu@hecknology.net>
date
Wed, 29 Jul 2020 23:45:53 +0300
changeset 1
f9788970fa46
parent 0
659ab465152e
permissions
-rw-r--r--

begin work on bus compiler

#!/usr/bin/env python3

# why isn't this in functools...
# https://www.geeksforgeeks.org/function-composition-in-python/
def compose(*func):
	import functools
	return functools.reduce(lambda f, g: lambda x: f(g(x)), func, lambda x: x)

# mapping of romaji to katakana
RAW_KATAKANA_TABLE = {
	'a': 'ア',
	'ba': 'バ',
	'be': 'ベ',
	'bi': 'ビ',
	'bo': 'ボ',
	'bu': 'ブ',
	'bya': 'ビャ',
	'byo': 'ビョ',
	'byu': 'ビュ',
	'cha': 'チャ',
	'che': 'チェ',
	'chi': 'チ',
	'cho': 'チョ',
	'chu': 'チュ',
	'da': 'ダ',
	'de': 'デ',
	'di': 'ディ',
	'do': 'ド',
	'du': 'ドゥ',
	'dyu': 'デュ',
	'e': 'エ',
	'fa': 'ファ',
	'fe': 'フェ',
	'fi': 'フィ',
	'fo': 'フォ',
	'fu': 'フ',
	'fyu': 'フュ',
	'ga': 'ガ',
	'ge': 'ゲ',
	'gi': 'ギ',
	'go': 'ゴ',
	'gu': 'グ',
	'gya': 'ギャ',
	'gyo': 'ギョ',
	'gyu': 'ギュ',
	'ha': 'ハ',
	'he': 'ヘ',
	'hi': 'ヒ',
	'ho': 'ホ',
	'hya': 'ヒャ',
	'hyo': 'ヒョ',
	'hyu': 'ヒュ',
	'i': 'イ',
	'ja': 'ジャ',
	'je': 'ジェ',
	'ji': 'ジ',
	'jo': 'ジョ',
	'ju': 'ジュ',
	'ka': 'カ',
	'ke': 'ケ',
	'ki': 'キ',
	'ko': 'コ',
	'ku': 'ク',
	'kya': 'キャ',
	'kyo': 'キョ',
	'kyu': 'キュ',
	'ma': 'マ',
	'me': 'メ',
	'mi': 'ミ',
	'mo': 'モ',
	'mu': 'ム',
	'mya': 'ミャ',
	'myo': 'ミョ',
	'myu': 'ミュ',
	'na': 'ナ',
	'ne': 'ネ',
	'ni': 'ニ',
	'no': 'ノ',
	'nu': 'ヌ',
	'nya': 'ニャ',
	'nyo': 'ニョ',
	'nyu': 'ニュ',
	'n': 'ン',
	'o': 'オ',
	'pa': 'パ',
	'pe': 'ペ',
	'pi': 'ピ',
	'po': 'ポ',
	'pu': 'プ',
	'pya': 'ピャ',
	'pyo': 'ピョ',
	'pyu': 'ピュ',
	'ra': 'ラ',
	're': 'レ',
	'ri': 'リ',
	'ro': 'ロ',
	'ru': 'ル',
	'rya': 'リャ',
	'ryo': 'リョ',
	'ryu': 'リュ',
	'sa': 'サ',
	'se': 'セ',
	'so': 'ソ',
	'su': 'ス',
	'sha': 'シャ',
	'she': 'シェ',
	'shi': 'シ',
	'sho': 'ショ',
	'shu': 'シュ',
	'ta': 'タ',
	'te': 'テ',
	'ti': 'ティ',
	'to': 'ト',
	'tu': 'トゥ',
	'tsa': 'ツァ',
	'tse': 'ツェ',
	'tso': 'ツォ',
	'tsu': 'ツ',
	'tyu': 'テュ',
	'u': 'ウ',
	'va': 'ヴァ',
	've': 'ヴェ',
	'vi': 'ヴィ',
	'vo': 'ヴォ',
	'vu': 'ヴ',
	'wa': 'ワ',
	'we': 'ウェ',
	'wi': 'ウィ',
	'wo': 'ウォ',
	'ya': 'ヤ',
	'ye': 'イェ',
	'yi': 'ヤィ',
	'yo': 'ヨ',
	'yu': 'ユ',
	'za': 'ザ',
	'ze': 'ゼ',
	'zo': 'ゾ',
	'zu': 'ズ',
}

def full_katakana_table(raw_table):
	'''
	adds small tsu and long vowel variants to the katakana table
	'''
	from copy import copy
	katakana = copy(raw_table)
	katakana['hu'] = katakana['fu']
	katakana['si'] = katakana['shi']
	# add small tsu versions
	for latin in copy(list(katakana.keys())):
		if len(latin) > 1 and latin[0] != 'n':
			# we do not need a small tsu version for n because n is its own kana
			katakana[latin[0] + latin] = 'ッ' + katakana[latin]
	# add long vowel versions
	for latin in copy(list(katakana.keys())):
		if latin != 'n':
			katakana[latin + latin[-1]] = katakana[latin] + 'ー'
	return katakana

def katakana_keys(kana_table):
	return sorted(kana_table.keys(), key = len)[::-1]

def finnish_to_romaji(finnish):
	# translates finnish text to Japanese romaji
	# does not, however, fill in 'u' vowels to consonants, that is done 
	# by the splice_romaji function
	from re import sub
	cleanup = lambda str: sub(r'[^a-zåäö]', '', str)
	return cleanup(finnish
		.lower()
		.replace('y', 'u')
		.replace('w', 'v')
		.replace('j', 'y')
		.replace('l', 'r')
		.replace('ä', 'a')
		.replace('ö', 'o')
		.replace('x', 'ks')
		.replace('c', 'k')
		.replace('å', 'oo'))

def splice_romaji(romaji, keys):
	while len(romaji) > 0:
		for key in keys:
			if romaji.startswith(key):
				yield key
				romaji = romaji[len(key):]
				break
		else:
			yield romaji[0] + 'u'
			romaji = romaji[1:]

def splices_to_katakana(splices, katakana_table):
	to_katakana = lambda romaji: katakana_table[romaji]
	return ''.join(map(to_katakana, splices))

class Transliterator:
	def __init__(self):
		self.cached_katakana_table = full_katakana_table(RAW_KATAKANA_TABLE)
		self.cached_katakana_keys = katakana_keys(self.cached_katakana_table)
	def __call__(self, finnish):
		return compose(
			lambda k: splices_to_katakana(k, katakana_table = self.cached_katakana_table),
			lambda k: splice_romaji(k, keys = self.cached_katakana_keys),
			finnish_to_romaji,
		)(finnish)
	def __repr__(self):
		return 'Transliterator()'

transliterate = Transliterator()

mercurial