Newer
Older
# (should work with either Python 2 or Python 3)
# cantonese.py - Python functions for processing Cantonese transliterations
# (uses eSpeak and Gradint for help with some of them)
# v1.47 (c) 2013-15,2017-23 Silas S. Brown. License: GPL
cache = {} # to avoid repeated eSpeak runs,
# zi -> jyutping or (pinyin,) -> translit
dryrun_mode = False # True = prepare to populate cache in batch
jyutping_dryrun,pinyin_dryrun = set(),set()
import re, pickle, os, sys
if '--cache' in sys.argv:
cache_fname = sys.argv[sys.argv.index('--cache')+1]
else: cache_fname = os.environ.get("JYUTPING_CACHE","/tmp/.jyutping-cache")
try: cache = pickle.Unpickler(open(cache_fname,"rb")).load()
extra_zhy_dict = { # TODO: add these to the real zhy_list in eSpeak
def S(v): # make sure it's a string in both Python 2 and 3
if type("")==type(u""): # Python 3
try: return v.decode('utf-8') # in case it's bytes
except: return v
else: return v
def B(v): # make sure it's bytes in Python 3, str in Python 2
if type(v)==type(u""): return v.encode('utf-8')
return v
if not type(hanzi)==type(u""): hanzi=hanzi.decode('utf-8')
for k,v in extra_zhy_dict.items(): hanzi=hanzi.replace(k,v)
global espeak
if not espeak:
espeak = import_gradint().ESpeakSynth()
if not espeak.works_on_this_platform(): # must call
raise Exception("espeak.works_on_this_platform")
assert espeak.supports_language("zhy")
if not hanzi in cache: jyutping_dryrun.add(hanzi)
return "aai1" # placeholder value
elif jyutping_dryrun:
jyutping_dryrun = list(jyutping_dryrun)
vals = espeak.transliterate_multiple("zhy",jyutping_dryrun,0)
assert len(jyutping_dryrun)==len(vals)
for k,v in zip(jyutping_dryrun,vals):
cache[k]=S(v).replace("7","1").lower() # see below
if hanzi in cache: jyutping = cache[hanzi]
else: cache[hanzi] = jyutping = S(espeak.transliterate("zhy",hanzi,forPartials=0)).replace("7","1").lower() # .lower() needed because espeak sometimes randomly capitalises e.g. 2nd hanzi of 'hypocrite' (Mandarin xuwei de ren)
if mustWork: assert jyutping.strip(), "No translit. result for "+repr(hanzi)
elif not jyutping.strip(): jyutping=""
return jyutping
espeak = 0
def hanzi_only(unitext): return u"".join(filter(lambda x:0x4e00<=ord(x)<0xa700 or ord(x)>=0x10000, list(unitext)))
def py2nums(pinyin):
if not type(pinyin)==type(u""):
pinyin = pinyin.decode('utf-8')
global pinyin_dryrun
if pinyin_dryrun:
pinyin_dryrun = list(pinyin_dryrun)
vals = espeak.transliterate_multiple("zh",pinyin_dryrun,0)
assert len(pinyin_dryrun)==len(vals)
for i in range(len(pinyin_dryrun)):
cache[(pinyin_dryrun[i],)]=vals[i]
if (pinyin,) in cache: pyNums = cache[(pinyin,)]
else: pyNums = espeak.transliterate("zh",pinyin,forPartials=0) # (this transliterate just does tone marks to numbers, adds 5, etc; forPartials=0 because we DON'T want to change letters like X into syllables, as that won't happen in jyutping and we're going through it tone-by-tone)
assert pyNums and pyNums.strip(), "espeak.transliterate returned %s for %s" % (repr(pyNums),repr(pinyin))
return re.sub("a$","a5",re.sub("(?<=[a-zA-Z])er([1-5])",r"e\1r5",S(pyNums)))
if type(u"")==type(""): # Python 3
getNext = lambda gen: gen.__next__()
else: getNext = lambda gen: gen.next()
def adjust_jyutping_for_pinyin(hanzi,jyutping,pinyin):
# If we have good quality (proof-read etc) Mandarin pinyin, this can sometimes improve the automatic Cantonese transcription
if not type(hanzi)==type(u""): hanzi = hanzi.decode('utf-8')
hanzi = hanzi_only(hanzi)
if not re.search(py2j_chars,hanzi): return jyutping
pinyin = re.findall('[A-Za-z]*[1-5]',py2nums(pinyin))
if not len(pinyin)==len(hanzi): return jyutping # can't fix
i = 0 ; tones = re.finditer('[1-7]',jyutping) ; j2 = []
for h,p in zip(list(hanzi),pinyin):
try: j = getNext(tones).end()
except StopIteration: return jyutping # one of the hanzi has no Cantonese reading in our data: we'll warn "failed to fix" below
if h in py2j and p.lower() in py2j[h]: j2[-1]=j2[-1][:re.search("[A-Za-z]*[1-7]$",j2[-1]).start()]+py2j[h][p.lower()]
py2j={
u"\u4E2D":{"zhong1":"zung1","zhong4":"zung3"},
u"\u4E3A\u70BA":{"wei2":"wai4","wei4":"wai6"},
u"\u4E50\u6A02":{"le4":"lok6","yue4":"ngok6"},
u"\u4EB2\u89AA":{"qin1":"can1","qing4":"can3"},
u"\u4EC0":{"shen2":"sam6","shi2":"sap6"}, # unless zaap6
u"\u4FBF":{"bian4":"bin6","pian2":"pin4"},
u"\u5047":{"jia3":"gaa2","jia4":"gaa3"},
u"\u5174\u8208":{"xing1":"hing1","xing4":"hing3"},
# u"\u5207":{"qie4":"cai3","qie1":"cit3"}, # WRONG (rm'd v1.17). It's cit3 in re4qie4. It just wasn't in yiqie4 (which zhy_list has as an exception anyway)
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
u"\u521B\u5275":{"chuang1":"cong1","chuang4":"cong3"},
u"\u53EA":{"zhi1":"zek3","zhi3":"zi2"},
u"\u53F7\u865F":{"hao4":"hou6","hao2":"hou4"},
u"\u548C":{"he2":"wo4","he4":"wo6"},
u"\u54BD":{"yan1":"jin1","yan4":"jin3","ye4":"jit3"},
u"\u5708":{"juan4":"gyun6","quan1":"hyun1"},
u"\u597D":{"hao3":"hou2","hao4":"hou3"},
u"\u5C06\u5C07":{"jiang1":"zoeng1","jiang4":"zoeng3"},
u"\u5C11":{"shao3":"siu2","shao4":"siu3"},
u"\u5DEE":{"cha4":"caa1","cha1":"caa1","chai1":"caai1"},
u"\u5F37\u5F3A":{"qiang2":"koeng4","qiang3":"koeng5"},
u"\u62C5\u64D4":{"dan1":"daam1","dan4":"daam3"},
u"\u6323\u6399":{"zheng4":"zaang6","zheng1":"zang1"},
u"\u6570\u6578":{"shu3":"sou2","shu4":"sou3"},
u"\u671D":{"chao2":"ciu4","zhao1":"ziu1"},
u"\u6ED1":{"hua2":"waat6","gu3":"gwat1"},
u"\u6F02":{"piao1":"piu1","piao3 piao4":"piu3"},
u"\u76DB":{"sheng4":"sing6","cheng2":"sing4"},
u"\u76F8":{"xiang1":"soeng1","xiang4":"soeng3"},
u"\u770B":{"kan4":"hon3","kan1":"hon1"},
u"\u79CD\u7A2E":{"zhong3":"zung2","zhong4":"zung3"},
u"\u7EF7\u7E43":{"beng1":"bang1","beng3":"maang1"},
u"\u8208":{"xing1":"hing1","xing4":"hing3"},
u"\u843D":{"luo1 luo4 lao4":"lok6","la4":"laai6"},
u"\u8457":{"zhu4":"zyu3","zhuo2":"zoek3","zhuo2 zhao2 zhao1 zhe5":"zoek6"},
u"\u8981":{"yao4":"jiu3","yao1":"jiu1"},
u"\u89C1\u898B":{"jian4":"gin3","xian4":"jin6"},
u"\u89C9\u89BA":{"jue2":"gok3","jiao4":"gaau3"},
u"\u8B58\u8BC6":{"shi2 shi4":"sik1","zhi4":"zi3"},
u"\u8ABF\u8C03":{"diao4":"diu6","tiao2":"tiu4"},
u"\u91CF":{"liang2":"loeng4","liang4":"loeng6"},
u"\u9577\u957F":{"chang2":"coeng4","zhang3":"zoeng2"},
u"\u9593\u95F4":{"jian1":"gaan1","jian4":"gaan3"},
u"\u96BE\u96E3":{"nan2":"naan4","nan4":"naan6"}}
for k in list(py2j.keys()):
if len(k)>1:
for c in list(k): py2j[c]=py2j[k]
del py2j[k]
for _,v in py2j.items():
if len(k.split())>1:
for w in k.split(): v[w]=v[k]
del v[k]
py2j_chars = re.compile(u'['+''.join(list(py2j.keys()))+']')
j = S(j).lower().replace("j","y").replace("z","j")
return j.lower().replace("ohek","euk")

Silas S. Brown
committed
def jyutping_to_lau_java(jyutpingNo=2,lauNo=1):
# for annogen.py 3.29+ --annotation-postprocess to ship Jyutping and generate Lau at runtime
return 'if(annotNo=='+str(jyutpingNo)+'||annotNo=='+str(lauNo)+'){m=Pattern.compile("<rt>(.*?)</rt>").matcher(r);sb=new StringBuffer();while(m.find()){String r2=(annotNo=='+str(jyutpingNo)+'?m.group(1).replaceAll("([1-7])(.)","$1­$2"):(m.group(1)+" ").toLowerCase().replace("j","y").replace("z","j")'+''.join('.replace("'+k+'","'+v+'")' for k,v in jlRep)+'.toLowerCase().replace("ohek","euk").replaceAll("([1-7])","<sup>$1</sup>-").replace("- "," ").replaceAll(" $","")),tmp=m.group(1).substring(0,1);if(annotNo=='+str(lauNo)+'&&tmp.equals(tmp.toUpperCase()))r2=r2.substring(0,1).toUpperCase()+r2.substring(1);m.appendReplacement(sb,"<rt>"+r2+"</rt>");}m.appendTail(sb); r=sb.toString();}' # TODO: can probably go faster with mapping for some of this
def incomplete_lau_to_jyutping(l):
# incomplete: assumes Lau didn't do the "aa" -> "a" rule
for k,v in ljRep: l=l.replace(k,v)
return l.lower().replace("j","z").replace("y","j")
def incomplete_lau_to_yale_u8(l): return jyutping_to_yale_u8(incomplete_lau_to_jyutping(l))
jlRep = [(unchanged,unchanged.upper()) for unchanged in "aai aau aam aang aan aap aat aak ai au am ang an ap at ak a ei eng ek e iu im ing in ip it ik i oi ong on ot ok ung uk".split()] + [("eoi","UI"),("eon","UN"),("eot","UT"),("eok","EUK"),("oeng","EUNG"),("oe","EUH"),("c","ch"),("ou","O"),("o","OH"),("yu","UE"),("u","OO")]
jlRep.sort(key=lambda a:-len(a[0])) # longest 1st
# u to oo includes ui to ooi, un to oon, ut to oot
# yu to ue includes yun to uen and yut to uet
# drawing from the table on http://www.omniglot.com/writing/cantonese.htm plus this private communication:
# Jyutping "-oeng" maps to Sidney Lau "-eung".
# Jyutping "jyu" maps to Sidney Lau "yue". (consequence of yu->ue, j->y)
ljRep.sort(key=lambda a:-len(a[0])) # longest 1st
def ping_or_lau_to_syllable_list(j): return re.sub(r"([1-9])(?![0-9])",r"\1 ",re.sub(r"[!-/:-@^-`]"," ",S(j))).split()
def hyphenate_ping_or_lau_syl_list(sList,groupLens=None):
return hyphenate_syl_list(sList,groupLens)
def hyphenate_yale_syl_list(sList,groupLens=None):
# (if sList is a string, the syllables must be space-separated,
# which will be the case if to_yale functions below are used)
if not type(sList)==list: sList = sList.split()
return hyphenate_syl_list(sList,groupLens)
def hyphenate_syl_list(sList,groupLens=None):
assert type(sList) == list
if '--hyphenate-all' in sys.argv: groupLens = [len(sList)]
elif not groupLens: groupLens = [1]*len(sList) # don't hyphenate at all if we don't know
else: assert sum(groupLens) == len(sList), "sum("+repr(groupLens)+")!=len("+repr(sList)+")"
r.append("-".join(S(x) for x in sList[start:start+g]))
def jyutping_to_yale_TeX(j): # returns space-separated syllables
for syl in ping_or_lau_to_syllable_list(S(j).lower().replace("eo","eu").replace("oe","eu").replace("j","y").replace("yyu","yu").replace("z","j").replace("c","ch")):
for i in range(len(syl)):
if syl[i] in "aeiou":
vowel=i ; break
if vowel==None and re.match(r"h?(m|ng)[456]",syl): # standalone nasal syllables
vowel = syl.find('m')
if vowel<0: vowel = syl.index('n')
lastVowel = syl.find('g')
if lastVowel<0: lastVowel = vowel
ret.append(syl.upper()) ; continue # English word or letter in the Chinese?
if syl[vowel:vowel+2] == "aa" and (len(syl)<vowel+2 or syl[vowel+2] in "123456"):
syl=syl[:vowel]+syl[vowel+1:] # final aa -> a
# the tonal 'h' goes after all the vowels but before any consonants:
for i in range(len(syl)-1,-1,-1):
if syl[i] in "aeiou":
lastVowel=i ; break
if syl[-1] in "1234567":
# get_jyutping replaces 7 with 1 because zhy_list is
# more Canton-type than Hong Kong-type Cantonese and
# there is considerable disagreement on which "1"s
# should be "7"s, but if you pass any "7" into the
# jyutping_to_yale functions we can at least process
# it here:
tone = ["\=",r"\'","",r"\`",r"\'","",r"\`"][int(syl[-1])-1]
if syl[-1] in "456":
syl=syl[:lastVowel+1]+"h"+syl[lastVowel+1:]
ret.append((syl[:vowel]+tone+syl[vowel:-1]).replace(r"\=i",r"\=\i{}").replace(r"\=I",r"\=\I{}"))
else: ret.append(syl.upper()) # English word or letter in the Chinese?
return ' '.join(ret)
def jyutping_to_yale_u8(j): # returns space-separated syllables
import unicodedata
def mysub(z,l):
for x,y in l:
z = re.sub(re.escape(x)+r"(.)",r"\1"+y,z)
return z
if type(u"")==type(""): U=str # Python 3
else: U=unicode # Python 2
return unicodedata.normalize('NFC',mysub(U(jyutping_to_yale_TeX(j).replace(r"\i{}","i").replace(r"\I{}","I")),[(r"\`",u"\u0300"),(r"\'",u"\u0301"),(r"\=",u"\u0304")])).encode('utf-8')
def superscript_digits_TeX(j):
# for jyutping and Sidney Lau
for digit in "123456789": j=j.replace(digit,r"\raisebox{-0.3ex}{$^"+digit+r"$}\hspace{0pt}")
for digit in "123456789": j=j.replace(digit,"<sup>"+digit+"</sup>")
return j
def superscript_digits_UTF8(j):
# WARNING: not all fonts have all digits; many have only the first 3. superscript_digits_HTML might be better for browsers, even though it does produce more bytes.
j = S(j)
for digit in range(1,10): j=j.replace(str(digit),S(u"¹²³⁴⁵⁶⁷⁸⁹"[digit-1].encode('utf-8')))
if type(j)==type(u""): j=j.encode('utf-8') # Python 3
return j
def import_gradint():
global gradint
try: return gradint
except: pass
# when importing gradint, make sure no command line
tmp,sys.argv = sys.argv,sys.argv[:1]
import gradint
sys.argv = tmp
gradint.espeak_preprocessors = {}
def do_song_subst(hanzi_u8): return B(hanzi_u8).replace(unichr(0x4f7f).encode('utf-8'),unichr(0x38c8).encode('utf-8')) # Mandarin shi3 (normally jyutping sai2) is usually si3 in songs, so substitute a rarer character that unambiguously has that reading before sending to get_jyutping

Silas S. Brown
committed
# with '#' separators if --yale#lau in sys.argv,
# also --yale#ping and --yale#lau#ping accepted);
# if there's a # in the line, assume it's hanzi#pinyin
# (for annogen.py --reannotator="##python cantonese.py")
lines = sys.stdin.read().replace("\r\n","\n").split("\n")
if lines and not lines[-1]: del lines[-1]
dryrun_mode = True
if '#' in l: l,pinyin = l.split('#')
else: pinyin = None
if pinyin and not type(pinyin)==type(u""):
pinyin = pinyin.decode('utf-8')
if pinyin and not (pinyin,) in cache:
pinyin_dryrun.add(pinyin)
for w in pinyin.split():
for h in w.split('-'):
pinyin_dryrun.add(h)
for l in lines:
if '#' in l: l,pinyin = l.split('#')
else: pinyin = None
if not jyutping: groupLens = None # likely a Unihan-only 'fallback readings' zi that has no Cantonese
elif pinyin:
jyutping = adjust_jyutping_for_pinyin(l,jyutping,pinyin)
groupLens = [0]
for syl,space in re.findall('([A-Za-z]*[1-5])( *)',' '.join('-'.join(py2nums(h) for h in w.split('-')) for w in pinyin.split())): # doing it this way so we're not relying on espeak transliterate_multiple to preserve spacing and hyphenation
groupLens[-1] += 1
if space: groupLens.append(0)
if not groupLens[-1]: groupLens=groupLens[:-1]
lenWanted = len(ping_or_lau_to_syllable_list(jyutping))
if sum(groupLens) > lenWanted: # probably silent -r to drop
for i,word in enumerate(py2nums(pinyin).split()):
groupLens[i] -= 1
if sum(groupLens)==lenWanted: break
if not sum(groupLens)==lenWanted:
sys.stderr.write("WARNING: failed to fix "+pinyin+" ("+py2nums(pinyin)+") to "+jyutping+" ("+repr(ping_or_lau_to_syllable_list(jyutping))+") from "+l+", omitting\n")
groupLens = None ; jyutping = ""
else: groupLens = None
if "--yale#lau" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping),groupLens)+"#"+superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping),groupLens)))

Silas S. Brown
committed
elif '--yale#ping' in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping),groupLens)+"#"+jyutping.replace(' ',''))
elif "--yale#lau#ping" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping),groupLens)+"#"+superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping),groupLens))+"#"+jyutping.replace(' ',''))
elif "--yale" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping),groupLens))
else: print (superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping),groupLens)))
try: pickle.Pickler(open(cache_fname,"wb"),-1).dump(cache)
except: pass