#!/usr/bin/env python
Silas S. Brown's avatar
Silas S. Brown committed
# -*- coding: utf-8 -*-
#  (should work with either Python 2 or Python 3)
Silas S. Brown's avatar
Silas S. Brown committed

# cantonese.py - Python functions for processing Cantonese transliterations
# (uses eSpeak and Gradint for help with some of them)

# v1.47 (c) 2013-15,2017-23 Silas S. Brown.  License: GPL

cache = {} # to avoid repeated eSpeak runs,
# zi -> jyutping or (pinyin,) -> translit
dryrun_mode = False # True = prepare to populate cache in batch
jyutping_dryrun,pinyin_dryrun = set(),set()
Silas S. Brown's avatar
Silas S. Brown committed

import re, pickle, os, sys
if '--cache' in sys.argv:
  cache_fname = sys.argv[sys.argv.index('--cache')+1]
else: cache_fname = os.environ.get("JYUTPING_CACHE","/tmp/.jyutping-cache")
try: cache = pickle.Unpickler(open(cache_fname,"rb")).load()
except: pass
Silas S. Brown's avatar
Silas S. Brown committed

extra_zhy_dict = { # TODO: add these to the real zhy_list in eSpeak
Silas S. Brown's avatar
Silas S. Brown committed

def S(v): # make sure it's a string in both Python 2 and 3
  if type("")==type(u""): # Python 3
    try: return v.decode('utf-8') # in case it's bytes
    except: return v
  else: return v
def B(v): # make sure it's bytes in Python 3, str in Python 2
  if type(v)==type(u""): return v.encode('utf-8')
  return v

Silas S. Brown's avatar
Silas S. Brown committed
def get_jyutping(hanzi,mustWork=1):
  if not type(hanzi)==type(u""): hanzi=hanzi.decode('utf-8')
Silas S. Brown's avatar
Silas S. Brown committed
  for k,v in extra_zhy_dict.items(): hanzi=hanzi.replace(k,v)
Silas S. Brown's avatar
Silas S. Brown committed
  global espeak
  if not espeak:
      espeak = import_gradint().ESpeakSynth()
      if not espeak.works_on_this_platform(): # must call
          raise Exception("espeak.works_on_this_platform")
      assert espeak.supports_language("zhy")
Silas S. Brown's avatar
Silas S. Brown committed

Silas S. Brown's avatar
Silas S. Brown committed
  global jyutping_dryrun
  if dryrun_mode:
      if not hanzi in cache: jyutping_dryrun.add(hanzi)
      return "aai1" # placeholder value
Silas S. Brown's avatar
Silas S. Brown committed
  elif jyutping_dryrun:
      jyutping_dryrun = list(jyutping_dryrun)
      vals = espeak.transliterate_multiple("zhy",jyutping_dryrun,0)
      assert len(jyutping_dryrun)==len(vals)
      for k,v in zip(jyutping_dryrun,vals):
        cache[k]=S(v).replace("7","1").lower() # see below
Silas S. Brown's avatar
Silas S. Brown committed
      jyutping_dryrun = set()
  if hanzi in cache: jyutping = cache[hanzi]
  else: cache[hanzi] = jyutping = S(espeak.transliterate("zhy",hanzi,forPartials=0)).replace("7","1").lower() # .lower() needed because espeak sometimes randomly capitalises e.g. 2nd hanzi of 'hypocrite' (Mandarin xuwei de ren)
Silas S. Brown's avatar
Silas S. Brown committed
  if mustWork: assert jyutping.strip(), "No translit. result for "+repr(hanzi)
  elif not jyutping.strip(): jyutping=""
  return jyutping
espeak = 0

Silas S. Brown's avatar
Silas S. Brown committed
def hanzi_only(unitext): return u"".join(filter(lambda x:0x4e00<=ord(x)<0xa700 or ord(x)>=0x10000, list(unitext)))
  if not type(pinyin)==type(u""):
    pinyin = pinyin.decode('utf-8')
  if not pinyin.strip(): return ""
  global pinyin_dryrun
  if pinyin_dryrun:
    pinyin_dryrun = list(pinyin_dryrun)
Silas S. Brown's avatar
Silas S. Brown committed
    vals = espeak.transliterate_multiple("zh",pinyin_dryrun,0)
    assert len(pinyin_dryrun)==len(vals)
    for i in range(len(pinyin_dryrun)):
    pinyin_dryrun = set()
  if (pinyin,) in cache: pyNums = cache[(pinyin,)]
Silas S. Brown's avatar
Silas S. Brown committed
  else: pyNums = espeak.transliterate("zh",pinyin,forPartials=0) # (this transliterate just does tone marks to numbers, adds 5, etc; forPartials=0 because we DON'T want to change letters like X into syllables, as that won't happen in jyutping and we're going through it tone-by-tone)
  assert pyNums and pyNums.strip(), "espeak.transliterate returned %s for %s" % (repr(pyNums),repr(pinyin))
  return re.sub("a$","a5",re.sub("(?<=[a-zA-Z])er([1-5])",r"e\1r5",S(pyNums)))
if type(u"")==type(""): # Python 3
  getNext = lambda gen: gen.__next__()
else: getNext = lambda gen: gen.next()
def adjust_jyutping_for_pinyin(hanzi,jyutping,pinyin):
  # If we have good quality (proof-read etc) Mandarin pinyin, this can sometimes improve the automatic Cantonese transcription
  if not type(hanzi)==type(u""): hanzi = hanzi.decode('utf-8')
  hanzi = hanzi_only(hanzi)
  if not re.search(py2j_chars,hanzi): return jyutping
  pinyin = re.findall('[A-Za-z]*[1-5]',py2nums(pinyin))
Silas S. Brown's avatar
Silas S. Brown committed
  if not len(pinyin)==len(hanzi): return jyutping # can't fix
  jyutping = S(jyutping)
Silas S. Brown's avatar
Silas S. Brown committed
  i = 0 ; tones = re.finditer('[1-7]',jyutping) ; j2 = []
  for h,p in zip(list(hanzi),pinyin):
    try: j = getNext(tones).end()
Silas S. Brown's avatar
Silas S. Brown committed
    except StopIteration: return jyutping # one of the hanzi has no Cantonese reading in our data: we'll warn "failed to fix" below
Silas S. Brown's avatar
Silas S. Brown committed
    j2.append(jyutping[i:j]) ; i = j
Silas S. Brown's avatar
Silas S. Brown committed
    if h in py2j and p.lower() in py2j[h]: j2[-1]=j2[-1][:re.search("[A-Za-z]*[1-7]$",j2[-1]).start()]+py2j[h][p.lower()]
Silas S. Brown's avatar
Silas S. Brown committed
  return "".join(j2)+jyutping[i:]
Silas S. Brown's avatar
Silas S. Brown committed
u"\u4EC0":{"shen2":"sam6","shi2":"sap6"}, # unless zaap6
Silas S. Brown's avatar
Silas S. Brown committed
Silas S. Brown's avatar
Silas S. Brown committed
Silas S. Brown's avatar
Silas S. Brown committed
# u"\u5207":{"qie4":"cai3","qie1":"cit3"}, # WRONG (rm'd v1.17).  It's cit3 in re4qie4.  It just wasn't in yiqie4 (which zhy_list has as an exception anyway)
Silas S. Brown's avatar
Silas S. Brown committed
u"\u6F02":{"piao1":"piu1","piao3 piao4":"piu3"},
u"\u843D":{"luo1 luo4 lao4":"lok6","la4":"laai6"},
u"\u8457":{"zhu4":"zyu3","zhuo2":"zoek3","zhuo2 zhao2 zhao1 zhe5":"zoek6"},
u"\u8B58\u8BC6":{"shi2 shi4":"sik1","zhi4":"zi3"},
for k in list(py2j.keys()):
   if len(k)>1:
Silas S. Brown's avatar
Silas S. Brown committed
    for c in list(k): py2j[c]=py2j[k]
    del py2j[k]
for _,v in py2j.items():
  for k in list(v.keys()):
Silas S. Brown's avatar
Silas S. Brown committed
    if len(k.split())>1:
      for w in k.split(): v[w]=v[k]
      del v[k]
py2j_chars = re.compile(u'['+''.join(list(py2j.keys()))+']')
Silas S. Brown's avatar
Silas S. Brown committed

Silas S. Brown's avatar
Silas S. Brown committed
def jyutping_to_lau(j):
  j = S(j).lower().replace("j","y").replace("z","j")
Silas S. Brown's avatar
Silas S. Brown committed
  for k,v in jlRep: j=j.replace(k,v)
  return j.lower().replace("ohek","euk")
def jyutping_to_lau_java(jyutpingNo=2,lauNo=1):
  # for annogen.py 3.29+ --annotation-postprocess to ship Jyutping and generate Lau at runtime
  return 'if(annotNo=='+str(jyutpingNo)+'||annotNo=='+str(lauNo)+'){m=Pattern.compile("<rt>(.*?)</rt>").matcher(r);sb=new StringBuffer();while(m.find()){String r2=(annotNo=='+str(jyutpingNo)+'?m.group(1).replaceAll("([1-7])(.)","$1&shy;$2"):(m.group(1)+" ").toLowerCase().replace("j","y").replace("z","j")'+''.join('.replace("'+k+'","'+v+'")' for k,v in jlRep)+'.toLowerCase().replace("ohek","euk").replaceAll("([1-7])","<sup>$1</sup>-").replace("- "," ").replaceAll(" $","")),tmp=m.group(1).substring(0,1);if(annotNo=='+str(lauNo)+'&&tmp.equals(tmp.toUpperCase()))r2=r2.substring(0,1).toUpperCase()+r2.substring(1);m.appendReplacement(sb,"<rt>"+r2+"</rt>");}m.appendTail(sb); r=sb.toString();}' # TODO: can probably go faster with mapping for some of this
Silas S. Brown's avatar
Silas S. Brown committed
def incomplete_lau_to_jyutping(l):
  # incomplete: assumes Lau didn't do the "aa" -> "a" rule
  l = S(l).lower().replace("euk","ohek")
Silas S. Brown's avatar
Silas S. Brown committed
  for k,v in ljRep: l=l.replace(k,v)
  return l.lower().replace("j","z").replace("y","j")
def incomplete_lau_to_yale_u8(l): return jyutping_to_yale_u8(incomplete_lau_to_jyutping(l))
Silas S. Brown's avatar
Silas S. Brown committed
jlRep = [(unchanged,unchanged.upper()) for unchanged in "aai aau aam aang aan aap aat aak ai au am ang an ap at ak a ei eng ek e iu im ing in ip it ik i oi ong on ot ok ung uk".split()] + [("eoi","UI"),("eon","UN"),("eot","UT"),("eok","EUK"),("oeng","EUNG"),("oe","EUH"),("c","ch"),("ou","O"),("o","OH"),("yu","UE"),("u","OO")]
jlRep.sort(key=lambda a:-len(a[0])) # longest 1st
Silas S. Brown's avatar
Silas S. Brown committed
# u to oo includes ui to ooi, un to oon, ut to oot
# yu to ue includes yun to uen and yut to uet
# drawing from the table on http://www.omniglot.com/writing/cantonese.htm plus this private communication:
# Jyutping "-oeng" maps to Sidney Lau "-eung".
# Jyutping "jyu" maps to Sidney Lau "yue". (consequence of yu->ue, j->y)
Silas S. Brown's avatar
Silas S. Brown committed
ljRep=[(b.lower(),a.upper()) for a,b in jlRep]
ljRep.sort(key=lambda a:-len(a[0])) # longest 1st
Silas S. Brown's avatar
Silas S. Brown committed

def ping_or_lau_to_syllable_list(j): return re.sub(r"([1-9])(?![0-9])",r"\1 ",re.sub(r"[!-/:-@^-`]"," ",S(j))).split()
Silas S. Brown's avatar
Silas S. Brown committed

def hyphenate_ping_or_lau_syl_list(sList,groupLens=None):
    if type(sList) in [str,type(u"")]:
Silas S. Brown's avatar
Silas S. Brown committed
        sList = ping_or_lau_to_syllable_list(sList)
Silas S. Brown's avatar
Silas S. Brown committed
    return hyphenate_syl_list(sList,groupLens)
def hyphenate_yale_syl_list(sList,groupLens=None):
    # (if sList is a string, the syllables must be space-separated,
    #  which will be the case if to_yale functions below are used)
    if not type(sList)==list: sList = sList.split()
Silas S. Brown's avatar
Silas S. Brown committed
    return hyphenate_syl_list(sList,groupLens)
def hyphenate_syl_list(sList,groupLens=None):
    assert type(sList) == list
    if '--hyphenate-all' in sys.argv: groupLens = [len(sList)]
    elif not groupLens: groupLens = [1]*len(sList) # don't hyphenate at all if we don't know
Silas S. Brown's avatar
Silas S. Brown committed
    else: assert sum(groupLens) == len(sList), "sum("+repr(groupLens)+")!=len("+repr(sList)+")"
Silas S. Brown's avatar
Silas S. Brown committed
    r = [] ; start = 0
    for g in groupLens:
        r.append("-".join(S(x) for x in sList[start:start+g]))
Silas S. Brown's avatar
Silas S. Brown committed
        start += g
    return " ".join(r)
Silas S. Brown's avatar
Silas S. Brown committed
def jyutping_to_yale_TeX(j): # returns space-separated syllables
Silas S. Brown's avatar
Silas S. Brown committed
  for syl in ping_or_lau_to_syllable_list(S(j).lower().replace("eo","eu").replace("oe","eu").replace("j","y").replace("yyu","yu").replace("z","j").replace("c","ch")):
Silas S. Brown's avatar
Silas S. Brown committed
Silas S. Brown's avatar
Silas S. Brown committed
    for i in range(len(syl)):
      if syl[i] in "aeiou":
        vowel=i ; break
Silas S. Brown's avatar
Silas S. Brown committed
    if vowel==None and re.match(r"h?(m|ng)[456]",syl): # standalone nasal syllables
Silas S. Brown's avatar
Silas S. Brown committed
      vowel = syl.find('m')
      if vowel<0: vowel = syl.index('n')
Silas S. Brown's avatar
Silas S. Brown committed
      lastVowel = syl.find('g')
      if lastVowel<0: lastVowel = vowel
Silas S. Brown's avatar
Silas S. Brown committed
    if vowel==None:
Silas S. Brown's avatar
Silas S. Brown committed
      ret.append(syl.upper()) ; continue # English word or letter in the Chinese?
    if syl[vowel:vowel+2] == "aa" and (len(syl)<vowel+2 or syl[vowel+2] in "123456"):
      syl=syl[:vowel]+syl[vowel+1:] # final aa -> a
    # the tonal 'h' goes after all the vowels but before any consonants:
    for i in range(len(syl)-1,-1,-1):
      if syl[i] in "aeiou":
        lastVowel=i ; break
    if syl[-1] in "1234567":
      # get_jyutping replaces 7 with 1 because zhy_list is
      # more Canton-type than Hong Kong-type Cantonese and
      # there is considerable disagreement on which "1"s
      # should be "7"s, but if you pass any "7" into the
      # jyutping_to_yale functions we can at least process
      # it here:
      tone = ["\=",r"\'","",r"\`",r"\'","",r"\`"][int(syl[-1])-1]
Silas S. Brown's avatar
Silas S. Brown committed
      if syl[-1] in "456":
Silas S. Brown's avatar
Silas S. Brown committed
    else: ret.append(syl.upper()) # English word or letter in the Chinese?
  return ' '.join(ret)

Silas S. Brown's avatar
Silas S. Brown committed
def jyutping_to_yale_u8(j): # returns space-separated syllables
  import unicodedata
  def mysub(z,l):
    for x,y in l:
      z = re.sub(re.escape(x)+r"(.)",r"\1"+y,z)
    return z
  if type(u"")==type(""): U=str # Python 3
  else: U=unicode # Python 2
  return unicodedata.normalize('NFC',mysub(U(jyutping_to_yale_TeX(j).replace(r"\i{}","i").replace(r"\I{}","I")),[(r"\`",u"\u0300"),(r"\'",u"\u0301"),(r"\=",u"\u0304")])).encode('utf-8')
Silas S. Brown's avatar
Silas S. Brown committed

Silas S. Brown's avatar
Silas S. Brown committed
def superscript_digits_TeX(j):
  # for jyutping and Sidney Lau
  j = S(j)
Silas S. Brown's avatar
Silas S. Brown committed
  for digit in "123456789": j=j.replace(digit,r"\raisebox{-0.3ex}{$^"+digit+r"$}\hspace{0pt}")
Silas S. Brown's avatar
Silas S. Brown committed
  return j

def superscript_digits_HTML(j):
  j = S(j)
Silas S. Brown's avatar
Silas S. Brown committed
  for digit in "123456789": j=j.replace(digit,"<sup>"+digit+"</sup>")
  return j

def superscript_digits_UTF8(j):
  # WARNING: not all fonts have all digits; many have only the first 3.  superscript_digits_HTML might be better for browsers, even though it does produce more bytes.
  j = S(j)
  for digit in range(1,10): j=j.replace(str(digit),S(u"¹²³⁴⁵⁶⁷⁸⁹"[digit-1].encode('utf-8')))
  if type(j)==type(u""): j=j.encode('utf-8') # Python 3
Silas S. Brown's avatar
Silas S. Brown committed
  return j

def import_gradint():
    global gradint
    try: return gradint
    except: pass
    # when importing gradint, make sure no command line
    tmp,sys.argv = sys.argv,sys.argv[:1]
    import gradint
    sys.argv = tmp
    gradint.espeak_preprocessors = {}
Silas S. Brown's avatar
Silas S. Brown committed
    return gradint

def do_song_subst(hanzi_u8): return B(hanzi_u8).replace(unichr(0x4f7f).encode('utf-8'),unichr(0x38c8).encode('utf-8')) # Mandarin shi3 (normally jyutping sai2) is usually si3 in songs, so substitute a rarer character that unambiguously has that reading before sending to get_jyutping
Silas S. Brown's avatar
Silas S. Brown committed

Silas S. Brown's avatar
Silas S. Brown committed
if __name__ == "__main__":
Silas S. Brown's avatar
Silas S. Brown committed
    # command-line use: output Lau for each line of stdin
Silas S. Brown's avatar
Silas S. Brown committed
    # (or Yale if there's a --yale in sys.argv, or both
    # with '#' separators if --yale#lau in sys.argv,
    # also --yale#ping and --yale#lau#ping accepted);
Silas S. Brown's avatar
Silas S. Brown committed
    # if there's a # in the line, assume it's hanzi#pinyin
    # (for annogen.py --reannotator="##python cantonese.py")
Silas S. Brown's avatar
Silas S. Brown committed
    lines = sys.stdin.read().replace("\r\n","\n").split("\n")
    if lines and not lines[-1]: del lines[-1]
    dryrun_mode = True
Silas S. Brown's avatar
Silas S. Brown committed
    def songSubst(l):
Silas S. Brown's avatar
Silas S. Brown committed
      if '--song-lyrics' in sys.argv: l=do_song_subst(l)
Silas S. Brown's avatar
Silas S. Brown committed
      return l
Silas S. Brown's avatar
Silas S. Brown committed
    for l in lines:
      if '#' in l: l,pinyin = l.split('#')
      else: pinyin = None
Silas S. Brown's avatar
Silas S. Brown committed
      if pinyin and not type(pinyin)==type(u""):
        pinyin = pinyin.decode('utf-8')
      if pinyin and not (pinyin,) in cache:
        for w in pinyin.split():
          for h in w.split('-'):
Silas S. Brown's avatar
Silas S. Brown committed
    dryrun_mode = False
Silas S. Brown's avatar
Silas S. Brown committed
    for l in lines:
      if '#' in l: l,pinyin = l.split('#')
      else: pinyin = None
Silas S. Brown's avatar
Silas S. Brown committed
      jyutping = get_jyutping(songSubst(l),0)
      if not jyutping: groupLens = None # likely a Unihan-only 'fallback readings' zi that has no Cantonese
      elif pinyin:
        jyutping = adjust_jyutping_for_pinyin(l,jyutping,pinyin)
        groupLens = [0]
        for syl,space in re.findall('([A-Za-z]*[1-5])( *)',' '.join('-'.join(py2nums(h) for h in w.split('-')) for w in pinyin.split())): # doing it this way so we're not relying on espeak transliterate_multiple to preserve spacing and hyphenation
          groupLens[-1] += 1
          if space: groupLens.append(0)
        if not groupLens[-1]: groupLens=groupLens[:-1]
        lenWanted = len(ping_or_lau_to_syllable_list(jyutping))
        if sum(groupLens) > lenWanted: # probably silent -r to drop
          for i,word in enumerate(py2nums(pinyin).split()):
Silas S. Brown's avatar
Silas S. Brown committed
            if re.search("[1-5]r5",word):
              groupLens[i] -= 1
              if sum(groupLens)==lenWanted: break
        if not sum(groupLens)==lenWanted:
          sys.stderr.write("WARNING: failed to fix "+pinyin+" ("+py2nums(pinyin)+") to "+jyutping+" ("+repr(ping_or_lau_to_syllable_list(jyutping))+") from "+l+", omitting\n")
          groupLens = None ; jyutping = ""
      else: groupLens = None
      if "--yale#lau" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping),groupLens)+"#"+superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping),groupLens)))
      elif '--yale#ping' in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping),groupLens)+"#"+jyutping.replace(' ',''))
      elif "--yale#lau#ping" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping),groupLens)+"#"+superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping),groupLens))+"#"+jyutping.replace(' ',''))
      elif "--yale" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping),groupLens))
      else: print (superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping),groupLens)))
    try: pickle.Pickler(open(cache_fname,"wb"),-1).dump(cache)
    except: pass