#!/usr/bin/env python
# -*- coding: utf-8 -*-
#  (should work with either Python 2 or Python 3)

# cantonese.py - Python functions for processing Cantonese transliterations
# (uses eSpeak and Gradint for help with some of them)

# v1.31 (c) 2013-15,2017-20 Silas S. Brown.  License: GPL

dryrun_mode = False # True makes get_jyutping batch it up for later (then run and save cache on first call with False)
jyutping_cache = {} ; jyutping_dryrun = set()
import re, pickle
try: jyutping_cache = pickle.Unpickler(open("/tmp/.jyutping-cache","rb")).load()
except: pass

extra_zhy_dict = { # TODO: add these to the real zhy_list in eSpeak
  u"\u9c85":"bat3",u"\u9b81":"bat3",
}

def S(v): # make sure it's a string in both Python 2 and 3
  if type("")==type(u""): # Python 3
    try: return v.decode('utf-8') # in case it's bytes
    except: return v
  else: return v
def B(v): # make sure it's bytes in Python 3, str in Python 2
  if type(v)==type(u""): return v.encode('utf-8')
  return v

def get_jyutping(hanzi,mustWork=1):
  if not type(hanzi)==type(u""): hanzi=hanzi.decode('utf-8')
  for k,v in extra_zhy_dict.items(): hanzi=hanzi.replace(k,v)
  global espeak
  if not espeak:
      espeak = import_gradint().ESpeakSynth()
      if not espeak.works_on_this_platform(): # must call
          raise Exception("espeak.works_on_this_platform")
      assert espeak.supports_language("zhy")

  global jyutping_dryrun
  if dryrun_mode:
      if not hanzi in jyutping_cache: jyutping_dryrun.add(hanzi)
      return "aai1" # dummy value
  elif jyutping_dryrun:
      jyutping_dryrun = list(jyutping_dryrun)
      vals = espeak.transliterate_multiple("zhy",jyutping_dryrun,0)
      assert len(jyutping_dryrun)==len(vals)
      for k,v in zip(jyutping_dryrun,vals):
        jyutping_cache[k]=S(v).replace("7","1").lower() # see below
      jyutping_dryrun = set()
      try: pickle.Pickler(open("/tmp/.jyutping-cache","wb"),-1).dump(jyutping_cache)
      except: pass
  if hanzi in jyutping_cache: jyutping = jyutping_cache[hanzi]
  else: jyutping_cache[hanzi] = jyutping = S(espeak.transliterate("zhy",hanzi,forPartials=0)).replace("7","1").lower() # .lower() needed because espeak sometimes randomly capitalises e.g. 2nd hanzi of 'hypocrite' (Mandarin xuwei de ren)
  if mustWork: assert jyutping.strip(), "No translit. result for "+repr(hanzi)
  elif not jyutping.strip(): jyutping=""
  return jyutping
espeak = 0

def hanzi_only(unitext): return u"".join(filter(lambda x:0x4e00<=ord(x)<0xa700 or ord(x)>=0x10000, list(unitext)))
def adjust_jyutping_for_pinyin(hanzi,jyutping,pinyin):
  # If we have good quality (proof-read etc) Mandarin pinyin, this can sometimes improve the automatic Cantonese transcription
  if not type(hanzi)==type(u""): hanzi = hanzi.decode('utf-8')
  hanzi = hanzi_only(hanzi)
  if not re.search(py2j_chars,hanzi): return jyutping
  if not type(pinyin)==type(u""): pinyin = pinyin.decode('utf-8')
  assert pinyin.strip(), "blank pinyin" # saves figuring out a findall TypeError
  py2 = espeak.transliterate("zh",pinyin,forPartials=0) # TODO: dryrun_mode ?  (this transliterate just does tone marks to numbers, adds 5, etc; forPartials=0 because we DON'T want to change letters like X into syllables, as that won't happen in jyutping and we're going through it tone-by-tone)
  assert py2 and py2.strip(), "espeak.transliterate returned %s for %s" % (repr(py2),repr(pinyin))
  pinyin = re.findall('[A-Za-z]*[1-5]',S(py2))
  if not len(pinyin)==len(hanzi): return jyutping # can't fix
  jyutping = S(jyutping)
  i = 0 ; tones = re.finditer('[1-7]',jyutping) ; j2 = []
  for h,p in zip(list(hanzi),pinyin):
    try: j = tones.next().end()
    except StopIteration: raise Exception("Ran out of tones in "+jyutping+" when zipping "+repr(hanzi)+'/'+repr(pinyin))
    j2.append(jyutping[i:j]) ; i = j
    if h in py2j and p.lower() in py2j[h]: j2[-1]=j2[-1][:re.search("[A-Za-z]*[1-7]$",j2[-1]).start()]+py2j[h][p.lower()]
  return "".join(j2)+jyutping[i:]
py2j={
u"\u4E2D":{"zhong1":"zung1","zhong4":"zung3"},
u"\u4E3A\u70BA":{"wei2":"wai4","wei4":"wai6"},
u"\u4E50\u6A02":{"le4":"lok6","yue4":"ngok6"},
u"\u4EB2\u89AA":{"qin1":"can1","qing4":"can3"},
u"\u4F20\u50B3":{"chuan2":"cyun4","zhuan4":"zyun6"},
u"\u4FBF":{"bian4":"pin4","pian2":"bin6"},
u"\u5047":{"jia3":"gaa2","jia4":"gaa3"},
u"\u5174\u8208":{"xing1":"hing1","xing4":"hing3"},
# u"\u5207":{"qie4":"cai3","qie1":"cit3"}, # WRONG (rm'd v1.17).  It's cit3 in re4qie4.  It just wasn't in yiqie4 (which zhy_list has as an exception anyway)
u"\u521B\u5275":{"chuang1":"cong1","chuang4":"cong3"},
u"\u53EA":{"zhi1":"zek3","zhi3":"zi2"},
u"\u53F7\u865F":{"hao4":"hou6","hao2":"hou4"},
u"\u548C":{"he2":"wo4","he4":"wo6"},
u"\u54BD":{"yan1":"jin1","yan4":"jin3","ye4":"jit3"},
u"\u5708":{"juan4":"gyun6","quan1":"hyun1"},
u"\u597D":{"hao3":"hou2","hao4":"hou3"},
u"\u5C06\u5C07":{"jiang1":"zoeng1","jiang4":"zoeng3"},
u"\u5C11":{"shao3":"siu2","shao4":"siu3"},
u"\u5DEE":{"cha4":"caa1","cha1":"caa1","chai1":"caai1"},
u"\u5F37\u5F3A":{"qiang2":"koeng4","qiang3":"koeng5"},
u"\u62C5\u64D4":{"dan1":"daam1","dan4":"daam3"},
u"\u6323\u6399":{"zheng4":"zaang6","zheng1":"zang1"},
u"\u6570\u6578":{"shu3":"sou2","shu4":"sou3"},
u"\u671D":{"chao2":"ciu4","zhao1":"ziu1"},
u"\u6ED1":{"hua2":"waat6","gu3":"gwat1"},
u"\u6F02":{"piao1":"piu1","piao3 piao4":"piu3"},
u"\u76DB":{"sheng4":"sing6","cheng2":"sing4"},
u"\u76F8":{"xiang1":"soeng1","xiang4":"soeng3"},
u"\u770B":{"kan4":"hon3","kan1":"hon1"},
u"\u79CD\u7A2E":{"zhong3":"zung2","zhong4":"zung3"},
u"\u7EF7\u7E43":{"beng1":"bang1","beng3":"maang1"},
u"\u8208":{"xing1":"hing1","xing4":"hing3"},
u"\u843D":{"luo1 luo4 lao4":"lok6","la4":"laai6"},
u"\u8457":{"zhu4":"zyu3","zhuo2":"zoek3","zhuo2 zhao2 zhao1 zhe5":"zoek6"},
u"\u8981":{"yao4":"jiu3","yao1":"jiu1"},
u"\u89C1\u898B":{"jian4":"gin3","xian4":"jin6"},
u"\u89C9\u89BA":{"jue2":"gok3","jiao4":"gaau3"},
u"\u8B58\u8BC6":{"shi2 shi4":"sik1","zhi4":"zi3"},
u"\u8ABF\u8C03":{"diao4":"diu6","tiao2":"tiu4"},
u"\u91CF":{"liang2":"loeng4","liang4":"loeng6"},
u"\u9577\u957F":{"chang2":"coeng4","zhang3":"zoeng2"},
u"\u9593\u95F4":{"jian1":"gaan1","jian4":"gaan3"},
u"\u96BE\u96E3":{"nan2":"naan4","nan4":"naan6"}}
for k in list(py2j.keys()):
   if len(k)>1:
    for c in list(k): py2j[c]=py2j[k]
    del py2j[k]
for _,v in py2j.items():
  for k in list(v.keys()):
    if len(k.split())>1:
      for w in k.split(): v[w]=v[k]
      del v[k]
py2j_chars = re.compile(u'['+''.join(list(py2j.keys()))+']')

def jyutping_to_lau(j):
  j = S(j).lower().replace("j","y").replace("z","j")
  for k,v in jlRep: j=j.replace(k,v)
  return j.lower().replace("aa","a").replace("ohek","euk")
def incomplete_lau_to_jyutping(l):
  # incomplete: assumes Lau didn't do the "aa" -> "a" rule
  l = S(l).lower().replace("euk","ohek")
  for k,v in ljRep: l=l.replace(k,v)
  return l.lower().replace("j","z").replace("y","j")
def incomplete_lau_to_yale_u8(l): return jyutping_to_yale_u8(incomplete_lau_to_jyutping(l))
jlRep = [(unchanged,unchanged.upper()) for unchanged in "aai aau aam aang aan aap aat aak ai au am ang an ap at ak a ei eng ek e iu im ing in ip it ik i oi ong on ot ok ung uk".split()] + [("eoi","UI"),("eon","UN"),("eot","UT"),("eok","EUK"),("oeng","EUNG"),("oe","EUH"),("c","ch"),("ou","O"),("o","OH"),("yu","UE"),("u","OO")]
jlRep.sort(key=lambda a:-len(a[0])) # longest 1st
# u to oo includes ui to ooi, un to oon, ut to oot
# yu to ue includes yun to uen and yut to uet
# drawing from the table on http://www.omniglot.com/writing/cantonese.htm plus this private communication:
# Jyutping "-oeng" maps to Sidney Lau "-eung".
# Jyutping "jyu" maps to Sidney Lau "yue". (consequence of yu->ue, j->y)
ljRep=[(b.lower(),a.upper()) for a,b in jlRep]
ljRep.sort(key=lambda a:-len(a[0])) # longest 1st

def ping_or_lau_to_syllable_list(j): return re.sub(r"([1-9])(?![0-9])",r"\1 ",re.sub(r"[!-/:-@^-`]"," ",S(j))).split()

def hyphenate_ping_or_lau_syl_list(sList,groupLens=None):
    if type(sList) in [str,type(u"")]:
        sList = ping_or_lau_to_syllable_list(sList)
    return hyphenate_syl_list(sList,groupLens)
def hyphenate_yale_syl_list(sList,groupLens=None):
    # (if sList is a string, the syllables must be space-separated,
    #  which will be the case if to_yale functions below are used)
    if not type(sList)==list: sList = sList.split()
    return hyphenate_syl_list(sList,groupLens)
def hyphenate_syl_list(sList,groupLens=None):
    assert type(sList) == list
    if not groupLens and '--hyphenate-all' in sys.argv: groupLens = [len(sList)] # this might be suitable for re-annotating hanzi+pinyin to Cantonese in annogen.py's --reannotator option, although it would be better if spacing could be copied from the pinyin for cases where the pinyin line is spaced but the hanzi line is not
    if not groupLens: groupLens = [1]*len(sList) # don't hyphenate at all if we don't know
    else: assert sum(groupLens) == len(sList), "sum("+repr(groupLens)+")!=len("+repr(sList)+")"
    r = [] ; start = 0
    for g in groupLens:
        r.append("-".join(S(x) for x in sList[start:start+g]))
        start += g
    return " ".join(r)
    
def jyutping_to_yale_TeX(j): # returns space-separated syllables
  ret=[]
  for syl in ping_or_lau_to_syllable_list(S(j).lower().replace("eo","eu").replace("oe","eu").replace("j","y").replace("yyu","yu").replace("z","j").replace("c","ch")):
    vowel=lastVowel=None
    for i in range(len(syl)):
      if syl[i] in "aeiou":
        vowel=i ; break
    if vowel==None and re.match(r"h?(m|ng)[456]",syl): # standalone nasal syllables
      vowel = syl.find('m')
      if vowel<0: vowel = syl.index('n')
      lastVowel = syl.find('g')
      if lastVowel<0: lastVowel = vowel
    if vowel==None:
      ret.append(syl.upper()) ; continue # English word or letter in the Chinese?
    if syl[vowel:vowel+2] == "aa" and (len(syl)<vowel+2 or syl[vowel+2] in "123456"):
      syl=syl[:vowel]+syl[vowel+1:] # final aa -> a
    # the tonal 'h' goes after all the vowels but before any consonants:
    for i in range(len(syl)-1,-1,-1):
      if syl[i] in "aeiou":
        lastVowel=i ; break
    if syl[-1] in "123456":
      tone = ["\=",r"\'","",r"\`",r"\'",""][int(syl[-1])-1]
      if syl[-1] in "456":
        syl=syl[:lastVowel+1]+"h"+syl[lastVowel+1:]
      ret.append((syl[:vowel]+tone+syl[vowel:-1]).replace(r"\=i",r"\=\i{}").replace(r"\=I",r"\=\I{}"))
    else: ret.append(syl.upper()) # English word or letter in the Chinese?
  return ' '.join(ret)

def jyutping_to_yale_u8(j): # returns space-separated syllables
  import unicodedata
  def mysub(z,l):
    for x,y in l:
      z = re.sub(re.escape(x)+r"(.)",r"\1"+y,z)
    return z
  if type(u"")==type(""): U=str # Python 3
  else: U=unicode # Python 2
  return unicodedata.normalize('NFC',mysub(U(jyutping_to_yale_TeX(j).replace(r"\i{}","i").replace(r"\I{}","I")),[(r"\`",u"\u0300"),(r"\'",u"\u0301"),(r"\=",u"\u0304")])).encode('utf-8')

def superscript_digits_TeX(j):
  # for jyutping and Sidney Lau
  j = S(j)
  for digit in "123456789": j=j.replace(digit,r"\raisebox{-0.3ex}{$^"+digit+r"$}\hspace{0pt}")
  return j

def superscript_digits_HTML(j):
  j = S(j)
  for digit in "123456789": j=j.replace(digit,"<sup>"+digit+"</sup>")
  return j

def superscript_digits_UTF8(j):
  # WARNING: not all fonts have all digits; many have only the first 3.  superscript_digits_HTML might be better for browsers, even though it does produce more bytes.
  j = S(j)
  for digit in range(1,10): j=j.replace(str(digit),S(u"¹²³⁴⁵⁶⁷⁸⁹"[digit-1].encode('utf-8')))
  if type(j)==type(u""): j=j.encode('utf-8') # Python 3
  return j

import sys

def import_gradint():
    global gradint
    try: return gradint
    except: pass
    # when importing gradint, make sure no command line
    tmp,sys.argv = sys.argv,sys.argv[:1]
    import gradint
    sys.argv = tmp
    return gradint

def do_song_subst(hanzi_u8): return B(hanzi_u8).replace(unichr(0x4f7f).encode('utf-8'),unichr(0x38c8).encode('utf-8')) # Mandarin shi3 (normally jyutping sai2) is usually si3 in songs, so substitute a rarer character that unambiguously has that reading before sending to get_jyutping

if __name__ == "__main__":
    # command-line use: output Lau for each line of stdin
    # (or Yale if there's a --yale in sys.argv, or both
    # with '#' separators if --yale#lau in sys.argv);
    # if there's a # in the line, assume it's hanzi#pinyin
    # (for annogen.py --reannotator="##python cantonese.py")
    lines = sys.stdin.read().replace("\r\n","\n").split("\n")
    if lines and not lines[-1]: del lines[-1]
    dryrun_mode = True
    def songSubst(l):
      if '--song-lyrics' in sys.argv: l=do_song_subst(l)
      return l
    for l in lines:
      if '#' in l: l=l[:l.index('#')]
      get_jyutping(songSubst(l))
    dryrun_mode = False
    for l in lines:
      if '#' in l: l,pinyin = l.split('#')
      else: pinyin = None
      jyutping = get_jyutping(songSubst(l),0)
      if pinyin: jyutping = adjust_jyutping_for_pinyin(l,jyutping,pinyin)
      if "--yale#lau" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping))+"#"+superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping))))
      elif "--yale#lau#ping" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping))+"#"+superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping)))+"#"+jyutping.replace(' ',''))
      elif "--yale" in sys.argv: print (hyphenate_yale_syl_list(jyutping_to_yale_u8(jyutping)))
      else: print (superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(jyutping))))