FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
Commit 9081104f authored by Silas S. Brown's avatar Silas S. Brown
Browse files

Gradint update

git-svn-id: http://svn.code.sf.net/p/e-guidedog/code/ssb22/gradint@1665 29193198-4895-4776-b068-10539e920549
parent abd9c57b
No related branches found
No related tags found
No related merge requests found
......@@ -18,7 +18,7 @@ so this is only for reference. Requires an Android SDK installation.
cd /tmp
wget http://android-scripting.googlecode.com/hg/android/script_for_android_template.zip
wget http://people.pwf.cam.ac.uk/ssb22/gradint/gradint-android.zip
wget http://people.ds.cam.ac.uk/ssb22/gradint/gradint-android.zip
mkdir android
cd android
unzip ../gradint-android.zip
......@@ -18,3 +18,5 @@ samples.cgi - CGI script to browse a samples directory
or that the site is not publically viewable)
espeak.cgi - script that lets a Web user play with espeak options
Other files - see description at the top of the file
# -*- coding: utf-8 -*-
# cantonese.py - Python functions for processing Cantonese transliterations
# (uses eSpeak and Gradint for help with some of them)
# (c) 2013 Silas S. Brown. License: GPL
dryrun_mode = False # True makes get_jyutping just batch it up for later
jyutping_cache = {} ; jyutping_dryrun = set()
def get_jyutping(hanzi,mustWork=1):
global espeak
if not espeak:
espeak = import_gradint().ESpeakSynth()
if not espeak.works_on_this_platform(): # must call
raise Exception("espeak.works_on_this_platform")
assert espeak.supports_language("zhy")
global jyutping_dryrun
if dryrun_mode:
return "aai1" # dummy value
elif jyutping_dryrun:
jyutping_dryrun = list(jyutping_dryrun)
vals = espeak.transliterate_multiple("zhy",jyutping_dryrun,0)
assert len(jyutping_dryrun)==len(vals)
for k,v in zip(jyutping_dryrun,vals):
jyutping_dryrun = set()
if hanzi in jyutping_cache: jyutping = jyutping_cache[hanzi]
else: jyutping_cache[hanzi] = jyutping = espeak.transliterate("zhy",hanzi,forPartials=0).replace("7","1").lower() # .lower() needed because espeak sometimes randomly capitalises e.g. 2nd hanzi of 'hypocrite' (Mandarin xuwei de ren)
if mustWork: assert jyutping.strip(), "No translit. result for "+repr(hanzi)
elif not jyutping.strip(): jyutping=""
return jyutping
espeak = 0
def jyutping_to_lau(j):
j = j.lower().replace("j","y").replace("z","j")
for k,v in jlRep: j=j.replace(k,v)
return j.lower()
jlRep = [(unchanged,unchanged.upper()) for unchanged in "aai aau aam aang aan aap aat aak ai au am ang an ap at ak a ei eng ek e iu im ing in ip it ik i oi ong on ot ok ung uk".split()] + [("eoi","UI"),("eon","UN"),("eot","UT"),("eok","EUK"),("oeng","EUNG"),("oe","EUH"),("c","ch"),("ou","O"),("o","OH"),("yu","UE"),("u","OO")]
jlRep.sort(lambda a,b:len(b[0])-len(a[0]))
# u to oo includes ui to ooi, un to oon, ut to oot
# yu to ue includes yun to uen and yut to uet
# drawing from the table on http://www.omniglot.com/writing/cantonese.htm plus this private communication:
# Jyutping "-oeng" maps to Sidney Lau "-eung".
# Jyutping "jyu" maps to Sidney Lau "yue". (consequence of yu->ue, j->y)
def ping_or_lau_to_syllable_list(j):
j = re.sub(r"[^a-zA-Z0-9]"," ",j)
for digit in "123456789": j=j.replace(digit,digit+" ")
return j.split()
import re
def hyphenate_ping_or_lau_syl_list(sList,groupLens=None):
if type(sList) in [str,unicode]:
sList = ping_or_lau_to_syllable_list(sList)
if not groupLens: groupLens = [len(sList)]
else: assert sum(groupLens) == len(sList)
r = [] ; start = 0
for g in groupLens:
start += g
return " ".join(r)
def jyutping_to_yale_TeX(j):
for syl in ping_or_lau_to_syllable_list(j.lower().replace("eo","eu").replace("oe","eu").replace("j","y").replace("yyu","yu").replace("z","j").replace("c","ch")):
for i in range(len(syl)):
if syl[i] in "aeiou":
vowel=i ; break
if not vowel:
ret.append(syl.upper()) ; continue # English word or letter in the Chinese?
if syl[vowel:vowel+2] == "aa" and (len(syl)<vowel+2 or syl[vowel+2] in "123456"):
syl=syl[:vowel]+syl[vowel+1:] # final aa -> a
# the tonal 'h' goes after all the vowels but before any consonants:
for i in range(len(syl)-1,-1,-1):
if syl[i] in "aeiou":
lastVowel=i ; break
if syl[-1] in "456":
if syl[-1] in "123":
ret.append(syl[:vowel]+[r"\`",r"\'",r""][int(syl[-1])-1]+syl[vowel:-1]) # TODO do we want \= in the 3rd one? what if it's over an i ?
else: ret.append(syl.upper()) # English word or letter in the Chinese?
return ' '.join(ret)
def superscript_digits_TeX(j):
# for jyutping and Sidney Lau
for digit in "123456789": j=j.replace(digit,r"$^"+digit+r"$\hspace{0pt}")
return j
def superscript_digits_HTML(j):
for digit in "123456789": j=j.replace(digit,"<sup>"+digit+"</sup>")
return j
def superscript_digits_UTF8(j):
# WARNING: not all fonts have all digits; many have only the first 3. superscript_digits_HTML might be better for browsers, even though it does produce more bytes.
for digit in range(1,10): j=j.replace(str(digit),u"¹²³⁴⁵⁶⁷⁸⁹"[digit-1].encode('utf-8'))
return j
import sys
def annogen_reannotate(input_c,annotate_func):
# re-annotates any annogen o() and o2() calls
# TODO: annotate_func is called separately for each
# o() and o2() call; should we group and degroup it
# so it has access to the whole phrase?
global dryrun_mode ; dryrun_mode = True
for m in re.finditer(r'o2?\("([^"]*)","[^"]*"(,"[^"]*")?\);',input_c): get_jyutping(m.groups()[0])
dryrun_mode = False
i = 0 ; out = []
for m in re.finditer(r'(o2?)\("([^"]*)","[^"]*"(,"[^"]*")?\);',input_c):
rest = m.groups()[2]
if not rest: rest = ""
i = m.end()
return "".join(out)
def import_gradint():
global gradint
try: return gradint
except: pass
# when importing gradint, make sure no command line
tmp,sys.argv = sys.argv,sys.argv[:1]
import gradint
sys.argv = tmp
return gradint
if __name__ == "__main__":
# command-line use: redo annotator.c on stdin to S.Lau
sys.stdout.write(annogen_reannotate(sys.stdin.read(),lambda h:superscript_digits_HTML(hyphenate_ping_or_lau_syl_list(jyutping_to_lau(get_jyutping(h,0))))))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment