Newer
Older

Silas S. Brown
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env python2
# Character-learning support program
# (C) 2006-2013 Silas S. Brown. Version 0.1471.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
listenAddr='127.0.0.1'
firstPortNo=9876
tableFile = "characters.txt" # for first-time setup
knownFile = "known-chars.txt" # ditto
dumpFile = "charlearn-data" # for saving progress
reviseFile = "revise.txt" # for requesting more revision next time (will be deleted after integration into progress)
import sys,os.path
if sys.argv[-1].startswith("--"): gradint = None # (don't need to speak if we're processing options, see at end)
elif os.path.isfile("gradint.py"): import gradint
else: gradint = None # won't speak characters
import commands,random,cPickle,BaseHTTPServer,os,thread,string,time,socket
if not sys.version_info[0]==2:
sys.stderr.write("Sorry, charlearn cannot run on Python "+repr(sys.version_info[0])+"\nNeeds Python 2.x\n")
sys.exit(1)
def byPriority(a,b): return a.priority-b.priority
priorityIfGotWrong = -10
priorityOfOtherCharWrong = -4
priorityOfGroupWrong = 0
maxShowInGroup = 5 ; priorityBreakGroup = 10
initSessionLen = sessionLen = 2 ; maxSessionLen = 10 ; sampleConst = 1.5
def updateSessionLen():
global sessionLen
sessionLen = min(max(sessionLen,int(thechars.countKnown()[1]+0.95)),maxSessionLen)
# did have /sampleConst after countKnown()[1] but doesn't seem necessary
already_spoken = {}
gradint_busy = 0
def speak_bkg():
gradint.just_synthesize()
global gradint_busy
gradint_busy = 0
class SingleChar:
def __init__(self,hanzi,pinyin):
self.hanzi = hanzi ; self.pinyin = pinyin
self.priority = 0 ; self.similarityGroup = None
self.supposedToKnow = 0
def formatPinyin(self): return self.pinyin.replace("\n","<BR>") # (could make it into actual tone marks also)
def htmlString(self,parent,step=1,left=0):
self.supposedToKnow = 1
r='<html><head><title>hanzi</title><meta http-equiv="Content-Type" content="text/html; charset=%s"></head><body><h1>%s</h1>' % (parent.charset,self.hanzi)
if step==1: r+=self.yesno('Do you know what this is? (%d remaining)' % left,2,0)
else:
r += self.formatPinyin() + "<HR>"
if step<=0:
if self.similarityGroup:
l = []
for c in parent.chars:
if c.similarityGroup == self.similarityGroup and not id(c)==id(self): l.append(c)
l.sort(byPriority)
r+="Not to be confused with:"
for c in l[:maxShowInGroup-1]: r+='<h1>%s</h1>%s' % (c.hanzi,c.formatPinyin())
r += '<hr>'
if parent.thisSession:
r+='<A HREF="/%s">Next character</A>' % str(random.random())
if step==-1:
# got it right - might as well take that link automatically
r=parent.processRequest("/").replace('</body></html>','')
else:
updateSessionLen()
r+='<A HREF="/quit">Quit</A> | <A HREF="/%s">Another %d</A>' % (str(random.random()),sessionLen)
if step==0:
self.priority=priorityIfGotWrong
self.speak(parent.charset)
else:
# knew it
self.priority += 1
if self.priority > 0:
if self.priority < 25000: self.priority *= 2 # give new characters a chance
else: self.priority = 50000 # level off
else: self.priority /= 2 # TRY this for a while - will make chars got-wrong recover more quickly (again to give new chars a chance)
parent.save()
elif step==2:
r+=self.yesno('Did you get it right?',-1,3)
self.speak(parent.charset)
elif step==3:
r+='What did you think it was?<P>'
toOut = [] # (pinyin,hanzi,id,is-in-same-group)
for c in parent.chars:
if c.similarityGroup and c.similarityGroup==self.similarityGroup: sameGrp=True
else: sameGrp=False # need to do it this way because Python sometimes returns 'None' from that expression
if c.supposedToKnow and not id(c)==id(self): toOut.append((c.pinyin,c.hanzi,id(c),sameGrp)) # NOT formatPinyin, because may want to i-search it
toOut.sort()
if len(toOut) > 20: r+="(Hint: On some browsers you can use find-as-you-type)<P>"
for outSameGroup in [True,False]:
oldL=len(r)
for p,hanzi,val,sameGrp in toOut:
if sameGrp==outSameGroup: r+='%s <A HREF="/%d_%d">%s</A><BR>' % (hanzi,id(self),val,p)
if len(r)>oldL and outSameGroup: r += '<HR>' # between chars in same group and others
r+='<A HREF="/%d=0">None of the above</A>' % id(self)
if not parent.thisSession:
global already_spoken ; already_spoken = {} # reset it so "Another N" does speak them
return r + '</body></html>'
def speak(self,charset):
if self.hanzi in already_spoken: return
already_spoken[self.hanzi] = 1 # don't set a self. attribute - it'll get pickled for next session
if gradint:
gradint.justSynthesize = self.hanzi.decode(charset).encode('utf-8')
global gradint_busy
while gradint_busy: time.sleep(0.5)
gradint_busy = 1
thread.start_new_thread(speak_bkg,())
def yesno(self,question,ifyes,ifno): return question+'<P><A ID="y" HREF="/%d=%d">Yes</A><SCRIPT>document.getElementById("y").focus()</SCRIPT> | <A HREF="/%d=%d">No</A>' % (id(self),ifyes,id(self),ifno) # (don't use the js anywhere except yes/no, because 'next character' etc may have too much on the screen and we don't want the focus() to scroll)
the_speaker_process = None
def terminate_server():
# portable signal.alarm(1)
time.sleep(1); os.abort()
class CharDbase:
def __init__(self):
self.counter = 0 ; self.nextPriority = 0
self.similarityGroups = 0
self.chars = [] ; self.thisSession = []
self.readTable() ; self.readKnown() ; self.readRevise()
def debug_printKnown(self):
print "-*- coding: %s -*-" % (self.charset,)
for c in self.chars:
if c.supposedToKnow: print c.priority,c.hanzi
def readTable(self):
addingTo = 0
if self.chars: addingTo = 1
lines=open(tableFile).readlines()
if lines[0].startswith("charset:"):
self.charset = lines[0].split()[-1]
lines = lines[1:]
else: self.charset = "iso-8859-1"
for line in lines: self.addCharFromFreqTable(line,addingTo)
def readKnown(self):
try:
o=open(knownFile)
except IOError: return
for line in o.readlines(): self.makeCharKnown(line.split()[0])
def readRevise(self):
try:
o=open(reviseFile)
except IOError: return
for line in o.readlines(): self.makeCharRevise(line.split()[0])
def makeCharKnown(self,hanzi):
if not hanzi: return # blank lines etc
for c in self.chars:
if c.hanzi==hanzi:
if not c.supposedToKnow:
c.supposedToKnow = 1
c.priority = priorityOfGroupWrong # just to check
return
print "WARNING: character '%s' in %s was not in %s - ignoring" % (repr(hanzi),knownFile,tableFile)
def makeCharRevise(self,hanzi):
if not hanzi: return # blank lines etc
for c in self.chars:
if c.hanzi==hanzi:
c.supposedToKnow = 1
c.priority = priorityIfGotWrong
return
print "WARNING: character '%s' in %s was not in %s - ignoring" % (repr(hanzi),reviseFile,tableFile)
def addCharFromFreqTable(self,line,checkAlreadyThere):
hanzi,pinyin = string.split(line,maxsplit=1)
c=SingleChar(hanzi,pinyin.replace("\\n","\n"))
c.priority = self.nextPriority ; self.nextPriority += 1
if checkAlreadyThere:
for c2 in self.chars:
if c2.hanzi == hanzi: return
self.chars.append(c)
def charIdToChar(self,charId):
char = None
for c in self.chars:
if id(c)==charId:
char = c ; break
assert char ; return char
def processRequest(self,path):
if '=' in path:
charId,step = map(lambda x:int(x),path[1:].split('='))
char = self.charIdToChar(charId)
elif '_' in path: # grouping
char,char2 = map(lambda x:self.charIdToChar(int(x)),path[1:].split('_'))
if not char.similarityGroup and not char2.similarityGroup: # new group:
self.similarityGroups += 1
char.similarityGroup = char2.similarityGroup = self.similarityGroups
elif not char.similarityGroup: char.similarityGroup = char2.similarityGroup
elif not char2.similarityGroup: char2.similarityGroup = char.similarityGroup
elif not char.similarityGroup == char2.similarityGroup: # merge 2 different groups:
for c in self.chars:
if c.similarityGroup == char2.similarityGroup: c.similarityGroup = char.similarityGroup
step = 0 # normal got-wrong for this character
char.priority = priorityIfGotWrong # here also, for the loop below
char2.priority = min(char2.priority,priorityOfOtherCharWrong)
for c in self.chars:
if c.similarityGroup == char.similarityGroup:
if c.priority >= priorityBreakGroup: c.similarityGroup=None
elif c.priority > priorityOfGroupWrong: c.priority = priorityOfGroupWrong
elif path=="/status":
self.chars.sort(byPriority)
cp=self.chars[:] ; r='<html><head><title>Current Status</title><meta http-equiv="Content-Type" content="text/html; charset=%s"></head><body><h2>Current Status</h2>(score/priority number is shown to the left of each item)<br>' % (self.charset,)
while cp:
if not cp[0].supposedToKnow:
del cp[0] ; continue
if cp[0].priority >= priorityBreakGroup: thisGrp=[0]
else: thisGrp=filter(lambda x:x==0 or (cp[x].similarityGroup and cp[x].similarityGroup==cp[0].similarityGroup and cp[x].priority < priorityBreakGroup),range(len(cp)))
if len(thisGrp)>1 and not r.endswith("<hr>"): r+="<hr>"
if len(thisGrp)>1: r+="<em>"+str(len(thisGrp))+" similar items:</em><br>"
for g in thisGrp: r += str(cp[g].priority)+": "+cp[g].hanzi+" "+cp[g].pinyin+"<br>"
if len(thisGrp)>1: r+="<hr>"
thisGrp.reverse()
for toDel in thisGrp: del cp[toDel]
return r+"</body></html>"
else:
if path=="/checkallknown": self.thisSession = filter(lambda x:x.supposedToKnow,self.chars) # TODO: Document this URL
char,step = self.chooseChar(),1
return char.htmlString(self,step,len(self.thisSession))
def chooseChar(self):
if not self.thisSession:
self.chars.sort(byPriority)
if sessionLen==initSessionLen:
self.thisSession = self.chars[:sessionLen] # introduce in order the first time (especially if the second one is just a straight line ("yi1"), as one beginner thought the program had gone wrong when he saw this)
self.thisSession.reverse() # because taken out by pop()
else: self.thisSession = random.sample(self.chars[:int(sessionLen*sampleConst)],sessionLen) # TODO need a better way than that. NB high priority should be VERY likely, but others should have a chance. try as-is for now
return self.thisSession.pop()
def save(self): cPickle.Pickler(open(dumpFile,"w"),-1).dump(self)
def countKnown(self):
charsSeen = sessnLen = charsSecure = newChars = 0
secure=[] ; insecure=[]
self.chars.sort(byPriority)
for c in self.chars:
if c.supposedToKnow:
charsSeen += 1
if c.priority>0: secure.append(c.hanzi)
else: insecure.append(c.hanzi)
else: newChars += 1
if newChars == 2: sessnLen = charsSeen
return charsSeen,sessnLen,secure,insecure
try:
dumped = open(dumpFile)
except IOError: dumped = None
if dumped:
thechars = cPickle.Unpickler(dumped).load()
dumped.close()
thechars.thisSession = []
if os.stat(tableFile).st_mtime > os.stat(dumpFile).st_mtime: thechars.readTable()
try:
if os.stat(knownFile).st_mtime > os.stat(dumpFile).st_mtime: thechars.readKnown()
except OSError: pass
try:
if os.stat(reviseFile).st_mtime > os.stat(dumpFile).st_mtime: thechars.readRevise()
except OSError: pass
updateSessionLen()
else:
thechars=CharDbase()
class RequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
def do_GET(self):
if self.path.startswith("/fav"):
self.send_response(404) ; self.end_headers() ; return
self.send_response(200)
self.send_header("Content-type","text/html; charset="+thechars.charset)
self.end_headers()
if self.path.startswith("/quit"):
r=thechars.processRequest("/status")
r=r[:r.index("<body>")+6]+"Server terminating."+r[r.index("<body>")+6:]
self.wfile.write(r)
thread.start_new_thread(terminate_server,()) # can terminate the server after this request
else: self.wfile.write(thechars.processRequest(self.path))
self.wfile.close() # needed or will wait for bkg speaking processes etc
def do_session():
portNo = firstPortNo ; server = None
while portNo < firstPortNo+100:
try:
server = BaseHTTPServer.HTTPServer((listenAddr,portNo),RequestHandler)
break
except socket.error: portNo += 1
assert server, "Couldn't find a port to run the server on"
if ("win" not in sys.platform) and commands.getoutput("which x-www-browser 2>/dev/null"): # (try to find x-www-browser, but not on windows/cygwin/darwin)
os.system("x-www-browser http://localhost:%d/%s &" % (portNo,str(random.random()))) # shouldn't need a sleep as should take a while to start anyway
else:
try:
import webbrowser
webbrowser.open_new("http://localhost:%d/%s" % (portNo,str(random.random())))
except ImportError: pass # fall through to command-line message
# Do this as well, in case that command failed:
print ; print ; print
print "Server running. If a web browser does not appear automatically,"
print "please start one yourself and go to"
print "http://localhost:%d/%d" % (portNo,random.randint(1,99999))
print ; print ; print
server.serve_forever()
if sys.argv[-1]=='--count':
x,y,sec,insec=thechars.countKnown()
print "%d (of which %d seem secure)" % (x,len(sec))
elif sys.argv[-1]=='--show-secure':
x,y,sec,insec=thechars.countKnown()
print " ".join(sec)
elif sys.argv[-1]=='--show-wfx':
# the result of this might need charset conversion
# (and the conversion of charlearn scores to Wenlin histories is only approximate)
print """<?xml version='1.0'?>
<!-- Wenlin Flashcard XML file -->
<stack owner='Anonymous' reward='points'>"""
thechars.chars.sort(byPriority)
for c in thechars.chars:
print "<card type='d'><question>"+c.hanzi+"</question>"
trials = "" ; score = 0
if c.supposedToKnow:
if c.priority < 0:
trials += "n"
p = priorityIfGotWrong
while p < c.priority:
trials += "y" ; score += 1
p /= 2
p = 1
while p < c.priority:
trials += "y" ; score += 1
p *= 2
print "<history score='%d' trials='%d' recent='%s'></history></card>" % (score,len(trials),trials)
print "</stack>"
else: do_session()