FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ssb22/gradint
  • st822/gradint
2 results
Show changes
Showing
with 448 additions and 362 deletions
#!/usr/bin/env python
# (should work in either Python 2 or Python 3)
# Character-learning support program
# (C) 2006-2013, 2020 Silas S. Brown. Version 0.3.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# Where to find history:
# on GitHub at https://github.com/ssb22/gradint
# and on GitLab at https://gitlab.com/ssb22/gradint
# and on BitBucket https://bitbucket.org/ssb22/gradint
# and at https://gitlab.developers.cam.ac.uk/ssb22/gradint
# and in China: https://gitee.com/ssb22/gradint
listenAddr='127.0.0.1'
firstPortNo=9876
tableFile = "characters.txt" # for first-time setup
knownFile = "known-chars.txt" # ditto
dumpFile = "charlearn-data" # for saving progress
reviseFile = "revise.txt" # for requesting more revision next time (will be deleted after integration into progress)
import sys,os.path
if sys.argv[-1].startswith("--"): gradint = None # (don't need to speak if we're processing options, see at end)
elif os.path.isfile("gradint.py"): import gradint
else: gradint = None # won't speak characters
import random,os,time,socket
try: from subprocess import getoutput
except: from commands import getoutput
try: from cPickle import Pickler,Unpickler
except: from pickle import Pickler,Unpickler
try: from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
except: from http.server import BaseHTTPRequestHandler, HTTPServer
try: import thread
except: import _thread as thread
def byPriority(a): return a.priority
priorityIfGotWrong = -10
priorityOfOtherCharWrong = -4
priorityOfGroupWrong = 0
maxShowInGroup = 5 ; priorityBreakGroup = 10
initSessionLen = sessionLen = 2 ; maxSessionLen = 10 ; sampleConst = 1.5
def updateSessionLen():
global sessionLen
sessionLen = min(max(sessionLen,int(thechars.countKnown()[1]+0.95)),maxSessionLen)
# did have /sampleConst after countKnown()[1] but doesn't seem necessary
already_spoken = {}
gradint_busy = 0
def speak_bkg():
gradint.just_synthesize()
global gradint_busy
gradint_busy = 0
class SingleChar:
def __init__(self,hanzi,pinyin):
self.hanzi = hanzi ; self.pinyin = pinyin
self.priority = 0 ; self.similarityGroup = None
self.supposedToKnow = 0
def formatPinyin(self): return self.pinyin.replace("\n","<BR>") # (could make it into actual tone marks also)
def htmlString(self,parent,step=1,left=0):
self.supposedToKnow = 1
r=u'<html><head><title>hanzi</title><meta http-equiv="Content-Type" content="text/html; charset=%s"></head><body><h1>%s</h1>' % (parent.charset,self.hanzi)
if step==1: r+=self.yesno('Do you know what this is? (%d remaining)' % left,2,0)
else:
r += self.formatPinyin() + "<HR>"
if step<=0:
if self.similarityGroup:
l = []
for c in parent.chars:
if c.similarityGroup == self.similarityGroup and not id(c)==id(self): l.append(c)
l.sort(key=byPriority)
r+="Not to be confused with:"
for c in l[:maxShowInGroup-1]: r+='<h1>%s</h1>%s' % (c.hanzi,c.formatPinyin())
r += '<hr>'
if parent.thisSession:
r+='<A HREF="/%s">Next character</A>' % str(random.random())
if step==-1:
# got it right - might as well take that link automatically
r=parent.processRequest("/").decode(parent.charset).replace('</body></html>','')
else:
updateSessionLen()
r+='<A HREF="/quit">Quit</A> | <A HREF="/%s">Another %d</A>' % (str(random.random()),sessionLen)
if step==0:
self.priority=priorityIfGotWrong
self.speak(parent.charset)
else:
# knew it
self.priority += 1
if self.priority > 0:
if self.priority < 25000: self.priority *= 2 # give new characters a chance
else: self.priority = 50000 # level off
else: self.priority /= 2 # TRY this for a while - will make chars got-wrong recover more quickly (again to give new chars a chance)
parent.save()
elif step==2:
r+=self.yesno('Did you get it right?',-1,3)
self.speak(parent.charset)
elif step==3:
r+='What did you think it was?<P>'
toOut = [] # (pinyin,hanzi,id,is-in-same-group)
for c in parent.chars:
if c.similarityGroup and c.similarityGroup==self.similarityGroup: sameGrp=True
else: sameGrp=False # need to do it this way because Python sometimes returns 'None' from that expression
if c.supposedToKnow and not id(c)==id(self): toOut.append((c.pinyin,c.hanzi,id(c),sameGrp)) # NOT formatPinyin, because may want to i-search it
toOut.sort()
if len(toOut) > 20: r+="(Hint: On some browsers you can use find-as-you-type)<P>"
for outSameGroup in [True,False]:
oldL=len(r)
for p,hanzi,val,sameGrp in toOut:
if sameGrp==outSameGroup: r+='%s <A HREF="/%d_%d">%s</A><BR>' % (hanzi,id(self),val,p)
if len(r)>oldL and outSameGroup: r += '<HR>' # between chars in same group and others
r+='<A HREF="/%d=0">None of the above</A>' % id(self)
if not parent.thisSession:
global already_spoken ; already_spoken = {} # reset it so "Another N" does speak them
return r + '</body></html>'
def speak(self,charset):
if self.hanzi in already_spoken: return
already_spoken[self.hanzi] = 1 # don't set a self. attribute - it'll get pickled for next session
if gradint:
gradint.justSynthesize = self.hanzi.decode(charset).encode('utf-8')
global gradint_busy
while gradint_busy: time.sleep(0.5)
gradint_busy = 1
thread.start_new_thread(speak_bkg,())
def yesno(self,question,ifyes,ifno): return question+'<P><A ID="y" HREF="/%d=%d">Yes</A><SCRIPT>document.getElementById("y").focus()</SCRIPT> | <A HREF="/%d=%d">No</A>' % (id(self),ifyes,id(self),ifno) # (don't use the js anywhere except yes/no, because 'next character' etc may have too much on the screen and we don't want the focus() to scroll)
the_speaker_process = None
def terminate_server():
# portable signal.alarm(1)
time.sleep(1); os.abort()
def B(s):
if type(u"")==type(""): return s.encode('utf-8')
else: return s
def S(s):
if type(u"")==type("") and not type(s)==type(""): return s.decode('utf-8')
else: return s
class CharDbase:
def __init__(self):
self.counter = 0 ; self.nextPriority = 0
self.similarityGroups = 0
self.chars = [] ; self.thisSession = []
self.readTable() ; self.readKnown() ; self.readRevise()
def debug_printKnown(self):
print ("-*- coding: %s -*-" % (self.charset,))
for c in self.chars:
if c.supposedToKnow: print ("%s %s" % (c.priority,c.hanzi))
def readTable(self):
addingTo = 0
if self.chars: addingTo = 1
lines=open(tableFile,'rb').readlines()
if lines[0].startswith(B("charset:")):
self.charset = S(lines[0].split()[-1])
lines = lines[1:]
else: self.charset = "iso-8859-1"
for line in lines: self.addCharFromFreqTable(line.decode(self.charset),addingTo)
def readKnown(self):
try:
o=open(knownFile)
except IOError: return
for line in o.readlines(): self.makeCharKnown(line.split()[0])
def readRevise(self):
try:
o=open(reviseFile)
except IOError: return
for line in o.readlines(): self.makeCharRevise(line.split()[0])
def makeCharKnown(self,hanzi):
if not hanzi: return # blank lines etc
for c in self.chars:
if c.hanzi==hanzi:
if not c.supposedToKnow:
c.supposedToKnow = 1
c.priority = priorityOfGroupWrong # just to check
return
print ("WARNING: character '%s' in %s was not in %s - ignoring" % (repr(hanzi),knownFile,tableFile))
def makeCharRevise(self,hanzi):
if not hanzi: return # blank lines etc
for c in self.chars:
if c.hanzi==hanzi:
c.supposedToKnow = 1
c.priority = priorityIfGotWrong
return
print ("WARNING: character '%s' in %s was not in %s - ignoring" % (repr(hanzi),reviseFile,tableFile))
def addCharFromFreqTable(self,line,checkAlreadyThere):
hanzi,pinyin = line.split(None,1)
c=SingleChar(hanzi,pinyin.replace("\\n","\n"))
c.priority = self.nextPriority ; self.nextPriority += 1
if checkAlreadyThere:
for c2 in self.chars:
if c2.hanzi == hanzi: return
self.chars.append(c)
def charIdToChar(self,charId):
char = None
for c in self.chars:
if id(c)==charId:
char = c ; break
assert char ; return char
def processRequest(self,path):
if '=' in path:
charId,step = map(lambda x:int(x),path[1:].split('='))
char = self.charIdToChar(charId)
elif '_' in path: # grouping
char,char2 = map(lambda x:self.charIdToChar(int(x)),path[1:].split('_'))
if not char.similarityGroup and not char2.similarityGroup: # new group:
self.similarityGroups += 1
char.similarityGroup = char2.similarityGroup = self.similarityGroups
elif not char.similarityGroup: char.similarityGroup = char2.similarityGroup
elif not char2.similarityGroup: char2.similarityGroup = char.similarityGroup
elif not char.similarityGroup == char2.similarityGroup: # merge 2 different groups:
for c in self.chars:
if c.similarityGroup == char2.similarityGroup: c.similarityGroup = char.similarityGroup
step = 0 # normal got-wrong for this character
char.priority = priorityIfGotWrong # here also, for the loop below
char2.priority = min(char2.priority,priorityOfOtherCharWrong)
for c in self.chars:
if c.similarityGroup == char.similarityGroup:
if c.priority >= priorityBreakGroup: c.similarityGroup=None
elif c.priority > priorityOfGroupWrong: c.priority = priorityOfGroupWrong
elif path=="/status":
self.chars.sort(key=byPriority)
cp=self.chars[:] ; r='<html><head><title>Current Status</title><meta http-equiv="Content-Type" content="text/html; charset=%s"></head><body><h2>Current Status</h2>(score/priority number is shown to the left of each item)<br>' % (self.charset,)
while cp:
if not cp[0].supposedToKnow:
del cp[0] ; continue
if cp[0].priority >= priorityBreakGroup: thisGrp=[0]
else: thisGrp=list(filter(lambda x:x==0 or (cp[x].similarityGroup and cp[x].similarityGroup==cp[0].similarityGroup and cp[x].priority < priorityBreakGroup),range(len(cp))))
if len(thisGrp)>1 and not r.endswith("<hr>"): r+="<hr>"
if len(thisGrp)>1: r+="<em>"+str(len(thisGrp))+" similar items:</em><br>"
for g in thisGrp: r += str(cp[g].priority)+": "+cp[g].hanzi+" "+cp[g].pinyin+"<br>"
if len(thisGrp)>1: r+="<hr>"
thisGrp.reverse()
for toDel in thisGrp: del cp[toDel]
return (r+"</body></html>").encode(self.charset)
else:
if path=="/checkallknown": self.thisSession = list(filter(lambda x:x.supposedToKnow,self.chars)) # TODO: Document this URL
char,step = self.chooseChar(),1
return char.htmlString(self,step,len(self.thisSession)).encode(self.charset)
def chooseChar(self):
if not self.thisSession:
self.chars.sort(key=byPriority)
if sessionLen==initSessionLen:
self.thisSession = self.chars[:sessionLen] # introduce in order the first time (especially if the second one is just a straight line ("yi1"), as one beginner thought the program had gone wrong when he saw this)
self.thisSession.reverse() # because taken out by pop()
else: self.thisSession = random.sample(self.chars[:int(sessionLen*sampleConst)],sessionLen) # TODO need a better way than that. NB high priority should be VERY likely, but others should have a chance. try as-is for now
return self.thisSession.pop()
def save(self): Pickler(open(dumpFile,"wb"),-1).dump(self)
def countKnown(self):
charsSeen = sessnLen = charsSecure = newChars = 0
secure=[] ; insecure=[]
self.chars.sort(key=byPriority)
for c in self.chars:
if c.supposedToKnow:
charsSeen += 1
if c.priority>0: secure.append(c.hanzi)
else: insecure.append(c.hanzi)
else: newChars += 1
if newChars == 2: sessnLen = charsSeen
return charsSeen,sessnLen,secure,insecure
try:
dumped = open(dumpFile,"rb")
except IOError: dumped = None
if dumped:
thechars = Unpickler(dumped).load()
dumped.close()
thechars.thisSession = []
if os.stat(tableFile).st_mtime > os.stat(dumpFile).st_mtime: thechars.readTable()
try:
if os.stat(knownFile).st_mtime > os.stat(dumpFile).st_mtime: thechars.readKnown()
except OSError: pass
try:
if os.stat(reviseFile).st_mtime > os.stat(dumpFile).st_mtime: thechars.readRevise()
except OSError: pass
updateSessionLen()
else:
thechars=CharDbase()
class RequestHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path.startswith("/fav"):
self.send_response(404) ; self.end_headers() ; return
self.send_response(200)
self.send_header("Content-type","text/html; charset="+thechars.charset)
self.end_headers()
if self.path.startswith("/quit"):
r=thechars.processRequest("/status").decode(thechars.charset)
r=r[:r.index("<body>")+6]+"Server terminating."+r[r.index("<body>")+6:]
self.wfile.write(r.encode(thechars.charset))
thread.start_new_thread(terminate_server,()) # can terminate the server after this request
else: self.wfile.write(thechars.processRequest(self.path))
self.wfile.close() # needed or will wait for bkg speaking processes etc
def do_session():
portNo = firstPortNo ; server = None
while portNo < firstPortNo+100:
try:
server = HTTPServer((listenAddr,portNo),RequestHandler)
break
except socket.error: portNo += 1
assert server, "Couldn't find a port to run the server on"
if ("win" not in sys.platform) and getoutput("which x-www-browser 2>/dev/null"): # (try to find x-www-browser, but not on windows/cygwin/darwin)
os.system("x-www-browser http://localhost:%d/%s &" % (portNo,str(random.random()))) # shouldn't need a sleep as should take a while to start anyway
else:
try:
import webbrowser
webbrowser.open_new("http://localhost:%d/%s" % (portNo,str(random.random())))
except ImportError: pass # fall through to command-line message
# Do this as well, in case that command failed:
print ("") ; print ("") ; print ("")
print ("Server running. If a web browser does not appear automatically,")
print ("please start one yourself and go to")
print ("http://localhost:%d/%d" % (portNo,random.randint(1,99999)))
print ("") ; print ("") ; print ("")
server.serve_forever()
if sys.argv[-1]=='--count':
x,y,sec,insec=thechars.countKnown()
print ("%d (of which %d seem secure)" % (x,len(sec)))
elif sys.argv[-1]=='--show-secure':
x,y,sec,insec=thechars.countKnown()
print (" ".join(sec))
elif sys.argv[-1]=='--show-wfx':
# the result of this might need charset conversion
# (and the conversion of charlearn scores to Wenlin histories is only approximate)
print ("""<?xml version='1.0'?>
<!-- Wenlin Flashcard XML file -->
<stack owner='Anonymous' reward='points'>""")
thechars.chars.sort(key=byPriority)
for c in thechars.chars:
print ("<card type='d'><question>"+c.hanzi+"</question>")
trials = "" ; score = 0
if c.supposedToKnow:
if c.priority < 0:
trials += "n"
p = priorityIfGotWrong
while p < c.priority:
trials += "y" ; score += 1
p /= 2
p = 1
while p < c.priority:
trials += "y" ; score += 1
p *= 2
print ("<history score='%d' trials='%d' recent='%s'></history></card>" % (score,len(trials),trials))
print ("</stack>")
else: do_session()
charset: euc-jp
あ a
い i
う u
え e
お o
か ka
き ki
く ku
け ke
こ ko
さ sa
し shi
す su
せ se
そ so
た ta
ち chi
つ tsu
て te
と to
な na
に ni
ぬ nu
ね ne
の no
は ha
ひ hi
ふ fu
へ he
ほ ho
ま ma
み mi
む mu
め me
も mo
や ya
ゆ yu
よ yo
ら ra
り ri
る ru
れ re
ろ ro
わ wa
を wo
ん n
ア a
イ i
ウ u
エ e
オ o
カ ka
キ ki
ク ku
ケ ke
コ ko
サ sa
シ shi
ス su
セ se
ソ so
タ ta
チ chi
ツ tsu
テ te
ト to
ナ na
ニ ni
ヌ nu
ネ ne
ノ no
ハ ha
ヒ hi
フ fu
ヘ he
ホ ho
マ ma
ミ mi
ム mu
メ me
モ mo
ヤ ya
ユ yu
ヨ yo
ラ ra
リ ri
ル ru
レ re
ロ ro
ワ wa
ヲ wo
ン n
Installing Gradint on Linux systems
-----------------------------------
Gradint does not need to be installed, it can
just run from the current directory.
If you do want to make a system-wide installation
(for example if you want to make a package for a
Linux distribution), I suggest doing the following
as root:
mkdir /usr/share/gradint
cp gradint.py /usr/share/gradint/
cd samples/utils
for F in *.py *.sh; do
export DestFile=/usr/bin/gradint-$(echo $F|sed -e 's/\..*//')
cp $F $DestFile
chmod +x $DestFile
done
cd ../.. ; rm -rf samples/utils
tar -zcf /usr/share/gradint/new-user.tgz \
advanced.txt settings.txt vocab.txt samples
cat > /usr/bin/gradint <<EOF
#!/bin/bash
if ! test -e "$HOME/gradint"; then
echo "You will need some prompts and samples in your home directory."
echo "Is it OK to unpack an example into $HOME/gradint ?"
echo "Ctrl-C to quit or Enter to continue"
read
echo -n "Unpacking... "
mkdir "$HOME/gradint"
cd "$HOME/gradint"
tar -zxf /usr/share/gradint/new-user.tgz
echo "done."
echo "Please check the contents of $HOME/gradint"
echo "especially the README files."
echo "Then you can run gradint again."
exit
fi
cd "$HOME/gradint"
python /usr/share/gradint/gradint.py $@
EOF
chmod +x /usr/bin/gradint
For a distribution you might also have to write
man pages and tidy up the help text etc.
Depends: python + a sound player (e.g. alsa-utils)
Recommends: python-tk python-tksnack sox libsox-fmt-all madplay
File deleted
File deleted
在说一次
File deleted
#!/bin/bash
if test -e /usr/lib/tkConfig.sh || test -e /usr/local/lib/tkConfig.sh; then
# run using only the Tk windows:
cd "$(echo $0 | sed -e 's|start-gradint.app/Contents/MacOS/start-gradint.*$||')"
exec pythonw gradint.py
else
# run in Terminal:
open -a Terminal.app "$(echo $0 | sed -e 's|start-gradint.app/Contents/MacOS/start-gradint.*$|gradint.py|')"
fi
File deleted
File deleted
(]q}q}q]qt.
\ No newline at end of file
File deleted
English
xian4zai4 wo3men5 yao4 deng3, ran2hou4 fu4xi2. zai4 di4 yi1 ke4 wo3men5 hai2 mei2you3 xue2xi2 hen3 duo1 ci2yu3 suo3yi3 ting2dun4 bi3jiao4 chang2. dan4shi4 zai4 wei4lai2 de5 ke4 wo3men5 mei2you3 zhe4yang4 chang2 de5 ting2dun4.
#!/usr/bin/env python
import os,commands,sys
def equalise():
oldDir=os.getcwd()
for l in os.listdir(oldDir):
isDir = 0
try:
os.chdir(l)
isDir=1
except: pass
if isDir:
equalise()
os.chdir(oldDir)
elif l.endswith("wav"):
vol = commands.getoutput('sox "%s" t.nul stat' % (l,)).split("\n")[-1].split()[-1]
os.system('sox -t wav - -t wav __adjusted vol %s < "%s"' % (vol,l))
os.remove(l) ; os.rename('__adjusted',l)
try: os.remove('t.nul')
except: pass
sys.stdout.write("""WARNING - Use this script ONLY if there is a large
perceptual variation in the volume levels. Works on all
samples in current directory and subdirectories. Really go
ahead?
Press Ctrl-C to cancel or Enter to continue\n""")
raw_input()
equalise()
#!/bin/bash
export SamplesDir="samples/" # Must include trailing /
export ProgressFile="progress.txt"
if ! test -e $SamplesDir; then echo "Error: $SamplesDir does not exist (are you in the right directory?)"; exit 1; fi
if ! test -e $ProgressFile; then echo "Error: $ProgressFile does not exist (are you in the right directory?)";exit 1;fi
if test "a$1" == a; then
echo "Usage: $0 oldname newname"
echo "oldname and newname are relative to $SamplesDir, and can be prefixes of several files/directories"
echo "Moves files from one samples directory to another, keeping $ProgressFile adjusted. Make sure gradint is not running (including waiting for start) when in use."
exit 1
fi
export Src=$1
export Dest=$2
find $SamplesDir -follow -type f | grep ^$SamplesDir$Src | \
while true; do read || break;
export SrcFile=$REPLY
export DestFile=$(echo $SrcFile|sed -e "s|^$SamplesDir$Src|$SamplesDir$Dest|")
mkdir -p $DestFile ; rmdir $DestFile # ensure parent dirs exist before moving file across
mv -b $SrcFile $DestFile
export SrcFile=$(echo $SrcFile|sed -e "s|$SamplesDir||")
export DestFile=$(echo $DestFile|sed -e "s|$SamplesDir||")
gzip -fdc $ProgressFile | sed -e "s|$SrcFile|$DestFile|g" > /tmp/newprog ; mv /tmp/newprog $ProgressFile # (ideally should re-write to batch these changes, but leave like this for now in case need to recover from unfinished operation)
done
rmdir $SamplesDir$Src 2>/dev/null >/dev/null # IF it's a directory
# log2opl.py (c) 2008 Silas S. Brown. License: GPL.
# This is a Python script to translate log.txt into an OPL
# program for a palmtop or smartphone running EPOC. The
# resulting file lesson.opl needs to be imported into Program
# and translated. The program will show the log of the lesson
# in real time, providing a countdown for each item. This
# is for use as a speaker's cue when demonstrating the
# graduated-interval method in an extemporaneous talk (works
# best with a lesson 1 so there are plenty of gaps to speak in).
# Make sure you're using vocab.txt or meaningful filenames.
# It may also be useful to set partialsDirectory=None
# If you have a PDA that can run Gradint by itself, then
# see ask_teacherMode in advanced.txt for a more flexible approach.
o=open("lesson.opl","wb")
o.write("PROC m:\r\nfont 8,9\r\n")
curS = -5 # allow lead-in
for l in open("log.txt"):
m,s = l.split()[0].split(":") ; m,s = int(m),int(s)
s=s+60*m
o.write("a:("+str(s-curS)+",\""+" ".join(l.split()[1:])+"\")\r\n")
curS = s
o.write('PRINT "Finished.":GET\r\nENDP\r\nPROC a:(secs%,a$)\r\nLOCAL i%\r\nPRINT " ";a$+chr$(13),\r\ni%=secs%\r\nWHILE i%\r\nprint CHR$(13)+GEN$(i%,2)+" ";\r\nPAUSE 20\r\ni%=i%-1\r\nENDWH\r\nPRINT CHR$(13)+" "\r\nENDP\r\n')
Note: Now that gradint supports MP3 input, you
can replace your WAVs with MP3s instead of
following the instructions here. See samples/ReadmeMP3.txt
for notes on getting this to work. You can
update all progress.txt's with the change like
this:
for N in $(find . -name progress.txt); do sed -e "s/\.wav/.mp3/g" < $N > n ; mv n $N; done
and do the encoding itself (in-place) with:
for N in $(find samples|grep wav$); do lame --cbr -b 48 -h -m m $N $(echo $N|sed -e s/.wav$/.mp3) && rm $N; done
---------------------
To squash down to 128kbps (16k bytes/s), be in the directory above 'samples' and do:
for Dir in $(find samples/ -type d); do mkdir -p "compressed-$Dir"; done; for F in $(find samples/ -type f|grep wav$); do if test "$F" -nt "compressed-$F"; then sox "$F" -r 16000 -c 1 -b -u test.wav; if test $(wc -c test.wav|sed -e 's/ .*//') -lt $(wc -c "$F"|sed -e 's/ .*//'); then mv test.wav "compressed-$F"; else rm test.wav; cp -p "$F" "compressed-$F"; fi; fi; done; for F in $(find samples/|grep -v wav$); do cp -up "$F" "compressed-$F" 2>/dev/null; done
The result will be in a directory called compressed-samples. Any samples that were already smaller than the "compressed" versions, or anything that is not a .wav file, will simply be copied into compressed-samples uncompressed. Any files already in compressed-samples will not be touched unless the "samples" equivalent is newer. Additionally you may want to delete any samples in compressed-samples that are no longer in samples, in which case do this as well:
for F in $(find compressed-samples/ -type f); do if ! test -e $(echo "$F"|sed -e s/compressed-//); then rm "$F"; fi; done
To compress in place (erasing original files), go into samples directory and do:
for F in $(find . -type f|grep wav$); do sox "$F" -r 16000 -c 1 -b -u test.wav; if test $(wc -c test.wav|sed -e 's/ .*//') -lt $(wc -c "$F"|sed -e 's/ .*//'); then mv test.wav "$F"; else rm test.wav; fi; done
On some systems, 8-bit playback is noisy (e.g. because volume adjustments cause too many of those 8 bits to be lost); if you can't work around this then you could use 16-bit by deleting '-b -u' from the above commands, but the result will be twice as big.
#!/usr/bin/env python
# Program to support splitting a long sound file into
# several little ones.
# Needs 'sox' - if Windows, download from
# sox.sourceforge.net
# (e.g. http://prdownloads.sourceforge.net/sox/sox12172.zip
# - note gives a "select a mirror" dialogue) and put sox.exe
# in the same directory or on the path
# -----------------------
# lowpri: 2nd sort key by length ? (only matters if adding a lot of new words & phrases at same time)
import time,os,sndhdr,sys
try: import winsound
except: winsound=None
macsound = (sys.platform.find("mac")>=0 or sys.platform.find("darwin")>=0)
if macsound: sys.stderr.write("Warning: You need to have qtplay (from gradint or wherever) in your PATH for this to work\n")
def rawcut(allData,fromSecs,toSecs,rate=22050,bits=16,channels=1):
return allData[secbyte(fromSecs,rate,channels,bits):secbyte(toSecs,rate,channels,bits)]
def secbyte(sec,rate,channels,bits):
# Convert a time in seconds to a byte offset in the raw
# data
# Note: Result MUST be a multiple of bytesPerSample
# 'sec' is not necessarily an integer
sampleNo = int(0.5+sec*rate) # nearest integer sample no
bytesPerSample = channels*int(bits/8)
return sampleNo * bytesPerSample
def readTimings(langs):
if macsound: time.sleep(1) # OS X hack due to qtplay delay (1sec on an Intel 2GHz Core Duo running OSX 10.5)
sys.stdout.write("Starting clock\n")
# Now using time.time() rather than time.clock()
# due to clock units confusion
# Just have to hope the system is accurate enough
offset = time.time()
ret = [] ; ip=''
start = offset
while not ip=='q':
ip = raw_input(langs[len(ret)%len(langs)]+": ")
t = time.time()
if ip=="c" and ret: ret[-1]=(ret[-1][0],t-offset)
elif not ip: ret.append((start-offset,t-offset))
start = t
sys.stdout.write("Finishing at %f seconds\n" % (t-offset,))
return ret
def instructions():
sys.stdout.write("Press Return between samples\n")
sys.stdout.write("Enter 'c' to change the time of the last Return to this one\n")
sys.stdout.write("Enter 'x' to omit this bit (e.g. silence)\n")
sys.stdout.write("Enter 'q' when done (AFTER stopping last sample)\n")
sys.stdout.write("PRESS RETURN TO START\n")
raw_input()
def getParams():
wavFile=raw_input("Enter filename of main recording: ")
header = sndhdr.what(wavFile)
if not header:
sys.stdout.write("Problem opening that file\n")
return None
(wtype,rate,channels,wframes,bits) = header
sys.stdout.write("WAV file is %d-bit\n" % (bits,))
if bits==8: soxBits="-b -u" # unsigned
elif bits==16: soxBits="-w -s" # signed
elif bits==32: soxBits="-l -s" # signed
else:
sys.stdout.write("Unsupported bits per sample '%s'\n" % (bits,))
return None
soxParams = "-t raw %s -r %d -c %d" % (soxBits,rate,channels)
rawFile = wavFile + ".raw"
convertToRaw(soxParams,wavFile,rawFile)
lang1=lang2=None
while not lang1: lang1=raw_input("Enter first language on recording (e.g. zh): ")
interleaved=input("Are two languages interleaved? (1/0): ") # (horrible hack)
if interleaved:
while not lang2: lang2=raw_input("Enter second language on recording (e.g. en): ")
else:
lang2=lang1
sys.stdout.write("OK - should run this program again for other language's recording\n")
return soxParams,wavFile,rawFile,lang1,lang2,rate,bits,channels
def convertToWav(soxParams,rawFile,wavFile):
os.system("sox %s \"%s\" \"%s\"" % (soxParams,rawFile,wavFile))
def convertToRaw(soxParams,wavFile,rawFile):
os.system("sox \"%s\" %s \"%s\"" % (wavFile,soxParams,rawFile))
def main():
tuple=None
while not tuple: tuple=getParams()
soxParams,wavFile,rawFile,lang1,lang2,rate,bits,channels = tuple
mainLoop(soxParams,wavFile,rawFile,lang1,lang2,rate,bits,channels)
os.unlink(rawFile)
# Set lang1 & lang2 equal if not interleaving
def mainLoop(soxParams,wavFile,rawFile,lang1="zh",lang2="en",rate=22050,bits=16,channels=1):
allData=open(rawFile,"rb").read()
open(wavFile,"rb").read() # to cache before starting clock and 'play' (especailly because just loaded the separate raw data) (could also play from raw data if got sox)
instructions()
# Start sound asynchronously - hope for the best that
# the first clock reading is near enough to the actual
# start of the sound
if winsound: winsound.PlaySound(wavFile,winsound.SND_FILENAME | winsound.SND_ASYNC)
elif macsound: os.spawnlp(os.P_NOWAIT,"qtplay","qtplay",wavFile)
# else: os.spawnlp(os.P_NOWAIT,"play","play",wavFile)
# Problem: What if 'play' o/p's at slightly less than the correct rate - will think the cuts are further on in the file than they really are. (e.g. 16000Hz on a z61p Cygwin, "time play" shows it takes slightly longer than sox thinks the file is)
# Better convert to 44100 just to make sure.
else: os.system("sox \"%s\" -r 44100 -t wav - | play -t wav - &" % wavFile)
# Read timings, cut up, and write out the samples
samples = [ rawcut(allData,s,f,rate,bits,channels) for s,f in readTimings([lang1,lang2]) ]
formatString = "%0"+str(len(str(int(len(samples)/(2-(lang2==lang1))-1))))+"d_%s"
# (pad with 0s as necessary so it's in order)
# (len(samples)-1 gives highest number, so len(str(l..))
# gives number of digits in it)
for i in range(len(samples)):
if i%2: lang=lang2
else: lang=lang1
if lang1==lang2: c=i
else: c=int(i/2)
fname = formatString % (c,lang)
f=open(fname, "wb")
f.write(samples[i])
f.close()
convertToWav(soxParams,fname,fname+".wav")
os.unlink(fname)
sys.stdout.write("Written %s.wav\n" % (fname,))
if __name__=="__main__":
main()
#!/usr/bin/env python
# Program to strip any silence from the beginning/end of a
# sound file (must be real 0-bytes not background noise)
# (This is useful as a "splitter" post-processor when
# getting samples from CD-ROMs e.g. "Colloquial Chinese" -
# don't use audacity here because some versions of audacity
# distort 8-bit audio files)
# Needs 'sox' + splitter
from splitter import *
for wavFile in sys.argv[1:]:
# Figure out sox parameters
header = sndhdr.what(wavFile)
if not header: raise IOError("Problem opening %s" % (wavFile,))
(wtype,rate,channels,wframes,bits) = header
if bits==8: soxBits="-b -u" # unsigned
elif bits==16: soxBits="-w -s" # signed
elif bits==32: soxBits="-l -s" # signed
else: raise Exception("Unsupported bits per sample")
soxParams = "-t raw %s -r %d -c %d" % (soxBits,rate,channels)
rawFile = wavFile + ".raw"
# Now ready to convert to raw, and read it in
convertToRaw(soxParams,wavFile,rawFile)
o=open(rawFile,"rb")
allData=o.read()
o.close()
# Now figure out how many samples we can take out
bytesPerSample = channels*int(bits/8)
if bytesPerSample==1: silenceVal=chr(128)
else: silenceVal=chr(0)
startIdx = 0
while startIdx < len(allData):
if not allData[startIdx]==silenceVal: break
startIdx = startIdx + 1
startIdx = int(startIdx/bytesPerSample) * bytesPerSample
endIdx = len(allData)
while endIdx:
if not allData[endIdx-1]==silenceVal: break
endIdx = endIdx - 1
endIdx = endIdx - len(allData) # put it into -ve notatn
endIdx = int(endIdx/bytesPerSample) * bytesPerSample
endIdx = endIdx + len(allData) # avoid 0
sys.stderr.write("Debugger: Clipping %s to %d:%d\n" % (wavFile,startIdx,endIdx))
allData = allData[startIdx:endIdx]
# Write back the file, and convert it back to wav
o=open(rawFile,"wb")
o.write(allData)
o.close()
convertToWav(soxParams,rawFile,wavFile)
# Clean up
os.unlink(rawFile)