# This file is part of the source code of Gradint
# (c) Silas S. Brown.
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 3 of the License, or
#    (at your option) any later version.
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.

# Start of lessonplan.py - tracking progress and planning a lesson

class ProgressDatabase(object):
    def __init__(self,alsoScan=1,fromString=0):
        self.data = [] ; self.promptsData = {}
        self.unavail = [] ; self.saved_completely = 0
        if fromString or not self._load_from_binary():
            self._load_from_text(fromString)
            if self.data and not fromString: self.save_binary(self.data) # even before starting, to save time if they press Cancel and then try loading again without futher progressFile changes
        self.oldPromptsData = self.promptsData.copy() # in case have to save partial (see below)
        if alsoScan:
          global is_first_lesson ; is_first_lesson = (not self.data and not self.unavail) # hack
          self.data += self.unavail # because it might have become available again
          self.unavail = mergeProgress(self.data,scanSamples()+parseSynthVocab(vocabFile))
          if not cache_maintenance_mode:
            doLabel("Checking transliterations")
            global tList # for Python 2.1
            tList = {}
            def addVs(ff,dirBase): # add variants of ff to tList which we might need to transliterate
                dirBase,ff = B(dirBase),B(ff)
                if dirBase: dirBase += B(os.sep)
                if checkIn(dirBase+ff,variantFiles):
                   if B(os.sep) in ff: ffpath=ff[:ff.rfind(B(os.sep))+1]
                   else: ffpath=B("")
                   variantList=map(lambda x,f=ffpath:f+B(x),variantFiles[dirBase+ff])
                else: variantList = [ff]
                l=languageof(ff)
                for f in variantList:
                  f = B(f)
                  if f.lower().endswith(B(dottxt)):
                      text=bwspstrip(u8strip(read(dirBase+f)))
                  elif f.find(B("!synth"))==-1: continue # don't need to translit. filenames of wav's etc
                  else: text = textof(f)
                  if not checkIn(l,tList): tList[l]={}
                  tList[l][text]=1
            for ff in availablePrompts.lsDic.values(): addVs(ff,promptsDirectory)
            for _,l1,l2 in self.data:
                if not type(l1)==type([]): l1=[l1]
                for ff in l1+[l2]: addVs(ff,samplesDirectory)
            doLabel("Transliterating")
            for lang,dic in list(tList.items()):
                s = get_synth_if_possible(lang,0)
                if s and hasattr(s,"update_translit_cache"): s.update_translit_cache(lang,list(dic.keys()))
            del tList
        self.didScan = alsoScan
    def _load_from_binary(self):
        if pickledProgressFile and fileExists(pickledProgressFile):
            if pickle and not (fileExists(progressFile) and os.stat(progressFile)[8] > os.stat(pickledProgressFile)[8]): # we can unpickle the binary version, and text version has not been manually updated since it, so do this
                global firstLanguage, secondLanguage, otherLanguages
                if compress_progress_file or (unix and got_program("gzip")):
                    if paranoid_file_management: open(pickledProgressFile) # ensure ready
                    f = os.popen('gzip -fdc "'+pickledProgressFile+'"',popenRB)
                else: f=open(pickledProgressFile,"rb")
                try: thingsToSet, tup = pickle.Unpickler(f).load()
                except: return False # probably moved to a different Python version or something
                exec(thingsToSet)
                self._py3_fix()
                return True
            # otherwise drop out and return None
    def _load_from_text(self,fromString=0):
        if fromString: expr=fromString
        elif fileExists(progressFile):
            if compress_progress_file or (unix and got_program("gzip")):
                if paranoid_file_management: open(progressFile) # ensure ready
                expr = readB(os.popen('gzip -fdc "'+progressFile+'"',popenRB))
            else: expr = read(progressFile)
        else: expr = None
        if expr:
            expr = u8strip(expr).replace(B("\r\n"),B("\n")) # just in case progress.txt has been edited in Notepad
            # First, try evaluating it as self.data (legacy progress.txt from older versions).  If that doesn't work, execute it (newer versions).
            global firstLanguage, secondLanguage, otherLanguages
            try: self.data = eval(expr)
            except TypeError: raise Exception(progressFile+" has not been properly decompressed") # 'expected string without null bytes'
            except SyntaxError:
                try: import codeop
                except: codeop = 0
                if codeop: # try a lower-memory version (in case text file has been edited by hand and we're on NSLU2 or something) - don't compile all of it at once
                    lineCache = []
                    for l in expr.replace(B("\r\n"),B("\n")).split(B("\n")):
                        lineCache.append(l)
                        if lineCache[-1].endswith(B(",")): continue # no point trying to compile if it's obviously incomplete
                        code = codeop.compile_command("# coding=utf-8\n"+S(B("\n").join(lineCache)))
                        if code:
                            lineCache = []
                            exec(code)
                else: exec(B("# coding=utf-8\n")+expr)
            del expr
        # Remove legacy extentions in promptsData (needed only when loading from text, as this was before pickledProgressFile was added)
        for k in list(self.promptsData.keys()):
            if k.endswith(dotwav) or k.endswith(dotmp3):
                self.promptsData[k[:-len(dotwav)]]=self.promptsData[k]
                del self.promptsData[k]
        self._py3_fix()
    def _py3_fix(self):
        if not type("")==type(u""): return
        # we're Python 3, and we might have just loaded data from Python 2
        for l in [self.data,self.unavail]:
            for i in range(len(l)):
                for j in [1,2]:
                    if type(l[i][j])==str: l[i]=l[i][:j]+(S2(LB(l[i][j])),)+l[i][j+1:]
                    elif type(l[i][j])==list: l[i]=l[i][:j]+(map(lambda x:S2(LB(x)),l[i][j]),)+l[i][j+1:]
    def save(self,partial=0):
        if need_say_where_put_progress: show_info("Saving "+cond(partial,"partial ","")+"progress to "+progressFile+"... ")
        else: show_info("Saving "+cond(partial,"partial ","")+"progress... ")
        global progressFileBackup
        # Remove 0-repeated items (helps editing by hand)
        data = [] # don't use self.data - may want to make another lesson after saving
        for a,b,c in self.data:
            if a: data.append(denumber_filelists(a,b,c))
        sort(data,cmpfunc) # to normalise when using diff etc
        if progressFileBackup:
            try:
                import shutil
                shutil.copy2(progressFile,progressFileBackup) # preserve timestamp etc if shutil is available
            except:
                try: write(progressFileBackup,read(progressFile))
                except IOError: pass # maybe progressFile not made yet
            progressFileBackup = None
        while True:
          try:
            if compress_progress_file:
              if paranoid_file_management: fn=os.tempnam() # on some ftpfs setups gzip can fail causing silent corruption
              else: fn=progressFile
              f=os.popen('gzip -9 > "'+fn+'"','w')
            else: f = open(progressFile,'w')
            global progressFileHeader
            if type(u"")==type(""): # Python 3: ensure UTF-8
                import codecs
                realF,f = f,codecs.getwriter("utf-8")(f.buffer)
                progressFileHeader=progressFileHeader.replace("mode: python ","mode: python; coding: utf-8")
            else: realF = f
            f.write(progressFileHeader)
            f.write("firstLanguage=\"%s\"\nsecondLanguage=\"%s\"\n# otherLanguages=%s\n" % (firstLanguage,secondLanguage,otherLanguages)) # Note: they're declared "global" above (and otherLanguages commented out here for now, since may add to it in advanced.txt) (Note also save_binary below.)
            if self.didScan and maxNewWords: f.write("# collection=%d done=%d left=%d lessonsLeft=%d\n" % (len(self.data),len(data),len(self.data)-len(data),(len(self.data)-len(data)+maxNewWords-1)/maxNewWords))
            prettyPrintLongList(f,"self.data",data)
            f.write("self.promptsData=") ; pprint.PrettyPrinter(indent=2,width=60,stream=f).pprint(self.promptsData)
            prettyPrintLongList(f,"self.unavail",self.unavail)
            realF.close()
            if compress_progress_file and paranoid_file_management: write(progressFile,read(fn)),os.remove(fn)
            self.save_binary(data)
          except IOError: # This can happen for example on some PocketPC devices if you reconnect the power during progress save (which is likely if you return the device to the charger when lesson finished)
            if app or appuifw or android:
              if getYN("I/O fault when saving progress. Retry?"): continue
              # TODO else try to restore the backup?
            else: raise
          break
        if not partial: self.saved_completely = 1
        if not app and not appuifw and not android: show_info("done\n")
    def save_binary(self,data): # save a pickled version if possible (no error if not)
        if not (pickledProgressFile and pickle): return
        try:
            if compress_progress_file:
              if paranoid_file_management: fn=os.tempnam()
              else: fn=pickledProgressFile # TODO near-duplicate code with above
              f=os.popen('gzip -9 > "'+fn+'"',popenWB)
              if hasattr(f,'buffer'): _,f = f,f.buffer
            else: f = open(pickledProgressFile,'wb')
            pickle.Pickler(f,-1).dump(("self.data,self.promptsData,self.unavail,firstLanguage,secondLanguage = tup", (data,self.promptsData,self.unavail,firstLanguage,secondLanguage)))
            f.close()
            if compress_progress_file and paranoid_file_management: write(pickledProgressFile,read(fn)),os.remove(fn)
        except IOError: pass # OK if not got permissions to do it (NB need to catch the write as well because popen won't throw, and don't have to worry about a corrupted partial binary because loader would ignore it)
    def savePartial(self,filesNotPlayed):
        curPD,curDat = self.promptsData, self.data[:] # in case want to save a more complete one later
        self.promptsData = self.oldPromptsData # partial recovery of prompts not implemented
        if hasattr(self,"previous_filesNotPlayed"):
            i=0
            while i<len(filesNotPlayed):
                if checkIn(filesNotPlayed[i],self.previous_filesNotPlayed): i+=1
                else: del filesNotPlayed[i] # cumulative effects if managed to play it last time but not this time (and both lessons incomplete)
        self.previous_filesNotPlayed = filesNotPlayed = list2set(filesNotPlayed)
        if not filesNotPlayed:
            # actually done everything on overlaps
            self.promptsData=curPD
            return self.save()
        changed = 0
        for i in xrange(len(self.data)):
            if type(self.data[i][1])==type([]): l=self.data[i][1][:]
            else: l=[self.data[i][1]]
            l.append(self.data[i][2])
            found=0
            for ii in l:
              if checkIn(ii,filesNotPlayed):
                  self.data[i] = self.oldData[i]
                  found=1 ; break
            if not found and not self.data[i] == self.oldData[i]: changed = 1
        if changed: self.save(partial=1)
        elif app==None and not appuifw and not android: show_info("No sequences were fully complete so no changes saved\n")
        self.promptsData,self.data = curPD,curDat
    def makeLesson(self):
        global maxLenOfLesson
        self.l = Lesson()
        sort(self.data,cmpfunc) ; jitter(self.data)
        self.oldData = self.data[:] # for handling interrupts & partial progress saves
        self.exclude = {} ; self.do_as_poem = {}
        # First priority: Recently-learned old words
        # (But not too many - want room for new words)
        num=self.addToLesson(1,knownThreshold,1,recentInitialNumToTry,maxReviseBeforeNewWords)
        if num < maxReviseBeforeNewWords:
            # Weren't enough recently-learned old words
            # Do try to add SOMETHING before the new words
            num += self.addToLesson(knownThreshold,reallyKnownThreshold,1,recentInitialNumToTry,maxReviseBeforeNewWords-num)
            if num < maxReviseBeforeNewWords: self.addToLesson(reallyKnownThreshold,-1,1,1,maxReviseBeforeNewWords-num)
        # Now some new words
        self.addToLesson(0,0,newWordsTryAtLeast,newInitialNumToTry,maxNewWords)
        # Now some more recently-learned old words
        self.addToLesson(1,knownThreshold,1,recentInitialNumToTry,-1)
        self.addToLesson(knownThreshold,reallyKnownThreshold,1,recentInitialNumToTry,-1)
        # Finally, fill in the gaps with ancient stuff (1 try only of each)
        # But watch out for known poems
        poems, self.responseIndex = find_known_poems(self.data)
        for p in poems:
            for l in p: self.do_as_poem[self.responseIndex[l]] = p
        self.addToLesson(reallyKnownThreshold,-1,1,1,-1)
        if not self.l.events:
            # desperado if someone really wants extra revision of few words
            global randomDropLevel, randomDropLevel2
            rdl,rdl2,randomDropLevel,randomDropLevel2 = randomDropLevel,randomDropLevel2,0,0
            self.addToLesson(reallyKnownThreshold,-1,1,1,-1)
            randomDropLevel, randomDropLevel2 = rdl,rdl2
        l = self.l ; del self.l, self.responseIndex, self.do_as_poem
        if not l.events: raise Exception("Didn't manage to put anything in the lesson")
        if commentsToAdd: l.addSequence(commentSequence(),False)
        if orderlessCommentsToAdd:
            for c in orderlessCommentsToAdd:
                try:
                    l.addSequence([GluedEvent(Glue(1,maxLenOfLesson),fileToEvent(c,""))],False)
                except StretchedTooFar:
                    show_info(("Was trying to add %s\n" % (c,)),True)
                    raise
        # Add note on "long pause", for beginners
        longpause = "longpause_"+firstLanguage
        if not advancedPromptThreshold and not checkIn(longpause,availablePrompts.lsDic): longpause = "longpause_"+secondLanguage
        o=maxLenOfLesson ; maxLenOfLesson = max(l.events)[0]
        if checkIn(longpause,availablePrompts.lsDic) and self.promptsData.get(longpause,0)==0:
            try:
                def PauseEvent(longpause): return fileToEvent(availablePrompts.lsDic[longpause],promptsDirectory)
                firstPauseMsg = PauseEvent(longpause)
                # the 1st potentially-awkward pause is likely to be a beepThreshold-length one
                l.addSequence([GluedEvent(Glue(1,maxLenOfLesson),CompositeEvent([firstPauseMsg,Event(max(5,beepThreshold-firstPauseMsg.length))]))],False)
                while True:
                    l.addSequence([GluedEvent(Glue(1,maxLenOfLesson),CompositeEvent([PauseEvent(longpause),Event(50)]))],False)
                    self.promptsData[longpause] = 1
            except StretchedTooFar: pass
        maxLenOfLesson = o
        # Add "this is the end"
        try:
            pl=availablePrompts.getPromptList("end",self.promptsData,secondLanguage)
        except PromptException: pl = []
        t,event = max(l.events)
        t += event.length
        for p in pl:
            end_event = fileToEvent(p,promptsDirectory)
            l.events.append((t,end_event))
            t += end_event.length
        if not pl and fileExists(promptsDirectory+os.sep+"end"+dotwav):
            l.events.append((t,SampleEvent(promptsDirectory+os.sep+"end"+dotwav)))
            show_warning("Warning: Using legacy end"+dotwav+" - please change it to end_"+firstLanguage+dotwav+" and end_"+secondLanguage+dotwav+" (or "+extsep+"txt if you have synthesis)")
        l.cap_max_lateness()
        return l
    def addToLesson(self,minTimesDone=0,maxTimesDone=-1,minNumToTry=0,maxNumToTry=0,maxNumToAdd=-1):
        # Service routine - adds some words to the lesson
        # Words added must conform to the criteria specified
        # (i.e. range of how many times they've been done
        # before, and how many tries we can fit in now)
        # This is called a few times with different criteria
        # for the different priorities
        if maxNumToAdd==None: return 0
        numberAdded = 0
        newWordTimes = {}
        for numToTry in range(maxNumToTry,minNumToTry-1,-1):
            numFailures = 0 ; startTime = time.time() # for not taking too long
            for i in xrange(len(self.data)):
                if maxNumToAdd>-1 and numberAdded >= maxNumToAdd: break # too many
                if checkIn(i,self.exclude): continue # already had it
                (timesDone,promptFile,zhFile)=self.data[i]
                if timesDone < minTimesDone or (maxTimesDone>=0 and timesDone > maxTimesDone): continue # out of range this time
                if timesDone >= knownThreshold: thisNumToTry = min(random.choice([2,3,4]),numToTry)
                else: thisNumToTry = numToTry
                if timesDone >= randomDropThreshold and random.random() <= calcDropLevel(timesDone):
                    # dropping it at random
                    self.exclude[i] = 1 # pretend we've done it
                    continue
                if checkIn(i,self.do_as_poem):
                    # this is part of a "known poem" and let's try to do it in sequence
                    self.try_add_poem(self.do_as_poem[i]) ; continue
                oldPromptsData = self.promptsData.copy()
                seq=anticipationSequence(promptFile,zhFile,timesDone,timesDone+thisNumToTry,self.promptsData,introductions(zhFile,self.data))
                seq[0].timesDone = timesDone # for diagram.py (and now status messages) to know if it's a new word
                global earliestAllowedEvent ; earliestAllowedEvent = 0
                if not timesDone and type(promptFile)==type([]):
                    # for poems: if any previously-added new word makes part of the prompt, try to ensure this one is introduced AFTER that one
                    for f,t in list(newWordTimes.items()):
                        if checkIn(f,promptFile): earliestAllowedEvent = max(earliestAllowedEvent,t)
                if not timesDone: newWordTimes[zhFile] = maxLenOfLesson # by default (couldn't fit it in).  (add even if not type(promptFile)==type([]), because it might be a first line)
                try: self.l.addSequence(seq)
                except StretchedTooFar: # If this happens, couldn't fit the word in anywhere.  If this is "filling in gaps" then it's likely that we won't be able to fit in any more words this lesson, so stop trying.
                    earliestAllowedEvent = 0 # because there may be addSequence's outside this method
                    self.promptsData = oldPromptsData
                    numFailures += 1
                    if numFailures > 2 and time.time()>startTime+1: # TODO these numbers need to be constants.  (the +1 could also be cond(soundCollector,10,1) but we might want offline-generation to run fast also and it doesn't seem to make much difference)
                        break # give up trying to add more (we're taking too long)
                    else: continue
                except IOError: # maybe this file isn't accessible at the moment; keep the progress data though
                    show_warning("Excluding %s (problems reading)" % str(zhFile))
                    earliestAllowedEvent = 0 # because there may be addSequence's outside this method
                    self.exclude[i] = 1 # save trouble
                    continue
                numFailures = 0
                earliestAllowedEvent = 0 # because there may be addSequence's outside this method
                numberAdded = numberAdded + 1
                self.exclude[i] = 1
                # Keep a count
                if not timesDone: self.l.newWords += 1
                else: self.l.oldWords += 1
                self.data[i]=(timesDone+thisNumToTry,promptFile,zhFile)
                if not timesDone: newWordTimes[zhFile] = seq[0].getEventStart(0) # track where it started
        return numberAdded
    def try_add_poem(self,poem):
        poemSequence = []
        isPrefix=0 # keep choosing until we get an instruction that's a prefix
        while not isPrefix: i,isPrefix = randomInstruction(2,self.promptsData,languageof(poem[0])) # 2 so not listen-repeat or sayAgain and not drop-altogether (assuming sensible thresholds)
        poemSequence.append(filesToEvents(i,promptsDirectory))
        poemSequence.append(fileToEvent(poem[0]))
        for line in poem:
            e=fileToEvent(line)
            poemSequence.append(Event(e.length))
            poemSequence.append(e)
            self.exclude[self.responseIndex[line]] = 1 # (don't try to add it again this lesson, whether successful or not)
        poemSequence = [GluedEvent(initialGlue(),CompositeEvent(poemSequence))]
        poemSequence[0].endseq = False # boolean 'is it a new word'
        try: self.l.addSequence(poemSequence)
        except StretchedTooFar: return
        self.l.oldWords += 1 # have to only count it as one due to endseq handling
        for line in poem: self.data[self.responseIndex[line]]=(self.data[self.responseIndex[line]][0]+1,)+self.data[self.responseIndex[line]][1:]
    def veryExperienced(self):
        # used for greater abbreviation in the prompts etc
        x = getattr(self,'cached_very_experienced',None)
        if x==None:
            covered = 0
            for timesDone,promptFile,zhFile in self.data:
                if timesDone: covered += 1
            x = (covered > veryExperiencedThreshold)
            self.cached_very_experienced = x
        return x
    def message(self):
        covered = 0 ; total = len(self.data)
        actualCovered = 0 ; actualTotal = 0
        for timesDone,promptFile,zhFile in self.data:
            if timesDone:
                covered += 1
                if B(zhFile).find(B(exclude_from_coverage))==-1: actualCovered += 1
            if B(zhFile).find(B(exclude_from_coverage))==-1: actualTotal += 1
        l=cond(app,localise,lambda x:x)
        toRet = (l("You have %d words in your collection") % total)
        if not total==actualTotal: toRet += (" (actually %d)" % actualTotal)
        if covered:
            toRet += ("\n("+(l("%d new words + %d old words") % (total-covered,covered))+")")
            if not covered==actualCovered: toRet += (" (actually %d new %d old)" % (actualTotal-actualCovered,actualCovered))
        return toRet

def prettyPrintLongList(f,thing,data):
    # help the low-memory compile by splitting it up (also helps saving on slow machines, see below)
    step = 50 # number of items to do in one go
    if winCEsound: p=0 # don't use WinCE's PrettyPrinter here - it inconsistently escapes utf8 sequences (result can't reliably be edited in MADE etc)
    else: p=pprint.PrettyPrinter(indent=2,width=60,stream=f)
    for start in range(0,len(data),step):
        dat = data[start:start+step]
        if type("")==type(u""): # Python 3: probably best to output strings rather than bytes
            for i in range(len(dat)):
                for j in [1,2]:
                    if type(dat[i][j])==bytes:
                        dat[i]=dat[i][:j]+(S2(dat[i][j]),)+dat[i][j+1:]
                    elif type(dat[i][j])==list:
                        dat[i]=dat[i][:j]+(map(S2,dat[i][j]),)+dat[i][j+1:]
        if start: f.write(thing+"+=")
        else: f.write(thing+"=")
        if p:
            t = time.time()
            p.pprint(dat)
            if not start and (time.time()-t)*(len(data)/step) > 5: p=0 # machine is too slow - use faster version on next iteration
        else: # faster version - not quite as pretty
            f.write("[")
            for d in dat: f.write("  "+repr(d)+",\n")
            f.write("]\n")

def calcDropLevel(timesDone):
    # assume timesDone > randomDropThreshold
    if timesDone > randomDropThreshold2:
        return randomDropLevel2
    # or linear interpolation between the two thresholds
    return dropLevelK * timesDone + dropLevelC
# K*rdt1 + c = l1, K*rdt2 + c = l2
# K = (l2-l1)/(rdt2-rdt1)
# c = l1 - K*rdt1
try:
    dropLevelK = (randomDropLevel2-randomDropLevel)/(randomDropThreshold2-randomDropThreshold)
    dropLevelC = randomDropLevel-dropLevelK*randomDropThreshold
except ZeroDivisionError: # thresholds are the same
    dropLevelK = 0
    dropLevelC = randomDropLevel

def cmpfunc(x,y):
    # Comparison function for sorting progress data.  It's a hack for dealing with the problem caused by the ASCII code of '-' being lower than that of '/', so "directory-2/file" comes before "directory/file" unless hacked with this.  NB needs to be fast - don't "".join() unnecessarily.
    r = cmpfunc_test(x[0],y[0])
    if r: return r # skipping the rest if x[0]!=y[0]
    if x[0]: return cmpfunc_test(x,y) # our special order is needed only for new words (to ensure correct order of introduction)
    def my_toString(x):
        if type(x)==type([]): return B("").join(map(B,x))
        else: return B(x)
    x2 = (my_toString(x[1]).replace(B(os.sep),chr(0)), my_toString(x[2]).replace(B(os.sep),chr(0)))
    y2 = (my_toString(y[1]).replace(B(os.sep),chr(0)), my_toString(y[2]).replace(B(os.sep),chr(0)))
    return cmpfunc_test(x2,y2)
def cmpfunc_test(x,y):
    try:
        if x < y: return -1
        elif x > y: return 1
        else: return 0
    except: # probably Python 3 can't compare list vs string
        if x[0] < y[0]: return -1
        elif x[0] > y[0]: return 1
        x,y = repr(x),repr(y)
        if x < y: return -1
        elif x > y: return 1
        else: return 0

def denumber_filelists(r,x,y):
    if type(x)==type([]): x=map(lambda z:denumber_synth(z),x)
    else: x=denumber_synth(x)
    if type(y)==type([]): y=map(lambda z:denumber_synth(z),y)
    else: y=denumber_synth(y)
    return (r,x,y)
def denumber_synth(z,also_norm_extsep=0):
    z=B(z) ; zf = z.find(B("!synth:"))
    if zf>=0:
        z=lower(z[zf:]) # so ignores the priority-number it had (because the vocab.txt file might have been re-organised hence changing all the numbers).  Also a .lower() so case changes don't change progress.  (Old versions of gradint said .lower() when parsing vocab.txt, but this can cause problems with things like Mc[A-Z].. in English espeak)
        if z.endswith(B(dotwav)) or z.endswith(B(dotmp3)): return z[:z.rindex(B(extsep))] # remove legacy extensions from synth vocab
    elif also_norm_extsep: return z.replace(B("\\"),B("/")).replace(B("."),B("/")) # so compares equally across platforms with os.sep and extsep differences
    return z

def norm_filelist(x,y):
    def noext(x): return (B(x)+B(' '))[:B(x).rfind(B(extsep))] # so user can change e.g. wav to mp3 without disrupting progress.txt (the ' ' is simply removed if rfind returns -1)
    if type(x)==type([]): x=tuple(map(lambda z,noext=noext:denumber_synth(noext(z),1),x))
    else: x=denumber_synth(noext(x),1)
    if type(y)==type([]): y=tuple(map(lambda z,noext=noext:denumber_synth(noext(z),1),y))
    else: y=denumber_synth(noext(y),1)
    return (x,y)
def mergeProgress(progList,scan):
    # Merges a progress database with a samples scan, to
    # pick up any new samples that were added since last
    # time.  Appends to progList.  Return value see below.
    proglistDict = {} ; scanlistDict = {} ; n = 0
    while n<len(progList):
        i,j,k = progList[n]
        if i:
            proglistDict[norm_filelist(j,k)]=n
            # (DO need to call denumber_synth (called by
            # norm_filelist) on existing data, because might
            # be loading a legacy progress.txt which has
            # numbers before !synth) (as well as the .lower() thing)
            n += 1
        else: del progList[n]
        # (take out any 0s - add them back in only if still
        # in the scan.  This makes re-organisation etc
        # easier.  NB this duplicates the functionality in
        # save(), but useful if upgrading from an old
        # version.)
    renames = {}
    for (_,j,k) in scan:
        key = norm_filelist(j,k)
        if checkIn(key,proglistDict):
            # an existing item - but in the case of synth'd vocab, we need to take the capitals/lower-case status from the scan rather than from the progress file (see comment above in denumber_synth) so:
            progList[proglistDict[key]]=(progList[proglistDict[key]][0],j,k)
        elif type(key[0])==type("") and (key[0]+key[1]).find("!synth")==-1 and ("_" in key[0] and "_" in key[1]):
            # a file which might have been renamed and we may be able to catch a case of appending text to digits (but we don't (yet?) support doing this with poetry, hence the type() precondition)
            # TODO document that we do this in samples/readme and possibly the autosplit scripts etc, although nowadays recording GUI is more likely to be used and it lends itself to rename-all-but-digits.
            normK = key[1]
            lastDirsep = normK.rfind(os.sep)
            ki = len(normK)-1 ; found=0
            while ki>lastDirsep:
                while ki>lastDirsep and not "0"<=normK[ki]<="9": ki -= 1
                if ki<=lastDirsep: break
                key2 = (key[0][:ki+1]+key[0][key[0].rindex("_"):],key[1][:ki+1]+key[1][key[1].rindex("_"):])
                if checkIn(key2,proglistDict):
                    if not checkIn(key2,renames): renames[key2] = []
                    renames[key2].append((j,k))
                    found=1 ; break
                while ki>lastDirsep and "0"<=normK[ki]<="9": ki -= 1
            if not found: progList.append((0,j,k)) # new item
        else: progList.append((0,j,k)) # ditto
        scanlistDict[key]=1
    for k,v in list(renames.items()):
        if checkIn(k,scanlistDict) or len(v)>1: # can't make sense of this one - just add the new stuff
            for jj,kk in v: progList.append((0,jj,kk))
        else: progList[proglistDict[k]]=(progList[proglistDict[k]][0],v[0][0],v[0][1])
    # finally, separate off any with non-0 progress that are
    # no longer available (keep them because they may come
    # back later, but useful to make the distinction in case
    # want to manually edit progress.txt)
    n = 0 ; unavailList = []
    while n<len(progList):
        i,j,k = progList[n]
        if not checkIn(norm_filelist(j,k), scanlistDict):
            unavailList.append((i,j,k))
            del progList[n]
        else: n += 1
    return unavailList

def jitter(list):
    # Adds some random 'jitter' to a list (in-place)
    # Assumes item is a tuple and item[0] might be ==
    # Doesn't touch "new" words (tries==0) (assumes they're
    # all at top, so doesn't have to test for new word after
    # already-tried word).
    # HOWEVER, now handles the 'limit' feature for new words

#     swappedLast = 0
#     for i in range(len(list)-1):
#         if list[i][0] and ((list[i][0] == list[i+1][0] and random.choice([1,2])==1) or (not list[i][0] == list[i+1][0] and random.choice([1,2,3,4,5,6])==1 and not swappedLast)):
#             x = list[i]
#             del list[i]
#             list.insert(i+1,x)
#             swappedLast = 1
#         else: swappedLast = 0

    # Algorithm below implemented 2005-04-13 to deal with
    # larger vocabularies (thousands of words - previously
    # only the first few hundred ever got considered even
    # with random drop).  Divide words into groups and
    # shuffle each group.  To begin with each group is the
    # items that have the same repetition count, but as this
    # gets large we rapidly tolerate increasing differences
    # in repetition count in the same group.
    i = 0 ; groupStart = -1
    while i <= len(list):
        if i<len(list) and not list[i][0]: pass # leave it
        elif i<len(list) and groupStart<0:
            groupStart = i
            try:
                incrementThreshold = int(math.exp(list[groupStart][0]*shuffleConstant/(randomDropThreshold+1)-shuffleConstant)) # (not sure about the +1) (NB strict int, NOT nearest)
            except OverflowError: incrementThreshold=sys.maxint
        elif groupStart>=0 and (i==len(list) or list[i][0] - list[groupStart][0] > incrementThreshold):
            l2 = list[groupStart:i] ; random.shuffle(l2)
            del list[groupStart:i]
            for item in l2: list.insert(groupStart,item)
            groupStart = -1
            continue
        i += 1

    # Handle 'limit' feature: Of the new words that are
    # limited, put all but limit_words of them at the end of
    # the list (but this is done for EACH limit).
    # Also, all but 1 of 3rd, 4th etc languages to end (and rely on
    # directory order not to introduce them too early)
    # -> latter has now been commented out because do sometimes
    # need to work on them more quickly, and can limit manually
    limitCounts = {} ; i = 0 ; imax = len(list)
    while i < imax:
        if list[i][0]==0 and checkIn(list[i][-1],limitedFiles): # or not languageof(list[i][2])==secondLanguage):
            # if not languageof(list[i][2])==secondLanguage: countNo="other-langs"
            # else:
            countNo = limitedFiles[list[i][-1]]
            if not checkIn(countNo,limitCounts): limitCounts [countNo] = 0
            limitCounts [countNo] += 1
            # (below is a hack: if already moved something, set limit_words to 1.  May want to do it better than that e.g. go back and ensure the first thing only left 1 as well, or share out limit_words among any concurrently available new items that are just about to be introduced)
            if limitCounts [countNo] > cond(imax==len(list),limit_words,1) or (countNo=="other-langs" and limitCounts [countNo] > 1):
                list.append(list[i])
                del list[i]
                imax -= 1 # don't re-check the ones already moved to the end
                continue # no need to increment i
        i += 1

def find_known_poems(progressData):
    # If every line of a poem is known then it might be better to recite the whole thing in sequence
    # This function goes through progressData and extracts "known poems".  Returns: (a) a list of poems (each being a list of lines), (b) dictionary line -> index into progressData
    nextLineDic = {} # line -> next line
    responseIndex = {} # target response -> index into progressData
    hasPreviousLine = {} # line -> does it have a previous line
    for i in xrange(len(progressData)):
        response = progressData[i][2]
        responseIndex[response] = i
        if type(progressData[i][1])==type([]): line=progressData[i][1][cond(len(progressData[i][1])==2,0,-1)] # (the L2 is normally in last position, but it's in 1st position in a 2-item list - see the "line 1 doesn't have L1 but line 2 does" comment)
        else: line=progressData[i][1]
        if languageof(line)==languageof(response) and not line==response: # looks like part of a poem (and not the 'beginning' first line).  (Don't need any extra code to avoid mistaking 2nd-language-to-2nd-language word pairs as poems, because responseIndex will not get the "first line" and the "poem" won't be viable.)
            nextLineDic[line]=response # TODO check what would happen if 2 different poems in vocab.txt share an identical line (or if responseIndex is ambiguous in any way)
            hasPreviousLine[response]=True
    poems = []
    for poemFirstLine in filter(lambda x,hasPreviousLine=hasPreviousLine:not x in hasPreviousLine,nextLineDic.keys()):
        poemLines = [] ; line = poemFirstLine
        poem_is_viable = True
        while True:
            poemLines.append(line)
            if not checkIn(line,responseIndex) or progressData[responseIndex[line]][0] < reallyKnownThreshold:
                poem_is_viable = False ; break # whole poem not in database yet, or is but not well-rehearsed
            if not checkIn(line,nextLineDic): break
            line = nextLineDic[line]
        if poem_is_viable: poems.append(poemLines)
    return poems, responseIndex