#!/usr/bin/env python # list-synth.py language [language ...] # list all words that can be synthesized # Run in the same directory as gradint.py with # all the settings. paragraph_size = 255 # if not 0, will try to split into paragraphs of max this # number of characters (after decoding the utf-8) hanzi_only = 1 # This is a hack for Chinese: if it's not 0, anything in # language "zh" will be listed only if it's entirely hanzi, # not pinyin (use for quick testing your vocab on an online # demo synth that supports only hanzi) reverse_grouping = 1 # if set, paragraphs will be grouped backwards # from the end of the list (useful if newest # words are at end and you don't want a trailing # half-paragraph) # ------------------------------------------------------ outFilename = "cache-list" import sys,os,time lang = sys.argv[1:] if not lang: sys.stderr.write("Please put a language abbreviation on the command line. See comments at the start of this script for details.\n") sys.exit() lang=lang[0] sys.argv = [] import gradint if not gradint.synthCache: sys.stderr.write("Error - synthCache is not set in advanced.txt\n") ; sys.exit() gradint.cache_maintenance_mode=1 toList = [] wroteChars = 0 ; listed = {} if gradint.fileExists("cachelist-done"+gradint.dottxt): listed=gradint.list2set(filter(lambda x:x,gradint.u8strip(open("cachelist-done"+gradint.dottxt).read().replace("\r","\n")).split("\n"))) def maybe_list(s,directory): if not s: return # in case poetry has some 2nd-language only if '!synth:' in s and "_" in s: textToSynth, langToSynth = gradint.textof(s),gradint.languageof(s) elif s.endswith(gradint.dottxt): textToSynth, langToSynth = gradint.readText(directory+os.sep+s), gradint.languageof(s,directory==gradint.promptsDirectory) else: return if not langToSynth==lang: return # we're not listing that language if textToSynth.lower() in listed: return d=textToSynth.decode('utf-8') if hanzi_only and langToSynth=="zh" and not gradint.fix_compatibility(d).replace(" ","")==gradint.hanzi_and_punc(d): return global wroteChars if paragraph_size and wroteChars and wroteChars+len(d)>paragraph_size: toList.append("") ; wroteChars = 0 wroteChars += (len(d)+2) # comma and \n toList.append(textToSynth+",") listed[textToSynth.lower()]=1 inList = gradint.scanSamples()+gradint.parseSynthVocab(gradint.vocabFile) if reverse_grouping: inList.reverse() # will be reversed again on output for _,s1,s2 in inList: if type(s1)==type([]): [maybe_list(i,gradint.samplesDirectory) for i in s1] else: maybe_list(s1,gradint.samplesDirectory) maybe_list(s2,gradint.samplesDirectory) for f in gradint.AvailablePrompts().lsDic.values(): for f2 in f: if f2.endswith("txt"): maybe_list(f2,gradint.promptsDirectory) if reverse_grouping: toList.reverse() for i in range(len(toList)+1): if i and (i==len(toList) or toList[i]=="\n") and toList[i-1].endswith(","): toList[i-1]=toList[i-1][:-1] # don't need last comma of each paragraph listFile = open(outFilename+gradint.dottxt,"w") listFile.write('\xef\xbb\xbfEdit this file, deleting this message and the phrases you don\'t want,\nget it recorded or synthesized, and split the recording into lines\n(using HardDiskOgg or Audacity or one of the utils)\nthen feed edited list + recordings to list2cache.py to get into '+gradint.synthCache+'\n\n') listFile.write("\n".join(toList)) listFile.close() sys.stdout.write("List written to %s\n" % (outFilename+gradint.dottxt+" (in utf-8)")) if gradint.textEditorCommand: os.system(gradint.textEditorCommand+" "+outFilename+gradint.dottxt)