Can you attach files in this forum? Couldn't find the option. Oh well, here's the file.
#!/usr/bin/python # Version: 1.1 # Author: Steve Losh from sets import Set from optparse import OptionParser from xml.dom.minidom import parse AudioPath = 'audio/' DatafilePath = 'utterances.trmxml' CONFIDENCE_LOW = None #'500' CONFIDENCE_HIGH = None #'500' utterancesFile = None class Utterance: def __init__(self, audio, grammarSet, text): self.audio = audio self.grammarSet = grammarSet self.text = text def __str__(self): return "SWIrecAcousticStateReset\ntranscription " + self.text \ + "\nrecognize " + AudioPath + self.audio def getGrammarPaths(): """Get the paths of all the grammars needed. Returns a Set containing the results. If a grammar is listed more than once in the transcription manifest it will only appear once in these results. TODO: Find a less fragile way to split off the server half of the URIs.""" grammarTags = utterancesFile.getElementsByTagName('Grammar') grammarURIs = [tag.getAttribute('uri') for tag in grammarTags] grammarPaths = [uri.split('servlet/CA/')[1] for uri in grammarURIs] return Set(grammarPaths) def createGrammarNameFromPath(path): """Convert a given path into an appropriate name for the grammar.""" path = path.replace('/', '-') # Strip the directory slashes path = path.replace('.', '_') # and the dot before the extension. return path def loadGrammars(): """Output the statements that will load the required grammars.""" grammarPaths = list(getGrammarPaths()) grammarsToLoad = {} for path in grammarPaths: grammarName = createGrammarNameFromPath(path) grammarsToLoad[grammarName] = "grammars/" + path for grammarName in grammarsToLoad: print "SWIrecGrammarLoad", grammarName, grammarsToLoad[grammarName] def loadGrammarSets(): """Output the statements that will define the grammar sets/contexts. Returns a list of the grammar set names.""" grammarSetList = utterancesFile.getElementsByTagName('GrammarSets') grammarSets = grammarSetList[0].getElementsByTagName('GrammarSet') grammarSetNames = [] for gs in grammarSets: grammarSetName = gs.getAttribute('id') print "context_define", grammarSetName, CONFIDENCE_LOW, CONFIDENCE_HIGH for g in gs.getElementsByTagName('Grammar'): path = g.getAttribute('uri').split('servlet/CA/')[1] print "context_add", createGrammarNameFromPath(path), '1000' print "context_end\n" grammarSetNames.append(grammarSetName) return grammarSetNames def buildUtterances(call): """This function takes a call tag, builds the utterances belonging to it and returns a list containing them.""" utts = call.getElementsByTagName('Utt') utterances = [Utterance( utt.getAttribute('audio'), \ utt.getAttribute('grammarSet'), utt.getAttribute('transcribedText') ) \ for utt in utts] return utterances def getUtterances(): """Returns a list of all the utterances we want to test.""" callList = utterancesFile.getElementsByTagName('Calls')[0] calls = callList.getElementsByTagName('Call') utterances = [] for c in calls: utterances.extend(buildUtterances(c)) return utterances def loadData(utterances): """Outputs the statements that will tell rec_test what to test.""" contexts = {} for u in utterances: if u.grammarSet not in contexts: contexts[u.grammarSet] = [] contexts[u.grammarSet].append(u) for c in contexts: print "open errors " + c + ".errors" print "open utd " + c + ".utd" print "context_use", c for u in contexts[c]: print u print "close utd" print "close errors" print "\n" def makeParser(): parser = OptionParser( "usage: %prog -l LOWER CONFIDENCE -h UPPER CONFIDENCE [-f FILTER1 -f FILTER2 ...] file" ) parser.add_option("-l", "--low-confidence", dest="lower", \ help="The lower confidence level to test at.", metavar="CONFIDENCE") parser.add_option("-u", "--upper-confidence", dest="upper", \ help="The upper confidence level to test at.", metavar="CONFIDENCE") parser.add_option("-f", "--filter", dest="filter", action="append", \ help="Only test utterances transcribed as WORD.", metavar="WORD") return parser def main(): global utterancesFile, CONFIDENCE_HIGH, CONFIDENCE_LOW parser = makeParser() (options, args) = parser.parse_args() if len(args) != 1: parser.error("One data file must be specified.") elif options.lower == None: parser.error("A lower confidence level must be specified.") elif options.upper == None: parser.error("An upper confidence level must be specified.") DatafilePath = args[0] CONFIDENCE_LOW = options.lower CONFIDENCE_HIGH = options.upper utterancesFile = parse(DatafilePath) print ':ACC\n\n' loadGrammars() print "\n\n" grammarSetNames = loadGrammarSets() utterances = getUtterances() if options.filter != None: utterances = [u for u in utterances if u.text in options.filter] print "\n\n" loadData(utterances) print "report summary summary.txt" print "report oov oov.txt" print "report words words.txt" if __name__ == '__main__': main() Peter Otten wrote: > [EMAIL PROTECTED] wrote: > > > HELP! > > Guy who was here before me wrote a script to parse files in Python. > > > > Includes line: > > print u > > According to your other posts 'u' seems to be an instance of a custom > Utterance class with a __str__() method that accidentally returns unicode. > Try changing the print statement to > > print unicode(u) > > If you're lucky, it works. Otherwise we need a piece of the actual code. To > give you an idea what a self-contained demonstration of your problem might > look like: > > >>> class Utterance(object): > ... def __str__(self): return u"äöü" > ... > >>> u = Utterance() > >>> print u > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2: > ordinal not in range(128) > >>> print unicode(u) > äöü > > Peter -- http://mail.python.org/mailman/listinfo/python-list