Hi Matthew, >From your example, it is hard to work out what character or character string is a separator, and what string needs to become a separate word when seen in the original file.
In the example below you need to learn about regular expressions. the split is based on the two RE's held in variables 'separators' and 'otherwords'. the first is used to split a line the second is used to extract sub-words. The output is printed in two formats that could be piped to a file for later re-use; as a python list, then as space separated lines of words. The csv module could be used to create a csv file for reading into spreadsheets and databases. No doubt an XML formatted output is just as straight-forward, (but XML is not, yet, my cup of tea). The prog: word_up.py: import re import pprint instring = '''gee fre asd[234 ger dsf asd[243 gwer af as.:^25a ''' separators = r'''[ \t\r\f\v]+''' otherwords = r'''(?x) (.*) ( \[ | \^ ) (.*) ''' def word_up(instring, separators, otherwords): """ for less confusing names substitute line for cameo, and w for jockstrap in the function body :-) # doctest >>> from pprint import pprint as pp >>> i = 'gee fre asd[234\nger dsf asd[243\ngwer af as.:^25a\n' >>> print i gee fre asd[234 ger dsf asd[243 gwer af as.:^25a >>> s = r'''[ \t\r\f\v]+''' >>> o = '(?x)\n (.*)\n (\n \\[\n | \\^\n )\n (.*)\n' >>> print o (?x) (.*) ( \[ | \^ ) (.*) >>> pp(word_up(i, s, o)) [['gee', 'fre', 'asd', '[', '234'], ['ger', 'dsf', 'asd', '[', '243'], ['gwer', 'af', 'as.:', '^', '25a']] >>> """ line_words = [] for cameo in instring.splitlines(): # some words are separated by separator chars word_split = re.split(separators, cameo) # extract separate sub_words word_extracts = [] for word in word_split: matched = re.match(otherwords, word) if matched: word_extracts += [jockstrap for jockstrap in matched.groups() if jockstrap] else: word_extracts.append(word) line_words.append(word_extracts) return line_words line_words = word_up(instring, separators, otherwords) print '\n# Python format extracted words as list of lists' pprint.pprint(line_words) print '\n# Unix friendly space separated words' for l in line_words: for w in l: print w, print -- Paddy -- http://mail.python.org/mailman/listinfo/python-list