Steven Bethard wrote: > I have some plain text data and some SGML markup for that text that I > need to align. (The SGML doesn't maintain the original whitespace, so I > have to do some alignment; I can't just calculate the indices directly.) [snip] > Note that the SGML inserts spaces not only within the SGML elements, but > also around punctuation. [snip] > I need to determine the indices in the original text that each SGML > element corresponds to.
Ok, below is a working version that doesn't use regular expressions. It's far from concise, but at least it doesn't fail like re does when I have more than 100 words. =) >>> import elementtree.ElementTree as etree >>> def align(text, sgml): ... # convert SGML tree to words, and assemble a list of the ... # start word index and end word index for each SGML element ... sgml = sgml.replace('&', '&') ... tree = etree.fromstring('<xml>%s</xml>' % sgml) ... words = [] ... if tree.text is not None: ... words.extend(tree.text.split()) ... word_spans = [] ... for elem in tree: ... elem_words = elem.text.split() ... start = len(words) ... end = start + len(elem_words) ... word_spans.append((start, end, elem.tag)) ... words.extend(elem_words) ... if elem.tail is not None: ... words.extend(elem.tail.split()) ... # determine the start character index and end character index ... # for each word from the SGML ... char_spans = [] ... start = 0 ... for word in words: ... while text[start:start + 1].isspace(): ... start += 1 ... end = start + len(word) ... assert text[start:end] == word, (text[start:end], word) ... char_spans.append((start, end)) ... start = end ... # convert the word indices for each SGML element to ... # character indices ... for word_start, word_end, label in word_spans: ... start, _ = char_spans[word_start] ... _, end = char_spans[word_end - 1] ... yield label, start, end ... >>> text = '''TNF binding induces release of AIP1 (DAB2IP) from TNFR1, resulting in cytoplasmic translocation and concomitant formation of an intracellular signaling complex comprised of TRADD, RIP1, TRAF2, and AIPl.''' >>> sgml = '''<PROTEIN> TNF </PROTEIN> binding induces release of <PROTEIN> AIP1 </PROTEIN> ( <PROTEIN> DAB2IP </PROTEIN> ) from <PROTEIN> TNFR1 </PROTEIN> , resulting in cytoplasmic translocation and concomitant formation of an <PROTEIN> intracellular signaling complex </PROTEIN> comprised of <PROTEIN> TRADD </PROTEIN> , <PROTEIN> RIP1 </PROTEIN> , <PROTEIN> TRAF2 </PROTEIN> , and AIPl . ... ''' >>> list(align(text, sgml)) [('PROTEIN', 0, 3), ('PROTEIN', 31, 35), ('PROTEIN', 37, 43), ('PROTEIN', 50, 55), ('PROTEIN', 128, 159), ('PROTEIN', 173, 178), ('PROTEIN', 180, 184), ('PROTEIN', 186, 191)] STeVe -- http://mail.python.org/mailman/listinfo/python-list