Gabriel Genellina wrote: > the idea would be as follows: > > - For each span generate two tuples: (start_offset, 1, end_offset, > element) and (end_offset, 0, -start_offset, element). If start==end use > (start_offset, -1, start_offset, element). > - Collect all tuples in a list, and sort them. The tuple is made so when > at a given offset there is an element that ends and another that starts, > the ending element comes first (because it has a 0). For all the > elements that end at a given point, the shortest comes first. > - Initialize an empty list (will be used as a stack of containers), and > create a root Element as your "current container" (CC), the variable > "last_used" will keep the last position used on the text. > - For each tuple in the sorted list: > - if the second item is a 1, an element is starting. Insert it into > the CC element, push the CC onto the stack, and set the new element as > the new CC. The element data is text[last_used:start_offset], and move > last_used to start_offset. > - if the second item is a 0, an element is ending. Discard the CC, pop > an element from the stack as the new CC. The element data is > text[last_used:end_offset], move last_used up to end_offset. > - if the second item is a -1, it's an element with no content. Insert > it into the CC element.
Thanks a lot! This put me on the right track (though the devil's definitely in the details). It's working now:: >>> tree = xmltools.text_and_spans_to_etree('aaa aaa aaaccc cccaaa', [ ... (etree.Element('a'), 0, 21), ... (etree.Element('b'), 11, 11), ... (etree.Element('c'), 11, 18), ... ]) >>> etree.tostring(tree) '<a>aaa aaa aaa<b /><c>ccc ccc</c>aaa</a>' >>> tree = xmltools.text_and_spans_to_etree('bbb\naaaccc\ncccaaa', [ ... (etree.Element('a'), 0, 17), ... (etree.Element('b'), 0, 4), ... (etree.Element('c'), 7, 14), ... (etree.Element('b'), 14, 14), ... ]) >>> etree.tostring(tree) '<a><b>bbb\n</b>aaa<c>ccc\nccc</c><b />aaa</a>' >>> tree = xmltools.text_and_spans_to_etree('abc', [ ... (etree.Element('a'), 0, 3), ... (etree.Element('b'), 0, 3), ... (etree.Element('c'), 0, 3), ... ]) >>> etree.tostring(tree) '<a><b><c>abc</c></b></a>' And for the sake of any poor soul who runs into a similar problem, here's the code I wrote using Gabriel's hints above:: def text_and_spans_to_etree(text, spans): # Create a list of element starts and ends, sorting them in the # order they would appear in an XML version of the text. So for # example, with XML text like: # <a> a <b> b </b> a </a> # we will see: # starting <a> # starting <b> # ending </b> # ending </a> empty = -1 starting = 0 ending = 1 tuples = [] root_elem = None for i, (elem, start, end) in enumerate(spans): # validate spans if start < 0 or end > len(text): msg = 'spans must be in the range 0-%i' raise ValueError(msg % len(text)) # save the first element that spans the entire text as the root elif root_elem is None and start == 0 and end == len(text): root_elem = elem # insert a single tuple for empty elements elif start == end: tuples.append((start, empty, -end, i, elem)) # insert starts and ends for all other elements else: tuples.append((start, starting, -end, i, elem)) tuples.append((start, ending, -end, -i, elem)) tuples.sort() # make sure we found a root element if root_elem is None: raise ValueError('one element must span the entire text') # iterate through the element starts and ends, # updating element texts, tails and children last_offset = 0 last_elem = root_elem last_type = starting stack = [root_elem] for start, offset_type, neg_end, _, elem in tuples: # start of an element: # add it as a child and add it to the stack # next text chunk goes up to the start offset if offset_type is starting: stack[-1].append(elem) stack.append(elem) offset = start # end of an element: # pop if off the stack # next text chunk goes up to the end offset elif offset_type is ending: if elem is not stack[-1]: print elem, stack[-1] assert False assert elem is stack.pop() offset = -neg_end # empty element: # add it as a child # next text chunk goes up to the start offset elif offset_type is empty: stack[-1].append(elem) offset = start # should never get here else: assert False # calculate the next text chunk, and then determine what to do # with it based on what we did the last time around: # * started an element -> set its .text # * ended an element (or inserted an empty) -> set its .tail last_text = text[last_offset:offset] if last_type is starting: last_elem.text = last_text elif last_type is ending: last_elem.tail = last_text elif last_type is empty: last_elem.tail = last_text else: assert False # save what we did this time for inspection next time last_offset = offset last_type = offset_type last_elem = elem # add any final text before the close of the root element last_elem.tail = text[last_offset:] return root_elem Thanks again, STeVe -- http://mail.python.org/mailman/listinfo/python-list