#!/usr/bin/env python # grep_for_QA.py I am only looking to isolate uniq Q:: and A:: stmts from my daily files # # note: This algorithm will fail if there are any blank lines within the Q and A area of interest (a paragraph)
# D. Beazley is my fav documentation import re, glob import pprint as pp sampledata = ''' A:: And Straight Street is playin on the Radio Free Tibet. What are the chances, DTMB? Q:: About 1 in 518400, Professor. A:: Correct! Err, I thought it was 1:410400, but <i>close enough for jazz!</i> ''' pattern0 = re.compile("Q::") pattern1 = re.compile("A::") # objects of interest can start with A:: ;; not alway Q:: END_OF_PARAGRAPH_pat = "\n\s*\n" path = "/Users/paultaney/dailies2012/0722" # an example of real data set. toggle = False L = [] M = [] #file = open(path, "r") try: #for line in file.readlines(): for line in sampledata: try: # Later, I also need to treat Unicode -- and I am clueless. # falsestarts:: #line.encode("utf8").decode('xxx', 'ignore') #line.encode("utf8", 'ignore') #line.decode('8859') #line.decode('8859') # 8859, Latin-1 doesn't cover my CJK pastings AT ALL #line.decode('GB18030') # 171006 -- ack #encoded_line = line # xxx line.encode("utf8") mo0 = re.search(pattern0, line) mo1 = re.search(pattern1, line) mo2 = re.search(END_OF_PARAGRAPH_pat, line) if mo0: if 1: print ("I see pattern 0") toggle = True if 1: print(line) M.append(mo0.group()) if mo1: if 1: print ("I see pattern 1") toggle = True M.append(mo1.group()) if mo2 and toggle: if 1: print ("I see pattern 2 AND toggle is set") # got one. save it for uniqifying, and empty the container toggle = False L.append(M) M = [] except Exception as e: print("--- " + e + " ---") except UnicodeDecodeError: #encoded_line = encoded_line.urlsafe_b64encode(re.replace("asdf", encoded_line)) #line = re.sub(".+", "--- asdf ---", line) pass L.sort print (L) # and what"s wrong with some of this, here! #myHash = set(L) # uniqify #pp.pprint(myHash) # july 23, 131001 hike! -- http://mail.python.org/mailman/listinfo/python-list