Hi all, After sludging my way through many obstacles with this interesting puzzle of a text parsing program, I found myself with one final error:
Traceback (most recent call last): File "C:\Python24\Lib\site-packages\pythonwin\pywin\framework \scriptutils.py", line 310, in RunScript exec codeObject in __main__.__dict__ File "C:\Documents and Settings\Patrick Waldo\My Documents\Python \WORD\try5-2-file-1-all patterns.py", line 77, in ? input = codecs.open(input_text, 'r','utf8') File "C:\Python24\lib\codecs.py", line 666, in open file = __builtin__.open(filename, mode, buffering) IOError: [Errno 13] Permission denied: 'C:\\text_samples\\test\ \output' The error doesn't stop the program from functioning as it should, except the last line of every document gets split with | in between the words, which is just strange. I have no idea why either is happening, but perhaps they are related. Any ideas? #For text files in a directory... #Analyzes a randomly organized UTF8 document with EINECS, CAS, Chemical, and Chemical Formula #into a document structured as EINECS|CAS|Chemical|Chemical Formula. import os import codecs import re path = "C:\\text_samples\\test\\" path2 = "C:\\text_samples\\test\\output\\" EINECS = re.compile(r'^\d\d\d-\d\d\d-\d$') FORMULA = re.compile(r'([A-Z][a-zA-Z0-9]*\.?[A-Za-z0-9]*/?[A-Za- z0-9]*)') FALSE_POS = re.compile(r'^[A-Z][a-z]{4,40}\)?\.?') FALSE_POS1 = re.compile(r'C\.I\..*') FALSE_POS2 = re.compile(r'vit.*') FALSE_NEG = re.compile(r'C\d+\.') def iter_elements(tokens): product = [] for tok in tokens: if EINECS.match(tok) and len(product) >= 3: match = re.match(FORMULA,product[-1]) match_false_pos = re.match(FALSE_POS,product[-1]) match_false_pos1 = re.match(FALSE_POS1,product[-1]) match_false_pos2 = re.match(FALSE_POS2,product[2]) match_false_neg = re.match(FALSE_NEG,product[-1]) if match_false_neg: product[2:-1] = [' '.join(product[2:])] del product[-1] yield product product = [] elif match_false_pos: product[2:-1] = [' '.join(product[2:])] del product[-1] yield product product = [] elif match: product[2:-1] = [' '.join(product[2:-1])] yield product product = [] elif match_false_pos1 or match_false_pos2: product[2:-1] = [' '.join(product[2:])] del product[-1] yield product product = [] else: product[2:-1] = [' '.join(product[2:])] del product[-1] yield product product = [] product.append(tok) yield product for text in os.listdir(path): input_text = os.path.join(path,text) output_text = os.path.join(path2,text) input = codecs.open(input_text, 'r','utf8') output = codecs.open(output_text, 'w', 'utf8') tokens = input.read().split() for element in iter_elements(tokens): output.write('|'.join(element)) output.write("\r\n") input.close() output.close() -- http://mail.python.org/mailman/listinfo/python-list