Fredrik Lundh wrote: > > what does the word "validate" mean here? > Let me explain our module. We receive text files (with comma separated values, as per some predefined format) from a third party. for example account file comes as "abc.acc" {.acc is the extension for account file as per our code} it must contain account_code, account_description, account_balance in the same order.
So, from the text file("abc.acc") we receive for 2 or more records, will look like A001, test account1, 100000 A002, test account2, 500000 We may have multiple .acc files Our job is to validate the incoming data on the basis of its datatype, field number, etc and copy all the error free records in acc.txt for this, we use a schema as follows ---------------------------------------------------------------------------------------------------------- if account_flg == 1: start = time() # the input fields acct_schema = { 0: Text('AccountCode', 50), 1: Text('AccountDescription', 100), 2: Text('AccountBalance', 50) } validate( schema = acct_schema, primary_keys = [acct_pk], infile = '../data/ACC/*.acc', outfile = '../data/acc.txt', update_freq = 10000) ---------------------------------------------------------------------------------------------------------- In a core.py, we have defined a function validate, which checks for the datatypes & other validations. All the erroneous records are copied in a error log file, and the correct records are copied to a clean acc.text file The validate function is as given below... --------------------------------------------------------------------------------------------------------------------------- def validate(infile, outfile, schema, primary_keys=[], foreign_keys=[], record_checks=[], buffer_size=0, update_freq=0): show("intitalizing ... ") # find matching input files all_files = glob.glob(infile) if not all_files: raise ValueError('No input files were found.') # initialize data structures freq = update_freq or DEFAULT_UPDATE input = fileinput.FileInput(all_files, bufsize = buffer_size or DEFAULT_BUFFER) output = open(outfile, 'wb+') logs = {} for name in all_files: logs[name] = open(name + DEFAULT_SUFFIX, 'wb+') #logs[name] = open(name + DEFAULT_SUFFIX, 'a+') errors = [] num_fields = len(schema) pk_length = range(len(primary_keys)) fk_length = range(len(foreign_keys)) rc_length = range(len(record_checks)) # initialize the PKs and FKs with the given schema for idx in primary_keys: idx.setup(schema) for idx in foreign_keys: idx.setup(schema) # start processing: collect all lines which have errors for line in input: rec_num = input.lineno() if rec_num % freq == 0: show("processed %d records ... " % (rec_num)) for idx in primary_keys: idx.flush() for idx in foreign_keys: idx.flush() if BLANK_LINE.match(line): continue try: data = csv.parse(line) # check number of fields if len(data) != num_fields: errors.append( (rec_num, LINE_ERROR, 'incorrect number of fields') ) continue # check for well-formed fields fields_ok = True for i in range(num_fields): if not schema[i].validate(data[i]): errors.append( (rec_num, FIELD_ERROR, i) ) fields_ok = False break # check the PKs for i in pk_length: if fields_ok and not primary_keys[i].valid(rec_num, data): errors.append( (rec_num, PK_ERROR, i) ) break # check the FKs for i in fk_length: if fields_ok and not foreign_keys[i].valid(rec_num, data): #print 'here ---> %s, rec_num : %d'%(data,rec_num) errors.append( (rec_num, FK_ERROR, i) ) break # perform record-level checks for i in rc_length: if fields_ok and not record_checks[i](schema, data): errors.append( (rec_num, REC_ERROR, i) ) break except fastcsv.Error, err: errors.append( (rec_num, LINE_ERROR, err.__str__()) ) # finalize the indexes to check for any more errors for i in pk_length: error_list = primary_keys[i].finalize() primary_keys[i].save() if error_list: errors.extend( [ (rec_num, PK_ERROR, i) for rec_num in error_list ] ) for i in fk_length: error_list = foreign_keys[i].finalize() if error_list: errors.extend( [ (rec_num, FK_ERROR, i) for rec_num in error_list ] ) # sort the list of errors by the cumulative line number errors.sort( lambda l, r: cmp(l[0], r[0]) ) show("saving output ... ") # reopen input and sort it into either the output file or error log file input = fileinput.FileInput(all_files, bufsize = buffer_size or DEFAULT_BUFFER) error_list = iter(errors) count = input.lineno filename = input.filename line_no = input.filelineno try: line_num, reason, i = error_list.next() except StopIteration: line_num = -1 for line in input: line = line + '\r\n' #print '%d,%d'%(line_num,count()) if line_num == count(): if reason == FIELD_ERROR: logs[filename()].write(ERROR_FORMAT % (line_no(), INVALID_FIELD % (schema[i].name), line)) elif reason == LINE_ERROR: logs[filename()].write(ERROR_FORMAT % (line_no(), i, line)) elif reason == PK_ERROR: logs[filename()].write(ERROR_FORMAT % (line_no(), INVALID_PK % (primary_keys[i].name), line)) elif reason == FK_ERROR: #print 'Test FK %s, rec_num : %d, line : %s'%(foreign_keys[i].name,line_no(),line) logs[filename()].write(ERROR_FORMAT % (line_no(), INVALID_FK % (foreign_keys[i].name), line)) elif reason == REC_ERROR: logs[filename()].write(ERROR_FORMAT % (line_no(), INVALID_REC % (record_checks[i].__doc__), line)) else: raise RuntimeError("shouldn't reach here") try: #print 'CURRENT ITERATION, line_num : %d, line : %s'%(line_num,line) line_num1 = line_num line_num, reason, i = error_list.next() if line_num1 == line_num : line_num, reason, i = error_list.next() #print 'FOR NEXT ITERATION, line_num : %d, line : %s'%(line_num,line) except StopIteration: line_num = -1 continue if not BLANK_LINE.match(line): output.write(line) output.close() for f in logs.values(): f.close() ----------------------------------------------------------------------------------------------------------------------------- now when I open the error log file, it contains the error message for each erroneous record, along with the original record copied from the *.acc file. Now this record is preceeded with a box like character. Do you want me to post the complete code , just incase... It might help... you might then understand my problem well.. plz let me know soon -- http://mail.python.org/mailman/listinfo/python-list