prakash jp wrote: > Could any one help to get max and min values from a specified column of a > csv file. The* csv file is large* and hence the below code did go bad. > *Alternate > methods would be highly appreciated > ** > minimum.py*: > import csv > filename = "testLog_4.csv" > f = open(filename) > def col_min(mincname): > with open(filename, 'rb') as f: > reader = csv.reader(f, delimiter=",") > #print reader > #print mincname > col_index = next(reader).index(mincname) > #print col_index > least = min(rec[col_index] for rec in reader) > print least > > col_min(str("Server Latency")) > col_min(str("Client Latency")) > f.close()
I don't see a way to make this much faster as it is probably already i/o- bound. You can try and calculate all four extrema at once, so you have to iterate over the file only once: $ cat csv_minmax_chunked.py #!/usr/bin/env python import csv import locale from itertools import islice def make_key(index, type): def key(row): return type(row[index]) return key class Aggregator(object): def __init__(self, name, index, type, key=None): self.name = name self.index = index self.type = type or float self.getvalue = make_key(index, type) self.key = {"key": key} if key is not None else {} def __repr__(self): return "{0}(name={1}, index={2}, type={3}".format( type(self).__name__, self.name, self.index, self.type) def __str__(self): try: return "%s: %s...%s" % (self.name, self.min, self.max) except AttributeError: return "%s: (no values)" % self.name def update(self, rows): values = map(self.getvalue, rows) chunkmin = min(values, **self.key) chunkmax = max(values, **self.key) try: curmin = self.min curmax = self.max except AttributeError: self.min = chunkmin self.max = chunkmax else: self.min = min(curmin, chunkmin, **self.key) self.max = max(curmax, chunkmax, **self.key) def identity(s): return s convert = { "float": (float,), "int": (int,), "text": (lambda s: s, locale.strxfrm), "": (identity,) } def main(): locale.setlocale(locale.LC_ALL, "") import argparse parser = argparse.ArgumentParser() parser.add_argument("csvfile") parser.add_argument("columns", nargs="+") parser.add_argument("--chunksize", "-s", type=int, default=2**12) args = parser.parse_args() ntpairs = (column.partition(":")[::2] for column in args.columns) with open(args.csvfile, "rb") as instream: rows = csv.reader(instream) header = dict((name, index) for index, name in enumerate(next(rows))) aggregators = [ Aggregator( "{0}({1})".format(name, type or "str"), header[name], *convert[type]) for name, type in ntpairs] chunksize = args.chunksize while True: chunk = list(islice(rows, chunksize)) if not chunk: break for agg in aggregators: agg.update(chunk) for agg in aggregators: print agg if __name__ == "__main__": main() $ head sample.csv alpha,beta,gamma,delta 69,-454,460960,ELIJAH 92,796,278885,SUFFICIENCY 42,895,934523,PIONEERED 88,-213,360633,reassert 63,-518,327010,mcmahon 84,604,540764,deprecatory 83,117,621832,VEGETARIANISM'S 61,411,844243,tan 29,-147,982043,plushest $ ./csv_minmax_chunked.py sample.csv alpha:int beta:int beta gamma:float delta delta:text alpha(int): 0...99 beta(int): -1000...999 beta(str): -1...999 gamma(float): 0.0...999997.0 delta(str): AALIYAH...études delta(text): a...ZYUGANOV'S -- http://mail.python.org/mailman/listinfo/python-list