Hi, Here is my first report. You can also find it on my Gitlab [0]. Week 1 - 2014/05/25
For this first week, I have written a test script that generates some simple datasets, and produces an image containing the output of the MADlib clustering algorithms. This script can be called like this: ./clustering_test.py new ds0 -n 8 # generates a dataset called "ds0" with 8 clusters ./clustering_test.py query ds0 -o output.png # outputs the result of the clustering algorithms applied to ds0 in output.png See ./clustering_test.py -h for all the available options. An example of output can be found here [1].<http://git.viod.eu/viod/gsoc_2014/blob/master/clustering_test/example_dataset.png> Of course, I will keep improving this test script, as it is still far from perfect; but for now, it does approximately what I want. For next week, I'll start working on the implementation of k-medoids in MADlib. As a reminder, according to the timeline I suggested for the project, this step must be done on May 30. Depending on the problems I will face (mostly lack of knowledge of the codebase, I guess), this might not be finished on time, but it should be done a few days later (by the end of next week, hopefully). Attached is the patch containing everything I have done this week, though the git log might be more convenient to read. Regards, Maxence A. [0] http://git.viod.eu/viod/gsoc_2014/blob/master/reports.rst [1] http://git.viod.eu/viod/gsoc_2014/blob/master/clustering_test/example_dataset.png -- Maxence Ahlouche 06 06 66 97 00
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..97de20e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +***/__pycache__/ +**.png \ No newline at end of file diff --git a/autogen_results.py b/autogen_results.py deleted file mode 100755 index 033c309..0000000 --- a/autogen_results.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/python - -import os - -while(True): - os.system("./k-means_test.py --regen -o results/$(date | md5sum | cut -d ' ' -f 1).png") diff --git a/clustering_test.py b/clustering_test.py deleted file mode 100755 index 2afc0d1..0000000 --- a/clustering_test.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import psycopg2 as pg -import sys - - -class DatabaseConnection(): - db_name = 'madlib' - user = 'madlib' - host = 'localhost' - port = 5432 - table_name = 'tmp_points' - field_name = 'coords' - - def __init__(self): - self.conn = pg.connect(database=self.db_name, user=self.user, host=self.host, port=5432) - self.cur = self.conn.cursor() - self.cur.execute('DROP TABLE IF EXISTS %s CASCADE;' % self.table_name) - self.cur.execute('CREATE TABLE %s (id SERIAL PRIMARY KEY, coords INT[]);' % self.table_name) - self.conn.commit() - - def __del__(self): - self.cur.close() - self.conn.close() - - -def main(args): - parser = argparse.ArgumentParser(description='Visualize output of the clustering algorithms provided by MADlib, in PostgreSQL.') - parser.add_argument('-n', metavar='number of clusters', type=int) - - dc = DatabaseConnection() - -if __name__ == '__main__': - main(sys.argv[1:]) diff --git a/clustering_test/autogen_results.py b/clustering_test/autogen_results.py new file mode 100755 index 0000000..033c309 --- /dev/null +++ b/clustering_test/autogen_results.py @@ -0,0 +1,6 @@ +#!/usr/bin/python + +import os + +while(True): + os.system("./k-means_test.py --regen -o results/$(date | md5sum | cut -d ' ' -f 1).png") diff --git a/clustering_test/clustering_test.py b/clustering_test/clustering_test.py new file mode 100755 index 0000000..248b5cf --- /dev/null +++ b/clustering_test/clustering_test.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import argparse + +import database as db +import dataset_generator as ds +import visualiser as vs + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Visualize output of the clustering algorithms provided by ' + 'MADlib, in PostgreSQL. You should start by adding a dataset. You need ' + 'a PostgreSQL running.') + subparsers = parser.add_subparsers(help='subparsers help', dest='action') + + parser_dataset = subparsers.add_parser('new', help='generate a dataset') + parser_dataset.add_argument( + 'dataset_name', + help='the name of the dataset to create', + ) + parser_dataset.add_argument( + '-n', + '--nb_clusters', + type=int, + help='the number of clusters composing the new dataset. Defaults to a ' + 'random value between 2 and 10.', + ) + parser_dataset.add_argument( + '-d', + '--distribution', + choices = ds.gen_cluster.keys(), + help='the distribution of the points in the clusters. Defaults to ' + 'uniform.', + ) + + parser_query = subparsers.add_parser('query', help='apply clustering algorithms on a dataset') + parser_query.add_argument( + 'dataset_name', + help='the name of the dataset to query', + ) + parser_query.add_argument( + '-n', + '--nb_clusters', + type=int, + help='the number of clusters in the dataset. Default to the actual number of ' + 'clusters of the dataset.', + ) + parser_query.add_argument( + '-o', + '--output_file', + help='the file in which the output will be saved.', + ) + + args = parser.parse_args() + + if args.action == 'new': + if args.nb_clusters is None: + ds.gen_dataset(args.dataset_name) + else: + ds.gen_dataset(args.dataset_name, args.nb_clusters) + elif args.action == 'query': + vs.gen_image(args.dataset_name, args.output_file) diff --git a/clustering_test/database.py b/clustering_test/database.py new file mode 100644 index 0000000..c47256d --- /dev/null +++ b/clustering_test/database.py @@ -0,0 +1,163 @@ +''' +Create the tables in the database and query them. +''' + +import psycopg2 as pg + + +# database parameters +db_name = 'madlib' +user = 'madlib' +host = 'localhost' +port = 5432 +table_name = 'tmp_point' +field_name = 'coords' + +# connection and cursor objects +conn = None +cur = None + +# available clustering algorithms +clustering_algorithms = [ + 'kmeans_random', + 'kmeanspp', +] + + +class DatabaseConnection(): + def __init__(self, f): + self.f = f + + def __call__(self, *k, **kw): + if conn is None: + self.connect() + result = self.f(*k, **kw) + conn.commit() + return result + + def connect(self): + global conn, cur + conn = pg.connect(database=db_name, user=user, host=host, port=5432) + cur = conn.cursor() + + # create the model, if it doesn't exist (should only be executed once) + cur.execute('CREATE TABLE IF NOT EXISTS dataset (' + 'id serial PRIMARY KEY, ' + 'name varchar(80) UNIQUE, ' + 'nb_clusters int);') + cur.execute('CREATE TABLE IF NOT EXISTS point (' + 'id serial PRIMARY KEY,' + 'coords int[],' + 'dataset_id int REFERENCES dataset(id) ON DELETE CASCADE,' + 'cluster_id int);') + + # create the temporary points table + cur.execute('CREATE TABLE IF NOT EXISTS %s (' + 'id int REFERENCES point(id) ON DELETE CASCADE, ' + '%s int[]);' + % (table_name, field_name)) + + +@DatabaseConnection +def insert_dataset(dataset_name, clusters): + # insert the dataset + cur.execute('INSERT INTO dataset (name, nb_clusters) ' + 'VALUES (%s, %s) ' + 'RETURNING id;', + [dataset_name, len(clusters)]) + + # get the dataset's id + dataset_id = cur.fetchone()[0] + + # insert the dataset's points + for i, cluster in enumerate(clusters): + query = 'INSERT INTO point (coords, dataset_id, cluster_id) VALUES\n' + for j, point in enumerate(cluster): + query += "('{" + str(point[0]) + ', ' + str(point[1]) + "}', " + \ + str(dataset_id) + ', ' + str(i) + ')' + query = query + ',\n' if j < len(cluster) - 1 else query + '\n' + query += ';' + + cur.execute(query) + + +@DatabaseConnection +def load_dataset(dataset_name): + ''' + Load the chosen dataset in the table_name table. + ''' + + cur.execute('DELETE FROM tmp_point;') + cur.execute("INSERT INTO %s (id, coords) " + "SELECT point.id, point.coords " + "FROM point " + "JOIN dataset ON point.dataset_id = dataset.id " + "WHERE dataset.name = '%s';" + % (table_name, dataset_name)) + + +@DatabaseConnection +def get_nb_clusters(dataset): + cur.execute( + 'SELECT nb_clusters ' + 'FROM dataset ' + 'WHERE name = %s;', + [dataset] + ) + + return cur.fetchone()[0] + + +@DatabaseConnection +def get_centroids(clustering_alg, nb_clusters): + ''' + Apply the clustering_algorithms to the current dataset loaded in the temp + table. + ''' + cur.execute( + "SELECT * " + "FROM madlib.%s('%s', '%s', %s);" + % (clustering_alg, table_name, field_name, nb_clusters) + ) + return cur.fetchone() + + +@DatabaseConnection +def get_points(dataset=None): + ''' + Get all the points of a specific dataset. If the dataset is not + provided, return the points in the temp table instead. + ''' + + if dataset is None: + cur.execute( + 'SELECT id, %s, cluster_id ' + 'FROM %s;' + % (field_name, table_name) + ) + + return cur.fetchall() + + else: + cur.execute( + 'SELECT id, coords, cluster_id ' + 'FROM point ' + 'WHERE dataset_id IN (SELECT id FROM dataset WHERE name = %s LIMIT 1);', + [dataset] + ) + + return cur.fetchall() + + +@DatabaseConnection +def get_current_dataset(): + cur.execute( + 'SELECT ds.id, ds.name ' + 'FROM DATASET ds ' + 'JOIN point p ON ds.id = p.dataset_id ' + 'JOIN %s tp on p.id = tp.id ' + 'LIMIT 1;' + % table_name + ) + + return cur.fetchone() diff --git a/clustering_test/dataset_generator.py b/clustering_test/dataset_generator.py new file mode 100644 index 0000000..0248f4a --- /dev/null +++ b/clustering_test/dataset_generator.py @@ -0,0 +1,51 @@ +import database as db +import random + +''' +Generate small toy datasets to test clustering algorithms. +''' + +max_x = 300 +max_y = 300 + + +def gen_uniform_cluster( + nb_points=None, + lower_x=None, + upper_x=None, + lower_y=None, + upper_y=None, +): + if lower_x is None: + lower_x = random.randint(0, max_x-1) + if upper_x is None: + upper_x = random.randint(lower_x, max_x-1) + if lower_y is None: + lower_y = random.randint(0, max_y-1) + if upper_y is None: + upper_y = random.randint(lower_y, max_y-1) + if nb_points is None: + nb_points = random.randint(100, 1000) + + cluster = [] + for i in range(nb_points): + cluster.append((random.randint(lower_x, upper_x), random.randint(lower_y, upper_y))) + + return cluster + + +def gen_dataset( + dataset_name, + nb_clusters=random.randint(0, 10), + distribution='uniform', + ): + clusters = [] + for i in range(nb_clusters): + clusters.append(gen_cluster[distribution]()) + + db.insert_dataset(dataset_name, clusters) + + +gen_cluster = { + 'uniform': gen_uniform_cluster, +} diff --git a/clustering_test/requirements.txt b/clustering_test/requirements.txt new file mode 100644 index 0000000..615e61e --- /dev/null +++ b/clustering_test/requirements.txt @@ -0,0 +1,2 @@ +Pillow==2.4.0 +psycopg2==2.5.3 diff --git a/clustering_test/visualiser.py b/clustering_test/visualiser.py new file mode 100644 index 0000000..fc69a65 --- /dev/null +++ b/clustering_test/visualiser.py @@ -0,0 +1,139 @@ +import math +from PIL import Image, ImageDraw + +import database as db +import dataset_generator as dsg + + +colors = [ + (255, 0, 0), # red + (0, 255, 0), # green + (0, 0, 255), # blue + (255, 255, 0), # yellow + (0, 255, 255), # cyan + (255, 0, 255), # pink + (96, 0, 0), # dark_red + (0, 96, 0), # dark_green + (0, 0, 96), # dark_blue + (96, 96, 96), # grey + (0, 0, 0), # black +] + + +def distance(p1, p2): + ''' + Compute the distance between p1 and p2. + ''' + + return math.sqrt(math.pow(p2[0] - p1[0], 2) + math.pow(p2[1] - p1[1], 2)) + + +def nearest_centroid(point, centroids): + ''' + Assign a point to its nearest centroid. + Returns the indice of the nearest centroid. + ''' + + nearest_centroid = 0 + min_dist = distance(point[1], centroids[nearest_centroid]) + + for i, centroid in enumerate(centroids): + dist = distance(point[1], centroid) + if dist < min_dist: + min_dist = dist + nearest_centroid = i + + return nearest_centroid + + +def cluster(clustering_alg, dataset=None): + ''' + Return the result of the clustering algorithms applied to dataset. + Returns the list of centroids, and a list of ((x, y), cluster_id). + + dataset defaults to the last one used. + ''' + + # if no dataset specified, keep the current one, else update the temp table + if dataset is not None: + db.load_dataset(dataset) + else: + dataset = db.get_current_dataset()[1] + + nb_clusters = db.get_nb_clusters(dataset) + + # get the centroids and the points + centroids = db.get_centroids(clustering_alg, nb_clusters)[0] + points = db.get_points(dataset) + + assigned_points = [] + for point in points: + assigned_points.append((point, nearest_centroid(point, centroids))) + + return centroids, assigned_points + + +def gen_image(dataset=None, output_file=None): + ''' + Write the output of the clustering algorithms in output_file. + If output_file is not provided, defaults to <dataset>.png + ''' + + def draw_centroid(bitmap, centroid, color): + # draw a black square + for i in range(max(int(centroid[0]) - 3, 0.), min(int(centroid[0]) + 3, dsg.max_x)): + for j in range(max(int(centroid[1]) - 3, 0), min(int(centroid[1]) + 3, dsg.max_x)): + bitmap[i * dsg.max_x + j] = colors[10] + + # fill it with the correct color + for i in range(max(int(centroid[0]) - 2, 0.), min(int(centroid[0]) + 2, dsg.max_x)): + for j in range(max(int(centroid[1]) - 2, 0), min(int(centroid[1]) + 2, dsg.max_x)): + bitmap[i * dsg.max_x + j] = color + + def draw_point(bitmap, point, color): + bitmap[point[0] * dsg.max_x + point[1]] = color + + def draw_text(img, text): + draw = ImageDraw.Draw(img) + draw.text((10, 10), text, fill=colors[10]) # black + + # if no dataset specified, get the current dataset + if dataset is None: + dataset = db.get_current_dataset()[1] + + # if no output_file specified, name it after the dataset + if output_file is None: + output_file = dataset + '.png' + + # draw the original clustering + img = Image.new("RGB", (dsg.max_x, dsg.max_y)) + bitmap = [(255, 255, 255)] * dsg.max_x * dsg.max_y + + points = db.get_points(dataset) + for point in points: + draw_point(bitmap, point[1], colors[point[2]]) + + img.putdata(bitmap) + draw_text(img, 'Original clustering') + + result_img = Image.new('RGB', (dsg.max_x, dsg.max_y * (len(db.clustering_algorithms)+1))) + result_img.paste(img, (0, 0)) + + # draw ths output of the clustering algorithms + for i, clustering_alg in enumerate(db.clustering_algorithms): + centroids, points = cluster(db.clustering_algorithms[0], dataset) + bitmap = [(255, 255, 255)] * dsg.max_x * dsg.max_y + + for point in points: + draw_point(bitmap, point[0][1], colors[point[1]]) + + for j, centroid in enumerate(centroids): + draw_centroid(bitmap, centroid, colors[j]) + + img = Image.new("RGB", (dsg.max_x, dsg.max_y)) + img.putdata(bitmap) + draw_text(img, clustering_alg) + + result_img.paste(img, (0, (i+1) * (dsg.max_x+1))) + + result_img.save(output_file) diff --git a/k-means_test.py b/k-means_test.py deleted file mode 100755 index 7d46de7..0000000 --- a/k-means_test.py +++ /dev/null @@ -1,350 +0,0 @@ -#!/usr/bin/python - -import postgresql -import random -import sys -import getopt -import math -import pickle -import time -from PIL import Image, ImageDraw - -# db informations -db_name = "madlib" -db_user = "viod" -db_server = "localhost" -db_port = 5432 -db_table_name = "k_means_test" -db_field_name = "coord" -db = postgresql.open("pq://" + db_user + "@" + db_server + ":" + str(db_port) + "/" + db_name) - -# dataset informations -ds_max_groups = 10 -ds_max_x = 300 -ds_max_y = 300 -group_max_elts = 1000 -group_max_width = 100 -group_max_height = 100 - -default_output_file = "clustered_data.png" -data_file = "clusters.dat" - -colors = [ - (255, 0, 0), # red - (0, 255, 0), # green - (0, 0, 255), # blue - (255, 255, 0), # yellow - (0, 255, 255), # cyan - (255, 0, 255), # pink - (96, 0, 0), # dark_red - (0, 96, 0), # dark_green - (0, 0, 96), # dark_blue - (96, 96, 96), # grey - (0, 0, 0) # black - ] - -def create_test_table(): - """ - Create or replace the data table - """ - try: - db.execute("DROP TABLE IF EXISTS " + db_table_name + " CASCADE;") - except UndefinedTableError: - pass - db.execute("CREATE TABLE " + db_table_name + " (" + - "id SERIAL PRIMARY KEY, " + - db_field_name + " int[]" + - ");") - -def gaussian_random(lower_bound, upper_bound): - """ - Generate a random number between lower_bound and upper_bound, assuming a gaussian repartition - """ - mean = (upper_bound + lower_bound) / 2 - variance = (upper_bound - lower_bound) / 4 - x = random.gauss(mean, variance) - while(x < lower_bound or x > upper_bound): - x = random.gauss(mean, variance) - return int(x) - -def insert_random_data(nb_groups): - """ - Populate the table with groups of points chosen randomly - """ - clusters = [] - - # for each group - for i in range(nb_groups): - width = random.randint(1, group_max_width) - height = random.randint(1, group_max_height) - nb_elts = random.randint(1, group_max_elts) - min_x = random.randint(1, ds_max_x - width) - min_y = random.randint(1, ds_max_y - height) - clusters.append( ((min_x + width/2, min_y + height/2), []) ) - - # points generation - for j in range(nb_elts): - x = gaussian_random(min_x, min_x + width) - y = gaussian_random(min_y, min_y + height) - clusters[i][1].append((x,y)) - db.execute("INSERT INTO " + db_table_name + " (" + db_field_name + ") VALUES (" + - "'{" + str(x) + "," + str(y) + "}');") - - # save clusters informations in a file - data_dump = open(data_file, "wb") - pickle.dump(nb_groups, data_dump) - pickle.dump(clusters, data_dump) - data_dump.close() - return clusters - -def get_points(): - """ - Get back the points previously generated - """ - c = db.prepare("SELECT " + db_field_name + " FROM " + db_table_name + ";").declare() - points = [] - for p in c: - points.append(list(p[0])) - return points - -def apply_clustering_kmeans(nb_groups): - """ - Call to MADlib's k-means clustering function - """ - c = db.prepare("SELECT * FROM madlib.kmeans_random('" + db_table_name + "', '" + - db_field_name + "', " + str(nb_groups) + ");").declare() - result = c.read()[0] - centroids = result[0] - #objective_fn = result[1] - #frac_reassigned = result[2] - #num_iterations = result[3] - - # init clusters - clusters = [] - for c in centroids: - clusters.append((c, [])) - - # assign each point to its cluster - points = get_points() - for p in points: - # compute distances - distances = [] - for c in centroids: - distances.append(math.pow(c[0] - p[0], 2) + math.pow(c[1] - p[1], 2)) - # get the indice of the nearest centroid - nearest = 0 - for i in range(1, len(distances)): - if(distances[i] < distances[nearest]): - nearest = i - clusters[nearest][1].append(p) - - return clusters - -def apply_clustering_kmeanspp(nb_groups): - """ - Call to MADlib's k-means clustering function - """ - c = db.prepare("SELECT * FROM madlib.kmeanspp('" + db_table_name + "', '" + - db_field_name + "', " + str(nb_groups) + ");").declare() - result = c.read()[0] - centroids = result[0] - #objective_fn = result[1] - #frac_reassigned = result[2] - #num_iterations = result[3] - - # init clusters - clusters = [] - for c in centroids: - clusters.append((c, [])) - - # assign each point to its cluster - points = get_points() - for p in points: - # compute distances - distances = [] - for c in centroids: - distances.append(math.pow(c[0] - p[0], 2) + math.pow(c[1] - p[1], 2)) - # get the indice of the nearest centroid - nearest = 0 - for i in range(1, len(distances)): - if(distances[i] < distances[nearest]): - nearest = i - clusters[nearest][1].append(p) - - return clusters - -def export_to_png(clusters): - """ - Visualize the result in a PNG file - """ - def display_centroid(bitmap, x, y, color): - """ - Display a big colored square to represent a centroid - """ - # Draw a black square - - # vertical lines - for i in max(0, int(x)-3), min(ds_max_x, int(x)+3): - for j in range(max(0,int(y)-3),min(ds_max_y,int(y)+4)): - bitmap[j * ds_max_x + i] = colors[10] # black - # horizontal lines - for i in range(max(0,int(x)-3), min(ds_max_x,int(x)+4)): - for j in max(0,int(y)-3), min(ds_max_y, int(y)+3): - bitmap[j * ds_max_x + i] = colors[10] # black - - # Fill this square with the color - for i in range(max(0, int(y)-2), min(ds_max_y, int(y)+3)): - for j in range(max(0, int(x)-2), min(ds_max_x, int(x)+3)): - bitmap[i * ds_max_x + j] = color - - bitmap = [(255,255,255)] * ds_max_x * ds_max_y - - i = 0 - for centroid, points in clusters: - # display points - for p in points: - bitmap[p[1] * ds_max_x + p[0]] = colors[i] - # display centroid - display_centroid(bitmap, centroid[0], centroid[1], colors[i]) - i += 1 - - img = Image.new("RGB", (ds_max_x, ds_max_y)) - img.putdata(bitmap) - return img - -def parse_args(argv): - """ - Interpret the command line - """ - try: - opts, args = getopt.getopt(argv, "ho:rn:", - ["regen", "help", "output-file=", "nb-groups="]) - except getopt.GetOptError: - usage() - sys.exit(2) - - regen = False - nb_groups = 0 - output_file = default_output_file - for opt, arg in opts: - if opt in ("-h", "--help"): - usage() - sys.exit(0) - elif opt in ("-o", "--output-file"): - output_file = arg - elif opt in ("-r", "--regen"): - regen = True - elif opt in ("-n", "--nb-groups"): - nb_groups = arg - - return regen, nb_groups, output_file - -def generate_output(output_file, clusters_set): - """ - Display all the clustering results on a single image - """ - def add_title(img, title): - draw = ImageDraw.Draw(img) - draw.text((10, 10), description, fill=colors[10]) # black - - result_img = Image.new("RGB", (ds_max_x * len(clusters_set), ds_max_y)) - - i = 0 - for clusters, description in clusters_set: - tmp_img = export_to_png(clusters) - add_title(tmp_img, description) - result_img.paste(tmp_img, (i * (ds_max_x + 1), 0)) - i += 1 - result_img.save(output_file) - -def print_line(line): - """ - Same as print, but allows to rewrite at the end of the line - """ - print(line, end = "") - sys.stdout.flush() - -def count_points(clusters): - """ - Counts the points in a cluster set - """ - nb_points = 0 - for c in clusters: - nb_points += len(c[1]) - return nb_points - -def usage(): - print(""" -Usage: - ./k-means_test.py -o output_file.png -n 4 -r - -Options: - -o, --output-file output_file.png: - The resulting PNG image. - -r, --regen: - Generate new points. You should use it at your first run. - -n, --nb-groups n: - Generate n groups of points. If not generating points, classify in n - clusters. - -h, --help: - Display this help message. -""") - -def main(args): - regen, nb_groups, output_file = parse_args(args) - - if(regen): - nb_groups = random.randint(2, ds_max_groups) - print("Creating test table...") - create_test_table() - print_line("Generating random data... ") - start = time.time() - original_clusters = (insert_random_data(nb_groups), "Original clustering") - finish = time.time() - - # nb_points = 0 - # for cluster in original_clusters[0]: - # nb_points += len(cluster[1]) - print("Generated " + str(count_points(original_clusters[0])) + " points partitioned into " + - str(len(original_clusters[0])) + " clusters in " + - str(finish - start)[:6] + " seconds.") - else: - try: - print_line("Loading data from " + data_file + "... ") - start = time.time() - data_dump = open(data_file, "rb") - nb_groups = pickle.load(data_dump) - original_clusters = (pickle.load(data_dump), "Original clustering") - data_dump.close - finish = time.time() - - print("Data loaded in " + str(finish - start)[:5] + " seconds.") - except FileNotFoundError: - print("Cannot load data, you need to generate some data first. Use --regen argument.") - exit(3) - - # k-means clustering - print_line("Clustering data using k-means algorithm... ") - start = time.time() - kmeans_clusters = (apply_clustering_kmeans(nb_groups), "K-means clustering") - finish = time.time() - print("Data clustered in " + str(finish - start)[:5] + " seconds.") - - # k-means++ clustering - print_line("Clustering data using k-means++ algorithm... ") - start = time.time() - kmeanspp_clusters = (apply_clustering_kmeanspp(nb_groups), "K-means++ clustering") - finish = time.time() - print("Data clustered in " + str(finish - start)[:5] + " seconds.") - - # output generation - print_line("Exporting to " + output_file + "...") - start = time.time() - generate_output(output_file, [ original_clusters, kmeans_clusters, kmeanspp_clusters]) - finish = time.time() - print("File generated in " + str(finish - start)[:5] + " seconds.") - - print("Done.") - -if(__name__ == "__main__"): - main(sys.argv[1:])
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers