hi! I have python codes for building an id3 decision tree and they`re
already running. However, the output decision tree is just text-based. So,
I decided to use web2py to give it a more pleasant look. Now, my problem is
how do i integrate these codes in web2py. I mean, where will I insert these
codes especially the computations of entropy and information gain after I
upload my dataset? How will I make an OOP into an MVC?
By the way, I have already created the GUI for this application but still
lack a UI for displaying the output decision tree.
Here`s the scenario:
<https://lh3.googleusercontent.com/-FfLOuWzGqFI/T8Lprknyu9I/AAAAAAAAAaI/RDkVYOosaT8/s1600/id31.JPG>
that`s the Home Page. When I click the ID3 menu, it will redirect here:
<https://lh3.googleusercontent.com/-brRNheqzg04/T8Lp8L1oi3I/AAAAAAAAAaQ/oRg3xGw73_0/s1600/id32.JPG>
The I will upload a file as my dataset. The dataset is to be computed and
analyzed so that the program will display the output.
Please help me. Thanks!
"""
This module holds functions that are responsible for creating a new
decision tree and for using the tree for data classification.
"""
def majority_value(data, target_attr):
"""
Creates a list of all values in the target attribute for each record
in the data list object, and returns the value that appears in this list
the most frequently.
"""
data = data[:]
return most_frequent([record[target_attr] for record in data])
def most_frequent(lst):
"""
Returns the item that appears most frequently in the given list.
"""
lst = lst[:]
highest_freq = 0
most_freq = None
for val in unique(lst):
if lst.count(val) > highest_freq:
most_freq = val
highest_freq = lst.count(val)
return most_freq
def unique(lst):
"""
Returns a list made up of the unique values found in lst. i.e., it
removes the redundant values in lst.
"""
lst = lst[:]
unique_lst = []
# Cycle through the list and add each value to the unique list only once.
for item in lst:
if unique_lst.count(item) <= 0:
unique_lst.append(item)
# Return the list with all redundant values removed.
return unique_lst
def get_values(data, attr):
"""
Creates a list of values in the chosen attribute for each record in data,
prunes out all of the redundant values, and return the list.
"""
data = data[:]
return unique([record[attr] for record in data])
def choose_attribute(data, attributes, target_attr, fitness):
"""
Cycles through all the attributes and returns the attribute with the
highest information gain (or lowest entropy).
"""
data = data[:]
best_gain = 0.0
mylist = ['Easy', 'Average', 'Difficult']
best_attr = temp1 = None
newlist = []
# i -=1
for attr in attributes:
gain = fitness(data, attr, target_attr)
if (gain >= best_gain and attr != target_attr):
best_gain = gain
best_attr = attr
temp = best_attr
# print (best_attr)
return best_attr
def get_examples(data, attr, value):
"""
Returns a list of all the records in <data> with the value of <attr>
matching the given value.
"""
data = data[:]
rtn_lst = []
if not data:
return rtn_lst
else:
record = data.pop()
if record[attr] == value:
rtn_lst.append(record)
rtn_lst.extend(get_examples(data, attr, value))
return rtn_lst
else:
rtn_lst.extend(get_examples(data, attr, value))
return rtn_lst
def get_classification(record, tree):
"""
This function recursively traverses the decision tree and returns a
classification for the given record.
"""
# If the current node is a string, then we've reached a leaf node and
# we can return it as our answer
if type(tree) == type("string"):
return tree
# Traverse the tree further until a leaf node is found.
else:
attr = tree.keys()[0]
t = tree[attr][record[attr]]
return get_classification(record, t)
def classify(tree, data):
"""
Returns a list of classifications for each of the records in the data
list as determined by the given decision tree.
"""
data = data[:]
classification = []
for record in data:
classification.append(get_classification(record, tree))
return classification
def create_decision_tree(data, attributes, target_attr, fitness_func):
"""
Returns a new decision tree based on the examples given.
"""
data = data[:]
vals = [record[target_attr] for record in data]
default = majority_value(data, target_attr)
# If the dataset is empty or the attributes list is empty, return the
# default value. When checking the attributes list for emptiness, we
# need to subtract 1 to account for the target attribute.
if not data or (len(attributes) - 1) <= 0:
return default
# If all the records in the dataset have the same classification,
# return that classification.
elif vals.count(vals[0]) == len(vals):
return vals[0]
else:
# Choose the next best attribute to best classify our data
best = choose_attribute(data, attributes, target_attr,
fitness_func)
# Create a new decision tree/node with the best attribute and an empty
# dictionary object--we'll fill that up next.
tree = {best:{}}
# Create a new decision tree/sub-node for each of the values in the
# best attribute field
for val in get_values(data, best):
# Create a subtree for the current value under the "best" field
subtree = create_decision_tree(
get_examples(data, best, val),
[attr for attr in attributes if attr != best],
target_attr,
fitness_func)
# Add the new subtree to the empty dictionary object in our new
# tree/node we just created.
tree[best][val] = subtree
return tree
"""
This module contains the functions for calculating the information gain of a
dataset as defined by the ID3 (Information Theoretic) heuristic.
"""
import math
def entropy(data, target_attr):
"""
Calculates the entropy of the given data set for the target attribute.
"""
val_freq = {}
data_entropy = 0.0
# Calculate the frequency of each of the values in the target attr
for record in data:
if (val_freq.has_key(record[target_attr])):
val_freq[record[target_attr]] += 1.0
else:
val_freq[record[target_attr]] = 1.0
# Calculate the entropy of the data for the target attribute
for freq in val_freq.values():
data_entropy += (-freq/len(data)) * math.log(freq/len(data), 2)
# print (data_entropy)
return data_entropy
def gain(data, attr, target_attr):
"""
Calculates the information gain (reduction in entropy) that would
result by splitting the data on the chosen attribute (attr).
"""
val_freq = {}
subset_entropy = 0.0
# Calculate the frequency of each of the values in the target attribute
for record in data:
if (val_freq.has_key(record[attr])):
val_freq[record[attr]] += 1.0
# print(val_freq)
# print ("\n")
else:
val_freq[record[attr]] = 1.0
# print(attr, val_freq)
# print ("\n")
# Calculate the sum of the entropy for each subset of records weighted
# by their probability of occuring in the training set.
for val in val_freq.keys():
val_prob = val_freq[val] / sum(val_freq.values())
data_subset = [record for record in data if record[attr] == val]
subset_entropy += val_prob * entropy(data_subset, target_attr)
# Subtract the entropy of the chosen attribute from the entropy of the
# whole data set with respect to the target attribute (and return it)
# print(data,(entropy(data, target_attr) - subset_entropy))
return (entropy(data, target_attr) - subset_entropy)
from dtree import *
from id3 import *
import sys
import math
def get_file():
"""
Tries to extract a filename from the command line. If none is present, it
prompts the user for a filename and tries to open the file. If the file
exists, it returns it, otherwise it prints an error message and ends
execution.
"""
# Get the name of the data file and load it into
if len(sys.argv) < 2:
# Ask the user for the name of the file
print "Filename: ",
filename = sys.stdin.readline().strip()
else:
filename = sys.argv[1]
try:
fin = open(filename, "r")
except IOError:
print "Error: The file '%s' was not found on this system." % filename
sys.exit(0)
return fin
def run_test(fin):
"""
This function creates a list of exmaples data (used to learn the d-tree)
and a list of samples (for classification by the d-tree) from the
designated file. It then creates the d-tree and uses it to classify the
samples. It prints the classification of each record in the samples list
and returns the d-tree.
"""
# Create a list of all the lines in the data file
lines = [line.strip() for line in fin.readlines()]
# Remove the attributes from the list of lines and create a list of
# the attributes.
lines.reverse()
attributes = [attr.strip() for attr in lines.pop().split(",")]
target_attr = attributes[-1]
lines.reverse()
# Create a list of the data in the data file
data = []
for line in lines:
data.append(dict(zip(attributes,
[datum.strip() for datum in line.split(",")])))
# Copy the data list into the examples list for testing
examples = data[:]
# Create the decision tree
tree = create_decision_tree(data, attributes, target_attr, gain)
# Classify the records in the examples list
classification = classify(tree, examples)
# Print out the classification for each record
# for item in classification:
# print item
return tree
def print_tree(tree, str):
"""
This function recursively crawls through the d-tree and prints it out in a
more readable format than a straight print of the Python dict object.
"""
if type(tree) == dict:
print "%s%s" % (str, tree.keys()[0])
for item in tree.values()[0].keys():
print "%s\t%s" % (str, item)
print_tree(tree.values()[0][item], str + "\t")
else:
print "%s\t==>\t%s" % (str, tree)
if __name__ == "__main__":
fin = get_file()
# print "------------------------\n"
# print "-- Classification --\n"
# print "------------------------\n"
# print "\n"
tree = run_test(fin)
print "\n"
print "------------------------\n"
print "-- Decision Tree --\n"
print "------------------------\n"
print "\n"
print_tree(tree, "")
fin.close()