Here is the full code. Look to the last part, denoted #(f) for the question being asked in this post:
#(a) Split datapoints into training (70 points) and test (30 points) sets. #Read in ass4-data.txt and ass3-phodata.txt ass4data = read.delim('http://www.moseslab.csb.utoronto.ca/alan/ass4-data.txt', header = FALSE, sep = "\t") #Separate all positive and negative hits ass4q1.neg = ass4data[which(ass4data[,1] == 0),] ass4q1.pos = ass4data[which(ass4data[,1] == 1),] #Reset row names rownames(ass4q1.neg) = NULL rownames(ass4q1.pos) = NULL #Sample 70% (35 out of 50 in each positive/negative set) for training set, rest for testing set ass4q1.negRid = sample(1:nrow(ass4q1.neg),floor(0.7*nrow(ass4q1.neg))) ass4q1.posRid = sample(1:nrow(ass4q1.pos),floor(0.7*nrow(ass4q1.pos))) #Combine negative and positive values from each data set to create training and testing arrays ass4q1.trainSet = as.matrix(rbind(ass4q1.neg[ass4q1.negRid,], ass4q1.pos[ass4q1.posRid,])) ass4q1.testSet = rbind(ass4q1.neg[-(ass4q1.negRid),],ass4q1.pos[-(ass4q1.posRid),]) #Reset row names rownames(ass4q1.trainSet) = NULL rownames(ass4q1.testSet) = NULL ass4q1.trainSetDF = as.data.frame(ass4q1.trainSet) ass4q1.trainSetDF$V1 = factor(ass4q1.trainSetDF$V1) ass4q1.testSetDF = as.data.frame(ass4q1.testSet) ass4q1.testSetDF$V1 = factor(ass4q1.testSetDF$V1) ############## #(b)Load MASS, e1071 and glmnet library(MASS) library(e1071) library(glmnet) ############# #(c)How many features does the data contain? #The data contains 32 features (columns of data) ############# #(d)How does the number of parameters required for Naïve Bayes, LDA, and Logistic #Regression (unregularized) scale as a function of the number of features? #If Y is binary with <X1 ... Xp> features, then the number of parameters is P(Y). #NaiveBayes #P(Y) = p • (mew(Y=1), mew(Y=0), sigma(Y=1), sigma(Y=0)) # = 1 + 4p #Linear Discriminant Analysis #Have to estimate one covariance matrix and p mean values for each class. #To compute the covariance matrix is p x p, but since the upper or lower halfsymetrical, we disregard half, but include the #middle diagonal by multiplying p x (p + 1) and dividing by 2. #Calculating p mean values for each class is 2p (2 classes of binary Y). #Thus: P(Y) = (p(p + 1) / 2) + 2p #Logistic Regression #P(Y) = 1 + p #To plot the relationship: ass4q1.dVS= matrixmatrix(,ncol(ass4q1.trainSet)-1,3) for (p in 1:ncol(ass4q1.trainSet)-1){ ass4q1.dVS[p,1] = (1 + (4*p)) ass4q1.dVS[p,2] = ((p *(p + 1) / 2) + 2*p) ass4q1.dVS[p,3] = (1 + p) } png('ass4q1.dVS.png') plot(ass4q1.dVS[,2], type="o", col="blue",ylim=c(0,max(ass4q1.dVS)), ann=FALSE) lines(ass4q1.dVS[,1], type="o", pch=22, lty=2, col="red") lines(ass4q1.dVS[,3], type="o", pch=23, lty=3, col="green") title(main = "Number of parameters as a function of features", col.main="red", font.main=4) title(xlab= "Features", col.lab="red") title(ylab= "Parameters", col.lab="red") legend(1, max(ass4q1.dVS), c("LDA", "Naive Bayes", "Logistic Regression"), cex=0.8, col=c("blue","red","green"), pch=21:23, lty=1:3) dev.off() ############# #(e)Train Naïve Bayes, LDA and Logistic Regression to classify the training data #using the first two, four, eight, 16 or 32 features, starting from the left of the file. Plot #the classification error (FP + FN)/(TP+FP+TN+FN) on the training data as a function #of the number of parameters for each method. #Contingency table organized as: #TN FN #FP TP #Organize tables to store data: ass4q1.dNBtable = matrix(,5,2) ass4q1.dLDAtable = matrix(,5,2) ass4q1.dGLMtable = matrix(,5,2) i = 1 for(p in c(2,4,8,16,32)){ ass4q1.dNBtable[i,1] = (1 + (4*p)) ass4q1.dLDAtable[i,1] = ((p *(p + 1) / 2) + 2*p) ass4q1.dGLMtable[i,1] = (1+p) i = i+1 } #Copying blank tables for part (f) ass4q1.dNBtable.testData = ass4q1.dNBtable ass4q1.dLDAtable.testData = ass4q1.dLDAtable ass4q1.dGLMtable.testData = ass4q1.dGLMtable ############# #(e)Train Naïve Bayes, LDA and Logistic Regression to classify the training data #using the first two, four, eight, 16 or 32 features, starting from the left of the file. Plot #the classification error (FP + FN)/(TP+FP+TN+FN) on the training data as a function #of the number of parameters for each method. #Contingency table organized as: #TN FN #FP TP #Organize tables to store data: ass4q1.dNBtable = matrix(,5,2) ass4q1.dLDAtable = matrix(,5,2) ass4q1.dGLMtable = matrix(,5,2) i = 1 for(p in c(2,4,8,16,32)){ ass4q1.dNBtable[i,1] = (1 + (4*p)) ass4q1.dLDAtable[i,1] = ((p *(p + 1) / 2) + 2*p) ass4q1.dGLMtable[i,1] = (1+p) i = i+1 } #Copying blank tables for part (f) ass4q1.dNBtable.testData = ass4q1.dNBtable ass4q1.dLDAtable.testData = ass4q1.dLDAtable ass4q1.dGLMtable.testData = ass4q1.dGLMtable ############# #2 Features #2 features for NaiveBayes ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:3], ass4q1.trainSetDF[,1]) ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:3]), ass4q1.trainSetDF[,1]) ass4q1.dNBtable[1,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable [1,2])/(sum(ass4q1.dNB.cTable)) #2 features for LDA ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:3]) ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA, ass4q1.trainSetDF[,2:3])$class, ass4q1.trainSetDF[,1]) ass4q1.dLDAtable[1,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable [1,2])/(sum(ass4q1.dLDA.cTable)) #2 features for GLM ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:3], family = "binomial") ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM, ass4q1.trainSetDF[,2:3]),ass4q1.trainSetDF[,1]) ass4q1.dGLMtable[1,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) + (35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70) ############# #4 Features #4 features for NaiveBayes ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:5], ass4q1.trainSetDF[,1]) ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:5]), ass4q1.trainSetDF[,1]) ass4q1.dNBtable[2,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable [1,2])/(sum(ass4q1.dNB.cTable)) #4 features for LDA ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:5]) ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA, ass4q1.trainSetDF[,2:5])$class, ass4q1.trainSetDF[,1]) ass4q1.dLDAtable[2,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable [1,2])/(sum(ass4q1.dLDA.cTable)) #4 features for GLM ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:5], family = "binomial") ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM, ass4q1.trainSetDF[,2:5]),ass4q1.trainSetDF[,1]) ass4q1.dGLMtable[2,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) + (35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70) ############# #8 Features #8 features for NaiveBayes ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:7], ass4q1.trainSetDF[,1]) ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:7]), ass4q1.trainSetDF[,1]) ass4q1.dNBtable[3,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable [1,2])/(sum(ass4q1.dNB.cTable)) #8 features for LDA ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:7]) ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA, ass4q1.trainSetDF[,2:7])$class, ass4q1.trainSetDF[,1]) ass4q1.dLDAtable[3,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable [1,2])/(sum(ass4q1.dLDA.cTable)) #8 features for GLM ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:7], family = "binomial") ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM, ass4q1.trainSetDF[,2:7]),ass4q1.trainSetDF[,1]) ass4q1.dGLMtable[3,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) + (35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70) ############# #16 Features #16 features for NaiveBayes ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:17], ass4q1.trainSetDF[,1]) ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:17]), ass4q1.trainSetDF[,1]) ass4q1.dNBtable[4,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable [1,2])/(sum(ass4q1.dNB.cTable)) #16 features for LDA ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:17]) ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA, ass4q1.trainSetDF[,2:17])$class, ass4q1.trainSetDF[,1]) ass4q1.dLDAtable[4,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable [1,2])/(sum(ass4q1.dLDA.cTable)) #16 features for GLM ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:17], family = "binomial") ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM, ass4q1.trainSetDF[,2:17]),ass4q1.trainSetDF[,1]) ass4q1.dGLMtable[4,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) + (35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70) ############# #32 Features #32 features for NaiveBayes ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:33], ass4q1.trainSetDF[,1]) ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:33]), ass4q1.trainSetDF[,1]) ass4q1.dNBtable[5,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable [1,2])/(sum(ass4q1.dNB.cTable)) #32 features for LDA ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:33]) ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA, ass4q1.trainSetDF[,2:33])$class, ass4q1.trainSetDF[,1]) ass4q1.dLDAtable[5,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable [1,2])/(sum(ass4q1.dLDA.cTable)) #16 features for GLM ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:33], family = "binomial") ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM, ass4q1.trainSetDF[,2:33]),ass4q1.trainSetDF[,1]) ass4q1.dLDAtable[5,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) + (35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70) png('ass4q1.dTables.png') plot(ass4q1.dLDAtable[,1],ass4q1.dLDAtable[,2], type="o", col="blue",ylim=c(0,.4), ann=FALSE) lines(ass4q1.dNBtable[,1], ass4q1.dNBtable[,2], type="o", pch=22, lty=2, col="red") lines(ass4q1.dGLMtable[,1], ass4q1.dGLMtable[,2], type="o", pch=23, lty=3, col="green") title(main = "Classification error as a function of number of parameters", col.main="red", font.main=4) title(xlab= "Parameters", col.lab="red") title(ylab= "(FP + FN)/(TP+FP+TN+FN)", col.lab="red") legend(300, .4, c("LDA", "Naive Bayes", "Logistic Regression"), cex=0.8, col=c("blue","red","green"), pch=21:23, lty=1:3) dev.off() ############# #(f)Plot the classification error as a function of the number of parameters on the test #data for each method. Does this differ from your answer in part (e)? Explain why. ############# #2 Features #2 features for NaiveBayes ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.testSetDF[,2:3]), ass4q1.testSetDF[,1]) ass4q1.dNBtable.testData[1,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable [1,2])/(sum(ass4q1.dNB.cTable)) #WORKS! #2 features for LDA ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA, ass4q1.testSetDF[,2:3])$class, ass4q1.testSetDF[,1]) #DOESN'T WORK! ass4q1.dLDAtable.tesetData[1,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable [1,2])/(sum(ass4q1.dLDA.cTable)) #2 features for GLM ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM, ass4q1.testSetDF[,2:3], s = ),ass4q1.testSetDF[,1]) #DOESN'T WORK! ass4q1.dGLMtable[1,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) + (35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70) Uwe Ligges-3 wrote > > On 22.03.2012 03:24, palanski wrote: >> Hi! >> >> I'm using GLM, LDA and NaiveBayes for binomial classification. My >> training >> set is 70 rows long with 32 features, and my test set is 30 rows long >> with >> 32 features. >> >> Using Naive Bayes, I can train a model, and then predict the test set >> with >> it like so: >> >> ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:3]) >> table(predict(ass4q1.dNB, ass4q1.testSetDF[,2:3]), ass4q1.testSetDF[,1]) >> >> >> However, when the same is done for LDA or GLM, suddenly it tells me that >> the >> number of rows doesn't match and doesn't predict my test data. The error >> for >> GLM, as an example, is: >> >> Error in table(predict(ass4q1.dGLM, ass4q1.testSetDF[, 2:3]), >> ass4q1.testSetDF[, : >> all arguments must have the same length >> In addition: Warning message: >> 'newdata' had 30 rows but variable(s) found have 70 rows > > >> >> What am I missing? > > A correct formula describing the model with separate variables with the > data.frame passed to the data argument of the lda() function. > A reproducible example is missing, hence this is just a guess. > > Uwe Ligges > > > >> >> -- >> View this message in context: >> http://r.789695.n4.nabble.com/predict-for-LDA-and-GLM-tp4494381p4494381.html >> Sent from the R help mailing list archive at Nabble.com. >> >> ______________________________________________ >> R-help@ mailing list >> https://stat.ethz.ch/mailman/listinfo/r-help >> PLEASE do read the posting guide >> http://www.R-project.org/posting-guide.html >> and provide commented, minimal, self-contained, reproducible code. > > ______________________________________________ > R-help@ mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. > Uwe Ligges-3 wrote > > On 22.03.2012 03:24, palanski wrote: >> Hi! >> >> I'm using GLM, LDA and NaiveBayes for binomial classification. My >> training >> set is 70 rows long with 32 features, and my test set is 30 rows long >> with >> 32 features. >> >> Using Naive Bayes, I can train a model, and then predict the test set >> with >> it like so: >> >> ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:3]) >> table(predict(ass4q1.dNB, ass4q1.testSetDF[,2:3]), ass4q1.testSetDF[,1]) >> >> >> However, when the same is done for LDA or GLM, suddenly it tells me that >> the >> number of rows doesn't match and doesn't predict my test data. The error >> for >> GLM, as an example, is: >> >> Error in table(predict(ass4q1.dGLM, ass4q1.testSetDF[, 2:3]), >> ass4q1.testSetDF[, : >> all arguments must have the same length >> In addition: Warning message: >> 'newdata' had 30 rows but variable(s) found have 70 rows > > >> >> What am I missing? > > A correct formula describing the model with separate variables with the > data.frame passed to the data argument of the lda() function. > A reproducible example is missing, hence this is just a guess. > > Uwe Ligges > > > >> >> -- >> View this message in context: >> http://r.789695.n4.nabble.com/predict-for-LDA-and-GLM-tp4494381p4494381.html >> Sent from the R help mailing list archive at Nabble.com. >> >> ______________________________________________ >> R-help@ mailing list >> https://stat.ethz.ch/mailman/listinfo/r-help >> PLEASE do read the posting guide >> http://www.R-project.org/posting-guide.html >> and provide commented, minimal, self-contained, reproducible code. > > ______________________________________________ > R-help@ mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. > -- View this message in context: http://r.789695.n4.nabble.com/predict-for-LDA-and-GLM-tp4494381p4495386.html Sent from the R help mailing list archive at Nabble.com. ______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.