[R] Problem with predict() and factors

Idgarad Thu, 03 Dec 2009 10:02:04 -0800

I am working on a script that takes numeric performance indicators and runs
them against a series of regressors (dummy regressors, yes\no stuff via 0
and 1, e.g. Was is Christmas this week 0=no, 1=yes).


The script is as follows (Written as a function):


-- Begin Script --

doEnv <- function(HOUR,ENVNAME,REPORTNAME) {
library(RODBC)
library(forecast)
library("geneplotter")
library(forecast)
library(fUtilities)
library(TSA)
require(gplots)
library(robfilter)

SOURCEDATA <- paste("Q:/TEST/RSTATS/EPOC ",HOUR," Metrics.xls",sep="")
REGRESSORS <- "Q:/TEST/RSTATS/eventswithholidays.xls"

mypalette=c()
mypalette$background="#FFFFFF"
mypalette$chart="#FFFFFF"
mypalette$forecastRegion="#66CCFF"
mypalette$confidence="#FF9966"
mypalette$limits="#FF0000"
mypalette$major="#000000"
mypalette$minor="#cccccc"
mypalette$actual="#aaaaaa"
mypalette$dp1="#9900FF"
mypalette$dp2="#EEEE00"
mypalette$dp3="#CCFF00"
mypalette$dp4="#00CCFF"
mypalette$dp5="#FF00CC"

#Raw Data
channel1 <- odbcConnectExcel(SOURCEDATA)
sqlTables(channel1)
sh1 <- sqlFetch(channel1, "Actuals$")
close(channel1)
channel2 <- odbcConnectExcel(REGRESSORS)
sqlTables(channel2)
sh2 <- sqlFetch(channel2, "data$")
close(channel2)

#Get Raw Data
tsSource<-ts(sh1[[ENVNAME]],start=c(2004,1),freq=52)

#Data is now a Time Series
#Prep Out-of-sample test ranges
modLength=length(sh1[[ENVNAME]])
modMax=round((modLength/3)*2)
modEndDate=time(tsSource)[modMax]
modStartDate=time(tsSource)[1]

#RAW SUMAMRY WITH OVERLAY OF OUT OF SAMPLE RANGES
summary(tsSource)
modelSource=window(tsSource,modStartDate,end=modEndDate)
verSource=window(tsSource,time(tsSource)[modMax+1])
pdf(paste("Q:/ReleaseMgmt/Environment
Mgmt/Data/Current/Metrics/Mainframe/Test Environment
Projections/RSTATS/images/",ENVNAME,"-",HOUR,"-","Raw Metrics with Test
Range.pdf",sep=""),width=9, height=6.5)
plot(tsSource,col="grey", main=paste("Raw Data for", REPORTNAME),
xlab="Date", ylab="MiPS Used")
points(modelSource,col="red", pch=20)
points(verSource,col="blue", pch=20)
smartlegend( x="left", y= "top", inset=0,
#smartlegend parameters
             legend = c("Actual Data","Data for Model Selection","Data for
In Sample Verification"),
       fill=c(mypalette$actual,"red","blue"),bg = mypalette$background)
print("The Red region is where we are going to develop the model from and
the blue area is where we will evaluate the model (In Sample Testing)")

#Ok our ranges are comfirmed we'll get a better graph later

# This Heavy Voodoo allows us to have a dynamic number of
#dummy variables we can add\remove from the spreadsheet
forecastDistance <- 52
#Grab Existing Regressors (clipping out the data)
cReg <- sh2[1:modLength,-1]
mcReg <- sh2[1:modMax,-1]
#transform the on\offs into proper factors
for(i in names(cReg)) cReg[[i]] <- factor(cReg[[i]])
for(i in names(mcReg)) mcReg[[i]] <- factor(mcReg[[i]])
#Grab X Future Regressors equal to the forecastDistance (gotta double check
if I need a +1 on the start point)
fReg <- sh2[length(tsSource):(length(tsSource)+forecastDistance),-1]
mfReg <-sh2[(modMax+1):modLength,-1]
#fix variable names
names(cReg) <- make.names(names(cReg))
names(mcReg) <- make.names(names(mcReg))
names(fReg) <- make.names(names(fReg))
names(mfReg) <- make.names(names(mfReg))
#print("#####################")
#print("This is the CReg Data")
#print("#####################")
#print(summary(cReg))
#print("######################")
#print("This is the mcReg Data")
#print("######################")
#print(summary(mcReg))
#names(mcReg)
for(i in names(fReg)) fReg[[i]] <- factor(fReg[[i]])
for(i in names(mfReg)) mfReg[[i]] <- factor(mfReg[[i]])
#end heavy voodoo

########
#
# MODEL VERIFICATION FIRST!!!!!
#
# Basic Look at the raw data
hist(modelSource)
plot(density(modelSource,na.rm=TRUE))
plot(sort(modelSource),pch=".")
for(i in names(mcReg)) {
pairs(modelSource ~ .,mcReg[[i]], main=paste("Model - MIPS vs",i))
}
#Build the list to store our results
linearModel <- list()
residuals <- list()
arima_Fit <- list()
arima_AO <- list()
arima_IO <- list()
newcReg <- list()
newfReg <- list()
newmcReg <- list()
newmfReg <- list()
newFit <- list()
newForecast <- list()
# Following won't work until mcReg contains full variety
linearModel[[1]]=lm(modelSource ~ + UNITBUILD + UNITDB + ITBUILD + ITDB +
UATBUILD + UATDB + HOGANCODE + RCF + ReleaseST1 + ReleaseST2 + ReleaseBLA +
Small.Bank.Acquisitions + HLY.NewYear + HLY.MLK + HLY.PRES + HLY.MEMORIAL +
HLY.J4 + HLY.LABOR + HLY.COLUMBUS + HLY.VETS + HLY.THANKS + HLY.XMAS +
HLY.ELECT + HLY.PATRIOT + EOM,mcReg)
linearModel[[2]]=step(linearModel[[1]], trace=1)
linearModel[[3]]=lm(modelSource ~ + UNITBUILD + UNITDB + ITBUILD + ITDB +
UATBUILD + UATDB + HOGANCODE + RCF + ReleaseST1 + ReleaseST2 + ReleaseBLA +
Small.Bank.Acquisitions + HLY.NewYear + HLY.MLK + HLY.PRES + HLY.MEMORIAL +
HLY.J4 + HLY.LABOR + HLY.COLUMBUS + HLY.VETS + HLY.THANKS + HLY.XMAS +
HLY.ELECT + HLY.PATRIOT + EOM - 1,mcReg)
linearModel[[4]]=step(linearModel[[3]],trace=1)
if(ENVNAME=="E081") {linearModel[[5]]=lm(modelSource ~ + UNITBUILD + UNITDB
+ HOGANCODE + RCF + ReleaseST1 + Small.Bank.Acquisitions + HLY.NewYear +
HLY.MLK + HLY.PRES + HLY.MEMORIAL + HLY.J4 + HLY.LABOR + HLY.COLUMBUS +
HLY.VETS + HLY.THANKS + HLY.XMAS + HLY.ELECT + HLY.PATRIOT + EOM,mcReg)}
if(ENVNAME=="I000") {linearModel[[5]]=lm(modelSource ~ + ITBUILD + ITDB +
HOGANCODE + RCF + ReleaseST1 + Small.Bank.Acquisitions + HLY.NewYear +
HLY.MLK + HLY.PRES + HLY.MEMORIAL + HLY.J4 + HLY.LABOR + HLY.COLUMBUS +
HLY.VETS + HLY.THANKS + HLY.XMAS + HLY.ELECT + HLY.PATRIOT + EOM,mcReg)}
if(ENVNAME=="U000") {linearModel[[5]]=lm(modelSource ~ + UATBUILD + UATDB +
HOGANCODE + RCF + ReleaseST2 + ReleaseBLA + Small.Bank.Acquisitions +
HLY.NewYear + HLY.MLK + HLY.PRES + HLY.MEMORIAL + HLY.J4 + HLY.LABOR +
HLY.COLUMBUS + HLY.VETS + HLY.THANKS + HLY.XMAS + HLY.ELECT + HLY.PATRIOT +
EOM,mcReg)}
if(ENVNAME=="U400") {linearModel[[5]]=lm(modelSource ~ + UATBUILD + UATDB +
HOGANCODE + RCF + ReleaseST2 + ReleaseBLA + Small.Bank.Acquisitions +
HLY.NewYear + HLY.MLK + HLY.PRES + HLY.MEMORIAL + HLY.J4 + HLY.LABOR +
HLY.COLUMBUS + HLY.VETS + HLY.THANKS + HLY.XMAS + HLY.ELECT + HLY.PATRIOT +
EOM,mcReg)}
if(ENVNAME=="T000") {linearModel[[5]]=lm(modelSource ~ + UNITBUILD + UNITDB
+ ITBUILD + ITDB + UATBUILD + UATDB + HOGANCODE + RCF + ReleaseST1 +
ReleaseST2 + ReleaseBLA + Small.Bank.Acquisitions + HLY.NewYear + HLY.MLK +
HLY.PRES + HLY.MEMORIAL + HLY.J4 + HLY.LABOR + HLY.COLUMBUS + HLY.VETS +
HLY.THANKS + HLY.XMAS + HLY.ELECT + HLY.PATRIOT + EOM,mcReg)}
if(ENVNAME=="P000") {linearModel[[5]]=lm(modelSource ~ +
Small.Bank.Acquisitions + HLY.NewYear + HLY.MLK + HLY.PRES + HLY.MEMORIAL +
HLY.J4 + HLY.LABOR + HLY.COLUMBUS + HLY.VETS + HLY.THANKS + HLY.XMAS +
HLY.ELECT + HLY.PATRIOT + EOM,mcReg)}
linearModel[[6]]=step(linearModel[[5]],trace=1)

##################
### FOR EACH MODEL
##################
for(i in length(linearModel)) {
print(paste("For linear Model",i))
print(summary(linearModel[[i]]))
residuals[[i]] <- residuals(linearModel[[i]])
arima_Fit[[i]] <- auto.arima(residuals[[i]])

#Ok what is left over after Regression and ARIMA that cannot
#be explained. Stupid outliers....
#AO's can be added to the cReg as a normal dummy variable
# but these are AOs from the model not the original data.
# is it better to handle AOs from the original data?
arima_AO[[i]] <- detectAO(arima_Fit[[i]])

#What do I do with an innovative outlier? Transfer function or what?
#auto.arima doesn't handle the IO=c(...) stuff.... Umm...
#transfer functions, etc. are a deficency in the script at this point....
arima_IO[[i]] <- detectIO(arima_Fit[[i]])

#Get the pdq,PDQs into a variable so we can re-feed it if neccessary
arima_Fit[[i]]$order
=c(arima_Fit[[i]]$arma[1],arima_Fit[[i]]$arma[2],arima_Fit[[i]]$arma[6])
arima_Fit[[i]]$seasonal
=c(arima_Fit[[i]]$arma[3],arima_Fit[[i]]$arma[4],arima_Fit[[i]]$arma[7])
newcReg[[i]]=cReg[match(names(linearModel[[i]]$coeff[-1]),paste(names(cReg),"1",sep=""))]
newfReg[[i]]=fReg[match(names(linearModel[[i]]$coeff[-1]),paste(names(fReg),"1",sep=""))]
newmcReg[[i]]=mcReg[match(names(linearModel[[i]]$coeff[-1]),paste(names(mcReg),"1",sep=""))]
newmfReg[[i]]=mfReg[match(names(linearModel[[i]]$coeff[-1]),paste(names(mfReg),"1",sep=""))]
newFit[[i]] <-
Arima(modelSource,order=arima_Fit[[i]]$order,seasonal=list(order=arima_Fit[[i]]$seasonal),xreg=newmcReg[[i]],include.drift=F)
newForecast[[i]]
<-predict(newFit[[i]],n.ahead=modLength-modMax,newxreg=newmfReg[[i]])
}
####################
# END FOR EACH MODEL
####################
dev.off()
return("AUTOBOT")
}

-- End Script --

The problem exists in the line:
newForecast[[i]]
<-predict(newFit[[i]],n.ahead=modLength-modMax,newxreg=newmfReg[[i]])


resulting in the error:

Error in as.matrix(newxreg) %*% coefs[-(1:narma)] :
  requires numeric matrix/vector arguments
In addition: Warning message:
In if (class(fit) != "try-error") offset <- -2 * fit$loglik - length(x) *  :
  the condition has length > 1 and only the first element will be used


I know that if I do not transform my 0/1 dummy regressors into Factors
(for(i in names(fReg)) fReg[[i]] <- factor(fReg[[i]]),for(i in names(mfReg))
mfReg[[i]] <- factor(mfReg[[i]]))
 this message goes away and it appears that it is attempting to convert the
data (as.matrix).

My question boils down to is there a solution or proper way to address
factors in prediction\forecasting. As I am having to learn this all on my
own I am at a loss on what do do to extract the forecast if I am using
factors. (I still have no clue how to handle a transfer function to account
for policy changes... ufda which I had time to go back to college...)

        [[alternative HTML version deleted]]

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

[R] Problem with predict() and factors

Reply via email to