## compimp compimp <- function(B=100, # nrow of the data N=20, # nb of trials ntree=50, # nb of trees in each forest new.plot=FALSE, # open a new graphic device dpanel=panel.hist, # diagonal panel. panel.dens upanel=panel.pcor, # upper panel. panel.cor rfsel=NULL, # randomForest or RRF imptype=1, # type of importance measure sd2=1, # sd for noise in x2 sdy=0.2, # sd for noise in y ...) { ## compimp -comparing importance- ver.0.9 2015.3.5 T.Yano ## based on the following blogpost. ## It takes much time to run compimp(), even if it is compiled! ## (because of varimp(cf, cond=TRUE) ) ## > system.time(ans <- compimp()) # normal process ## user system ## 99.101 1.252 100.331 ## > system.time(ans <- compimp(B=200,N=40)) ## user system ## 327.863 3.766 331.430 ## library(corpcor) ## > pc <- cor2pcor(cor(ans$df)) ## > pc[1:4,1:4] ## [,1] [,2] [,3] [,4] ## [1,] 1.00000000 0.96075770 0.01780330 -0.068833742 ## [2,] 0.96075770 1.00000000 0.18168173 0.099381081 ## [3,] 0.01780330 0.18168173 1.00000000 -0.125588212 ## [4,] -0.06883374 0.09938108 -0.12558821 1.000000000 ## ## Original post: Random Forest Variable Importance ## http://alandgraf.blogspot.jp/2012/07/random-forest-variable-importance.html ## posted 19th July 2012 by Andrew Landgraf if (is.null(rfsel)) require(randomForest) else require(RRF) require(party) require(car) panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) { ## from the manual on "pairs". NOT USED usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits = digits)[1] txt <- paste0(prefix, txt) if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * r) } panel.hist <- function(x, ...) { ## from the manual on "pairs". usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- graphics::hist(x, plot=FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, ...) ##, col = "cyan", ...) } ## panel.pcor.digits=3 panel.pcor <- function(x, y, digits=3, cex.cor, cex.pcor, prefix="", ...) { ## from papca.R (Doshisha Univ.) by T.Yano usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- cor(x, y) DaTa <- as.data.frame(DaTa) ncD <- ncol(DaTa) pcor <- partial.cor(DaTa) cortxt <- format(c(r, 0.123456789), digits=digits)[1] cortxt <- paste(prefix, cortxt, sep="") # table <- as.list(DaTa[1:4,]) # xcol <- match(list(x=x[1:4]),table) # ycol <- match(list(y=y[1:4]),table) xcol <- match(list(x[1:4]), divmat(DaTa[1:4,])[[2]]) ycol <- match(list(y[1:4]), divmat(DaTa[1:4,])[[2]]) pr <- round(pcor[xcol,ycol],digits) pcortxt <- format(c(pr, 0.123456789), digits=digits)[1] pcortxt <- paste(prefix, pcortxt, sep="") if(missing(cex.cor)) cex <- 0.7/(strwidth(cortxt)+strwidth("-")*(sign(r)-1)/2) if(missing(cex.pcor)) pcex <- 0.7/(strwidth(pcortxt)+strwidth("-")*(sign(pr)-1)/2) text(0.5, 0.7, cortxt, cex = cex * abs(r)+ifelse(ncD<13,0.8,0.5), col=(-sign(r)+3)/2,family="serif") text(0.5, 0.3, pcortxt, cex = pcex * abs(pr)+ifelse(ncD<13,0.8,0.5), col=(-sign(pr)+3)/2,family="serif") } partial.cor <- function (X, ...) { R <- cor(X, ...) RI <- solve(R) D <- 1/sqrt(diag(RI)) R <- -RI * (D %o% D) diag(R) <- 0 rownames(R) <- colnames(R) <- colnames(X) R } divmat <- function(M){ if (length(dim(M))!=2) stop("M must be 2-dim matrix! \n") ansrow <- "ansr <- list(M[1,]" anscol <- "ansc <- list(M[,1]" if (ncol(M)>1) { for (cnt in 2:ncol(M)){ anscol <- paste(anscol, ",M[,",cnt,"]",sep="") }} if (nrow(M)>1) { for (cnt in 2:nrow(M)){ ansrow <- paste(ansrow, ",M[",cnt,",]",sep="") }} ansrow <- paste(ansrow,")",sep="") anscol <- paste(anscol,")",sep="") eval(parse(text=ansrow)) eval(parse(text=anscol)) return(list(row=ansr,col=ansc)) } panel.dens <- function(x, adjust=1,...){ usr <- par("usr") on.exit(par(usr)) f <- density(x,adjust=adjust, bw="SJ") par(usr = c(usr[1:2], 0, 1.5*max(f$y)) ) lines(f$x,f$y) } mrf <- array(0, c(N,5,6)) dimnames(mrf) <- list(1:N, paste("x",1:5,sep=""), c("rF","rF.corb", "cf.ub","cf.ub.cnd", "cf.AUC","cf.AUC.cnd")) x1=rnorm(B) x2=x1+ rnorm(B,0,sd2) y= x1+ rnorm(B,0,sdy) df <- data.frame(y,x1,x2,x3=rnorm(B),x4=rnorm(B),x5=rnorm(B)) DaTa <- df[,1:4] pairs(df[,1:4], upper.panel=upanel, diag.panel=dpanel) print(summary(anv <- lm(y ~ ., data=df))) print(car::vif(anv)) for (cnt in 1:N) { if (is.null(rfsel)){ rf <- randomForest(y~.,data=df,mtry=2,ntree=ntree,importance=TRUE) ## mtry = #var/3 for regression, sqrt(#var) for classification mrf[cnt,,1] <- randomForest::importance(rf)[,imptype] ## type: 1=mean decrease in accuracy, 2=mean decrease in node impurity(Gini, RSS). rf <- randomForest(y~.,data=df,mtry=2,ntree=ntree,importance=TRUE,corr.bias=TRUE) mrf[cnt,,2] <- randomForest::importance(rf)[,imptype] } else { rf <- RRF(y~.,data=df,mtry=2,ntree=ntree,importance=TRUE) mrf[cnt,,1] <- RRF::importance(rf,type=1)[,1] rf <- RRF(y~.,data=df,mtry=2,ntree=ntree,importance=TRUE,corr.bias=TRUE) mrf[cnt,,2] <- RRF::importance(rf,type=1)[,1] } cf <- cforest(y~.,df,control=cforest_unbiased(mtry=2,ntree=ntree)) ## unbiased default: teststat = "quad", testtype = "Univ", replace = FALSE mrf[cnt,,3] <- varimp(cf) mrf[cnt,,4] <- varimp(cf, conditional=TRUE) ## # cf <- cforest(y~.,df,control=cforest_unbiased(mtry=2,ntree=ntree)) # ## standard default: teststat = "max", testtype = "Teststatistic", replace = TRUE # ## ntree=500, mtry=5 mrf[cnt,,5] <- varimpAUC(cf) mrf[cnt,,6] <- varimpAUC(cf, conditional=TRUE) ## } if (new.plot) dev.new() opar <- par(mfrow=c(1,6), mar=c(2,2,2,.1)) for (pan in 1:6) { boxplot(mrf[,,pan], horizontal=FALSE) title(dimnames(mrf)[[3]][pan]) } par(mfrow=c(1,1), mar=c(4,3,2,1)) ratios <- mrf[,1,] / mrf[,2,] medians <- round(apply(ratios, 2, median), 3) medxlab <- paste("median of ratios :", medians[1], medians[2], medians[3], medians[4], medians[5], medians[6]) means <- round(apply(mrf, c(3,2), mean), 3) meansub <- paste("mean of imp(x1) :", means[1], means[2], means[3], means[4], means[5], means[6]) if (new.plot) dev.new() boxplot( ratios , xlab=medxlab, notch=TRUE) title(main=paste("Ratios imp(x1)/imp(x2), rF:type =", imptype), sub=meansub) return(invisible(list(df=df, mrf=mrf, means=means, ratios=ratios, anv=anv))) }