📄 randomforest.default.r
字号:
## mylevels() returns levels if given a factor, otherwise 0.
mylevels <- function(x) if (is.factor(x)) levels(x) else 0
"randomForest.default" <-
function(x, y=NULL, xtest=NULL, ytest=NULL, ntree=500,
mtry=if (!is.null(y) && !is.factor(y))
max(floor(ncol(x)/3), 1) else floor(sqrt(ncol(x))),
replace=TRUE, classwt=NULL, cutoff, strata,
sampsize = if (replace) nrow(x) else ceiling(.632*nrow(x)),
nodesize = if (!is.null(y) && !is.factor(y)) 5 else 1,
importance=FALSE, localImp=FALSE, nPerm=1,
proximity, oob.prox=proximity,
norm.votes=TRUE, do.trace=FALSE,
keep.forest=!is.null(y) && is.null(xtest), corr.bias=FALSE,
keep.inbag=FALSE, ...) {
addclass <- is.null(y)
classRF <- addclass || is.factor(y)
if (!classRF && length(unique(y)) <= 5) {
warning("The response has five or fewer unique values. Are you sure you want to do regression?")
}
if (classRF && !addclass && length(unique(y)) < 2)
stop("Need at least two classes to do classification.")
n <- nrow(x)
p <- ncol(x)
if (n == 0) stop("data (x) has 0 rows")
x.row.names <- rownames(x)
x.col.names <- if (is.null(colnames(x))) 1:ncol(x) else colnames(x)
## overcome R's lazy evaluation:
keep.forest <- keep.forest
testdat <- !is.null(xtest)
if (testdat) {
if (ncol(x) != ncol(xtest))
stop("x and xtest must have same number of columns")
ntest <- nrow(xtest)
xts.row.names <- rownames(xtest)
}
## Make sure mtry is in reasonable range.
if (mtry < 1 || mtry > p)
warning("invalid mtry: reset to within valid range")
mtry <- max(1, min(p, round(mtry)))
if (!is.null(y)) {
if (length(y) != n) stop("length of response must be the same as predictors")
addclass <- FALSE
} else {
if (!addclass) addclass <- TRUE
y <- factor(c(rep(1, n), rep(2, n)))
x <- rbind(x, x)
}
## Check for NAs.
if (any(is.na(x))) stop("NA not permitted in predictors")
if (testdat && any(is.na(xtest))) stop("NA not permitted in xtest")
if (any(is.na(y))) stop("NA not permitted in response")
if (!is.null(ytest) && any(is.na(ytest))) stop("NA not permitted in ytest")
if (is.data.frame(x)) {
xlevels <- lapply(x, mylevels)
ncat <- sapply(xlevels, length)
## Treat ordered factors as numerics.
ncat <- ifelse(sapply(x, is.ordered), 1, ncat)
x <- data.matrix(x)
if(testdat) {
if(!is.data.frame(xtest))
stop("xtest must be data frame if x is")
xfactor <- which(sapply(xtest, is.factor))
if (length(xfactor) > 0) {
for (i in xfactor) {
if (any(! levels(xtest[[i]]) %in% xlevels[[i]]))
stop("New factor levels in xtest not present in x")
xtest[[i]] <-
factor(xlevels[[i]][match(xtest[[i]], xlevels[[i]])],
levels=xlevels[[i]])
}
}
xtest <- data.matrix(xtest)
}
} else {
ncat <- rep(1, p)
xlevels <- as.list(rep(0, p))
}
maxcat <- max(ncat)
if (maxcat > 32)
stop("Can not handle categorical predictors with more than 32 categories.")
if (classRF) {
nclass <- length(levels(y))
## Check for empty classes:
if (any(table(y) == 0)) stop("Can't have empty classes in y.")
if (!is.null(ytest)) {
if (!is.factor(ytest)) stop("ytest must be a factor")
if (!all(levels(y) == levels(ytest)))
stop("y and ytest must have the same levels")
}
if (missing(cutoff)) {
cutoff <- rep(1 / nclass, nclass)
} else {
if (sum(cutoff) > 1 || sum(cutoff) < 0 || !all(cutoff > 0) ||
length(cutoff) != nclass) {
stop("Incorrect cutoff specified.")
}
if (!is.null(names(cutoff))) {
if (!all(names(cutoff) %in% levels(y))) {
stop("Wrong name(s) for cutoff")
}
cutoff <- cutoff[levels(y)]
}
}
if (!is.null(classwt)) {
if (length(classwt) != nclass)
stop("length of classwt not equal to number of classes")
## If classwt has names, match to class labels.
if (!is.null(names(classwt))) {
if (!all(names(classwt) %in% levels(y))) {
stop("Wrong name(s) for classwt")
}
classwt <- classwt[levels(y)]
}
if (any(classwt <= 0)) stop("classwt must be positive")
ipi <- 1
} else {
classwt <- rep(1, nclass)
ipi <- 0
}
} else addclass <- FALSE
if (missing(proximity)) proximity <- addclass
if (proximity) {
prox <- matrix(0.0, n, n)
proxts <- if (testdat) matrix(0, ntest, ntest + n) else double(1)
} else {
prox <- proxts <- double(1)
}
if (localImp) {
importance <- TRUE
impmat <- matrix(0, p, n)
} else impmat <- double(1)
if (importance) {
if (nPerm < 1) nPerm <- as.integer(1) else nPerm <- as.integer(nPerm)
if (classRF) {
impout <- matrix(0.0, p, nclass + 2)
impSD <- matrix(0.0, p, nclass + 1)
} else {
impout <- matrix(0.0, p, 2)
impSD <- double(p)
names(impSD) <- x.col.names
}
} else {
impout <- double(p)
impSD <- double(1)
}
nsample <- if (addclass) 2 * n else n
Stratify <- length(sampsize) > 1
if ((!Stratify) && sampsize > nrow(x)) stop("sampsize too large")
if (Stratify && (!classRF)) stop("sampsize should be of length one")
if (classRF) {
if (Stratify) {
if (missing(strata)) strata <- y
if (!is.factor(strata)) strata <- as.factor(strata)
nsum <- sum(sampsize)
if (length(sampsize) > nlevels(strata))
stop("sampsize has too many elements.")
if (any(sampsize <= 0) || nsum == 0)
stop("Bad sampsize specification")
## If sampsize has names, match to class labels.
if (!is.null(names(sampsize))) {
sampsize <- sampsize[levels(strata)]
}
if (any(sampsize > table(strata)))
stop("sampsize can not be larger than class frequency")
} else {
nsum <- sampsize
}
nrnodes <- 2 * trunc(nsum / nodesize) + 1
} else {
## For regression trees, need to do this to get maximal trees.
nrnodes <- 2 * trunc(sampsize/max(1, nodesize - 4)) + 1
}
## Compiled code expects variables in rows and observations in columns.
x <- t(x)
storage.mode(x) <- "double"
if (testdat) {
xtest <- t(xtest)
storage.mode(xtest) <- "double"
if (is.null(ytest)) {
ytest <- labelts <- 0
} else {
labelts <- TRUE
}
} else {
xtest <- double(1)
ytest <- double(1)
ntest <- 1
labelts <- FALSE
}
nt <- if (keep.forest) ntree else 1
if (classRF) {
error.test <- if (labelts) double((nclass+1) * ntree) else double(1)
rfout <- .C("classRF",
x = x,
xdim = as.integer(c(p, n)),
y = as.integer(y),
nclass = as.integer(nclass),
ncat = as.integer(ncat),
maxcat = as.integer(maxcat),
sampsize = as.integer(sampsize),
strata = if (Stratify) as.integer(strata) else integer(1),
Options = as.integer(c(addclass,
importance,
localImp,
proximity,
oob.prox,
do.trace,
keep.forest,
replace,
Stratify,
keep.inbag)),
ntree = as.integer(ntree),
mtry = as.integer(mtry),
ipi = as.integer(ipi),
classwt = as.double(classwt),
cutoff = as.double(cutoff),
nodesize = as.integer(nodesize),
outcl = integer(nsample),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -