2015年2月13日金曜日

R言語 ランダムフォレスト






> require("randomForest")
Loading required package: randomForest
randomForest 4.6-7
Type rfNews() to see new features/changes/bug fixes.

> tuneRF(d[,-8],d[,8],doBest=T)
mtry = 2  OOB error = 6.43% 
Searching left ...
mtry = 1  OOB error = 9.23% 
-0.4352332 0.05 
Searching right ...
mtry = 4  OOB error = 6.6% 
-0.02590674 0.05 

Call:
 randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1]) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 6.4%
Confusion matrix:
      No  Yes class.error
No  1399  101  0.06733333
Yes   91 1409  0.06066667
# まずはチューニングする:mtry=2が最適らしい



> d.rf<- span="">randomForest(cv~.,d,mtry=2)
# mtry=2を引数にしてrandomForest()関数で分類する

> print(d.rf)

Call:
 randomForest(formula = cv ~ ., data = d, mtry = 2) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 6.37%
Confusion matrix:
      No  Yes class.error
No  1403   97  0.06466667
Yes   94 1406  0.06266667
# OOB誤差が6.37%とまずまず

> importance(d.rf)
   MeanDecreaseGini
a1        20.320854
a2        11.490523
a3         2.380128
a4       203.135651
a5        75.415005
a6       783.553501
a7         2.679649
# 決定木同様に変数重要度が出せる。これもまた重要

> table(d$cv,predict(d.rf,d[,-8]))
     
        No  Yes
  No  1409   91
  Yes   83 1417

# 分類正答率は94.2%とまずまず




> require("randomForest")
> train_dat<-read .csv="" header="TRUE)</span" kaggle="" ocuments="" train.csv="">
> str(train_dat)

> train_dat$holiday <- as.="" font="">factor(train_dat$holiday)

> train_dat$workingday <- as.factor="" span="" train_dat="" workingday="">
> train_dat$weather <- as.ordered="" span="" train_dat="">weather)
> train_dat$season <- as.ordered="" span="" train_dat="">season)

train_dat$datetime = as.POSIXct(train_dat$datetime)


> tuneRF(train_dat[,c(-10,-11,-12)],train_dat[,12],doBest=T)

mtry = 3  OOB error = 16201.56 
Searching left ...
mtry = 2  OOB error = 17056 
-0.05273804 0.05 
Searching right ...
mtry = 6  OOB error = 15210.13 
0.06119329 0.05 
mtry = 9  OOB error = 14763.69 
0.02935166 0.05 

Call:
 randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1]) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 9

          Mean of squared residuals: 14300.74
                    % Var explained: 56.41
> 

# まずはチューニングする。mtry=9が最適らしい

> train_dat.rf<-randomforest class="synSpecial" font="">(count~.,train_dat[,c(-10,-11)],mtry=9)
> print(train_dat.rf)

Call:
 randomForest(formula = count ~ ., data = train_dat[, c(-10, -11)],      mtry = 9) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 9

          Mean of squared residuals: 14306.06
                    % Var explained: 56.4


# テストデータを読み込む

train_dat<-read .csv="" font-family:="" gothic="" header="TRUE)</span><br style=" iragino="" kaku="" normal="" pro="" white-space:="">> str(train_dat)

> test_dat<-read .csv="" header="TRUE)</span" kaggle="" ocuments="" test.csv="">
> str(test_dat)

> test_dat$holiday <- as.factor="" holiday="" span="" test_dat="">

> test_dat$workingday <- as.factor="" span="" test_dat="" workingday="">
> test_dat$weather <- as.ordered="" span="">test_dat$weather)
test_dat $season <- as.ordered="" span="">test_dat$season)






> tuneRF(train_dat[,c(-10,-11,-12)],train_dat[,12],doBest=T)


n <- nrow(iris)
s <- sample(n,n*0.5)
iris.train <- iris[s,]
iris.test <- iris[-s,]
# random forest
forest <- randomForest(Species~., data=iris.train, ntree=500)
pred.forest <- predict(forest, newdata=iris.test, type="class")
table(pred.forest, iris.test[,5])
 
# decision tree
tree <- rpart(Species~.,data=iris.train)
pred.rpart<-predict(tree, iris.test, type="class")
table(pred.rpart, iris.test[,5])
 
# importance
getTree(forest, 1, labelVar=TRUE)
varImpPlot(forest)
 
# report
split.screen(c(2,1))
split.screen(c(1,3), screen = 2)
screen(3); partialPlot(forest, iris, Petal.Length, "setosa")
screen(4); partialPlot(forest, iris, Petal.Length, "versicolor")
screen(5); partialPlot(forest, iris, Petal.Length, "virginica")
split.screen(c(2,1), screen = 1)
screen(1); plot(forest) 
close.screen(all=T)

0 件のコメント:

コメントを投稿