泰坦尼克號kaggle隨機森林預測

04-20

一、載入包

# Load packages

library(ggplot2) # visualization

library(ggthemes) # visualization

library(scales) # visualization

library(dplyr) # data manipulation

library(mice) # imputation

library(randomForest) # classification algorithm

library(knitr)#kable

二、讀取數據（選取工作路徑）

train <- read.csv("train.csv", stringsAsFactors = F, na.strings = c("NA", ""))

test <- read.csv("test.csv", stringsAsFactors = F, na.strings = c("NA", ""))

full<-bind_rows(train,test)

str(full)#觀察數據類型

sapply(full, function(x) {sum(is.na(x))})#查看缺失值

可以看出，age和cabin的缺失值比較大，survived是待預測的缺失值

三、轉化因子項

full$Sex <- as.factor(full$Sex)

full$Survived <- as.factor(full$Survived)

full$Pclass <- as.ordered(full$Pclass) #自動形成N等份因子

四、姓名

full$Title <- gsub((.*, )|(\..*), , full$Name)#家族姓氏

kable(table(full$Sex, full$Title))

rare_title <- c(Dona, Lady, the Countess,Capt, Col, Don,

Dr, Major, Rev, Sir, Jonkheer)

full$Title[full$Title == Mlle] <- Miss

full$Title[full$Title == Ms] <- Miss

full$Title[full$Title == Mme] <- Mrs

full$Title[full$Title %in% rare_title] <- Rare Title

kable(table(full$Sex, full$Title))

full$Title <- as.factor(full$Title)

full$Surname <- sapply(full$Name, function(x) strsplit(x, split = [,.])[[1]][1])生還者姓名

五、家庭人口大小

full$Fsize <- full$SibSp + full$Parch + 1

full$Family <- paste(full$Surname, full$Fsize, sep=)

ggplot(full[!is.na(full$Survived),], aes(x = Fsize, fill = Survived)) +

+ geom_bar(stat=count, position=dodge) +

+ labs(x = Fsize) +theme_grey()+scale_x_continuous(breaks=c(1:11))

full$FsizeD[full$Fsize == 1] <- singleton

full$FsizeD[full$Fsize < 5 & full$Fsize > 1] <- small

full$FsizeD[full$Fsize > 4] <- large

mosaicplot(table(full$FsizeD, full$Survived), main=Family Size by Survival, shade=TRUE)

六、cabin

full$Deck<-factor(sapply(full$Cabin, function(x) strsplit(x, NULL)[[1]][1]))

full[c(62, 830), Embarked]

七、缺失值

embark_fare <- full %>%filter(PassengerId != 62 & PassengerId != 830)

ggplot(embark_fare, aes(x = Embarked, y = Fare, fill = factor(Pclass))) +

geom_boxplot() +

geom_hline(aes(yintercept=80),

colour=red, linetype=dashed, lwd=2) +

scale_y_continuous(labels=dollar_format()) +

theme_few()

full$Embarked[c(62, 830)] <- C

full[1044, ]

ggplot(full[full$Title== Mr & full$Embarked == S, ],

aes(x = Fare)) +

geom_density(fill = #99d6ff, alpha=0.4) +

geom_vline(aes(xintercept=median(Fare, na.rm=T)),

colour=red, linetype=dashed, lwd=1) +

scale_x_continuous(labels=dollar_format())

full$Fare[1044] <- median(full[full$Title == Mr & full$Embarked == S, ]$Fare, na.rm = TRUE)

sapply(full, function(x) {sum(is.na(x))})#查看缺失值

factor_vars <- c(PassengerId,Pclass,Sex,Embarked， Title,Surname,Family,FsizeD)

full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))

set.seed(129)#利用mice包預測年齡，mice處理缺失數據的包

mice_mod <- mice(full[, !names(full) %in% c(PassengerId,Name,Ticket,Cabin,Family,Surname,Survived)], method=rf)

par(mfrow=c(1,2))

hist(full$Age, freq=F, main=Age: Original Data,

col=darkgreen, ylim=c(0,0.04))

hist(mice_output$Age, freq=F, main=Age: MICE Output,

col=lightgreen, ylim=c(0,0.04))

full$Age <- mice_output$Age

ggplot(full[1:891,], aes(Age, fill = factor(Survived))) +

geom_histogram() + facet_grid(.~Sex)

full$Child[full$Age < 18] <- Child

full$Child[full$Age >= 18] <- Adult

kable(table(full$Child, full$Survived))

full$Mother <- Not Mother

full$Mother[full$Sex == female & full$Parch > 0 & full$Age > 18 & full$Title != Miss] <- Mother

table(full$Mother, full$Survived)

full$Child <- factor(full$Child)

full$Mother <- factor(full$Mother)

md.pattern(full)

八、預測

train <- full[1:891,]

test <- full[892:1309,]

#use randomForest

set.seed(754)

rf_model <- randomForest(factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch +

Fare + Embarked + Title +

FsizeD + Child + Mother,

data = train)

plot(rf_model, ylim=c(0,0.36))

legend(topright, colnames(rf_model$err.rate), col=1:3, fill=1:3)

The black line shows the overall error rate which falls below 20%. The red and green lines show the error rate for died and survived respectively. We can see that right now were much more successful predicting death than we are survival.

importance <- importance(rf_model)

varImportance <- data.frame(Variables = row.names(importance),

+ Importance = round(importance[ ,MeanDecreaseGini],2))

rankImportance <- varImportance %>%

+ mutate(Rank = paste0(#,dense_rank(desc(Importance))))

ggplot(rankImportance, aes(x = reorder(Variables, Importance),

+ y = Importance, fill = Importance)) +

+ geom_bar(stat=identity) +

+ geom_text(aes(x = Variables, y = 0.5, label = Rank),

+ hjust=0, vjust=0.55, size = 4, colour = red) +

+ labs(x = Variables) +

+ coord_flip()

prediction <- predict(rf_model, test)

solution <- data.frame(PassengerID = test$PassengerId, Survived = prediction)

第十一、輸出預測結果

write.csv(solution, file = rf_mod_Solution.csv, row.names = F)