泰坦尼克號kaggle隨機森林預測
一、載入包
# Load packages
library(ggplot2) # visualization
library(ggthemes) # visualization
library(scales) # visualization
library(dplyr) # data manipulation
library(mice) # imputation
library(randomForest) # classification algorithm
library(knitr)#kable
二、讀取數據(選取工作路徑)
train <- read.csv("train.csv", stringsAsFactors = F, na.strings = c("NA", ""))
test <- read.csv("test.csv", stringsAsFactors = F, na.strings = c("NA", ""))
full<-bind_rows(train,test)
str(full)#觀察數據類型
sapply(full, function(x) {sum(is.na(x))})#查看缺失值
可以看出,age和cabin的缺失值比較大,survived是待預測的缺失值
三、轉化因子項
full$Sex <- as.factor(full$Sex)
full$Survived <- as.factor(full$Survived)full$Pclass <- as.ordered(full$Pclass) #自動形成N等份因子四、姓名
full$Title <- gsub((.*, )|(\..*), , full$Name)#家族姓氏
kable(table(full$Sex, full$Title))
rare_title <- c(Dona, Lady, the Countess,Capt, Col, Don,
Dr, Major, Rev, Sir, Jonkheer)
full$Title[full$Title == Mlle] <- Miss
full$Title[full$Title == Ms] <- Miss
full$Title[full$Title == Mme] <- Mrs
full$Title[full$Title %in% rare_title] <- Rare Title
kable(table(full$Sex, full$Title))
full$Title <- as.factor(full$Title)
full$Surname <- sapply(full$Name, function(x) strsplit(x, split = [,.])[[1]][1])生還者姓名
五、家庭人口大小
full$Fsize <- full$SibSp + full$Parch + 1
full$Family <- paste(full$Surname, full$Fsize, sep=)
ggplot(full[!is.na(full$Survived),], aes(x = Fsize, fill = Survived)) +
+ geom_bar(stat=count, position=dodge) +
+ labs(x = Fsize) +theme_grey()+scale_x_continuous(breaks=c(1:11))
full$FsizeD[full$Fsize == 1] <- singleton
full$FsizeD[full$Fsize < 5 & full$Fsize > 1] <- small
full$FsizeD[full$Fsize > 4] <- large
mosaicplot(table(full$FsizeD, full$Survived), main=Family Size by Survival, shade=TRUE)
六、cabin
full$Deck<-factor(sapply(full$Cabin, function(x) strsplit(x, NULL)[[1]][1]))
full[c(62, 830), Embarked]
七、缺失值
embark_fare <- full %>%filter(PassengerId != 62 & PassengerId != 830)
ggplot(embark_fare, aes(x = Embarked, y = Fare, fill = factor(Pclass))) +
geom_boxplot() +
geom_hline(aes(yintercept=80),
colour=red, linetype=dashed, lwd=2) +
scale_y_continuous(labels=dollar_format()) +
theme_few()
full$Embarked[c(62, 830)] <- C
full[1044, ]
ggplot(full[full$Title== Mr & full$Embarked == S, ],
aes(x = Fare)) +
geom_density(fill = #99d6ff, alpha=0.4) +
geom_vline(aes(xintercept=median(Fare, na.rm=T)),
colour=red, linetype=dashed, lwd=1) +
scale_x_continuous(labels=dollar_format())
full$Fare[1044] <- median(full[full$Title == Mr & full$Embarked == S, ]$Fare, na.rm = TRUE)
sapply(full, function(x) {sum(is.na(x))})#查看缺失值
factor_vars <- c(PassengerId,Pclass,Sex,Embarked, Title,Surname,Family,FsizeD)
full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))
set.seed(129)#利用mice包預測年齡,mice處理缺失數據的包
mice_mod <- mice(full[, !names(full) %in% c(PassengerId,Name,Ticket,Cabin,Family,Surname,Survived)], method=rf)
par(mfrow=c(1,2))
hist(full$Age, freq=F, main=Age: Original Data,
col=darkgreen, ylim=c(0,0.04))
hist(mice_output$Age, freq=F, main=Age: MICE Output,
col=lightgreen, ylim=c(0,0.04))
full$Age <- mice_output$Age
ggplot(full[1:891,], aes(Age, fill = factor(Survived))) +
geom_histogram() + facet_grid(.~Sex)
full$Child[full$Age < 18] <- Child
full$Child[full$Age >= 18] <- Adult
kable(table(full$Child, full$Survived))
full$Mother <- Not Mother
full$Mother[full$Sex == female & full$Parch > 0 & full$Age > 18 & full$Title != Miss] <- Mother
table(full$Mother, full$Survived)
full$Child <- factor(full$Child)
full$Mother <- factor(full$Mother)
md.pattern(full)
八、預測
train <- full[1:891,]
test <- full[892:1309,]
#use randomForest
set.seed(754)
rf_model <- randomForest(factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch +
Fare + Embarked + Title +
FsizeD + Child + Mother,
data = train)
plot(rf_model, ylim=c(0,0.36))
legend(topright, colnames(rf_model$err.rate), col=1:3, fill=1:3)
The black line shows the overall error rate which falls below 20%. The red and green lines show the error rate for died and survived respectively. We can see that right now were much more successful predicting death than we are survival.
importance <- importance(rf_model)
varImportance <- data.frame(Variables = row.names(importance),
+ Importance = round(importance[ ,MeanDecreaseGini],2))
rankImportance <- varImportance %>%
+ mutate(Rank = paste0(#,dense_rank(desc(Importance))))
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
+ y = Importance, fill = Importance)) +
+ geom_bar(stat=identity) +
+ geom_text(aes(x = Variables, y = 0.5, label = Rank),
+ hjust=0, vjust=0.55, size = 4, colour = red) +
+ labs(x = Variables) +
+ coord_flip()
prediction <- predict(rf_model, test)
solution <- data.frame(PassengerID = test$PassengerId, Survived = prediction)
第十一、輸出預測結果
write.csv(solution, file = rf_mod_Solution.csv, row.names = F)
推薦閱讀:
※Kaggle 的比賽在 Machine Learning 領域中屬於什麼地位?
※泰坦尼克號倖存預測n ——Kaggle排名321名(前4%)
※Kaggle入門系列:(一)機器學習環境搭建
※零基礎自學兩月後三月三次輕鬆進入kaggle比賽top20小結
TAG:Kaggle |