HR Analyse
數據來源:Human Resources Analytics | Kaggle
變數解釋
satisfaction_level:工作滿意度
last_evaluation:最終評價
number_project:工作項目數
average_montly_hours:平均每月工作時間
time_spend_company:花費在公司中的時間
Work_accident:工作失誤
promotion_last_5years:最近五年是否被提升
sales:崗位
salary:薪水
left:是否離職
分析目的:探究各變數之間的關係,通過建立模型預測員工是否會離職。
library(dplyr)nlibrary(ggplot2)n#數據重命名nhr<-HR_comma_sep_1_n#查看數據nstr(hr)n#變數重命名npost <- rename(hr, post= sales)n#查看缺失值nsapply(hr,function(x) sum(is.na(x)))n#以崗位將離職的員工分組nhr_post<-post %>%n filter(left == 1)%>%n select(post,left)%>%n group_by(post)%>%n summarize(left =n())%>%n arrange(desc(left))nggplot(hr_post, aes(x=post,y=left,col=post,fill=post))+n geom_bar(stat = "identity",width = .75)+n coord_flip()+n scale_x_discrete("Department") + n scale_y_continuous("Nos of Employees")+n ggtitle("各部門員工的流動性")n
#工資水平np9<- hr %>% group_by(Salaries) %>%count() nggplot(p9, aes(x=salary,y=n,fill=salary))+n geom_bar(stat = "identity",width = .75)+n scale_x_discrete("salary") + n scale_y_continuous("n")+n ggtitle("各水平工資數量")n
p10 <- hr %>%n group_by(post) %>%n summarise(n sat = mean(satisfaction_level, na.rm = TRUE),n mon = mean(average_montly_hours, na.rm = TRUE),n num = mean(number_project,na.rm=TRUE))nggplot(p10, aes(x=post,y=sat,col=post,fill=post))+n geom_bar(stat = "identity",width = .75)+n coord_flip()+n scale_x_discrete("Department") + n scale_y_continuous("level")+n ggtitle("各部門滿意度")n
ggplot(p10, aes(x=post,y=mon,col=post,fill=post))+n geom_bar(stat = "identity",width = .75)+n coord_flip()+n scale_x_discrete("Department") + n scale_y_continuous("level")+n ggtitle("各部平均每月工作時間")n
#平均每月工作時間與工作滿意度關係np11 <- hr %>%n group_by(average_montly_hours) %>%n summarise(n sat = mean(satisfaction_level, na.rm = TRUE)n )nggplot(p11, aes(x=p11$average_montly_hours,y=sat))+n geom_point(position="jitter", color="blue", alpha=.5)+n geom_smooth()n
ggplot(hr, aes(x=salary,col=salary, fill=salary))+n geom_bar(width = .7)+n facet_grid(~post)+n scale_x_discrete("Salary level ") + n scale_y_continuous("Nos of Employees")+n ggtitle("各崗位的薪水水平")n
ggplot(hr, aes(x=average_montly_hours,color=post,fill=post ))+n geom_histogram(binwidth = 3)+n theme(axis.line = element_line(color = "blue",size=1.25))+n theme(legend.position="bottom")+n facet_grid(~left)+n scale_x_continuous("Average Monthly Hours") + n scale_y_continuous("No of employees ")+n ggtitle("工作時間與離職的關係")n
Data_w <-Data%>%filter(Work_accident>0)nnggplot(hr, aes(x=Data_w,color=postn ,fill=post ))+n geom_bar(width = .7)+n theme(axis.line = element_line(color = "blue",size=1.25))+n theme(legend.position="bottom")+n facet_grid(~left)+n scale_x_discrete("Work accidents") + n scale_y_continuous("No of employees ")+n ggtitle("工作事故與離職")n
hr_<-hr[c(1:8)]ncor(hr_)nlibrary(corrgram)ncorrgram(hr_,lower.panel = panel.shade,upper.panel = NULL,n text.panel = panel.txt,main="相關關係")n
#將數據分成兩個數據集nsample_date <- sample(2,nrow(hr),replace=TRUE,prob=c(0.7,0.3))ntrain_date <- hr[sample_date==1,]#訓練數據ntest_date <- hr[sample_date==2,]#測試數據ntable(train_date$left)ntable(test_date$left)n#邏輯回歸nglm.model <- glm(left ~ ., data = train_date, family = "binomial")nsummary(glm.model)n#評價模型nglm.predict <- predict(glm.model, test_date, type = "response")ntable(test_date$left, glm.predict > 0.41)n
glm.accuracy <- (3020+546)/nrow(test_date)npaste("Accuracy of glm model is:",glm.accuracy*100, "%", sep = " ")n[1] "Accuracy of glm model is: 79.6871508379888 %"n#決策樹nlibrary(rpart.plot)nprp(dtree.pruned, type = 2, extra = 104, n fallen.leaves = TRUE, main="Decision Tree")n
dtree.pred <- predict(dtree.pruned, test_date, type="class")ndtree.perf <- table(test_date$left, dtree.pred, n dnn=c("Actual", "Predicted"))ndtree.perfn
dtree.perfnrpart.accuracy <- (3335+99)/nrow(test_date)npaste("Accuracy of rpart model is:",rpart.accuracy*100, "%", sep = " ")n[1] "Accuracy of rpart model is: 76.7374301675978 %"n
推薦閱讀:
※如何判斷一 部電影值不值得看?
※R的矩陣相乘/逆矩陣
※第一關:開啟大數據學習之路
※零基礎學習R語言數據分析
※R語言可視化——REmapH(中心熱度圖)
TAG:R编程语言 |