HR Analyse

01-28

數據來源：Human Resources Analytics | Kaggle

變數解釋

satisfaction_level：工作滿意度

last_evaluation：最終評價

number_project：工作項目數

average_montly_hours：平均每月工作時間

time_spend_company：花費在公司中的時間

Work_accident：工作失誤

promotion_last_5years：最近五年是否被提升

sales：崗位

salary：薪水

left：是否離職

分析目的：探究各變數之間的關係，通過建立模型預測員工是否會離職。

library(dplyr)nlibrary(ggplot2)n#數據重命名nhr<-HR_comma_sep_1_n#查看數據nstr(hr)n#變數重命名npost <- rename(hr, post= sales)n#查看缺失值nsapply(hr,function(x) sum(is.na(x)))n#以崗位將離職的員工分組nhr_post<-post %>%n filter(left == 1)%>%n select(post,left)%>%n group_by(post)%>%n summarize(left =n())%>%n arrange(desc(left))nggplot(hr_post, aes(x=post,y=left,col=post,fill=post))+n geom_bar(stat = "identity",width = .75)+n coord_flip()+n scale_x_discrete("Department") + n scale_y_continuous("Nos of Employees")+n ggtitle("各部門員工的流動性")n

#工資水平np9<- hr %>% group_by(Salaries) %>%count() nggplot(p9, aes(x=salary,y=n,fill=salary))+n geom_bar(stat = "identity",width = .75)+n scale_x_discrete("salary") + n scale_y_continuous("n")+n ggtitle("各水平工資數量")n

p10 <- hr %>%n group_by(post) %>%n summarise(n sat = mean(satisfaction_level, na.rm = TRUE),n mon = mean(average_montly_hours, na.rm = TRUE),n num = mean(number_project,na.rm=TRUE))nggplot(p10, aes(x=post,y=sat,col=post,fill=post))+n geom_bar(stat = "identity",width = .75)+n coord_flip()+n scale_x_discrete("Department") + n scale_y_continuous("level")+n ggtitle("各部門滿意度")n

ggplot(p10, aes(x=post,y=mon,col=post,fill=post))+n geom_bar(stat = "identity",width = .75)+n coord_flip()+n scale_x_discrete("Department") + n scale_y_continuous("level")+n ggtitle("各部平均每月工作時間")n

#平均每月工作時間與工作滿意度關係np11 <- hr %>%n group_by(average_montly_hours) %>%n summarise(n sat = mean(satisfaction_level, na.rm = TRUE)n )nggplot(p11, aes(x=p11$average_montly_hours,y=sat))+n geom_point(position="jitter", color="blue", alpha=.5)+n geom_smooth()n

ggplot(hr, aes(x=salary,col=salary, fill=salary))+n geom_bar(width = .7)+n facet_grid(~post)+n scale_x_discrete("Salary level ") + n scale_y_continuous("Nos of Employees")+n ggtitle("各崗位的薪水水平")n

ggplot(hr, aes(x=average_montly_hours,color=post,fill=post ))+n geom_histogram(binwidth = 3)+n theme(axis.line = element_line(color = "blue",size=1.25))+n theme(legend.position="bottom")+n facet_grid(~left)+n scale_x_continuous("Average Monthly Hours") + n scale_y_continuous("No of employees ")+n ggtitle("工作時間與離職的關係")n

Data_w <-Data%>%filter(Work_accident>0)nnggplot(hr, aes(x=Data_w,color=postn ,fill=post ))+n geom_bar(width = .7)+n theme(axis.line = element_line(color = "blue",size=1.25))+n theme(legend.position="bottom")+n facet_grid(~left)+n scale_x_discrete("Work accidents") + n scale_y_continuous("No of employees ")+n ggtitle("工作事故與離職")n

hr_<-hr[c(1:8)]ncor(hr_)nlibrary(corrgram)ncorrgram(hr_,lower.panel = panel.shade,upper.panel = NULL,n text.panel = panel.txt,main="相關關係")n

#將數據分成兩個數據集nsample_date <- sample(2,nrow(hr),replace=TRUE,prob=c(0.7,0.3))ntrain_date <- hr[sample_date==1,]#訓練數據ntest_date <- hr[sample_date==2,]#測試數據ntable(train_date$left)ntable(test_date$left)n#邏輯回歸nglm.model <- glm(left ~ ., data = train_date, family = "binomial")nsummary(glm.model)n#評價模型nglm.predict <- predict(glm.model, test_date, type = "response")ntable(test_date$left, glm.predict > 0.41)n

glm.accuracy <- (3020+546)/nrow(test_date)npaste("Accuracy of glm model is:",glm.accuracy*100, "%", sep = " ")n[1] "Accuracy of glm model is: 79.6871508379888 %"n#決策樹nlibrary(rpart.plot)nprp(dtree.pruned, type = 2, extra = 104, n fallen.leaves = TRUE, main="Decision Tree")n

dtree.pred <- predict(dtree.pruned, test_date, type="class")ndtree.perf <- table(test_date$left, dtree.pred, n dnn=c("Actual", "Predicted"))ndtree.perfn

dtree.perfnrpart.accuracy <- (3335+99)/nrow(test_date)npaste("Accuracy of rpart model is:",rpart.accuracy*100, "%", sep = " ")n[1] "Accuracy of rpart model is: 76.7374301675978 %"n