標籤:

房價預測 (一)(lasso與xgboost)

一、載入所需要的包

library(knitr)

library(ggplot2)

library(plyr)

library(dplyr)

library(corrplot)

library(caret)

library(gridExtra)

library(scales)

library(Rmisc)

library(ggrepel)

library(randomForest)

library(psych)

library(xgboost)

二、讀取觀察、整理數據

test<-read.csv("test.csv",stringsAsFactors = F,na.strings = c("NA",""))

train<-read.csv("train.csv",stringsAsFactors = F,na.strings = c("NA",""))

dim(test)

dim(train)

test_labels <- test$Id

test$Id <- NULL#刪除不需要的序列號

train$Id <- NULL

test$SalePrice <- NA

all <- rbind(train, test)#整合訓練集和測試集的數據,方便處理缺失值和特徵向量

三、初探重要的變數

ggplot(data=all[!is.na(all$SalePrice),], aes(x=SalePrice)) +

geom_histogram(fill="blue", binwidth = 10000) +

scale_x_continuous(breaks= seq(0, 800000, by=100000), labels = comma)

summary(all$SalePrice)

1.數值型變數探究

numericVars <- which(sapply(all, is.numeric))#篩選數據框中數值型變數

numericVarNames <- names(numericVars)#取變數名稱

all_numVar <- all[, numericVars]#取數值型變數的數據

cor_numVar<-cor(all_numVar,use = "pairwise.complete.obs")#展示數值型變數數據的相關係數

view(cor_numVar)

cor_sorted <- as.matrix(sort(cor_numVar[,SalePrice], decreasing = TRUE))#選取跟房價相關的相關係數

view(cor_sorted)#以SalePrice排列

corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")#可視化

2、變數OverallQual

ggplot(data=all[!is.na(all$SalePrice),], aes(x=factor(OverallQual), y=SalePrice))+

geom_boxplot() + labs(x=Overall Quality) +

scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)

3、Above Grade (Ground) Living Area (square feet)

ggplot(data=all[!is.na(all$SalePrice),], aes(x=GrLivArea, y=SalePrice))+

geom_point(col=blue) + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +

scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +

geom_text_repel(aes(label = ifelse(all$GrLivArea[!is.na(all$SalePrice)]>4500, rownames(all), )))

all[c(524, 1299), c(SalePrice, GrLivArea, OverallQual)]#離群點

3、缺失值探究

NAcol <- which(colSums(is.na(all)) > 0)

sort(colSums(sapply(all[NAcol], is.na)), decreasing = TRUE)

cat(There are, length(NAcol), columns with missing values)#缺失值探究

(1)poolQC#缺失值最多的變數

all$PoolQC[is.na(all$PoolQC)] <- None

Qualities <- c(None = 0, Po = 1, Fa = 2, TA = 3, Gd = 4, Ex = 5)

all$PoolQC<-as.integer(revalue(all$PoolQC, Qualities))

table(all$PoolQC)

all[all$PoolArea>0 & all$PoolQC==0, c(PoolArea, PoolQC, OverallQual)]

all$PoolQC[2421] <- 2

all$PoolQC[2504] <- 3

all$PoolQC[2600] <- 2

(2)Miscellaneous Feature#缺失第二多、其他條件中包含的部分特性

all$MiscFeature[is.na(all$MiscFeature)] <- None

all$MiscFeature <- as.factor(all$MiscFeature)

ggplot(all[!is.na(all$SalePrice),], aes(x=MiscFeature, y=SalePrice)) +

geom_bar(stat=summary, fun.y = "median", fill=blue) +

scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +

geom_label(stat = "count", aes(label = ..count.., y = ..count..))

table(all$MiscFeature)

(3)Alley 小道路面類型#缺失值第三2721

ggplot(all[!is.na(all$SalePrice),], aes(x=Alley, y=SalePrice)) +

geom_bar(stat=summary, fun.y = "median", fill=blue)+

scale_y_continuous(breaks= seq(0, 200000, by=50000), labels = comma)

(4)Fence quality#圍欄質量 缺失值

all$Fence[is.na(all$Fence)] <- None

table(all$Fence)

ggplot(all[!is.na(all$SalePrice),], aes(x=Fence, y=SalePrice)) +

+ geom_bar(stat=summary, fun.y = "median", fill=blue)+

+ scale_y_continuous(breaks= seq(0, 200000, by=50000), labels = comma)

all$Fence <- as.factor(all$Fence)

(5)Fireplace quality#壁爐質量 (1420)

all$FireplaceQu[is.na(all$FireplaceQu)] <- None

all$FireplaceQu<-as.integer(revalue(all$FireplaceQu, Qualities))

table(all$FireplaceQu)

(6)LotFrontage: Linear feet of street connected to property#房屋同街道(486)

ggplot(all[!is.na(all$LotFrontage),], aes(x=as.factor(Neighborhood), y=LotFrontage)) +

geom_bar(stat=summary, fun.y = "median", fill=blue) +

theme(axis.text.x = element_text(angle = 45, hjust = 1))

(7)LotShape: General shape of property#房屋外形

all$LotShape<-as.integer(revalue(all$LotShape, c(IR3=0, IR2=1, IR1=2, Reg=3)))

table(all$LotShape)

sum(table(all$LotShape))

(8)LotConfig: Lot configuration#配置

ggplot(all[!is.na(all$SalePrice),], aes(x=as.factor(LotConfig), y=SalePrice)) +

geom_bar(stat=summary, fun.y = "median", fill=blue)+

scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +

geom_label(stat = "count", aes(label = ..count.., y = ..count..))

all$LotConfig <- as.factor(all$LotConfig)

table(all$LotConfig)

sum(table(all$LotConfig))

(9)Garage variables#汽車變數(有七個變數和汽車有關)

#Two of those have one NA (GarageCars and GarageArea), one has 157 NAs (GarageType), 4 variables have 159 NAs.

車庫一般和房子是一起修建的所以用房子的修建時間來代替車庫建造時間

all$GarageYrBlt[is.na(all$GarageYrBlt)] <- all$YearBuilt[is.na(all$GarageYrBlt)]

車庫中有157個變數都缺失,應該是沒有車庫所導致的。

kable(all[!is.na(all$GarageType) & is.na(all$GarageFinish), c(GarageCars, GarageArea, GarageType, GarageCond, GarageQual, GarageFinish)])

all$GarageCond[2127] <- names(sort(-table(all$GarageCond)))[1]

all$GarageQual[2127] <- names(sort(-table(all$GarageQual)))[1]

all$GarageFinish[2127] <- names(sort(-table(all$GarageFinish)))[1]

kable(all[2127, c(GarageYrBlt, GarageCars, GarageArea, GarageType, GarageCond, GarageQual, GarageFinish)])

all$GarageCars[2577] <- 0 #結論

all$GarageArea[2577] <- 0

all$GarageType[2577] <- NA

  • GarageType: Garage location#車庫位置

all$GarageType[is.na(all$GarageType)] <- No Garage

all$GarageType <- as.factor(all$GarageType)

table(all$GarageType)

  • GarageFinish: Interior finish of the garage#車庫翻新時間

all$GarageFinish[is.na(all$GarageFinish)] <- None

Finish <- c(None=0, Unf=1, RFn=2, Fin=3)

all$GarageFinish<-as.integer(revalue(all$GarageFinish, Finish))

table(all$GarageFinish)

  • GarageQual: Garage quality#車庫質量

all$GarageQual[is.na(all$GarageQual)] <- None

all$GarageQual<-as.integer(revalue(all$GarageQual, Qualities))

table(all$GarageQual)

。。。。。

  • all$GarageType[is.na(all$GarageType)] <- No Garage

all$GarageType <- as.factor(all$GarageType)

table(all$GarageType)

  • all$GarageFinish[is.na(all$GarageFinish)] <- None

Finish <- c(None=0, Unf=1, RFn=2, Fin=3)

all$GarageFinish<-as.integer(revalue(all$GarageFinish, Finish))

table(all$GarageFinish)

  • all$GarageQual[is.na(all$GarageQual)] <- None

all$GarageQual<-as.integer(revalue(all$GarageQual, Qualities))

table(all$GarageQual)

  • all$GarageCond[is.na(all$GarageCond)] <- None

all$GarageCond<-as.integer(revalue(all$GarageCond, Qualities))

table(all$GarageCond)

(10)Altogether, there are 11 variables that relate to the Basement of a house

length(which(is.na(all$BsmtQual) & is.na(all$BsmtCond) & is.na(all$BsmtExposure) & is.na(all$BsmtFinType1) & is.na(all$BsmtFinType2)))#檢查有多少變數共同缺失值79

  • all[!is.na(all$BsmtFinType1) & (is.na(all$BsmtCond)|is.na(all$BsmtQual)|is.na(all$BsmtExposure)|is.na(all$BsmtFinType2)), c(BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2)]

all$BsmtFinType2[333] <- names(sort(-table(all$BsmtFinType2)))[1]

all$BsmtExposure[c(949, 1488, 2349)] <- names(sort(-table(all$BsmtExposure)))[1]

all$BsmtCond[c(2041, 2186, 2525)] <- names(sort(-table(all$BsmtCond)))[1]

all$BsmtQual[c(2218, 2219)] <- names(sort(-table(all$BsmtQual)))[1]

  • all$BsmtQual[is.na(all$BsmtQual)] <- None

all$BsmtQual<-as.integer(revalue(all$BsmtQual, Qualities))

table(all$BsmtQual)

#BsmtCond: Evaluates the general condition of the basement

all$BsmtCond[is.na(all$BsmtCond)] <- None

all$BsmtCond<-as.integer(revalue(all$BsmtCond, Qualities))

table(all$BsmtCond)

#BsmtExposure: Refers to walkout or garden level walls

all$BsmtExposure[is.na(all$BsmtExposure)] <- None

Exposure <- c(None=0, No=1, Mn=2, Av=3, Gd=4)

all$BsmtExposure<-as.integer(revalue(all$BsmtExposure, Exposure))

table(all$BsmtExposure)

#BsmtFinType1: Rating of basement finished area

all$BsmtFinType1[is.na(all$BsmtFinType1)] <- None

FinType <- c(None=0, Unf=1, LwQ=2, Rec=3, BLQ=4, ALQ=5, GLQ=6)

all$BsmtFinType1<-as.integer(revalue(all$BsmtFinType1, FinType))

table(all$BsmtFinType1)

#BsmtFinType2: Rating of basement finished area (if multiple types)

all$BsmtFinType2[is.na(all$BsmtFinType2)] <- None

FinType <- c(None=0, Unf=1, LwQ=2, Rec=3, BLQ=4, ALQ=5, GLQ=6)

all$BsmtFinType2<-as.integer(revalue(all$BsmtFinType2, FinType))

table(all$BsmtFinType2)

#Remaining Basement variabes with just a few NAs

all[(is.na(all$BsmtFullBath)|is.na(all$BsmtHalfBath)|is.na(all$BsmtFinSF1)|is.na(all$BsmtFinSF2)|is.na(all$BsmtUnfSF)|is.na(all$TotalBsmtSF)), c(BsmtQual, BsmtFullBath, BsmtHalfBath, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF)]

all$BsmtFullBath[is.na(all$BsmtFullBath)] <-0

table(all$BsmtFullBath)

#BsmtHalfBath: Basement half bathrooms

all$BsmtHalfBath[is.na(all$BsmtHalfBath)] <-0

table(all$BsmtHalfBath)

#BsmtFinSF1: Type 1 finished square feet

all$BsmtFinSF1[is.na(all$BsmtFinSF1)] <-0

all$BsmtFinSF2[is.na(all$BsmtFinSF2)] <-0

(11)#Masonry veneer type, and masonry veneer area

length(which(is.na(all$MasVnrType) & is.na(all$MasVnrArea)))

[23]

all[is.na(all$MasVnrType) & !is.na(all$MasVnrArea), c(MasVnrType, MasVnrArea)]

all$MasVnrType[2611] <- names(sort(-table(all$MasVnrType)))[2] #taking the 2nd value as the 1st is none

all[2611, c(MasVnrType, MasVnrArea)]

all$MasVnrType[is.na(all$MasVnrType)] <- None

all[!is.na(all$SalePrice),] %>% group_by(MasVnrType) %>% summarise(median = median(SalePrice), counts=n()) %>% arrange(median)

Masonry <- c(None=0, BrkCmn=0, BrkFace=1, Stone=2)

all$MasVnrType<-as.integer(revalue(all$MasVnrType, Masonry))

table(all$MasVnrType)

all$MasVnrArea[is.na(all$MasVnrArea)] <-0

(12)

all$MSZoning[is.na(all$MSZoning)] <- names(sort(-table(all$MSZoning)))[1]

all$MSZoning <- as.factor(all$MSZoning)

table(all$MSZoning)

sum(table(all$MSZoning))

all$KitchenQual[is.na(all$KitchenQual)] <- TA #replace with most common value

all$KitchenQual<-as.integer(revalue(all$KitchenQual, Qualities))

table(all$KitchenQual)

sum(table(all$KitchenQual))

table(all$Utilities)

kable(all[is.na(all$Utilities) | all$Utilities==NoSeWa, 1:9])

all$Utilities <- NULL

all$Functional[is.na(all$Functional)] <- names(sort(-table(all$Functional)))[1]

all$Functional <- as.integer(revalue(all$Functional, c(Sal=0, Sev=1, Maj2=2, Maj1=3, Mod=4, Min2=5, Min1=6, Typ=7)))

table(all$Functional)

sum(table(all$Functional))

all$Exterior1st[is.na(all$Exterior1st)] <- names(sort(-table(all$Exterior1st)))[1]

all$Exterior1st <- as.factor(all$Exterior1st)

table(all$Exterior1st)

sum(table(all$Exterior1st))

all$Exterior2nd[is.na(all$Exterior2nd)] <- names(sort(-table(all$Exterior2nd)))[1]

all$Exterior2nd <- as.factor(all$Exterior2nd)

table(all$Exterior2nd)

sum(table(all$Exterior2nd))

all$ExterQual<-as.integer(revalue(all$ExterQual, Qualities))

table(all$ExterQual)

sum(table(all$ExterQual))

all$ExterCond<-as.integer(revalue(all$ExterCond, Qualities))

table(all$ExterCond)

sum(table(all$ExterCond))

all$SaleType[is.na(all$SaleType)] <- names(sort(-table(all$SaleType)))[1]

all$SaleType <- as.factor(all$SaleType)

table(all$SaleType)

sum(table(all$SaleType))

all$Electrical[is.na(all$Electrical)] <- names(sort(-table(all$Electrical)))[1]

all$Electrical <- as.factor(all$Electrical)

table(all$Electrical)

sum(table(all$Electrical))

all$SaleCondition <- as.factor(all$SaleCondition)

table(all$SaleCondition)

sum(table(all$SaleCondition))

。。。。

依次檢驗變數


推薦閱讀:

為什麼做stacking ensemble的時候需要固定k-fold?
【持續更新】機器學習特徵工程實用技巧大全
Kaggle入門-泰坦尼克號乘客生存預測
怎麼著手玩kaggle?
我在Kaggle數海獅

TAG:Kaggle |