房價預測 (一)(lasso與xgboost)
一、載入所需要的包
library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(Rmisc)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
二、讀取觀察、整理數據
test<-read.csv("test.csv",stringsAsFactors = F,na.strings = c("NA",""))
train<-read.csv("train.csv",stringsAsFactors = F,na.strings = c("NA",""))
dim(test)
dim(train)
test_labels <- test$Id
test$Id <- NULL#刪除不需要的序列號
train$Id <- NULL
test$SalePrice <- NA
all <- rbind(train, test)#整合訓練集和測試集的數據,方便處理缺失值和特徵向量
三、初探重要的變數
ggplot(data=all[!is.na(all$SalePrice),], aes(x=SalePrice)) +
geom_histogram(fill="blue", binwidth = 10000) +
scale_x_continuous(breaks= seq(0, 800000, by=100000), labels = comma)
summary(all$SalePrice)
1.數值型變數探究
numericVars <- which(sapply(all, is.numeric))#篩選數據框中數值型變數
numericVarNames <- names(numericVars)#取變數名稱
all_numVar <- all[, numericVars]#取數值型變數的數據
cor_numVar<-cor(all_numVar,use = "pairwise.complete.obs")#展示數值型變數數據的相關係數
view(cor_numVar)
cor_sorted <- as.matrix(sort(cor_numVar[,SalePrice], decreasing = TRUE))#選取跟房價相關的相關係數
view(cor_sorted)#以SalePrice排列
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")#可視化
2、變數OverallQual
ggplot(data=all[!is.na(all$SalePrice),], aes(x=factor(OverallQual), y=SalePrice))+
geom_boxplot() + labs(x=Overall Quality) +
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)
3、Above Grade (Ground) Living Area (square feet)
ggplot(data=all[!is.na(all$SalePrice),], aes(x=GrLivArea, y=SalePrice))+
geom_point(col=blue) + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
geom_text_repel(aes(label = ifelse(all$GrLivArea[!is.na(all$SalePrice)]>4500, rownames(all), )))
all[c(524, 1299), c(SalePrice, GrLivArea, OverallQual)]#離群點
3、缺失值探究
NAcol <- which(colSums(is.na(all)) > 0)
sort(colSums(sapply(all[NAcol], is.na)), decreasing = TRUE)
cat(There are, length(NAcol), columns with missing values)#缺失值探究
(1)poolQC#缺失值最多的變數
all$PoolQC[is.na(all$PoolQC)] <- None
Qualities <- c(None = 0, Po = 1, Fa = 2, TA = 3, Gd = 4, Ex = 5)
all$PoolQC<-as.integer(revalue(all$PoolQC, Qualities))
table(all$PoolQC)
all[all$PoolArea>0 & all$PoolQC==0, c(PoolArea, PoolQC, OverallQual)]
all$PoolQC[2421] <- 2
all$PoolQC[2504] <- 3
all$PoolQC[2600] <- 2
(2)Miscellaneous Feature#缺失第二多、其他條件中包含的部分特性
all$MiscFeature[is.na(all$MiscFeature)] <- None
all$MiscFeature <- as.factor(all$MiscFeature)
ggplot(all[!is.na(all$SalePrice),], aes(x=MiscFeature, y=SalePrice)) +
geom_bar(stat=summary, fun.y = "median", fill=blue) +
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..))
table(all$MiscFeature)
(3)Alley 小道路面類型#缺失值第三2721
ggplot(all[!is.na(all$SalePrice),], aes(x=Alley, y=SalePrice)) +
geom_bar(stat=summary, fun.y = "median", fill=blue)+
scale_y_continuous(breaks= seq(0, 200000, by=50000), labels = comma)
(4)Fence quality#圍欄質量 缺失值
all$Fence[is.na(all$Fence)] <- None
table(all$Fence)
ggplot(all[!is.na(all$SalePrice),], aes(x=Fence, y=SalePrice)) +
+ geom_bar(stat=summary, fun.y = "median", fill=blue)+
+ scale_y_continuous(breaks= seq(0, 200000, by=50000), labels = comma)
all$Fence <- as.factor(all$Fence)
(5)Fireplace quality#壁爐質量 (1420)
all$FireplaceQu[is.na(all$FireplaceQu)] <- None
all$FireplaceQu<-as.integer(revalue(all$FireplaceQu, Qualities))
table(all$FireplaceQu)
(6)LotFrontage: Linear feet of street connected to property#房屋同街道(486)
ggplot(all[!is.na(all$LotFrontage),], aes(x=as.factor(Neighborhood), y=LotFrontage)) +
geom_bar(stat=summary, fun.y = "median", fill=blue) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
(7)LotShape: General shape of property#房屋外形
all$LotShape<-as.integer(revalue(all$LotShape, c(IR3=0, IR2=1, IR1=2, Reg=3)))
table(all$LotShape)
sum(table(all$LotShape))
(8)LotConfig: Lot configuration#配置
ggplot(all[!is.na(all$SalePrice),], aes(x=as.factor(LotConfig), y=SalePrice)) +
geom_bar(stat=summary, fun.y = "median", fill=blue)+
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..))
all$LotConfig <- as.factor(all$LotConfig)
table(all$LotConfig)
sum(table(all$LotConfig))
(9)Garage variables#汽車變數(有七個變數和汽車有關)
#Two of those have one NA (GarageCars and GarageArea), one has 157 NAs (GarageType), 4 variables have 159 NAs.
車庫一般和房子是一起修建的所以用房子的修建時間來代替車庫建造時間
all$GarageYrBlt[is.na(all$GarageYrBlt)] <- all$YearBuilt[is.na(all$GarageYrBlt)]
車庫中有157個變數都缺失,應該是沒有車庫所導致的。
kable(all[!is.na(all$GarageType) & is.na(all$GarageFinish), c(GarageCars, GarageArea, GarageType, GarageCond, GarageQual, GarageFinish)])
all$GarageCond[2127] <- names(sort(-table(all$GarageCond)))[1]
all$GarageQual[2127] <- names(sort(-table(all$GarageQual)))[1]
all$GarageFinish[2127] <- names(sort(-table(all$GarageFinish)))[1]
kable(all[2127, c(GarageYrBlt, GarageCars, GarageArea, GarageType, GarageCond, GarageQual, GarageFinish)])
all$GarageCars[2577] <- 0 #結論
all$GarageArea[2577] <- 0
all$GarageType[2577] <- NA
- GarageType: Garage location#車庫位置
all$GarageType[is.na(all$GarageType)] <- No Garage
all$GarageType <- as.factor(all$GarageType)
table(all$GarageType)
- GarageFinish: Interior finish of the garage#車庫翻新時間
all$GarageFinish[is.na(all$GarageFinish)] <- None
Finish <- c(None=0, Unf=1, RFn=2, Fin=3)
all$GarageFinish<-as.integer(revalue(all$GarageFinish, Finish))
table(all$GarageFinish)
- GarageQual: Garage quality#車庫質量
all$GarageQual[is.na(all$GarageQual)] <- None
all$GarageQual<-as.integer(revalue(all$GarageQual, Qualities))
table(all$GarageQual)
。。。。。
- all$GarageType[is.na(all$GarageType)] <- No Garage
all$GarageType <- as.factor(all$GarageType)
table(all$GarageType)
- all$GarageFinish[is.na(all$GarageFinish)] <- None
Finish <- c(None=0, Unf=1, RFn=2, Fin=3)
all$GarageFinish<-as.integer(revalue(all$GarageFinish, Finish))
table(all$GarageFinish)
- all$GarageQual[is.na(all$GarageQual)] <- None
all$GarageQual<-as.integer(revalue(all$GarageQual, Qualities))
table(all$GarageQual)
- all$GarageCond[is.na(all$GarageCond)] <- None
all$GarageCond<-as.integer(revalue(all$GarageCond, Qualities))
table(all$GarageCond)
(10)Altogether, there are 11 variables that relate to the Basement of a house
length(which(is.na(all$BsmtQual) & is.na(all$BsmtCond) & is.na(all$BsmtExposure) & is.na(all$BsmtFinType1) & is.na(all$BsmtFinType2)))#檢查有多少變數共同缺失值79
- all[!is.na(all$BsmtFinType1) & (is.na(all$BsmtCond)|is.na(all$BsmtQual)|is.na(all$BsmtExposure)|is.na(all$BsmtFinType2)), c(BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2)]
all$BsmtFinType2[333] <- names(sort(-table(all$BsmtFinType2)))[1]
all$BsmtExposure[c(949, 1488, 2349)] <- names(sort(-table(all$BsmtExposure)))[1]
all$BsmtCond[c(2041, 2186, 2525)] <- names(sort(-table(all$BsmtCond)))[1]
all$BsmtQual[c(2218, 2219)] <- names(sort(-table(all$BsmtQual)))[1]
- all$BsmtQual[is.na(all$BsmtQual)] <- None
all$BsmtQual<-as.integer(revalue(all$BsmtQual, Qualities))
table(all$BsmtQual)
#BsmtCond: Evaluates the general condition of the basement
all$BsmtCond[is.na(all$BsmtCond)] <- None
all$BsmtCond<-as.integer(revalue(all$BsmtCond, Qualities))
table(all$BsmtCond)
#BsmtExposure: Refers to walkout or garden level walls
all$BsmtExposure[is.na(all$BsmtExposure)] <- None
Exposure <- c(None=0, No=1, Mn=2, Av=3, Gd=4)
all$BsmtExposure<-as.integer(revalue(all$BsmtExposure, Exposure))
table(all$BsmtExposure)
#BsmtFinType1: Rating of basement finished area
all$BsmtFinType1[is.na(all$BsmtFinType1)] <- None
FinType <- c(None=0, Unf=1, LwQ=2, Rec=3, BLQ=4, ALQ=5, GLQ=6)
all$BsmtFinType1<-as.integer(revalue(all$BsmtFinType1, FinType))
table(all$BsmtFinType1)
#BsmtFinType2: Rating of basement finished area (if multiple types)
all$BsmtFinType2[is.na(all$BsmtFinType2)] <- None
FinType <- c(None=0, Unf=1, LwQ=2, Rec=3, BLQ=4, ALQ=5, GLQ=6)
all$BsmtFinType2<-as.integer(revalue(all$BsmtFinType2, FinType))
table(all$BsmtFinType2)
#Remaining Basement variabes with just a few NAs
all[(is.na(all$BsmtFullBath)|is.na(all$BsmtHalfBath)|is.na(all$BsmtFinSF1)|is.na(all$BsmtFinSF2)|is.na(all$BsmtUnfSF)|is.na(all$TotalBsmtSF)), c(BsmtQual, BsmtFullBath, BsmtHalfBath, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF)]
all$BsmtFullBath[is.na(all$BsmtFullBath)] <-0
table(all$BsmtFullBath)
#BsmtHalfBath: Basement half bathrooms
all$BsmtHalfBath[is.na(all$BsmtHalfBath)] <-0
table(all$BsmtHalfBath)
#BsmtFinSF1: Type 1 finished square feet
all$BsmtFinSF1[is.na(all$BsmtFinSF1)] <-0
all$BsmtFinSF2[is.na(all$BsmtFinSF2)] <-0
(11)#Masonry veneer type, and masonry veneer area
length(which(is.na(all$MasVnrType) & is.na(all$MasVnrArea)))
[23]
all[is.na(all$MasVnrType) & !is.na(all$MasVnrArea), c(MasVnrType, MasVnrArea)]
all$MasVnrType[2611] <- names(sort(-table(all$MasVnrType)))[2] #taking the 2nd value as the 1st is none
all[2611, c(MasVnrType, MasVnrArea)]
all$MasVnrType[is.na(all$MasVnrType)] <- None
all[!is.na(all$SalePrice),] %>% group_by(MasVnrType) %>% summarise(median = median(SalePrice), counts=n()) %>% arrange(median)
Masonry <- c(None=0, BrkCmn=0, BrkFace=1, Stone=2)
all$MasVnrType<-as.integer(revalue(all$MasVnrType, Masonry))
table(all$MasVnrType)
all$MasVnrArea[is.na(all$MasVnrArea)] <-0
(12)
all$MSZoning[is.na(all$MSZoning)] <- names(sort(-table(all$MSZoning)))[1]
all$MSZoning <- as.factor(all$MSZoning)
table(all$MSZoning)
sum(table(all$MSZoning))
all$KitchenQual[is.na(all$KitchenQual)] <- TA #replace with most common value
all$KitchenQual<-as.integer(revalue(all$KitchenQual, Qualities))
table(all$KitchenQual)
sum(table(all$KitchenQual))
table(all$Utilities)
kable(all[is.na(all$Utilities) | all$Utilities==NoSeWa, 1:9])
all$Utilities <- NULL
all$Functional[is.na(all$Functional)] <- names(sort(-table(all$Functional)))[1]
all$Functional <- as.integer(revalue(all$Functional, c(Sal=0, Sev=1, Maj2=2, Maj1=3, Mod=4, Min2=5, Min1=6, Typ=7)))
table(all$Functional)
sum(table(all$Functional))
all$Exterior1st[is.na(all$Exterior1st)] <- names(sort(-table(all$Exterior1st)))[1]
all$Exterior1st <- as.factor(all$Exterior1st)
table(all$Exterior1st)
sum(table(all$Exterior1st))
all$Exterior2nd[is.na(all$Exterior2nd)] <- names(sort(-table(all$Exterior2nd)))[1]
all$Exterior2nd <- as.factor(all$Exterior2nd)
table(all$Exterior2nd)
sum(table(all$Exterior2nd))
all$ExterQual<-as.integer(revalue(all$ExterQual, Qualities))
table(all$ExterQual)
sum(table(all$ExterQual))
all$ExterCond<-as.integer(revalue(all$ExterCond, Qualities))
table(all$ExterCond)
sum(table(all$ExterCond))
all$SaleType[is.na(all$SaleType)] <- names(sort(-table(all$SaleType)))[1]
all$SaleType <- as.factor(all$SaleType)
table(all$SaleType)
sum(table(all$SaleType))
all$Electrical[is.na(all$Electrical)] <- names(sort(-table(all$Electrical)))[1]
all$Electrical <- as.factor(all$Electrical)
table(all$Electrical)
sum(table(all$Electrical))
all$SaleCondition <- as.factor(all$SaleCondition)
table(all$SaleCondition)
sum(table(all$SaleCondition))
。。。。
依次檢驗變數
推薦閱讀:
※為什麼做stacking ensemble的時候需要固定k-fold?
※【持續更新】機器學習特徵工程實用技巧大全
※Kaggle入門-泰坦尼克號乘客生存預測
※怎麼著手玩kaggle?
※我在Kaggle數海獅
TAG:Kaggle |