一個小案例，教你如何從數據抓取、數據清洗到數據可視化

01-24

一篇小短文了，囊括數據抓取、數據清洗、數據呈現等全過程，數據主要展示2016年我國top100地級市GDP、增長率、及地區分布密度圖三個維度信息。

library(plyr)library(rvest)library(stringr)library("data.table")library(dplyr)

隨便找的一篇微信短文，複製網址鏈接直接在瀏覽器打開

readurl<-"http://mp.weixin.qq.com/s?__biz=MzI1ODM5NTQ1Mw==&mid=2247484083&idx=1&sn=ba4f4b10af3e4d6ed45f4d04edc30980&chksm=ea099ee1dd7e17f717afffdb3a3ff82c6e4e6bd351251601f0968c792b7e7cb5cdf084fb86a8&mpshare=1&scene=23&srcid=02039mlTmLqMxQEnb4CnUrK3#rd"

用rvest簡單提取文本內容

web<-read_html(readurl,encoding="UTF-8")a<-web%>%html_nodes("p")%>%html_text()

網頁抓取階段完畢，以下轉入數據清洗階段：

#------------------------------------------------------------------------------------------------------

仔細觀察該文本向量可以發現，我們需要的城市數據都是以數開頭（1~3位不等），其中第七行也是一數據字開頭（2017年1月20日），使用正則表示進行精準匹配，並將所有標點符號（記得是中文標點）替換成逗號（英文），方便之後作為分列拆分依據（也可以自定義拆分的符號）

a<-grep("^\d{1,3}\D",a,value=TRUE)a<-gsub("(\（|\）|\，|\：)",",",a)

由於四個直轄市文本行與其他城市相比，缺失一個省級標籤，為使之後拆分順利完成，需要將其修改與其他城市一致。

a[1]<-"1.上海,上海1,26688億元,同比增長6.7%,人口,2415萬,"a[2]<-"2.北京,北京1,24541億元,同比增長6.7%,人口,2171萬,"a[5]<-"5.天津,天津1,17800億元,同比增長9%,人口,1547萬,"a[6]<-"6.重慶,重慶1,17010億元,同比增長10.7%,人口,3372萬,"

剔除中文冗餘文字

dataA<-gsub("((億元)|(同比增長)|(人口)|(萬))","",a)

字元串拆分，使用stringr中的str_split函數進行拆分，使用plyr中的ldply函數進行數據框轉化，然後對列欄位重命名，重排序。

temp1<-str_split(dataA,",")result1<-ldply(temp1,.fun=NULL)Error in list_to_dataframe(res, attr(.data, "split_labels"), .id, id_as_factor) : Results do not have equal lengths

以上語法出錯，提示長度不等，用函數查看具體哪一行出現不等長的情況。

m<-c()for (i in 1:length(temp1)) m[i]<-length(temp1[[i]])grep("6",m)[1] 35 36

查看具體情況

dataA[35:37][1] "35.溫州,浙江3,5110,8%,919," "36.紹興,浙江4,4800,5%,501," [3] "37.鄂爾多斯,內蒙古1,4678,7.3%,,201,"

原來是倒數第二個分割點少了一個逗號，重新補全，使其等長。

dataA[35]<-"35.溫州,浙江3,5110,8%,,919,"dataA[36]<-"36.紹興,浙江4,4800,5%,,501,"

再次運行拆分函數：

temp1<-str_split(dataA,",")result1<-ldply(temp1,.fun=NULL)names(result1)<-c("city","province","gdp","ratio","blank1","scale","blank2")result1<-result1[,-c(5,7)]

提取城市名稱欄位：

wh<-regexpr("[0-9]{1,3}",result1[,1])order<-substring(result1[,1],wh,wh+attr(wh,"match.length")-1)city<-substring(result1[,1],attr(wh,"match.length")+2)nchar(city)>1

為防止提取的城市名稱不全，查看下是否有少於一個字元的名稱：

[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE [99] TRUE TRUE

第92名宿遷因為少了一個句號，只提取了一個字元，立即補全（之後要使用城市名稱提取經緯度，否則會失敗）

city[92]<-"宿遷"

合併：

result<-data.frame(order,city,result1)result<-result[,-3]

提取省份名稱、省內排名欄位

wm<-regexpr("[0-9]{1,2}",result[,3])prov<-substring(result[,3],1,wm-1)scope<-substring(result[,3],wm,wm+attr(wh,"match.length")-1)

替換百分號（坑爹啊，R不支持百分號顯示，一律識別為文本）

result$ratio<-sub("%","",result$ratio)result$ratio<-as.numeric(result$ratio)result$ratio<-result$ratio/100

合併並重排序列欄位

resultm<-data.frame(prov,scope,result)resultm<-resultm[,c(3,4,1,2,6,7,8)]

查看數據框欄位屬性，不符號要求需要重新定義屬性

str(resultm)resultm$order<-as.numeric(resultm$order)resultm$city<-as.character(resultm$city)resultm$prov<-as.character(resultm$prov)resultm$scope<-as.numeric(resultm$scope)resultm$gdp<-as.numeric(resultm$gdp)resultm$scale<-as.numeric(resultm$scale)

按照排名排序

resultm<-arrange(resultm,order)resultm$order[92:100]<-92:100

至此數據清洗階段完畢，以下轉入數據呈現

#-----------------------------------------------------------------------------------------------

分布密度圖：

library(ggplot2) library(plyr) library(maptools) library(ggmap) library(REmap)china_map<-readShapePoly("c:/rstudy/bou2_4p.shp")china_map1 <- fortify(china_map)data<-get_geo_position(resultm$city)data$lon<-as.numeric(data$lon)data$lat<-as.numeric(data$lat)

ggplot()+geom_polygon(data=china_map1,aes(x=long,y=lat,group=group),fill="white",col="grey60",size=.3)+geom_polygon(data=data,aes(x=lon,y=lat,fill = ..level..), stat="density_2d", alpha = .3, color = NA)+coord_map("polyconic") +scale_fill_gradient2( low = "white",mid="yellow", high = "red")+theme_nothing()

gdp總量分布圖：

data1<-inner_join(resultm,data)ggplot()+geom_polygon(data=china_map1,aes(x=long,y=lat,group=group),fill="white",col="grey60",size=.3)+coord_map("polyconic") +geom_point(data=data1, aes(x =lon,y =lat,size=gdp), alpha=0.6,shape=21, fill="red",colour="white")+ scale_size_area(max_size=12)+ theme_nothing()

增長率分布圖：

ggplot()+geom_polygon(data=china_map1,aes(x=long,y=lat,group=group),fill="white",col="grey60",size=.3)+geom_point(data=data1, aes(x =lon,y =lat,size=ratio,fill=ratio), alpha=0.6,shape=21, colour="white")+ scale_fill_gradient2(low="#0E4E75", mid="#BFBEBE", high="red", midpoint=median(na.omit(data1$ratio)))+ coord_map("polyconic") +scale_size_area(max_size=6)+ theme_nothing()

麻雀雖小，五臟俱全，數據可視化也是一樣！地圖素材在魔方學院QQ群貢獻的rstudy文件中，其他作圖數據可通過運行代碼獲得~

聯繫方式：

wechat：ljty1991

Mail:578708965@qq.com

個人公眾號：數據小魔方（datamofang）

團隊公眾號：EasyCharts

qq交流群：[魔方學院]553270834

github:ljtyduyu (RainDu)