簡單數據處理實踐(一)
來自專欄 數據分析成長之路
案例:
1.excel數據讀取
> library("openxlsx")
> readFilePath<-"D:朝陽醫院2016年銷售數據"
>exceldata<-read.xlsx(readFilePath,"Sheet1")
Error in read.xlsx.default(readFilePath,"Sheet1") : File does not exist.#由於少輸入xlsx
> readFilePath<-"D:朝陽醫院2016年銷售數據.xlsx"
>exceldata<-read.xlsx(readFilePath,"Sheet1")
> View(exceldata)
數據如下:
2.列名更換
>names(exceldata)<-c("time","cardno","drugID","drugName","saleNumber","virtualmoney","actualmoney")
3.去除空行
>exceldata<-exceldata[!is.na(exceldata$time),]#由於逗號問題糾結了好久!!!
4.日期分拆
> library(stringr)
>timesplit<-str_split_fixed(exceldata$time," ",n=2)#分號內為空格。
> exceldata$time<-timesplit[,n=1]#取timesplit的第一列
5.數據類型轉換
> class(exceldata$time)
[1] "character"
>exceldata$time<-as.Date(exceldata$time,"%Y-%m-%d")
> class(exceldata$time)
[1] "Date"
> class(exceldata$saleNumber)
[1] "numeric"
> class(exceldata$virtualmoney)
[1] "numeric"
> class(exceldata$actualmoney)
[1] "numeric"
6.排序
>exceldata<-exceldata[order(exceldata$time,decreasing = FALSE),]#逗號依然是包含所有列
> View(exceldata)
7.月均消費次數
月均消費次數=總消費次數/月數
> Kpi1<-exceldata[!duplicated(exceldata[,c("time","cardno")]),]
> Kpi1
> consumeNumber<-nrow(Kpi1)
> consumeNumber
[1] 5394
> starttime<-Kpi1$time[1]
> endtime<-Kpi1$time[nrow(Kpi1)]
> starttime
[1] "2016-01-01"
> endtime
[1] "2016-07-19"
> day<-endtime-starttime
> day
Time difference of 200 days
> month<-day%/%30
Error in Ops.difftime(day, 30) : %/%對"difftime"對象不適用
> day<-as.numeric(day)#將difftime轉換為numeric
> month<-day%/%30
> month
[1] 6
> monthconsume<-consumeNumber/month
> monthconsume
[1] 899
8.月均消費金額=總消費金額/月份
> monthconsume<-totalconsumer/month
> monthconsume
[1] 50776.38
9客單價
> pct<-totalconsumer/consumeNumber
> pct
[1] 56.48095
10消費趨勢
> week<-tapply(exceldata$actualmoney,format(exceldata$time,"%Y-%U"),sum)
> week
2016-00 2016-01 2016-02 2016-03 2016-04 2016-05 2016-06 2016-07 2016-08
1734.80 9639.95 10925.29 8719.73 15457.23 18518.82 3665.70 8243.47 8406.46
2016-09 2016-10 2016-11 2016-12 2016-13 2016-14 2016-15 2016-16 2016-17
9939.38 8507.50 9829.02 10135.23 8536.40 11407.60 14559.17 10620.04 10696.94
2016-18 2016-19 2016-20 2016-21 2016-22 2016-23 2016-24 2016-25 2016-26
9524.06 9728.40 11863.40 11537.63 9558.70 10788.49 11877.43 14130.16 10894.90
2016-27 2016-28 2016-29
8358.10 13369.76 3484.50
> week<-as.data.frame.table(week)
> names(week)<-c("time","actualmoney")#給數據框增加名字
> week
time actualmoney
1 2016-00 1734.80
2 2016-01 9639.95
3 2016-02 10925.29
4 2016-03 8719.73
5 2016-04 15457.23
6 2016-05 18518.82
7 2016-06 3665.70
8 2016-07 8243.47
9 2016-08 8406.46
10 2016-09 9939.38
11 2016-10 8507.50
12 2016-11 9829.02
13 2016-12 10135.23
14 2016-13 8536.40
15 2016-14 11407.60
16 2016-15 14559.17
17 2016-16 10620.04
18 2016-17 10696.94
19 2016-18 9524.06
20 2016-19 9728.40
21 2016-20 11863.40
22 2016-21 11537.63
23 2016-22 9558.70
24 2016-23 10788.49
25 2016-24 11877.43
26 2016-25 14130.16
27 2016-26 10894.90
28 2016-27 8358.10
29 2016-28 13369.76
30 2016-29 3484.50
> week$time<-as.character(week$time)
> week$timeNumber<-c(1:nrow(week))#向數據框中增加一列
> week
time actualmoney timeNumber
1 2016-00 1734.80 1
2 2016-01 9639.95 2
3 2016-02 10925.29 3
4 2016-03 8719.73 4
5 2016-04 15457.23 5
6 2016-05 18518.82 6
7 2016-06 3665.70 7
8 2016-07 8243.47 8
9 2016-08 8406.46 9
10 2016-09 9939.38 10
11 2016-10 8507.50 11
12 2016-11 9829.02 12
13 2016-12 10135.23 13
14 2016-13 8536.40 14
15 2016-14 11407.60 15
16 2016-15 14559.17 16
17 2016-16 10620.04 17
18 2016-17 10696.94 18
19 2016-18 9524.06 19
20 2016-19 9728.40 20
21 2016-20 11863.40 21
22 2016-21 11537.63 22
23 2016-22 9558.70 23
24 2016-23 10788.49 24
25 2016-24 11877.43 25
26 2016-25 14130.16 26
27 2016-26 10894.90 27
28 2016-27 8358.10 28
29 2016-28 13369.76 29
30 2016-29 3484.50 30
> plot(week$timeNumber,week$actualmoney,
+ xlab = "時間(年份-第幾周)",
+ ylab="消費金額",
+ xaxt="n",
+ main="2016朝陽醫院消費曲線",
+ col="blue",
+ type="b")
There were 14 warnings (use warnings() to see them)
> axis(1,at=week$timeNumber,labels = week$time,cex.axis=1.5)
兩個知識點總結
關於[]最後的逗號
readFilePath<-"D:朝陽醫院2016年銷售數據.xlsx"
library("openxlsx")
excelData<-read.xlsx(readFilePath,"Sheet1")
View(excelData)
names(excelData)<-c("time","cardno","drugID","drugName","saleNumber","virtualmoner","actualmoner")
View(excelData)
excelData<-excelData[!is.na(excelData$time),1:5]#顯示所有行的前5列,後面寫1則顯示第一列,後面無數字則默認顯示所有列
2.關於str_split_fixed
1. 字元串分割函數 str_split(),str_split_fixed()
調用公式:
> str_split(string, pattern, n = Inf) ## 返回列表格式
> str_split_fixed(string, pattern, n) ## 返回矩陣格式
# n:指定返回分割的個數
> data <- myxyznamexyzisxyzKobexyzBryant!
> str_split(data,xyz)
> library("stringr")
> str_split(data,xyz)
[[1]]
[1] "my" "name" "is" "Kobe" "Bryant!"
> str_split_fixed(data,xyz,n=2)
[,1] [,2]
[1,] "my" "namexyzisxyzKobexyzBryant!"
> str_split(data,xyz,n=3)
[[1]]
[1] "my" "name" "isxyzKobexyzBryant!"
推薦閱讀:
※糗事百科數據淺析
※轉戰Python,數據分析入門
※使用開源軟體快速搭建數據分析平台
※Nesto - Hulu用戶分析平台的OLAP引擎
※我們用 RNN 分析了最流行的音樂,只想為你寫首歌
TAG:大數據分析 |