標籤:

簡單數據處理實踐(一)

簡單數據處理實踐(一)

來自專欄 數據分析成長之路

案例:

1.excel數據讀取

> library("openxlsx")

> readFilePath<-"D:朝陽醫院2016年銷售數據"

>exceldata<-read.xlsx(readFilePath,"Sheet1")

Error in read.xlsx.default(readFilePath,"Sheet1") : File does not exist.#由於少輸入xlsx

> readFilePath<-"D:朝陽醫院2016年銷售數據.xlsx"

>exceldata<-read.xlsx(readFilePath,"Sheet1")

> View(exceldata)

數據如下:

2.列名更換

>names(exceldata)<-c("time","cardno","drugID","drugName","saleNumber","virtualmoney","actualmoney")

3.去除空行

>exceldata<-exceldata[!is.na(exceldata$time),]#由於逗號問題糾結了好久!!!

4.日期分拆

> library(stringr)

>timesplit<-str_split_fixed(exceldata$time," ",n=2)#分號內為空格。

> exceldata$time<-timesplit[,n=1]#取timesplit的第一列

5.數據類型轉換

  1. > class(exceldata$time)

    [1] "character"

    >exceldata$time<-as.Date(exceldata$time,"%Y-%m-%d")

    > class(exceldata$time)

    [1] "Date"

> class(exceldata$saleNumber)

[1] "numeric"

> class(exceldata$virtualmoney)

[1] "numeric"

> class(exceldata$actualmoney)

[1] "numeric"

6.排序

>exceldata<-exceldata[order(exceldata$time,decreasing = FALSE),]#逗號依然是包含所有列

> View(exceldata)

7.月均消費次數

月均消費次數=總消費次數/月數

> Kpi1<-exceldata[!duplicated(exceldata[,c("time","cardno")]),]

> Kpi1

> consumeNumber<-nrow(Kpi1)

> consumeNumber

[1] 5394

> starttime<-Kpi1$time[1]

> endtime<-Kpi1$time[nrow(Kpi1)]

> starttime

[1] "2016-01-01"

> endtime

[1] "2016-07-19"

> day<-endtime-starttime

> day

Time difference of 200 days

> month<-day%/%30

Error in Ops.difftime(day, 30) : %/%對"difftime"對象不適用

> day<-as.numeric(day)#將difftime轉換為numeric

> month<-day%/%30

> month

[1] 6

> monthconsume<-consumeNumber/month

> monthconsume

[1] 899

8.月均消費金額=總消費金額/月份

> monthconsume<-totalconsumer/month

> monthconsume

[1] 50776.38

9客單價

> pct<-totalconsumer/consumeNumber

> pct

[1] 56.48095

10消費趨勢

> week<-tapply(exceldata$actualmoney,format(exceldata$time,"%Y-%U"),sum)

> week

2016-00 2016-01 2016-02 2016-03 2016-04 2016-05 2016-06 2016-07 2016-08

1734.80 9639.95 10925.29 8719.73 15457.23 18518.82 3665.70 8243.47 8406.46

2016-09 2016-10 2016-11 2016-12 2016-13 2016-14 2016-15 2016-16 2016-17

9939.38 8507.50 9829.02 10135.23 8536.40 11407.60 14559.17 10620.04 10696.94

2016-18 2016-19 2016-20 2016-21 2016-22 2016-23 2016-24 2016-25 2016-26

9524.06 9728.40 11863.40 11537.63 9558.70 10788.49 11877.43 14130.16 10894.90

2016-27 2016-28 2016-29

8358.10 13369.76 3484.50

> week<-as.data.frame.table(week)

> names(week)<-c("time","actualmoney")#給數據框增加名字

> week

time actualmoney

1 2016-00 1734.80

2 2016-01 9639.95

3 2016-02 10925.29

4 2016-03 8719.73

5 2016-04 15457.23

6 2016-05 18518.82

7 2016-06 3665.70

8 2016-07 8243.47

9 2016-08 8406.46

10 2016-09 9939.38

11 2016-10 8507.50

12 2016-11 9829.02

13 2016-12 10135.23

14 2016-13 8536.40

15 2016-14 11407.60

16 2016-15 14559.17

17 2016-16 10620.04

18 2016-17 10696.94

19 2016-18 9524.06

20 2016-19 9728.40

21 2016-20 11863.40

22 2016-21 11537.63

23 2016-22 9558.70

24 2016-23 10788.49

25 2016-24 11877.43

26 2016-25 14130.16

27 2016-26 10894.90

28 2016-27 8358.10

29 2016-28 13369.76

30 2016-29 3484.50

> week$time<-as.character(week$time)

> week$timeNumber<-c(1:nrow(week))#向數據框中增加一列

> week

time actualmoney timeNumber

1 2016-00 1734.80 1

2 2016-01 9639.95 2

3 2016-02 10925.29 3

4 2016-03 8719.73 4

5 2016-04 15457.23 5

6 2016-05 18518.82 6

7 2016-06 3665.70 7

8 2016-07 8243.47 8

9 2016-08 8406.46 9

10 2016-09 9939.38 10

11 2016-10 8507.50 11

12 2016-11 9829.02 12

13 2016-12 10135.23 13

14 2016-13 8536.40 14

15 2016-14 11407.60 15

16 2016-15 14559.17 16

17 2016-16 10620.04 17

18 2016-17 10696.94 18

19 2016-18 9524.06 19

20 2016-19 9728.40 20

21 2016-20 11863.40 21

22 2016-21 11537.63 22

23 2016-22 9558.70 23

24 2016-23 10788.49 24

25 2016-24 11877.43 25

26 2016-25 14130.16 26

27 2016-26 10894.90 27

28 2016-27 8358.10 28

29 2016-28 13369.76 29

30 2016-29 3484.50 30

> plot(week$timeNumber,week$actualmoney,

+ xlab = "時間(年份-第幾周)",

+ ylab="消費金額",

+ xaxt="n",

+ main="2016朝陽醫院消費曲線",

+ col="blue",

+ type="b")

There were 14 warnings (use warnings() to see them)

> axis(1,at=week$timeNumber,labels = week$time,cex.axis=1.5)

兩個知識點總結

關於[]最後的逗號

readFilePath<-"D:朝陽醫院2016年銷售數據.xlsx"

library("openxlsx")

excelData<-read.xlsx(readFilePath,"Sheet1")

View(excelData)

names(excelData)<-c("time","cardno","drugID","drugName","saleNumber","virtualmoner","actualmoner")

View(excelData)

excelData<-excelData[!is.na(excelData$time),1:5]#顯示所有行的前5列,後面寫1則顯示第一列,後面無數字則默認顯示所有列

2.關於str_split_fixed

1. 字元串分割函數 str_split(),str_split_fixed()

調用公式:

> str_split(string, pattern, n = Inf) ## 返回列表格式

> str_split_fixed(string, pattern, n) ## 返回矩陣格式

# n:指定返回分割的個數

> data <- myxyznamexyzisxyzKobexyzBryant!

> str_split(data,xyz)

> library("stringr")

> str_split(data,xyz)

[[1]]

[1] "my" "name" "is" "Kobe" "Bryant!"

> str_split_fixed(data,xyz,n=2)

[,1] [,2]

[1,] "my" "namexyzisxyzKobexyzBryant!"

> str_split(data,xyz,n=3)

[[1]]

[1] "my" "name" "isxyzKobexyzBryant!"


推薦閱讀:

糗事百科數據淺析
轉戰Python,數據分析入門
使用開源軟體快速搭建數據分析平台
Nesto - Hulu用戶分析平台的OLAP引擎
我們用 RNN 分析了最流行的音樂,只想為你寫首歌

TAG:大數據分析 |