《R語言實戰》第7章 筆記

1.描述性統計分析:

> myvars <- c("mpg","hp","wt")> head(mtcars[myvars]) mpg hp wtMazda RX4 21.0 110 2.620Mazda RX4 Wag 21.0 110 2.875Datsun 710 22.8 93 2.320Hornet 4 Drive 21.4 110 3.215Hornet Sportabout 18.7 175 3.440Valiant 18.1 105 3.460> #summary()函數提供了最小值、最大值、四分位數和數值型變數的均值,以及因子向量和邏輯型向量的頻數統計。> summary(mtcars[myvars]) mpg hp wt Min. :10.40 Min. : 52.0 Min. :1.513 1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581 Median :19.20 Median :123.0 Median :3.325 Mean :20.09 Mean :146.7 Mean :3.217 3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610 Max. :33.90 Max. :335.0 Max. :5.424 > #通過sapply()計算描述性統計量> mystats <- function(x,na.omit=FALSE){+ if(na.omit)+ x <- x[!is.na(x)]+ m <- mean(x)+ n <- length(x)+ s <- sd(x)+ skew <- sum((x-m)^3/s^3)/n+ kurt <- sum((x-m)^4/s^4)/n - 3+ return (c(n=n,mean=m,stdev=s,skew=skew,kurtosis=kurt))+ }> myvars <- c("mpg","hp","wt")> #如果你只希望單純地忽略缺失值,那麼應當使用sapply(mtcars[myvars], mystats,na.omit=TRUE)。> sapply(mtcars[myvars],mystats) mpg hp wtn 32.000000 32.0000000 32.00000000mean 20.090625 146.6875000 3.21725000stdev 6.026948 68.5628685 0.97845744skew 0.610655 0.7260237 0.42314646kurtosis -0.372766 -0.1355511 -0.02271075

用戶貢獻包:

> library(Hmisc)> myvars <- c("mpg","hp","wt")> describe(mtcars[myvars])mtcars[myvars] 3 Variables 32 Observations------------------------------------------------mpg n missing distinct Info Mean 32 0 25 0.999 20.09 Gmd .05 .10 .25 .50 6.796 12.00 14.34 15.43 19.20 .75 .90 .95 22.80 30.09 31.30 lowest : 10.4 13.3 14.3 14.7 15.0highest: 26.0 27.3 30.4 32.4 33.9------------------------------------------------hp n missing distinct Info Mean 32 0 22 0.997 146.7 Gmd .05 .10 .25 .50 77.04 63.65 66.00 96.50 123.00 .75 .90 .95 180.00 243.50 253.55 lowest : 52 62 65 66 91, highest: 215 230 245 264 335------------------------------------------------wt n missing distinct Info Mean 32 0 29 0.999 3.217 Gmd .05 .10 .25 .50 1.089 1.736 1.956 2.581 3.325 .75 .90 .95 3.610 4.048 5.293 lowest : 1.513 1.615 1.835 1.935 2.140highest: 3.845 4.070 5.250 5.345 5.424------------------------------------------------> library(pastecs)> myvars <- c("mpg","hp","wt")> stat.desc(mtcars[myvars]) mpg hpnbr.val 32.0000000 32.0000000nbr.null 0.0000000 0.0000000nbr.na 0.0000000 0.0000000min 10.4000000 52.0000000max 33.9000000 335.0000000range 23.5000000 283.0000000sum 642.9000000 4694.0000000median 19.2000000 123.0000000mean 20.0906250 146.6875000SE.mean 1.0654240 12.1203173CI.mean.0.95 2.1729465 24.7195501var 36.3241028 4700.8669355std.dev 6.0269481 68.5628685coef.var 0.2999881 0.4674077 wtnbr.val 32.0000000nbr.null 0.0000000nbr.na 0.0000000min 1.5130000max 5.4240000range 3.9110000sum 102.9520000median 3.3250000mean 3.2172500SE.mean 0.1729685CI.mean.0.95 0.3527715var 0.9573790std.dev 0.9784574coef.var 0.3041285> library(psych)> myvars <- c("mpg","hp","wt")> describe(mtcars[myvars]) vars n mean sd median trimmed madmpg 1 32 20.09 6.03 19.20 19.70 5.41hp 2 32 146.69 68.56 123.00 141.19 77.10wt 3 32 3.22 0.98 3.33 3.15 0.77 min max range skew kurtosis sempg 10.40 33.90 23.50 0.61 -0.37 1.07hp 52.00 335.00 283.00 0.73 -0.14 12.12wt 1.51 5.42 3.91 0.42 -0.02 0.17

後載入的包會屏蔽之前載入包的同名函數,如果仍然想使用之前包的函數,使用下列方式:Hmisc::describe(mt)。

分組計算描述性統計量:

> myvars <- c("mpg","hp","wt")> aggregate(mtcars[myvars],by=list(am=mtcars$am),mean) am mpg hp wt1 0 17.14737 160.2632 3.7688952 1 24.39231 126.8462 2.411000> aggregate(mtcars[myvars],by=list(am=mtcars$am),sd) am mpg hp wt1 0 3.833966 53.90820 0.77740012 1 6.166504 84.06232 0.6169816> dstats <- function(x)sapply(x,mystats)> myvars <- c("mpg","hp","wt")> by(mtcars[myvars],mtcars$am,dstats)mtcars$am: 0 mpg hp wtn 19.00000000 19.00000000 19.0000000mean 17.14736842 160.26315789 3.7688947stdev 3.83396639 53.90819573 0.7774001skew 0.01395038 -0.01422519 0.9759294kurtosis -0.80317826 -1.20969733 0.1415676------------------------------------ mtcars$am: 1 mpg hp wtn 13.00000000 13.0000000 13.0000000mean 24.39230769 126.8461538 2.4110000stdev 6.16650381 84.0623243 0.6169816skew 0.05256118 1.3598859 0.2103128kurtosis -1.45535200 0.5634635 -1.1737358

2.頻數表和列聯表:

> #數據> head(Arthritis) ID Treatment Sex Age Improved1 57 Treated Male 27 Some2 46 Treated Male 29 None3 77 Treated Male 30 None4 17 Treated Male 32 Marked5 36 Treated Male 46 Marked6 23 Treated Male 58 Marked

用於創建和處理列聯表的函數

一維列聯表:

> #可以使用table()函數生成簡單的頻數統計表> mytable <- with(Arthritis,table(Improved))> mytableImproved None Some Marked 42 14 28 > #可以用prop.table()將這些頻數轉化為比例值> prop.table(mytable)Improved None Some Marked 0.5000000 0.1666667 0.3333333 > #或使用prop.table()*100轉化為百分比> prop.table(mytable)*100Improved None Some Marked 50.00000 16.66667 33.33333

二維列聯表:

> #方法一> mytable <- with(Arthritis,table(Treatment,Improved)) > mytable ImprovedTreatment None Some Marked Placebo 29 7 7 Treated 13 7 21> #方法二> mytable <- xtabs(~Treatment+Improved,data=Arthritis)> mytable ImprovedTreatment None Some Marked Placebo 29 7 7 Treated 13 7 21> #使用margin.table()和prop.table()函數分別生成邊際頻數和比例> margin.table(mytable,1)TreatmentPlacebo Treated 43 41 > prop.table(mytable,1) ImprovedTreatment None Some Marked Placebo 0.6744186 0.1627907 0.1627907 Treated 0.3170732 0.1707317 0.5121951> margin.table(mytable,2)Improved None Some Marked 42 14 28 > prop.table(mytable,2) ImprovedTreatment None Some Marked Placebo 0.6904762 0.5000000 0.2500000 Treated 0.3095238 0.5000000 0.7500000> #各單元格所佔比例可用如下語句獲取> prop.table(mytable) ImprovedTreatment None Some Marked Placebo 0.34523810 0.08333333 0.08333333 Treated 0.15476190 0.08333333 0.25000000> #可以使用addmargins()函數為這些表格添加邊際和> addmargins(mytable) ImprovedTreatment None Some Marked Sum Placebo 29 7 7 43 Treated 13 7 21 41 Sum 42 14 28 84> addmargins(prop.table(mytable)) ImprovedTreatment None Some Marked Placebo 0.34523810 0.08333333 0.08333333 Treated 0.15476190 0.08333333 0.25000000 Sum 0.50000000 0.16666667 0.33333333 ImprovedTreatment Sum Placebo 0.51190476 Treated 0.48809524 Sum 1.00000000> #僅添加了各行的和> addmargins(prop.table(mytable, 2), 1) ImprovedTreatment None Some Marked Placebo 0.6904762 0.5000000 0.2500000 Treated 0.3095238 0.5000000 0.7500000 Sum 1.0000000 1.0000000 1.0000000

多維列聯表:

table() 和xtabs() 都可以基於三個或更多的類別型變數生成多維列聯表。margin.table()、prop.table()和addmargins()函數可以自然地推廣到高於二維的情況。

> #三維列聯表> mytable <- xtabs(~Treatment+Sex+Improved, data=Arthritis)> mytable, , Improved = None SexTreatment Female Male Placebo 19 10 Treated 6 7, , Improved = Some SexTreatment Female Male Placebo 7 0 Treated 5 2, , Improved = Marked SexTreatment Female Male Placebo 6 1 Treated 16 5> #ftable()函數可以以一種緊湊而吸引人的方式輸出多維列聯表> ftable(mytable) Improved None Some MarkedTreatment Sex Placebo Female 19 7 6 Male 10 0 1Treated Female 6 5 16 Male 7 2 5> margin.table(mytable, c(1, 3)) ImprovedTreatment None Some Marked Placebo 29 7 7 Treated 13 7 21> ftable(prop.table(mytable, c(1, 2))) Improved None Some MarkedTreatment Sex Placebo Female 0.59375000 0.21875000 0.18750000 Male 0.90909091 0.00000000 0.09090909Treated Female 0.22222222 0.18518519 0.59259259 Male 0.50000000 0.14285714 0.35714286> ftable(addmargins(prop.table(mytable, c(1, 2)), 3)) Improved None Some Marked SumTreatment Sex Placebo Female 0.59375000 0.21875000 0.18750000 1.00000000 Male 0.90909091 0.00000000 0.09090909 1.00000000Treated Female 0.22222222 0.18518519 0.59259259 1.00000000 Male 0.50000000 0.14285714 0.35714286 1.00000000

獨立性檢驗:

> #卡方獨立性檢驗> library(vcd)> mytable <- xtabs(~Treatment+Improved,data=Arthritis)> chisq.test(mytable) Pearson"s Chi-squared testdata: mytableX-squared = 13.055, df = 2, p-value = 0.001463> #治療情況和改善情況不獨立> mytable <- xtabs(~Improved+Sex, data=Arthritis)> chisq.test(mytable) Pearson"s Chi-squared testdata: mytableX-squared = 4.8407, df = 2, p-value = 0.08889> #性別和改善情況獨立

相關性的度量:

> library(vcd)> mytable <- xtabs(~Treatment+Improved,data=Arthritis)> assocstats(mytable) X^2 df P(> X^2)Likelihood Ratio 13.530 2 0.0011536Pearson 13.055 2 0.0014626Phi-Coefficient : NA Contingency Coeff.: 0.367 Cramer"s V : 0.394

3.相關:

> states <- state.x77[,1:6]> #方差和協方差> cov(states) Population IncomePopulation 19931683.7588 571229.7796Income 571229.7796 377573.3061Illiteracy 292.8680 -163.7020Life Exp -407.8425 280.6632Murder 5663.5237 -521.8943HS Grad -3551.5096 3076.7690 Illiteracy Life ExpPopulation 292.8679592 -407.8424612Income -163.7020408 280.6631837Illiteracy 0.3715306 -0.4815122Life Exp -0.4815122 1.8020204Murder 1.5817755 -3.8694804HS Grad -3.2354694 6.3126849 Murder HS GradPopulation 5663.523714 -3551.509551Income -521.894286 3076.768980Illiteracy 1.581776 -3.235469Life Exp -3.869480 6.312685Murder 13.627465 -14.549616HS Grad -14.549616 65.237894> #Pearson積差相關係數> cor(states) Population Income IlliteracyPopulation 1.00000000 0.2082276 0.1076224Income 0.20822756 1.0000000 -0.4370752Illiteracy 0.10762237 -0.4370752 1.0000000Life Exp -0.06805195 0.3402553 -0.5884779Murder 0.34364275 -0.2300776 0.7029752HS Grad -0.09848975 0.6199323 -0.6571886 Life Exp Murder HS GradPopulation -0.06805195 0.3436428 -0.09848975Income 0.34025534 -0.2300776 0.61993232Illiteracy -0.58847793 0.7029752 -0.65718861Life Exp 1.00000000 -0.7808458 0.58221620Murder -0.78084575 1.0000000 -0.48797102HS Grad 0.58221620 -0.4879710 1.00000000> #Spearman等級相關係數> cor(states,method="spearman") Population Income IlliteracyPopulation 1.0000000 0.1246098 0.3130496Income 0.1246098 1.0000000 -0.3145948Illiteracy 0.3130496 -0.3145948 1.0000000Life Exp -0.1040171 0.3241050 -0.5553735Murder 0.3457401 -0.2174623 0.6723592HS Grad -0.3833649 0.5104809 -0.6545396 Life Exp Murder HS GradPopulation -0.1040171 0.3457401 -0.3833649Income 0.3241050 -0.2174623 0.5104809Illiteracy -0.5553735 0.6723592 -0.6545396Life Exp 1.0000000 -0.7802406 0.5239410Murder -0.7802406 1.0000000 -0.4367330HS Grad 0.5239410 -0.4367330 1.0000000> #非方形的相關矩陣> x <- states[,c("Population", "Income", "Illiteracy", "HS Grad")]> y <- states[,c("Life Exp", "Murder")]> cor(x,y) Life Exp MurderPopulation -0.06805195 0.3436428Income 0.34025534 -0.2300776Illiteracy -0.58847793 0.7029752HS Grad 0.58221620 -0.4879710> #檢驗某種相關係數的顯著性> cor.test(states[,3], states[,5]) Pearson"s product-moment correlationdata: states[, 3] and states[, 5]t = 6.8479, df = 48, p-value = 1.258e-08alternative hypothesis: true correlation is not equal to 095 percent confidence interval: 0.5279280 0.8207295sample estimates: cor 0.7029752

4.t檢驗:

> library(MASS)> t.test(Prob ~ So, data=UScrime) Welch Two Sample t-testdata: Prob by Sot = -3.8954, df = 24.925, p-value =0.0006506alternative hypothesis: true difference in means is not equal to 095 percent confidence interval: -0.03852569 -0.01187439sample estimates:mean in group 0 mean in group 1 0.03851265 0.06371269 > #你可以拒絕南方各州和非南方各州擁有相同監禁概率的假設(p<0.001)。

總結:本章介紹了一些統計量在R語言中的獲取方法,以及如何構建列聯表,如何使用R語言分析數據的獨立性和相關性。了解大致思想,細節部分當遇到具體問題時再探究。

推薦閱讀:

胡說八道互聯網,他讓馬雲又愛又恨
數據可視化(三)那些好用的在線工具
《機器學習》從零開始學(1) 數據分析之「岩石與水雷」
數據化運營管理_互聯網行業(六)(訂單1)

TAG:数据分析 | R编程语言 |