在數據分析方面,比起python,excel的局限性在哪?

昨天用python處理數據的時候,身邊的人對我說這些excel都能實現,請問事實是這樣的嗎?

我想了解有哪些「非python不可」的理由


因為有了像ipython(jupyter) notebook這樣的工具,輔助以像numpy, scipy, matplotlib, seaborn, scikit-learn等科學計算庫可以很方便的進行數據分析以及可視化。這樣說吧,舉個例子最近因為在Kaggle上水,做了一個比賽數據的可視化分析樓主可以參閱哈(所有的代碼和輸出都在同一個界面是不是很方便)

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import time
import seaborn as sns
%matplotlib inline

df_train = pd.read_csv("E:/facebook/train.csv")
df_test = pd.read_csv("E:/facebook/test.csv")
df_train.head()

df_train_sample = df_train.sample(n=1000000)
df_test_sample = df_test.sample(n=1000000)

counts1, bins1 = np.histogram(df_train["accuracy"], bins=50)
binsc1 = bins1[:-1] + np.diff(bins1)/2.

counts2, bins2 = np.histogram(df_test["accuracy"], bins=50)
binsc2 = bins2[:-1] + np.diff(bins2)/2.

plt.figure(0, figsize=(14,4))
plt.style.use("ggplot")
plt.subplot(121)
plt.bar(binsc1, counts1/(counts1.sum()*1.0), w=np.diff(bins1)[0])
plt.grid(True)
plt.xlabel("Accuracy")
plt.ylabel("Fraction")
plt.title("Train")

plt.subplot(122)
plt.bar(binsc2, counts2/(counts2.sum()*1.0), w=np.diff(bins2)[0])
plt.grid(True)
plt.xlabel("Accuracy")
plt.ylabel("Fraction")
plt.title("Test")

plt.show()

current_palette = sns.color_palette()

counts1, bins1 = np.histogram(df_train["time"], bins=50)
binsc1 = bins1[:-1] + np.diff(bins1)/2.

counts2, bins2 = np.histogram(df_test["time"], bins=50)
binsc2 = bins2[:-1] + np.diff(bins2)/2.

plt.figure(1, figsize=(12,3))

plt.subplot(121)
plt.bar(binsc1, counts1/(counts1.sum()*1.0), w=np.diff(bins1)[0], color=current_palette[0])
plt.grid(True)
plt.xlabel("Time")
plt.ylabel("Fraction")
plt.title("Train")

plt.subplot(122)
plt.bar(binsc2, counts2/(counts2.sum()*1.0), w=np.diff(bins2)[0], color=current_palette[1])
plt.grid(True)
plt.xlabel("Time")
plt.ylabel("Fraction")
plt.title("Test")

plt.show()

plt.figure(2, figsize=(12,3))
plt.bar(binsc1, counts1/(counts1.sum()*1.0), w=np.diff(bins1)[0], color=current_palette[0], label="Train")
plt.bar(binsc2, counts2/(counts2.sum()*1.0), w=np.diff(bins2)[0], color=current_palette[1], label="Test")
plt.grid(True)
plt.xlabel("Time")
plt.ylabel("Fraction")
plt.title("Test")
plt.legend()
plt.show()

df_placecounts = df_train["place_id"].value_counts()

counts, bins = np.histogram(df_placecounts.values, bins=50)
binsc = bins[:-1] + np.diff(bins)/2.

plt.figure(3, figsize=(12,6))
plt.bar(binsc, counts/(counts.sum()*1.0), w=np.diff(bins)[0])
plt.grid(True)
plt.xlabel("Number of place occurances")
plt.ylabel("Fraction")
plt.title("Train")
plt.show()

plt.figure(4, figsize=(12,10))

plt.subplot(211)
plt.scatter(df_train_sample["time"], df_train_sample["accuracy"], s=1, c="k", lw=0, alpha=0.1)
plt.xlabel("Time")
plt.ylabel("Accuracy")
plt.xlim(df_train_sample["time"].min(), df_train_sample["time"].max())
plt.ylim(df_train_sample["accuracy"].min(), df_train_sample["accuracy"].max())
plt.title("Train")

plt.subplot(212)
plt.scatter(df_test_sample["time"], df_test_sample["accuracy"], s=1, c="k", lw=0, alpha=0.1)
plt.xlabel("Time")
plt.ylabel("Accuracy")
plt.xlim(df_test_sample["time"].min(), df_test_sample["time"].max())
plt.ylim(df_test_sample["accuracy"].min(), df_test_sample["accuracy"].max())
plt.title("Test")

plt.show()

df_train_sample["xround"] = df_train_sample["x"].round(decimals=1)
df_train_sample["yround"] = df_train_sample["y"].round(decimals=1)
df_groupxy = df_train_sample.groupby(["xround", "yround"]).agg({"accuracy":[np.mean, np.std]})
df_groupxy.head()

idx = np.asarray(list(df_groupxy.index.values))
plt.figure(5, figsize=(14,6))

plt.subplot(121)
plt.scatter(idx[:,0], idx[:,1], s=20, c=df_groupxy["accuracy", "mean"], marker="s", lw=0, cmap=plt.cm.viridis)
plt.colorbar().set_label("Mean accuracy")
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")
plt.xlim(0,10)
plt.ylim(0,10)

plt.subplot(122)
plt.scatter(idx[:,0], idx[:,1], s=20, c=df_groupxy["accuracy", "std"], marker="s", lw=0, cmap=plt.cm.viridis)
plt.colorbar().set_label("Std accuracy")
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")
plt.xlim(0,10)
plt.ylim(0,10)

plt.tight_layout()
plt.show()

df_topplaces = df_placecounts.iloc[0:20]
l_topplaces = list(df_topplaces.index)
print(l_topplaces)

plt.figure(6, figsize=(14,10))
for i in range(len(l_topplaces)):
place = l_topplaces[i]

df_place = df_train[df_train["place_id"]==place]

counts, bins = np.histogram(df_place["time"], bins=50, range=[df_train["time"].min(), df_train["time"].max()])
binsc = bins[:-1] + np.diff(bins)/2.

plt.subplot(5,4,i+1)
plt.bar(binsc, counts/(counts.sum()*1.0), w=np.diff(bins)[0])
plt.xlim(df_train["time"].min(), df_train["time"].max())
plt.grid(True)
plt.xlabel("Time")
plt.ylabel("Fraction")
plt.gca().get_xaxis().set_ticks([])
plt.title("pid: " + str(place))

plt.tight_layout()
plt.show()

plt.figure(7, figsize=(14,10))
for i in range(len(l_topplaces)):
place = l_topplaces[i]

df_place = df_train[df_train["place_id"]==place]

# Try % 3600*24 to see daily trend assuming it"s in seconds
# Try % 60*24 if minutes
counts, bins = np.histogram(df_place["time"]%(60*24), bins=50)
binsc = bins[:-1] + np.diff(bins)/2.

plt.subplot(5,4,i+1)
plt.bar(binsc, counts/(counts.sum()*1.0), w=np.diff(bins)[0])
plt.grid(True)
plt.xlabel("Time")
plt.ylabel("Fraction")
plt.gca().get_xaxis().set_ticks([])
plt.title("pid: " + str(place))

plt.tight_layout()
plt.show()

df_train["hour"] = (df_train["time"]%(60*24))/60.
df_train["dayofweek"] = np.ceil((df_train["time"]%(60*24*7))/(60.*24))
df_train["dayofyear"] = np.ceil((df_train["time"]%(60*24*365))/(60.*24))
df_train.head()

df_train_sample["hour"] = (df_train_sample["time"]%(60*24))/60.
df_train_sample["dayofweek"] = np.ceil((df_train_sample["time"]%(60*24*7))/(60.*24))
df_train_sample["dayofyear"] = np.ceil((df_train_sample["time"]%(60*24*365))/(60.*24))

plt.figure(8, figsize=(14,10))
for i in range(20):
place = l_topplaces[i]
df_place = df_train[df_train["place_id"]==place]

# Group by weekday
df_groupday = df_place.groupby("dayofweek").agg("count")

plt.subplot(5,4,i+1)
plt.bar(df_groupday.index.values-0.5, df_groupday["time"], w=1)
plt.grid(True)
plt.xlabel("Day")
plt.ylabel("Count")
plt.title("pid: " + str(place))

plt.tight_layout()
plt.show()

plt.figure(9, figsize=(14,10))
for i in range(20):
place = l_topplaces[i]
df_place = df_train[df_train["place_id"]==place]

# Add some colums
df_place = df_place[df_place["time"]&<(60*24*365)] # Restrict to 1 year so the counts don"t double up df_groupday = df_place.groupby("dayofyear").agg("count") plt.subplot(5,4,i+1) plt.bar(df_groupday.index.values-0.5, df_groupday["time"], w=1) plt.grid(True) plt.xlabel("Day of year") plt.ylabel("Count") plt.xlim(0,365) plt.title("pid: " + str(place)) plt.tight_layout() plt.show()

plt.figure(10, figsize=(14,16))
cmapm = plt.cm.viridis
cmapm.set_bad("0.5",1.)

for i in range(len(l_topplaces)):
place = l_topplaces[i]
df_place = df_train[df_train["place_id"]==place]
counts, binsX, binsY = np.histogram2d(df_place["x"], df_place["y"], bins=100)
extent = [binsX.min(),binsX.max(),binsY.min(),binsY.max()]

plt.subplot(5,4,i+1)
plt.imshow(np.log10(counts.T),
interpolation="none",
origin="lower",
extent=extent,
aspect="auto",
cmap=cmapm)
plt.grid(True, c="0.6", lw=0.5)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("pid: " + str(place))

plt.tight_layout()
plt.show()

plt.figure(11, figsize=(14,16))

for i in range(len(l_topplaces)):
plt.subplot(5,4,i+1)
plt.gca().set_axis_bgcolor("0.5")
place = l_topplaces[i]
df_place = df_train[df_train["place_id"]==place]
plt.scatter(df_place["x"], df_place["y"], s=10, c=df_place["accuracy"], lw=0, cmap=plt.cm.viridis)
plt.grid(True, c="0.6", lw=0.5)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("pid: " + str(place))

plt.tight_layout()
plt.show()

plt.figure(12, figsize=(14,16))

for i in range(len(l_topplaces)):
plt.subplot(5,4,i+1)
plt.gca().set_axis_bgcolor("0.5")
place = l_topplaces[i]
df_place = df_train[df_train["place_id"]==place]
plt.scatter(df_place["x"], df_place["y"], s=10, c=df_place["hour"], lw=0, cmap=plt.cm.viridis)
plt.grid(True, c="0.6", lw=0.5)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("pid: " + str(place))

plt.tight_layout()
plt.show()

i = 11
place = l_topplaces[i]
df_place = df_train[df_train["place_id"]==place]
xmin, xmax = df_place["x"].min(), df_place["x"].max()
ymin, ymax = df_place["y"].min(), df_place["y"].max()
df_noise = df_train[(df_train["x"]&>xmin)
(df_train["x"]&ymin)
(df_train["y"]&

plt.figure(14, figsize=(12,12))

for i in range(20):
place = l_topplaces[i]
df_place = df_train[df_train["place_id"]==place]
plt.scatter(df_place["x"], df_place["y"], s=3, alpha=0.5, c=plt.cm.viridis(int(i*(255/20.))), lw=0)

plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")
plt.tight_layout()
plt.xlim(0,10)
plt.ylim(0,10)
plt.show()

df_groupplace = df_train.groupby("place_id").agg({"time":"count", "x":"std", "y":"std"})
df_groupplace.sort_values(by="time", inplace=True, ascending=False)
df_groupplace.head()

gkde_stddevx = gaussian_kde(df_groupplace["x"][~df_groupplace["x"].isnull()].values)
gkde_stddevy = gaussian_kde(df_groupplace["y"][~df_groupplace["y"].isnull()].values)

# Compute the functions
rangeX = np.linspace(0, 3, 100)
x_density = gkde_stddevx(rangeX)
y_density = gkde_stddevy(rangeX)

plt.figure(15, figsize=(12,6))
plt.subplot(111)
plt.plot(rangeX, x_density, c=current_palette[0], ls="-", alpha=0.75)
plt.plot(rangeX, y_density, c=current_palette[1], ls="-", alpha=0.75)
plt.gca().fill_between(rangeX, 0, x_density, facecolor=current_palette[0], alpha=0.2)
plt.gca().fill_between(rangeX, 0, y_density, facecolor=current_palette[1], alpha=0.2)
plt.ylabel("Density")
plt.xlabel("Std dev")
plt.plot([], [], c=current_palette[0], alpha=0.2, linew=10, label="stddev x")
plt.plot([], [], c=current_palette[1], alpha=0.2, linew=10, label="stddev y")
plt.legend()
plt.grid(True)

plt.figure(19, figsize=(12,6))
plt.scatter(df_train_sample["hour"], df_train_sample["accuracy"], s=1, c="k", lw=0, alpha=0.05)
plt.xlabel("Hour")
plt.ylabel("Accuracy")
plt.xlim(df_train_sample["hour"].min(), df_train_sample["hour"].max())
plt.ylim(df_train_sample["accuracy"].min(), df_train_sample["accuracy"].max())
plt.title("Train")
plt.show()

i = 11
place = l_topplaces[i]
df_place = df_train[df_train["place_id"]==place]
xmin, xmax = df_place["x"].min(), df_place["x"].max()
ymin, ymax = df_place["y"].min(), df_place["y"].max()

# Calculate the KDE
res = 200 # resolution
gkde_place = gaussian_kde(np.asarray((df_place["x"], df_place["y"])))
x_flat = np.linspace(xmin, xmax, res)
y_flat = np.linspace(ymin, ymax, res)
x, y = np.meshgrid(x_flat,y_flat)
grid_coords = np.append(x.reshape(-1,1),y.reshape(-1,1),axis=1)
z = gkde_place(grid_coords.T)
z = z.reshape(res,res)

# Plot
extent = [xmin,xmax,ymin,ymax]
plt.figure(20, figsize=(12,6))

# KDE only
plt.subplot(121)
plt.imshow(z[::-1,:],
extent=extent,
aspect="auto",
cmap=plt.cm.viridis,
interpolation="bilinear")
plt.grid(False)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("pid: " + str(place))
plt.xlim(xmin,xmax)
plt.ylim(ymin,ymax)

# Overplot the points
plt.subplot(122)
plt.imshow(z[::-1,:],
extent=extent,
aspect="auto",
cmap=plt.cm.viridis,
interpolation="bilinear")
plt.colorbar().set_label("density")
plt.scatter(df_place["x"], df_place["y"], s=10, c="k", lw=0, alpha=0.5)
plt.grid(False)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("pid: " + str(place))
plt.xlim(xmin,xmax)
plt.ylim(ymin,ymax)

plt.tight_layout()
plt.show()

pids = [0,8,9,10,11,14] # A few places
kdes = []
plt.figure(21, figsize=(14,5))
for i in range(len(pids)):
place = l_topplaces[pids[i]]
df_place = df_train[df_train["place_id"]==place]
xmin, xmax = df_place["x"].min(), df_place["x"].max()
ymin, ymax = df_place["y"].min(), df_place["y"].max()

# Calculate the KDE
res = 50 # resolution
gkde_place = gaussian_kde(np.asarray((df_place["x"], df_place["y"])))
kdes.append(gkde_place) # Keep these KDEs for later
x_flat = np.linspace(xmin, xmax, res)
y_flat = np.linspace(ymin, ymax, res)
x, y = np.meshgrid(x_flat,y_flat)
grid_coords = np.append(x.reshape(-1,1),y.reshape(-1,1),axis=1)
z = gkde_place(grid_coords.T)
z = z.reshape(res,res)

# Plot
extent = [xmin,xmax,ymin,ymax]

# KDE only
plt.subplot(2,6,i+1)
plt.imshow(z[::-1,:],
extent=extent,
aspect="auto",
cmap=plt.cm.viridis,
interpolation="bilinear")
plt.grid(False)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("pid: " + str(place))
plt.xlim(xmin,xmax)
plt.ylim(ymin,ymax)

# Overplot the points
plt.subplot(2,6,i+7)
plt.imshow(z[::-1,:],
extent=extent,
aspect="auto",
cmap=plt.cm.viridis,
interpolation="bilinear")
plt.scatter(df_place["x"], df_place["y"], s=5, c="k", lw=0, alpha=0.5)
plt.grid(False)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("pid: " + str(place))
plt.xlim(xmin,xmax)
plt.ylim(ymin,ymax)

plt.tight_layout()
plt.show()

終於寫完了,呼!如果知乎上不好看的話,代碼已上傳到github,不過由於github中打開.ipynb文件特別慢所以我通過jupyter nbviewer來展示這樣會更快一點http://nbviewer.jupyter.org/github/Fenghuapiao/Machine_learing/blob/master/kaggle_facebook.ipynb

還有我的excel2016版的最多也就只能載入100多萬行記錄,而pandas則可以較為容易的處理5T以下的數據。


先說答案:excel能實現的功能python都可以實現,python能實現的excel不一定能實現。

舉個例子:excel不能很好處理大量數據,比如10萬條。

反例:python處理小批量數據的時候並沒有比excel更快捷方便。

結論:人,是會選擇合適的工具做合適的事情的物種。


10萬條數據,你在搞笑,EXCEL能處理1億條數據,你信嗎?只是你們不知道EXCEL有個PowerPivot


速度是主要的,尤其是載入一個大文件,或者excel有很多sheets需要刷新。其次就是python可以用很簡單的語句,循環去實現一個功能,而excel可能需要產生很多中間值或者套用很多函數。


推薦閱讀:

django+nginx+uwsgi+git有哪些自動化部署工具?
uWSGI 伺服器的 uwsgi 協議究竟用在何處?
使用了Gunicorn或者uWSGI,為什麼還需要Nginx?
如何做到R和python的完美配合?
財務一名,已經工作兩年,現在想轉數據分析師,有沒有r語言和python學習的教程?

TAG:Python | 數據分析 | MicrosoftExcel | Excel圖表繪製 |