U.S Pollution Data
項目來源:U.S. Pollution Data | Kaggle,
數據為:美國2000-2016期間空氣質量的數據(主要收集了4個最主要的污染物,分別是二氧化氮,二氧化硫,一氧化碳,臭氧)
目標:圖形化顯示這些數據的變化
數據量:1746661條記錄,對應的特徵為:29個特徵
1、理解數據
1.1導入開發包以及數據
#coding:utf-8#針對分析的是2000-2016年期間California的空氣質量的分析,#主要收集了4個最主要的污染物,分別是二氧化氮,二氧化硫,一氧化碳,臭氧import pandas as pdimport numpy as npfrom datetime import datetime#visualizationimport matplotlib as mplimport matplotlib.pyplot as plt%matplotlib inline# Warningsimport warningswarnings.filterwarnings(ignore)#導入數據df = pd.read_csv("../input/pollution_us_2000_2016.csv")
1.2查看數據的類型,並進行相應的數據轉換
df.info()<class pandas.core.frame.DataFrame>RangeIndex: 1746661 entries, 0 to 1746660Data columns (total 29 columns):Unnamed: 0 int64State Code int64County Code int64Site Num int64Address objectState objectCounty objectCity objectDate Local objectNO2 Units objectNO2 Mean float64NO2 1st Max Value float64NO2 1st Max Hour int64NO2 AQI int64O3 Units objectO3 Mean float64O3 1st Max Value float64O3 1st Max Hour int64O3 AQI int64SO2 Units objectSO2 Mean float64SO2 1st Max Value float64SO2 1st Max Hour int64SO2 AQI float64CO Units objectCO Mean float64CO 1st Max Value float64CO 1st Max Hour int64CO AQI float64dtypes: float64(10), int64(10), object(9)memory usage: 386.5+ MB#其中有Date Local為時間序列#column Date Local object to datetimedf[Date Local]=pd.to_datetime(df[Date Local],format="%Y-%m-%d")#並拆分出Year這個特徵,用於分析數據變化的趨勢#split year info from date infodf[Year]=df[Date Local].dt.year
1.3特徵選擇
刪除第一列#del first columndf.drop(Unnamed: 0,axis=1,inplace=True)#因為4個污染物是都是由:Units,Mean,AQI,1st Max Value,1st Max Hour#使用AQI就可以代表污染物的水平,其他的特徵先不予分析#另外要分析趨勢,所以還要選擇年#還要對應的各個州,縣,城市的名稱所以選擇的特徵有State,City,NO2 AQI,O3 AQI ,SO2 AQI,CO AQI,Year
2、探索型分析
2.1記錄數據統計
#圖形化顯示各個州記錄污染的統計數量f , ax = plt.subplots(1,2, figsize=[15,6])#設置圖片的分布#針對分析的特徵是SQ2 AQI#取SQI AQI State的特徵值,按照state進行分組聚合,聚合後按照count值統計每個state對應的SQ2 AQI的次數#按照SQ2 AQI的次數進行降序排列#繪製直方圖,對應的坐標軸為ax[0]var = "SO2 AQI"df[[var,State]].groupby(["State"]).count().sort_values(by=var,ascending=False).plot.bar(ax=ax[0])ax[0].set_title("Number of Measurements, by State")#temp_df#取SO2 AQI year state等特徵,按照年進行聚合,並統計每年各州的的SO2 AQI的次數#reset_index之後吧year從index中移到特徵列#sort_values:按照Year進行降序排列temp_df = df[[var,Year,State]].groupby(["Year"]).count().reset_index().sort_values(by=Year,ascending=False)#topstate:取各州SO2 AQI記錄最多的5個州#取SO2, AQI與State,按照State進行分組聚合,統計對應記錄的次數#按照SO2 AQI對應的值進行降序#取對應的index,並截取前5個topstate = df[[var,State]].groupby(["State"]).count().sort_values(by=SO2 AQI,ascending=False).index [:5]#color liststate_col = ["green","red","yellow","orange","purple"]# Plot#設置ax[1]的title xlabel,ylableax[1].set_title(Number of Observations for {} by Year.format(var))ax[1].set_xlabel(Year)ax[1].set_ylabel(Observation Count)#x軸為年,y軸為SO2 AQI記錄的次數,繪製一條時間序列圖plt.plot(temp_df.Year,temp_df["SO2 AQI"],marker=o, linestylex=--, color=black, label=Square)#選擇記錄最多的前5個州對應的SO2 AQI Yyearfor (i,col) in list(zip(topstate, state_col)): state_df= df[df.State==i][[var,Year]].groupby(["Year"]).count().reset_index().sort_values(by=Year,ascending=False) ax[1].plot(state_df.Year,state_df[var],marker=o, linestylex=--, color=col, label=Square)ax[1].legend(topstate.insert(0, "All") , loc=2,fontsize=large)#設置legend,loc=2=upper leftplt.show()
對應California對污染物的記錄比其他的州要充足,所以我們可以對這個州的數據做重點分析
2.2趨勢分析
#查看這4項污染物對應的趨勢f, ax = plt.subplots(figsize=[10,4])#設置圖的大小#趨勢圖則需要以時間為橫軸,變數的某種值為縱軸df_trend=df.groupby([Year]).agg({SO2 AQI: mean,CO AQI: mean,NO2 AQI: mean,O3 AQI: mean})df_trend.plot(lw=2,colormap=jet,marker=.,markersize=10, ax =ax)#繪製點線圖ax.set_title(Mean Pollutant AQI Over Time, fontsize=20,fontweight=bold)#設置標題ax.set(ylabel="Average AQI", xlabel="Year")#設置x軸,y軸的標籤plt.show()
3 California的數據分析
3.1根據時間繪製污染物的趨勢圖
提取州名等於California的數據#針對California污染物的數據進行分析cal = df[df.State==California]#提取州名為California的所有記錄# 刪除其中包含的Nan值cal= cal.dropna(axis=0)# Inputcols = ["black","blue","green","red"]polldata= [NO2 AQI,O3 AQI,SO2 AQI,"CO AQI"]# Plotterdef row_plots(data, time, rol_window): #data:選擇的特性 #time:時間序列 #rol_window:翻轉窗 f, axarr = plt.subplots(len(data), sharex=True,figsize=[10,6])#Create a figure and a set of subplots #nrows=len(data) #sharex=True,共享x軸 #index,x對應data的Series的index和value for index, x in enumerate(data): plot1 = cal[[x,time]].groupby([time]).mean()#選擇特徵x,time,並以time進行數據聚合,並求聚合類的均值 plot1[x] = plot1[x].rolling(window = rol_window).mean() axarr[index].set_ylabel("{}".format(x))#設置y軸 axarr[index].plot(plot1, color=cols[index],label=x)#x為time,縱軸為均值 axarr[index].legend(fontsize=large, loc=center left,bbox_to_anchor=(1, 0.5))#設置legend plt.tight_layout(pad=0)#緊湊型 plt.subplots_adjust(top=0.90) plt.suptitle("Trend of Average Pollutants by {}".format(time),fontsize=17)#title plt.show() # City plots: city_poll = cal[["City",time,"NO2 AQI","O3 AQI","CO AQI","SO2 AQI"]] .groupby([City,time]).mean().groupby(level="City") f, axarr = plt.subplots(len(data), sharex=True,figsize=[10,6]) for index, x in enumerate(data): pollutant_plot = city_poll[x] pollutant_plotTop = pollutant_plot.mean().nlargest(4).index for i in pollutant_plotTop: lineplot= pollutant_plot.get_group(i).groupby(pd.Grouper(level=time)).mean().rolling(window = rol_window).mean() axarr[index].plot(lineplot) axarr[index].legend(pollutant_plotTop,fontsize=large, loc=center left,bbox_to_anchor=(1, 0.5)) axarr[index].set_ylabel("{}".format(x)) plt.tight_layout(pad=0) plt.subplots_adjust(top=0.90) plt.suptitle("Trend of Average Pollutants of Top 4 Cities by {}".format(time),fontsize=17) plt.show()#調用函數,顯示污染物每年的變化缺失,以及污染程度最高的4座城市的污染趨勢row_plots(data=polldata, time= "Date Local", rol_window=80)
從上圖可以看出變化趨勢上看都存在波動,說明有一定的季節性
3.2 隨著天數的變化趨勢
cal["Date of Year"] = cal[Date Local].dt.dayofyear#The ordinal day of year,把日期轉換為當年的第幾天按照天對數據進行趨勢分析#查看是否與天相關# Plot Mega-Helperdef years_city_plot(time, rol_window): plt.figure(figsize=(12,8)) for var,plot in [(NO2 AQI,221), (O3 AQI,222),(SO2 AQI,223),("CO AQI",224)]: plt.subplot(plot)#畫布分配 plot1 = cal[[var,time]].groupby([time]).mean()#按time進行聚合,求其平均值 plot1[var] = plot1[var].rolling(window = rol_window).mean()#設置 plt.plot(plot1, color=green, label=var)# plt.title(var)#設置title plt.legend(loc=upper center, bbox_to_anchor=(0.5, -0.12),fancybox=True, shadow=True, ncol=4) plt.xlabel(time) plt.ylabel("Air Quality Index") plt.tight_layout(pad=0, w_pad=0.5, h_pad=2.5) plt.subplots_adjust(top=0.90) plt.suptitle("Trend of Average Pollutants by {}".format(time),fontsize=17) plt.show() # City city_poll = cal[["City",time,"NO2 AQI","O3 AQI","CO AQI","SO2 AQI"]].groupby([City,time]).mean().groupby(level="City") plt.figure(figsize=(12,8)) for var,plot in [(NO2 AQI,221), (O3 AQI,222),(SO2 AQI,223),("CO AQI",224)]: plt.subplot(plot) pollutant_plot = city_poll[var] pollutant_plotTop = pollutant_plot.mean().nlargest(4).index for i in pollutant_plotTop: plot1= pollutant_plot.get_group(i).groupby(pd.Grouper(level=time)).mean().rolling(window = rol_window).mean().plot() plt.title(var) plt.xlabel(time) plt.ylabel("Air Quality Index") plt.legend(pollutant_plotTop, loc=upper center, bbox_to_anchor=(0.5, -0.12),fancybox=True, shadow=True, ncol=4) plt.tight_layout(pad=0, w_pad=0.5, h_pad=2.5) plt.subplots_adjust(top=0.9) plt.suptitle("Trend of Average Pollutant by Top 4 City by {}".format(time),fontsize=17) plt.show()years_city_plot(time="Date of Year", rol_window=5)
從上圖可以看出CO與NO2的污染物AQI的曲線相似
3.3分析在工作日與休息日污染物排放的變化
#分析工作日與周日的污染排放的關係#使用直方圖顯示a = 0.80#透明度設置def pol_bar_plot(time, rol_window): plt.figure(figsize=(12,8))#設置圖片的大小 width = .90 plot1 = cal.groupby([time]).mean()#按照time分組,並求統計平均值 plot1 = plot1.rolling(window = rol_window).mean()#設置rolling X= list(range(len(set(plot1.index)))) labels = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"] for index, (var,plot) in enumerate([(NO2 AQI,221), (O3 AQI,222),(SO2 AQI,223),("CO AQI",224)]): plt.subplot(plot)#畫布的分布 X= list(range(len(set(plot1.index)))) plt.bar(left=[p + width for p in X], height=plot1[var],width_=width,label=var,alpha=a)#繪製直方圖 plt.title(var)#title plt.ylabel("Air Quality Index")#y label plt.xticks([p + width for p in X], labels)#繪製x軸的tick plt.legend(loc=upper center, bbox_to_anchor=(0.5, -0.05),fancybox=True, shadow=True, ncol=4)#設置legend plt.tight_layout(pad=0, w_pad=0.5, h_pad=2.5)#緊湊型繪圖 plt.subplots_adjust(top=0.9) plt.suptitle("Trend of Average Pollutant by {}".format(time),fontsize=17)#設置整個圖的title plt.show() # City city_poll = cal[["City",time,"NO2 AQI","O3 AQI","CO AQI","SO2 AQI"]].groupby([City,time]).mean().groupby(level="City") width = .22#設置bar的寬度 plt.figure(figsize=(12,8)) for index, (var,plot) in enumerate([(NO2 AQI,221), (O3 AQI,222),(SO2 AQI,223),("CO AQI",224)]): plt.subplot(plot) pollutant_plot = city_poll[var]#NO2 AQI O3 AQI SO2 AQI CO AQI對應的4個特徵 pollutant_plotTop = pollutant_plot.mean().nlargest(4).index#超看對應污染物的最高的4個城市 for index, i in enumerate(pollutant_plotTop): plot1= pollutant_plot.get_group(i).groupby(pd.Grouper(level=time)) .mean().rolling(window = rol_window).mean() plt.bar(left=[p + width*index for p in X], height=plot1,width_=width, label=i,alpha=a) plt.title(var)#title針對每個小圖 plt.ylabel("Air Quality Index") plt.xticks([p + (width*len(pollutant_plotTop))/2 for p in X], labels) plt.legend(loc=upper center, bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=4) plt.tight_layout(pad=0, w_pad=0.5, h_pad=2.5) plt.subplots_adjust(top=0.89) plt.suptitle("Trend of Average Pollutant by Top 4 City by {}".format(time),fontsize=17) plt.show()# Plot Datapol_bar_plot(time="Weekday", rol_window=1)
可以看出工作日與休息日污染物對應的AQI沒有明顯的變化
從上圖可以看出各項污染物對應的城市不同,可能是每個城市對應的工業類型不同導致,可以對相關城市的相關行業做調查,驗證無法物的排放是否符合標準。
3.4 各項污染物的相關性分析
#相關性使用熱力圖查看import seaborn as snssns.heatmap(Pollutants.corr(), annot=True, fmt=".2f", cbar_kws={label: Correlation Coefficient})plt.title("Correlation Plot")plt.show()
推薦閱讀:
※Kaggle機器學習之泰坦尼克號生還預測
※SQL初級數據分析(基於Microsoft Access)
※python基礎篇之小白滾躺式入坑
※Python 數據分析學習路線
※用戶畫像學習