Titanic 數據分析
泰坦尼克號沉船事件存活率分析
import numpy as npimport pandas as pdfrom pandas import Series, DataFrameimport re as reimport seaborn as snsimport matplotlib.pyplot as plt%matplotlib inline
0. 提出問題
# 以DataFrame的方式讀取數據文件 titanic-data.csvfilename = titanic-data.csvdf = pd.read_csv(filename)# 可以直接用函數info來描述數據信息df.info()# 查看錶格前面5行數據df.head()<class pandas.core.frame.DataFrame>RangeIndex: 891 entries, 0 to 890Data columns (total 12 columns):PassengerId 891 non-null int64Survived 891 non-null int64Pclass 891 non-null int64Name 891 non-null objectSex 891 non-null objectAge 714 non-null float64SibSp 891 non-null int64Parch 891 non-null int64Ticket 891 non-null objectFare 891 non-null float64Cabin 204 non-null objectEmbarked 889 non-null objectdtypes: float64(2), int64(5), object(5)memory usage: 83.6+ KB
問題:有哪些因素會讓船上的人生還率更高?
1.數據清理
1.1 重複
# 判斷數據表中是否有重複值,注意value_counts函數是Series的函數df.duplicated().value_counts()False 891dtype: int64
1.2 缺失
# 判斷哪些列存在數據缺失,從結果看Age、Cabin和Embarked三列存在缺失# Cabin列缺失過多,暫時不處理df.isnull().sum()PassengerId 0Survived 0Pclass 0Name 0Sex 0Age 177SibSp 0Parch 0Ticket 0Fare 0Cabin 687Embarked 2dtype: int64# 對Age列缺失數據填充age_avg = df[Age].mean()age_std = df[Age].std()age_null_count = df[Age].isnull().sum()age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)#df[Age][np.isnan(df[Age])] = age_null_random_list#df[Age] = df[Age].astype(int)df.loc[df[Age].isnull(), Age] = age_null_random_listdf[Age] = df[Age].astype(int)# 從統計數來看,大部分來自於S港口,因此對缺失項用S港來填充,用Replace對現有對象修改df[Embarked].fillna(S, inplace=True)
1.3 新增
# 對名字列中的抬頭進行分類def get_title(name): title_search = re.search( ([A-Za-z]+)., name) # If the title exists, extract and return it. if title_search: return title_search.group(1) # 返回第一個匹配項 return ""df[Title] = df[Name].apply(get_title)#對以上的抬頭進行替換後歸類df[Title] = df[Title].replace([Lady, Countess,Capt, Col, Don, Dr, Major, Rev, Sir, Jonkheer, Dona], Rare)df[Title] = df[Title].replace(Mlle, Miss)df[Title] = df[Title].replace(Ms, Miss)df[Title] = df[Title].replace(Mme, Mrs)# sibsp # of siblings / spouses aboard the Titanic# parch # of parents / children aboard the Titanic# 這兩列分別是描述有多少兄弟姐妹和父母子女在船上的# 增加新的一列,叫做在船上的家庭人口數df[FamilySize] = df[SibSp] + df[Parch] + 1# 繼續分析,新增一列,看是否是單身df[IsAlone] = 0df.loc[df[FamilySize]==1, IsAlone] =1# Cabin缺失數據的認為沒有包廂,因此新增一列has_cabindf[Has_Cabin] = df["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
1.4 數據示例
df.head()
2數據分析
2.1 艙位
# 不同艙位的數量Pclass_grouped = df.groupby([Pclass], as_index=True)Pclass_grouped[PassengerId].count()Pclass1 2162 1843 491Name: PassengerId, dtype: int64labels = Pclass 1, Pclass 2, Pclass 3Pclass_grouped[PassengerId].count().plot(kind=pie, labels=labels, autopct=%.0f%%)plt.title(Pclass Rate)plt.show()
# 不同艙位的存活率Pclass_grouped[[ Survived]].mean().plot(kind=bar)plt.title(Pclass VS Survival Rate)plt.ylabel(Survival Rate)plt.show()
2.2 性別
# 分析每個性別的存活和非存活數量df.groupby(Sex, as_index=False)[PassengerId].count() sns.countplot(x=Sex, data=df, order=[male, female])plt.title(Sex)plt.show()
df.groupby([Sex, Survived], as_index=False)[PassengerId].count()sns.countplot(x=Survived, hue="Sex", data=df, order=[0, 1])plt.title(Sex Survival Counts)plt.show()
# 不同性別的存活率Sex_grouped = df.groupby([Sex], as_index=False)Sex_Survived_Per = Sex_grouped[[Sex, Survived]].mean()Sex_Survived_Persns.barplot(x=Sex, y=Survived, data=Sex_Survived_Per,order=[male,female])plt.title(Sex Survival Rate)plt.show()
2.3 年齡
df[CategoricalAge] = pd.cut(df[Age], 5)Age_grouped = df.groupby([CategoricalAge], as_index=False)Age_grouped[[CategoricalAge, Survived]].mean()
2.4 票價
# 對船費劃分成4組後,分組df[CategoricalFare] = pd.qcut(df[Fare], 4)Catego_grouped = df.groupby(CategoricalFare, as_index=False)Catego_grouped[[CategoricalFare, Survived]].mean()
2.4 港口
# 對出發港口分析df.groupby([Embarked, Survived], as_index=False)[PassengerId].count()sns.countplot(x=Survived, hue="Embarked", data=df, order=[1,0])plt.title(Embarked Survival Counts)plt.show()
Embarked_grouped = df.groupby([Embarked], as_index=False)Embarked_Survived_Per = Embarked_grouped[[Embarked, Survived]].mean()Embarked_Survived_Per# Aspect ratio of each facet, so that aspect * size gives the width of each facet in inches.sns.factorplot(x=Embarked,y=Survived, data=df,size=4,aspect=1.5)plt.title(Embarked Survival Rate)plt.show()
2.5 抬頭
Title_grouped = df.groupby([Title], as_index=False)Title_grouped[[Title, Survived]].mean()
2.6 單身
IsAlone_grouped = df.groupby([IsAlone], as_index=False)IsAlone_grouped[[IsAlone, Survived]].mean()
2.7 包廂
Has_Cabin_grouped = df.groupby([Has_Cabin], as_index=False)Has_Cabin_grouped[[Has_Cabin, Survived]].mean()
2.8 相關性分析
# 現在把所有的非數值列轉化成數值列# Mapping Sexdf[Sex] = df[Sex].map( {female: 0, male: 1} ).astype(int)# Mapping titlestitle_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}df[Title] = df[Title].map(title_mapping)df[Title] = df[Title].fillna(0)# Mapping Cabindf[Has_Cabin] = df["Cabin"].apply(lambda x: 0 if type(x) == float else 1)# Mapping Embarkeddf[Embarked] = df[Embarked].map( {S: 0, C: 1, Q: 2} ).astype(int)# Mapping Faredf.loc[ df[Fare] <= 7.91, Fare] = 0df.loc[(df[Fare] > 7.91) & (df[Fare] <= 14.454), Fare] = 1df.loc[(df[Fare] > 14.454) & (df[Fare] <= 31), Fare] = 2df.loc[ df[Fare] > 31, Fare] = 3df[Fare] = df[Fare].astype(int)# Mapping Agedf.loc[ df[Age] <= 16, Age] = 0df.loc[(df[Age] > 16) & (df[Age] <= 32), Age] = 1df.loc[(df[Age] > 32) & (df[Age] <= 48), Age] = 2df.loc[(df[Age] > 48) & (df[Age] <= 64), Age] = 3df.loc[ df[Age] > 64, Age] = 4df.head(1)
# Feature Selectiondrop_elements = [PassengerId, Name, Ticket, Cabin, SibSp, Parch, CategoricalFare, CategoricalAge]df = df.drop(drop_elements, axis=1)df.head(1)
# 採用皮爾遜相關係數,並繪圖來分析colormap = plt.cm.RdBuplt.figure(figsize=(12,10))plt.title(Pearson Correlation of Features, y=1.05, size=15)sns.heatmap(df.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor=white, annot=True)plt.show()
3. 總結
- 報告中使用的數據並不是全部乘客數據,這個樣本的大小合適與否取決於其他測試樣本的結論是否一致。樣本可能會存在偏差,因為在數據清洗過程中發現較多缺失項,採用均值或者最多值來填充。樣本不能夠代表整體人口,我們的分析是否正確,可以採用假設檢驗,可以通過對樣本重新抽樣形成新的樣本來驗證。
- 數據處理的方法會帶入偏差和不確定性,尤其是缺失項的處理。
- 還存在其他因素但是我們沒有數據,如身體強壯與否,是否會游泳等。
參考
https://www.kaggle.com/omarelgabry/a-journey-through-titanic
https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python《利用Python進行數據分析》推薦閱讀:
※回顧與展望轉行數據科學路上的點點滴滴(2016-2018)
※大數據有哪些工作崗位,日常工作內容是什麼,需要掌握哪些工具和技能
※Numpy和Pandas---數據分析的梯子
※初識Python
※基於新浪微博的男女性擇偶觀數據分析
TAG:數據分析 |