





import warningswarnings.filterwarnings(ignore)import pandas as pdimport numpy as py


train=pd.read_csv(E:train.csv)test=pd.read_csv(E:test.csv)print(訓練數據結構:,train.shape)print(測試數據結構:,test.shape)得到結果訓練數據結構: (891, 12)測試數據結構: (418, 11)


rowNum_train=train.shape[0]rowNum_test=test.shape[0]print(訓練行數:,rowNum_train)print(測試行數,rowNum_test)訓練行數: 891測試行數 418


full=train.append(test,ignore_index=True)print(總數據集:,full.shape)總數據集: (1309, 12)


full.head() Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket0 22.0 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0.0 A/5 211711 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1.0 PC 175992 26.0 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1.0 STON/O2. 31012823 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1.0 1138034 35.0 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0.0 373450


full.describe() Age Fare Parch PassengerId Pclass SibSp Survivedcount 1046.000000 1308.000000 1309.000000 1309.000000 1309.000000 1309.000000 891.000000mean 29.881138 33.295479 0.385027 655.000000 2.294882 0.498854 0.383838std 14.413493 51.758668 0.865560 378.020061 0.837836 1.041658 0.486592min 0.170000 0.000000 0.000000 1.000000 1.000000 0.000000 0.00000025% 21.000000 7.895800 0.000000 328.000000 2.000000 0.000000 0.00000050% 28.000000 14.454200 0.000000 655.000000 3.000000 0.000000 0.00000075% 39.000000 31.275000 0.000000 982.000000 3.000000 1.000000 1.000000max 80.000000 512.329200 9.000000 1309.000000 3.000000 8.000000 1.000000


full.info()<class pandas.core.frame.DataFrame>RangeIndex: 1309 entries, 0 to 1308Data columns (total 12 columns):Age 1046 non-null float64Cabin 295 non-null objectEmbarked 1307 non-null objectFare 1308 non-null float64Name 1309 non-null objectParch 1309 non-null int64PassengerId 1309 non-null int64Pclass 1309 non-null int64Sex 1309 non-null objectSibSp 1309 non-null int64Survived 891 non-null float64Ticket 1309 non-null objectdtypes: float64(3), int64(4), object(5)memory usage: 97.2+ KB




print(處理前)full.info()full[Age]=full[Age].fillna(full[Age].mean())full[Fare]=full[Fare].fillna(full[Fare].mean())print(處理後:)full.info()處理前<class pandas.core.frame.DataFrame>RangeIndex: 1309 entries, 0 to 1308Data columns (total 12 columns):Age 1046 non-null float64Cabin 295 non-null objectEmbarked 1307 non-null objectFare 1308 non-null float64Name 1309 non-null objectParch 1309 non-null int64PassengerId 1309 non-null int64Pclass 1309 non-null int64Sex 1309 non-null objectSibSp 1309 non-null int64Survived 891 non-null float64Ticket 1309 non-null objectdtypes: float64(3), int64(4), object(5)memory usage: 97.2+ KB處理後:<class pandas.core.frame.DataFrame>RangeIndex: 1309 entries, 0 to 1308Data columns (total 12 columns):Age 1309 non-null float64Cabin 295 non-null objectEmbarked 1307 non-null objectFare 1309 non-null float64Name 1309 non-null objectParch 1309 non-null int64PassengerId 1309 non-null int64Pclass 1309 non-null int64Sex 1309 non-null objectSibSp 1309 non-null int64Survived 891 non-null float64Ticket 1309 non-null objectdtypes: float64(3), int64(4), object(5)memory usage: 97.2+ KB


full.head() Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket0 22.0 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0.0 A/5 211711 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1.0 PC 175992 26.0 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1.0 STON/O2. 31012823 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1.0 1138034 35.0 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0.0 373450

對登船港口(Embarked)用S 、船艙號(Cabin)用U來進行填充

full[Embarked]=full[Embarked].fillna(S)full[Cabin]=full[Cabin].fillna(U)full.head() Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket0 22.0 U S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0.0 A/5 211711 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1.0 PC 175992 26.0 U S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1.0 STON/O2. 31012823 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1.0 1138034 35.0 U S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0.0 373450


full.info()<class pandas.core.frame.DataFrame>RangeIndex: 1309 entries, 0 to 1308Data columns (total 12 columns):Age 1309 non-null float64Cabin 1309 non-null objectEmbarked 1309 non-null objectFare 1309 non-null float64Name 1309 non-null objectParch 1309 non-null int64PassengerId 1309 non-null int64Pclass 1309 non-null int64Sex 1309 non-null int64SibSp 1309 non-null int64Survived 891 non-null float64Ticket 1309 non-null objectdtypes: float64(3), int64(5), object(4)memory usage: 102.3+ KB



full[Sex].head()0 male1 female2 female3 female4 maleName: Sex, dtype: object


sex_mapDict={male:1,female:0}full[Sex]=full[Sex].map(sex_mapDict) #map函數:對Series每個數據應用自定義的函數計算full.head()Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket0 22.0 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 1 1 0.0 A/5 211711 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 0 1 1.0 PC 175992 26.0 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 0 0 1.0 STON/O2. 31012823 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 0 1 1.0 1138034 35.0 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 1 0 0.0 373450


embarkedDf=pd.DataFrame()embarkedDf=pd.get_dummies(full[Embarked],prefix=Embarked)embarkedDf.head() Embarked_C Embarked_Q Embarked_S0 0 0 11 1 0 02 0 0 13 0 0 14 0 0 1

添加one-hot編碼產生的虛擬變數(dummy variables)到泰坦尼克號數據集full,並把登船港口(Embarked)這一列刪掉

full = pd.concat([full,embarkedDf],axis=1)full.drop(Embarked,axis=1,inplace=True)full.head() Age Cabin Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket Embarked_C Embarked_Q Embarked_S0 22.000000 U 7.2500 Braund, Mr. Owen Harris 0 1 3 1 1 0.0 A/5 21171 0 0 11 38.000000 C85 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 0 1 1.0 PC 17599 1 0 02 26.000000 U 7.9250 Heikkinen, Miss. Laina 0 3 3 0 0 1.0 STON/O2. 3101282 0 0 1...1308 29.881138 U 22.3583 Peter, Master. Michael J 1 1309 3 1 1 NaN 2668 1 0 01309 rows × 14 columns


pclassDf=pd.DataFrame()pclassDf=pd.get_dummies(full[Pclass],prefix=Pclass)pclassDf.head()Pclass_1 Pclass_2 Pclass_30 0 0 11 1 0 02 0 0 13 1 0 04 0 0 1

添加one-hot編碼產生的虛擬變數(dummy variables)到泰坦尼克號數據集full,並把客艙等級(Pclass)這一列刪掉

full=pd.concat([full,pclassDf],axis=1)full.drop(Pclass,axis=1,inplace=True)full Age Cabin Fare Name Parch PassengerId Sex SibSp Survived Ticket Embarked_C Embarked_Q Embarked_S Pclass_1 Pclass_2 Pclass_30 22.000000 U 7.2500 Braund, Mr. Owen Harris 0 1 1 1 0.0 A/5 21171 0 0 1 0 0 11 38.000000 C85 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 0 1 1.0 PC 17599 1 0 0 1 0 0...1307 29.881138 U 8.0500 Ware, Mr. Frederick 0 1308 1 0 NaN 359309 0 0 1 0 0 11308 29.881138 U 22.3583 Peter, Master. Michael J 1 1309 1 1 NaN 2668 1 0 0 0 0 11309 rows × 16 columns


full[Name].head()0 Braund, Mr. Owen Harris1 Cumings, Mrs. John Bradley (Florence Briggs Th...2 Heikkinen, Miss. Laina3 Futrelle, Mrs. Jacques Heath (Lily May Peel)4 Allen, Mr. William HenryName: Name, dtype: object


def getTitle(name): str1=name.split( , )[1] #Mr. Owen Harris str2=str1.split( . )[0]#Mr #strip() 方法用於移除字元串頭尾指定的字元(默認為空格) str3=str2.strip() return str3


titleDf = pd.DataFrame()titleDf[Title] = full[Name].map(getTitle)#map函數:對Series每個數據應用自定義的函數計算titleDf.head() Title0 Mr1 Mrs2 Miss3 Mrs4 Mr


title_mapDict = { "Capt": "Officer", "Col": "Officer", "Major": "Officer", "Jonkheer": "Royalty", "Don": "Royalty", "Sir" : "Royalty", "Dr": "Officer", "Rev": "Officer", "the Countess":"Royalty", "Dona": "Royalty", "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs", "Mr" : "Mr", "Mrs" : "Mrs", "Miss" : "Miss", "Master" : "Master", "Lady" : "Royalty" }#map函數:對Series每個數據應用自定義的函數計算titleDf[Title] = titleDf[Title].map(title_mapDict)#使用get_dummies進行one-hot編碼titleDf = pd.get_dummies(titleDf[Title])titleDf.head()Master Miss Mr Mrs Officer Royalty0 0 0 1 0 0 01 0 0 0 1 0 02 0 1 0 0 0 03 0 0 0 1 0 04 0 0 1 0 0 0

添加one-hot編碼產生的虛擬變數(dummy variables)到泰坦尼克號數據集full,並把姓名(Name)這一列刪掉

full=pd.concat([full,titleDf],axis=1)full.drop(Name,axis=1,inplace=True)full.head()Age Cabin Fare Parch PassengerId Sex SibSp Survived Ticket Embarked_C ... Embarked_S Pclass_1 Pclass_2 Pclass_3 Master Miss Mr Mrs Officer Royalty0 22.0 U 7.2500 0 1 1 1 0.0 A/5 21171 0 ... 1 0 0 1 0 0 1 0 0 01 38.0 C85 71.2833 0 2 0 1 1.0 PC 17599 1 ... 0 1 0 0 0 0 0 1 0 02 26.0 U 7.9250 0 3 0 0 1.0 STON/O2. 3101282 0 ... 1 0 0 1 0 1 0 0 0 03 35.0 C123 53.1000 0 4 0 1 1.0 113803 0 ... 1 1 0 0 0 0 0 1 0 04 35.0 U 8.0500 0 5 1 0 0.0 373450 0 ... 1 0 0 1 0 0 1 0 0 05 rows × 21 columns


full[Cabin].head()0 U1 C852 U3 C1234 UName: Cabin, dtype: object

python 使用 lambda 來創建匿名函數,並使用get_dummies進行one-hot編碼,列名前綴是Cabin

cabinDf=pd.DataFrame()full[Cabin] =full[Cabin].map(lambda c:c[0])cabinDf = pd.get_dummies( full[Cabin] , prefix = Cabin )cabinDf.head()Cabin_A Cabin_B Cabin_C Cabin_D Cabin_E Cabin_F Cabin_G Cabin_T Cabin_U0 0 0 0 0 0 0 0 0 11 0 0 1 0 0 0 0 0 02 0 0 0 0 0 0 0 0 13 0 0 1 0 0 0 0 0 04 0 0 0 0 0 0 0 0 1

添加one-hot編碼產生的虛擬變數(dummy variables)到泰坦尼克號數據集full,並刪掉客艙號(Cabin)這一列

full=pd.concat([full,cabinDf],axis=1)full.drop(Cabin,axis=1,inplace=True)full.head()Age Fare Parch PassengerId Sex SibSp Survived Ticket Embarked_C Embarked_Q ... Royalty Cabin_A Cabin_B Cabin_C Cabin_D Cabin_E Cabin_F Cabin_G Cabin_T Cabin_U0 22.0 7.2500 0 1 1 1 0.0 A/5 21171 0 0 ... 0 0 0 0 0 0 0 0 0 11 38.0 71.2833 0 2 0 1 1.0 PC 17599 1 0 ... 0 0 0 1 0 0 0 0 0 02 26.0 7.9250 0 3 0 0 1.0 STON/O2. 3101282 0 0 ... 0 0 0 0 0 0 0 0 0 13 35.0 53.1000 0 4 0 1 1.0 113803 0 0 ... 0 0 0 1 0 0 0 0 0 04 35.0 8.0500 0 5 1 0 0.0 373450 0 0 ... 0 0 0 0 0 0 0 0 0 15 rows × 29 columns


#存放家庭信息familyDf = pd.DataFrame()familyDf[ FamilySize ] = full[ Parch ] + full[ SibSp ] + 1familyDf[ Family_Single ] = familyDf[ FamilySize ].map( lambda s : 1 if s == 1 else 0 )familyDf[ Family_Small ] = familyDf[ FamilySize ].map( lambda s : 1 if 2 <= s <= 4 else 0 )familyDf[ Family_Large ] = familyDf[ FamilySize ].map( lambda s : 1 if 5 <= s else 0 )familyDf.head() FamilySize Family_Single Family_Small Family_Large0 2 0 1 01 2 0 1 02 1 1 0 03 2 0 1 04 1 1 0 0

添加one-hot編碼產生的虛擬變數(dummy variables)到泰坦尼克號數據集full

full = pd.concat([full,familyDf],axis=1)full.head()Age Fare Parch PassengerId Sex SibSp Survived Ticket Embarked_C Embarked_Q ... Cabin_D Cabin_E Cabin_F Cabin_G Cabin_T Cabin_U FamilySize Family_Single Family_Small Family_Large0 22.0 7.2500 0 1 1 1 0.0 A/5 21171 0 0 ... 0 0 0 0 0 1 2 0 1 01 38.0 71.2833 0 2 0 1 1.0 PC 17599 1 0 ... 0 0 0 0 0 0 2 0 1 02 26.0 7.9250 0 3 0 0 1.0 STON/O2. 3101282 0 0 ... 0 0 0 0 0 1 1 1 0 03 35.0 53.1000 0 4 0 1 1.0 113803 0 0 ... 0 0 0 0 0 0 2 0 1 04 35.0 8.0500 0 5 1 0 0.0 373450 0 0 ... 0 0 0 0 0 1 1 1 0 05 rows × 33 columns


full.shape(1309, 33)



corrDf=full.corr()corrDfAge Fare Parch PassengerId Sex SibSp Survived Embarked_C Embarked_Q Embarked_S ... Cabin_D Cabin_E Cabin_F Cabin_G Cabin_T Cabin_U FamilySize Family_Single Family_Small Family_LargeAge 1.000000 0.171521 -0.130872 0.025731 0.057397 -0.190747 -0.070323 0.076179 -0.012718 -0.059153 ... 0.132886 0.106600 -0.072644 -0.085977 0.032461 -0.271918 -0.196996 0.116675 -0.038189 -0.161210Fare 0.171521 1.000000 0.221522 0.031416 -0.185484 0.160224 0.257307 0.286241 -0.130054 -0.169894 ... 0.072737 0.073949 -0.037567 -0.022857 0.001179 -0.507197 0.226465 -0.274826 0.197281 0.170853Parch -0.130872 0.221522 1.000000 0.008942 -0.213125 0.373587 0.081629 -0.008635 -0.100943 0.071881 ... -0.027385 0.001084 0.020481 0.058325 -0.012304 -0.036806 0.792296 -0.549022 0.248532 0.624627PassengerId 0.025731 0.031416 0.008942 1.000000 0.013406 -0.055224 -0.005007 0.048101 0.011585 -0.049836 ... 0.000549 -0.008136 0.000306 -0.045949 -0.023049 0.000208 -0.031437 0.028546 0.002975 -0.063415Sex 0.057397 -0.185484 -0.213125 0.013406 1.000000 -0.109609 -0.543351 -0.066564 -0.088651 0.115193 ... -0.057396 -0.040340 -0.006655 -0.083285 0.020558 0.137396 -0.188583 0.284537 -0.255196 -0.077748SibSp -0.190747 0.160224 0.373587 -0.055224 -0.109609 1.000000 -0.035322 -0.048396 -0.048678 0.073709 ... -0.015727 -0.027180 -0.008619 0.006015 -0.013247 0.009064 0.861952 -0.591077 0.253590 0.699681Survived -0.070323 0.257307 0.081629 -0.005007 -0.543351 -0.035322 1.000000 0.168240 0.003650 -0.149683 ... 0.150716 0.145321 0.057935 0.016040 -0.026456 -0.316912 0.016639 -0.203367 0.279855 -0.125147Embarked_C 0.076179 0.286241 -0.008635 0.048101 -0.066564 -0.048396 0.168240 1.000000 -0.164166 -0.778262 ... 0.107782 0.027566 -0.020010 -0.031566 -0.014095 -0.258257 -0.036553 -0.107874 0.159594 -0.092825Embarked_Q -0.012718 -0.130054 -0.100943 0.011585 -0.088651 -0.048678 0.003650 -0.164166 1.000000 -0.491656 ... -0.061459 -0.042877 -0.020282 -0.019941 -0.008904 0.142369 -0.087190 0.127214 -0.122491 -0.018423Embarked_S -0.059153 -0.169894 0.071881 -0.049836 0.115193 0.073709 -0.149683 -0.778262 -0.491656 1.000000 ... -0.056023 0.002960 0.030575 0.040560 0.018111 0.137351 0.087771 0.014246 -0.062909 0.093671Pclass_1 0.362587 0.599956 -0.013033 0.026495 -0.107371 -0.034256 0.285904 0.325722 -0.166101 -0.181800 ... 0.275698 0.242963 -0.073083 -0.035441 0.048310 -0.776987 -0.029656 -0.126551 0.165965 -0.067523Pclass_2 -0.014193 -0.121372 -0.010057 0.022714 -0.028862 -0.052419 0.093349 -0.134675 -0.121973 0.196532 ... -0.037929 -0.050210 0.127371 -0.032081 -0.014325 0.176485 -0.039976 -0.035075 0.097270 -0.118495Pclass_3 -0.302093 -0.419616 0.019521 -0.041544 0.116562 0.072610 -0.322308 -0.171430 0.243706 -0.003805 ... -0.207455 -0.169063 -0.041178 0.056964 -0.030057 0.527614 0.058430 0.138250 -0.223338 0.155560Master -0.363923 0.011596 0.253482 0.002254 0.164375 0.329171 0.085221 -0.014172 -0.009091 0.018297 ... -0.042192 0.001860 0.058311 -0.013690 -0.006113 0.041178 0.355061 -0.265355 0.120166 0.301809Miss -0.254146 0.092051 0.066473 -0.050027 -0.672819 0.077564 0.332795 -0.014351 0.198804 -0.113886 ... -0.012516 0.008700 -0.003088 0.061881 -0.013832 -0.004364 0.087350 -0.023890 -0.018085 0.083422Mr 0.165476 -0.192192 -0.304780 0.014116 0.870678 -0.243104 -0.549199 -0.065538 -0.080224 0.108924 ... -0.030261 -0.032953 -0.026403 -0.072514 0.023611 0.131807 -0.326487 0.386262 -0.300872 -0.194207Mrs 0.198091 0.139235 0.213491 0.033299 -0.571176 0.061643 0.344935 0.098379 -0.100374 -0.022950 ... 0.080393 0.045538 0.013376 0.042547 -0.011742 -0.162253 0.157233 -0.354649 0.361247 0.012893Officer 0.162818 0.028696 -0.032631 0.002231 0.087288 -0.013813 -0.031316 0.003678 -0.003212 -0.001202 ... 0.006055 -0.024048 -0.017076 -0.008281 -0.003698 -0.067030 -0.026921 0.013303 0.003966 -0.034572Royalty 0.059466 0.026214 -0.030197 0.004400 -0.020408 -0.010787 0.033391 0.077213 -0.021853 -0.054250 ... -0.012950 -0.012202 -0.008665 -0.004202 -0.001876 -0.071672 -0.023600 0.008761 -0.000073 -0.017542Cabin_A 0.125177 0.020094 -0.030707 -0.002831 0.047561 -0.039808 0.022287 0.094914 -0.042105 -0.056984 ... -0.024952 -0.023510 -0.016695 -0.008096 -0.003615 -0.242399 -0.042967 0.045227 -0.029546 -0.033799Cabin_B 0.113458 0.393743 0.073051 0.015895 -0.094453 -0.011569 0.175095 0.161595 -0.073613 -0.095790 ... -0.043624 -0.041103 -0.029188 -0.014154 -0.006320 -0.423794 0.032318 -0.087912 0.084268 0.013470Cabin_C 0.167993 0.401370 0.009601 0.006092 -0.077473 0.048616 0.114652 0.158043 -0.059151 -0.101861 ... -0.053083 -0.050016 -0.035516 -0.017224 -0.007691 -0.515684 0.037226 -0.137498 0.141925 0.001362Cabin_D 0.132886 0.072737 -0.027385 0.000549 -0.057396 -0.015727 0.150716 0.107782 -0.061459 -0.056023 ... 1.000000 -0.034317 -0.024369 -0.011817 -0.005277 -0.353822 -0.025313 -0.074310 0.102432 -0.049336Cabin_E 0.106600 0.073949 0.001084 -0.008136 -0.040340 -0.027180 0.145321 0.027566 -0.042877 0.002960 ... -0.034317 1.000000 -0.022961 -0.011135 -0.004972 -0.333381 -0.017285 -0.042535 0.068007 -0.046485Cabin_F -0.072644 -0.037567 0.020481 0.000306 -0.006655 -0.008619 0.057935 -0.020010 -0.020282 0.030575 ... -0.024369 -0.022961 1.000000 -0.007907 -0.003531 -0.236733 0.005525 0.004055 0.012756 -0.033009Cabin_G -0.085977 -0.022857 0.058325 -0.045949 -0.083285 0.006015 0.016040 -0.031566 -0.019941 0.040560 ... -0.011817 -0.011135 -0.007907 1.000000 -0.001712 -0.114803 0.035835 -0.076397 0.087471 -0.016008Cabin_T 0.032461 0.001179 -0.012304 -0.023049 0.020558 -0.013247 -0.026456 -0.014095 -0.008904 0.018111 ... -0.005277 -0.004972 -0.003531 -0.001712 1.000000 -0.051263 -0.015438 0.022411 -0.019574 -0.007148Cabin_U -0.271918 -0.507197 -0.036806 0.000208 0.137396 0.009064 -0.316912 -0.258257 0.142369 0.137351 ... -0.353822 -0.333381 -0.236733 -0.114803 -0.051263 1.000000 -0.014155 0.175812 -0.211367 0.056438FamilySize -0.196996 0.226465 0.792296 -0.031437 -0.188583 0.861952 0.016639 -0.036553 -0.087190 0.087771 ... -0.025313 -0.017285 0.005525 0.035835 -0.015438 -0.014155 1.000000 -0.688864 0.302640 0.801623Family_Single 0.116675 -0.274826 -0.549022 0.028546 0.284537 -0.591077 -0.203367 -0.107874 0.127214 0.014246 ... -0.074310 -0.042535 0.004055 -0.076397 0.022411 0.175812 -0.688864 1.000000 -0.873398 -0.318944Family_Small -0.038189 0.197281 0.248532 0.002975 -0.255196 0.253590 0.279855 0.159594 -0.122491 -0.062909 ... 0.102432 0.068007 0.012756 0.087471 -0.019574 -0.211367 0.302640 -0.873398 1.000000 -0.183007Family_Large -0.161210 0.170853 0.624627 -0.063415 -0.077748 0.699681 -0.125147 -0.092825 -0.018423 0.093671 ... -0.049336 -0.046485 -0.033009 -0.016008 -0.007148 0.056438 0.801623 -0.318944 -0.183007 1.00000032 rows × 32 columns


corrDf[Survived].sort_values(ascending =False)Survived 1.000000Mrs 0.344935Miss 0.332795Pclass_1 0.285904Family_Small 0.279855Fare 0.257307Cabin_B 0.175095Embarked_C 0.168240Cabin_D 0.150716Cabin_E 0.145321Cabin_C 0.114652Pclass_2 0.093349Master 0.085221Parch 0.081629Cabin_F 0.057935Royalty 0.033391Cabin_A 0.022287FamilySize 0.016639Cabin_G 0.016040Embarked_Q 0.003650PassengerId -0.005007Cabin_T -0.026456Officer -0.031316SibSp -0.035322Age -0.070323Family_Large -0.125147Embarked_S -0.149683Family_Single -0.203367Cabin_U -0.316912Pclass_3 -0.322308Sex -0.543351Mr -0.549199Name: Survived, dtype: float64



full_X = pd.concat( [titleDf,#頭銜 pclassDf,#客艙等級 familyDf,#家庭大小 full[Fare],#船票價格 cabinDf,#船艙號 embarkedDf,#登船港口 full[Sex]#性別 ] , axis=1 )full_X.head()Master Miss Mr Mrs Officer Royalty Pclass_1 Pclass_2 Pclass_3 FamilySize ... Cabin_D Cabin_E Cabin_F Cabin_G Cabin_T Cabin_U Embarked_C Embarked_Q Embarked_S Sex0 0 0 1 0 0 0 0 0 1 2 ... 0 0 0 0 0 1 0 0 1 11 0 0 0 1 0 0 1 0 0 2 ... 0 0 0 0 0 0 1 0 0 02 0 1 0 0 0 0 0 0 1 1 ... 0 0 0 0 0 1 0 0 1 03 0 0 0 1 0 0 1 0 0 2 ... 0 0 0 0 0 0 0 0 1 04 0 0 1 0 0 0 0 0 1 1 ... 0 0 0 0 0 1 0 0 1 15 rows × 27 columns

4 構建模型


sourceRow=891source_X=full_X.loc[0:sourceRow-1,:] #原始數據集:特徵source_y=full.loc[0:sourceRow-1,Survived] #原始數據集:標籤pred_X=full_X.loc[sourceRow:,:] #預測數據集:特徵print(source_X.shape,source_y.shape,pred_X.shape)(891, 27) (891,) (418, 27)


train_test_split是交叉驗證中常用的函數,功能是從樣本中隨機的按比例選取train data和test data




from sklearn.cross_validation import train_test_splittrain_X,test_X,train_y,test_y=train_test_split(source_X,source_y,train_size=.8)print(source_X.shape, train_X.shape, test_X.shape, source_y.shape, train_y.shape, test_y.shape)(891, 27) (712, 27) (179, 27) (891,) (712,) (179,)


source_y.head()0 0.01 1.02 1.03 1.04 0.0Name: Survived, dtype: float64

4 選擇機器學習演算法

這裡我們用邏輯回歸(logisic regression)演算法進行預測


from sklearn.linear_model import LogisticRegression#創建模型model = LogisticRegression()#訓練模型model.fit( train_X , train_y )LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class=ovr, n_jobs=1, penalty=l2, random_state=None, solver=liblinear, tol=0.0001, verbose=0, warm_start=False)

5 模型評估


model.score(test_X , test_y )#得分0.8379888268156425

6 方案實施


#使用機器學習模型,對預測數據集中的生存情況進行預測pred_Y = model.predict(pred_X)#生成的預測值是浮點數(0.0,1,0),但是Kaggle要求提交的結果是整型(0,1),所以要對數據類型進行轉換pred_Y=pred_Y.astype(int)#乘客idpassenger_id = full.loc[sourceRow:,PassengerId]#數據框:乘客id,預測生存情況的值predDf = pd.DataFrame( { PassengerId: passenger_id , Survived: pred_Y } )predDf.shapepredDf.head() passengerID Survived891 892 0892 893 1893 894 0894 895 0895 896 1


predDf.to_csv( titanic_pred.csv , index = False )




