簡單線性回歸練習sklearn.linear_model.LinearRegression

03-02

本文使用少量樣本數據進行簡單線性回歸的實戰，主要練習sklearn的線性回歸函數，並且使用了sklearn中的cross_validation import.train_test_split進行測試集與訓練集的劃分。

可視化部分使用plotly進行。

import sklearnimport pandas as pdfrom collections import OrderedDictimport numpy as np#數據集examDict={ 學習時間:[0.50,0.75,1.00,1.25,1.50,1.75,1.75,2.00,2.25, 2.50,2.75,3.00,3.25,3.50,4.00,4.25,4.50,4.75,5.00,5.50], 分數: [10, 22, 13, 43, 20, 22, 33, 50, 62, 48, 55, 75, 62, 73, 81, 76, 64, 82, 90, 93]}examOrderDict=OrderedDict(examDict)examDf=pd.DataFrame(examOrderDict)#提取特徵和標籤#特徵featuresexam_X=examDf.loc[:,學習時間]#標籤labesexam_y=examDf.loc[:,分數]#散點圖import plotly as pyimport plotly.graph_objs as gopy.offline.init_notebook_mode() #plotly離線模式data = [ go.Scatter(x = exam_X, y = exam_y,mode = markers) ]layout = go.Layout(title=學習時間-分數,yaxis={title:學習時間},xaxis={title:分數})fig = go.Figure(data=data, layout=layout)py.offline.iplot(fig)train_test_split是交叉驗證中常用的函數，功能是從樣本中隨機的按比例選取訓練數據（train）和測試數據（test）第1個參數：所要劃分的樣本特徵第2個參數：所要劃分的樣本標籤train_size：訓練數據佔比，如果是整數的話就是樣本的數量from sklearn.cross_validation import train_test_split#建立訓練數據和測試數據X_train , X_test , y_train , y_test = train_test_split(exam_X , exam_y , train_size = .8)#輸出數據大小print(原始數據特徵：,exam_X.shape , ，訓練數據特徵：, X_train.shape , ，測試數據特徵：,X_test.shape )print(原始數據標籤：,exam_y.shape , 訓練數據標籤：, y_train.shape , 測試數據標籤： ,y_test.shape)#分別繪製訓練集和測試集散點圖scatter1 = go.Scatter(x = X_train, y = y_train,mode = markers,name = train)scatter2 = go.Scatter(x = X_test, y = y_test,mode = markers,name = test)data = [ scatter1,scatter2 ]layout = go.Layout(title=學習時間-分數,yaxis={title:學習時間},xaxis={title:分數})fig = go.Figure(data=data, layout=layout)py.offline.iplot(fig)

sklearn要求輸入的特徵必須是二維數組的類型，但是因為我們目前只有1個特徵，所以需要用安裝錯誤提示用reshape轉行成二維數組的類型。錯誤提示信息：Reshape your data either using array.reshape(-1, 1) if your data has a single feature#將訓練數據特徵轉換成二維數組XX行*1列X_train1 = X_train.values.reshape(-1,1) #print(X_train)#將測試數據特徵轉換成二維數組行數*1列X_test1 = X_test.values.reshape(-1,1)#print(X_test)#第1步：導入線性回歸from sklearn.linear_model import LinearRegression# 第2步：創建模型：線性回歸model = LinearRegression()#第3步：訓練模型model.fit(X_train1 , y_train)LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)最佳擬合線：z=??+??x截距intercept：a回歸係數：b#截距a=model.intercept_#回歸係數b=model.coef_print(最佳擬合線：截距a=,a,，回歸係數b=,b)最佳擬合線：截距a= 8.49197038177 ，回歸係數b= [ 15.91537648]#繪製圖形#訓練數據的預測值y_train_pred = model.predict(X_train1)#分別繪製訓練集和測試集散點圖scatter1 = go.Scatter(x = X_train, y = y_train,mode = markers,name = train)scatter2 = go.Scatter(x = X_test, y = y_test,mode = markers,name = test)line = go.Scatter(x = X_train, y = y_train_pred ,mode = lines, name = predict)data = [ scatter1,scatter2,line ]layout = go.Layout(title=學習時間-分數,yaxis={title:學習時間},xaxis={title:分數})fig = go.Figure(data=data, layout=layout)py.offline.iplot(fig)

繪製出數據集散點圖及最佳擬合線

#相關係數：corr返回結果是一個數據框，存放的是相關係數矩陣rDf=examDf.corr()#並繪製相關係數矩陣

#線性回歸的scroe方法得到的是決定係數R平方#評估模型:決定係數R平方model.score(X_test1 , y_test)0.64961574398099398