[轉載]《機器學習基石》作業2
來自專欄 Antenna的機器學習筆記
原作者:海綿醬
原文鏈接:機器學習基石-作業2(16-20題Python實現) - CSDN博客
import numpy as npfrom numpy import random def sign(x):#自定義符號函數,只返回-1,+1 ret=np.ones(x.shape) for i,each in enumerate(x): if each<0: ret[i]=-1 return ret def getTheta(x):#由輸入的x生成假設空間的所有theta的序列 n=len(x) l1=sorted(x) theta=np.zeros(n) for i in range(n-1): theta[i]=(l1[i]+l1[i+1])/2 theta[-1]=1 return theta def q17_18(): data_size=20 expes=5000 E_in=0 E_out=0 for i in range(expes): x=random.uniform(-1,1,data_size) noise_rate=0.2 #生成[-0.2,0.8]範圍內的隨機數組,取sign()即變為有20%的-1的隨機數組 noise=sign(random.uniform(size=data_size)-noise_rate) y=sign(x)*noise #為y加上20%的雜訊 theta=getTheta(x) e_in=np.zeros((2,data_size))#對每個theta求出一個error_in,第一行是s=1,第2行是s=-1. for i in range(len(theta)): a1=y*sign(x-theta[i]) e_in[0][i]=(data_size-np.sum(a1))/(2*data_size)#數組只有-1和+1,可直接計算出-1所佔比例 e_in[1][i]=(data_size-np.sum(-a1))/(2*data_size) s=0;theta_best=0 min0, min1 = np.min(e_in[0]), np.min(e_in[1]) if min0<min1: s=1 theta_best=theta[np.argmin(e_in[0])] else: s=-1 theta_best=theta[np.argmin(e_in[1])] e_out=0.5+0.3*s*(np.abs(theta_best)-1) E_in+=np.min(e_in) E_out+=np.min(e_out) ave_in=E_in/expes ave_out=E_out/expes print(ave_in,ave_out) def deciStump(x,y):#d:第d維,x:第x維數據 y:標籤 data_size=len(x) theta=getTheta(x) e_in=np.zeros((2,data_size)) for i in range(len(theta)): a1=y*sign(x-theta[i]) e_in[0][i]=(data_size-np.sum(a1))/(2*data_size) e_in[1][i]=(data_size-np.sum(-a1))/(2*data_size) s=0 min0, min1 = np.min(e_in[0]), np.min(e_in[1]) if min0<min1: s=1 theta_best=theta[np.argmin(e_in[0])] else: s=-1 theta_best=theta[np.argmin(e_in[1]) E_in=np.min(np.min(e_in)) return s,theta_best,E_in def mkDateSet(datPath): dataSet = open(datPath, r).readlines() m = len(dataSet) X_train = np.zeros((m, 9)) Y_train = np.zeros(m) for i, item in enumerate(dataSet): each = item.strip().split() X_train[i] = [float(a) for a in each[:-1]] Y_train[i] = int(each[-1]) return (X_train, Y_train) def getData_i(X_train,i):#獲取第d維數據 return np.reshape(X_train[:,i],len(X_train))#從ndarray二維數組轉為array一維數組 def q19_20(): (X_train, Y_train)=mkDateSet(hw2_train.dat) e_in=np.zeros(9) s=np.zeros(9) theta=np.zeros(9) for i in range(9): s[i],theta[i],e_in[i]=deciStump(getData_i(X_train,i),Y_train) E_in=np.min(e_in) dimension=np.argmin(e_in) theta_best=theta[dimension] s_best=s[dimension] (X_test, Y_test)=mkDateSet(hw2_test.dat) test_len=len(Y_test) X_i=getData_i(X_test,dimension) q=Y_test*s_best*sign(X_i-theta_best) E_out=(test_len-np.sum(q))/(2*test_len) print(E_in,E_out) if __name__==__main__: q17_18()#0.16737 0.257180978031 q19_20()#0.25 0.355
推薦閱讀:
※[機器學習入門] 李宏毅機器學習筆記-2 (Regression:Case Study ;回歸:案例研究)
※什麼是生成式對抗網路 GAN
※《Scikit-Learn與TensorFlow機器學習實用指南》第3章 分類
※深度學習與量子蒙特卡洛 (3) 最優路徑與神經網路
※台大林軒田機器學習課第卅二講筆記:結束曲