精準營銷的兩階段預測模型-客戶響應模型+代碼
上代碼:
# coding: utf-8
# 《Python數據科學實戰》案例之:慈善機構營銷預測模型
# #模型訓練使用流程
# - 數據抽取
# - 數據探索
# - 建模數據準備
# - 變數選擇
# - 模型開發與驗證
# - 模型部署
# - 模型監督
# In[86]:
# ## 步驟一:構造營銷響應模型
# ### 1、 數據獲取與導入的S(抽樣)階段。
# - 規整數據集
# In[87]:
import pandas as pd
model_data = pd.read_csv("donations2.csv").drop(["ID","TARGET_D"],1)
model_data.head()
# In[88]:
model_data.dtypes
# In[89]:
#model_data["TARGET_B"]=pd.Categorical(model_data["TARGET_B"],ordered=False)
model_data["StatusCat96NK"]=pd.Categorical(model_data["StatusCat96NK"],ordered=False)
model_data["DemCluster"]=pd.Categorical(model_data["DemCluster"],ordered=False)
model_data["DemGender"]=pd.Categorical(model_data["DemGender"],ordered=False)
model_data["DemHomeOwner"]=pd.Categorical(model_data["DemHomeOwner"],ordered=False)
# 在pandas中的官方在線文檔中,給出了pandas因子變數的詳細論述,並在適當位置與R語言進行了對比描述。
# Categorical Data
# In[90]:
model_data.dtypes
# In[91]:
y = TARGET_B
var_c = ["GiftCnt36","GiftCntAll","GiftCntCard36","GiftCntCardAll","GiftTimeLast","GiftTimeFirst", "PromCnt12","PromCnt36","PromCntAll","PromCntCard12","PromCntCard36","PromCntCardAll", "StatusCatStarAll","DemAge","DemMedHomeValue","DemPctVeterans","DemMedIncome","GiftAvgLast", "GiftAvg36","GiftAvgAll","GiftAvgCard36"]
var_d = list(set(model_data.columns)-set(var_c)-set([y]))
# In[92]:
X = model_data[var_c+var_d]
Y = model_data[y]
# - 篩選預測能力強的變數
# In[93]:
from woe import WoE
# **WoE類參數說明**:
# + **qnt_num**:int,等頻分箱個數,默認16
# + **min_block_size**:int,最小觀測數目,默認16
# + **spec_values**:dict,若為分類自變數,指派替換值
# + **v_type**:str,自變數類型,分類:『d』,連續變數:『c』,默認c
# + **bins**:list,預定義的連續變數分箱區間
# + **t_type**:str,目標變數類型,二分類:『b』,連續變數:『c』,默認b
# **WoE類重要方法**:
#
# + **plot**:繪製WOE圖
# + **transform**:轉換數據為WOE數據
# + **fit_transform**:轉換數據為WOE數據
# + **optimize**:連續變數使用最優分箱
# **WoE類重要屬性**:
# + **bins**:分箱結果匯總
# + **iv**:變數的信息價值
# + 根據IV值篩選變數-分類變數
# In[94]:
from woe import WoE
iv_d = {}
for i in var_d:
iv_d[i] = WoE(v_type=d,t_type=b).fit(X[i],Y).iv
pd.Series(iv_d).sort_values(ascending=False)
# In[95]:
var_d_s = list(set(var_d)-set(["DemHomeOwner","DemGender"]))
# + 根據IV值篩選變數-連續變數
# In[96]:
iv_c = {}
for i in var_c:
iv_c[i] = WoE(v_type=c,t_type=b,qnt_num=3).fit(X[i],Y).iv
pd.Series(iv_c).sort_values(ascending=False)
# In[97]:
var_c_s = list(set(var_c)-set(["PromCntCard12","PromCnt12","DemMedHomeValue","PromCnt36", "DemAge","DemPctVeterans","DemMedIncome","StatusCatStarAll","GiftCntCard36"]))
# In[98]:
X = model_data[var_c_s+var_d_s]
Y = model_data[y]
# ### 2、針對每個變數的E(探索)階段
# - 對連續變數的統計探索
# In[99]:
X[var_c_s].describe().T
# In[100]:
import matplotlib.pyplot as plt
plt.hist(X["PromCntAll"], bins=20)
plt.show()
# In[101]:
abs((X[var_c_s].mode().ix[0,]-X[var_c_s].median())/(X[var_c_s].quantile(0.75)-X[var_c_s].quantile(0.25)))
# - 對分類變數的統計探索
# In[102]:
X["DemCluster"].value_counts()
# ### 3、針對有問題的變數進行修改的M(修改)階段
# - 將連續變數的錯誤值改為缺失值
# In[103]:
X.isnull().sum()/(X.count()+X.isnull().sum())
# - 將連續變數的缺失值用中位數填補
# In[104]:
top_state = X.GiftAvgCard36.median()
X.GiftAvgCard36.fillna(value=top_state, inplace=True)
# - 對分類水平過多的變數進行合併(或概化)
# In[105]:
X.DemCluster.value_counts()
# In[106]:
X_rep=X.replace({"DemCluster":(37,48,5 ,47,7 ,29,32,9 ,50,6,33)},100)
# In[107]:
for i in var_d_s:
X_rep[i] = WoE(v_type=d).fit_transform(X[i],Y)
# In[108]:
import sklearn.ensemble as ensemble
rfc = ensemble.RandomForestClassifier(criterion=entropy, n_estimators=3, max_features=0.5, min_samples_split=5)
rfc_model = rfc.fit(X_rep, Y)
rfc_model.feature_importances_
rfc_fi = pd.DataFrame()
rfc_fi["features"] = list(X.columns)
rfc_fi["importance"] = list(rfc_model.feature_importances_)
rfc_fi=rfc_fi.set_index("features",drop=True)
rfc_fi.sort_index(by="importance",ascending=False).plot(kind="bar")
# In[109]:
var_x = ["GiftAvgAll","DemCluster","PromCntAll","GiftTimeFirst","GiftAvg36","GiftTimeLast", "GiftAvgCard36","GiftCntAll","PromCntCardAll","GiftAvgLast","PromCntCard36","GiftCntCardAll", "GiftCnt36"]
# - 3)解釋變數分布轉換
# In[25]:
import matplotlib.pyplot as plt
for i in var_x:
plt.hist(X_rep[i], bins=20)
plt.show()
# In[110]:
skew_var_x = {}
for i in var_x:
skew_var_x[i]=abs(X_rep[i].skew())
skew=pd.Series(skew_var_x).sort_values(ascending=False)
skew
# In[111]:
var_x_ln = skew.index[skew>=1]
import numpy as np
for i in var_x_ln:
if min(X_rep[i])<0:
X_rep[i] =np.log(X_rep[i]+abs(min(X_rep[i]))+0.01)
else:
X_rep[i] =np.log(X_rep[i]+0.01)
# In[112]:
skew_var_x = {}
for i in var_x:
skew_var_x[i]=abs(X_rep[i].skew())
skew=pd.Series(skew_var_x).sort_values(ascending=False)
skew
# - 3)變數壓縮
# In[146]:
from VarSelec import *
X_rep_reduc=Var_Select_auto(X_rep)
X_rep_reduc.head()
# In[155]:
X_rep_reduc.corr()
# ### 4、建立邏輯回歸模型M(建模)階段
# - 分成訓練集和測試集,比例為6:4
# In[147]:
import sklearn.cross_validation as cross_validation
train_data, test_data, train_target, test_target = cross_validation.train_test_split(X_rep_reduc, Y, test_size=0.3, random_state=0)
# - 模型訓練
# - 使用全部變數進行logistic回歸
# In[148]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
train_data = min_max_scaler.fit_transform(train_data)
test_data = min_max_scaler.fit_transform(test_data)
train_data
# In[149]:
import sklearn.linear_model as linear_model
logistic_model = linear_model.LogisticRegression(class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, penalty=l1, random_state=None, tol=0.001)
# In[150]:
from sklearn.model_selection import ParameterGrid, GridSearchCV
C=np.logspace(-3,0,20,base=10)
param_grid = {C: C}
clf_cv = GridSearchCV(estimator=logistic_model,
param_grid=param_grid,
cv=5,
scoring=roc_auc)
clf_cv.fit(train_data, train_target)
# In[151]:
import sklearn.linear_model as linear_model
logistic_model = linear_model.LogisticRegression(C=clf_cv.best_params_["C"], class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, penalty=l1, random_state=None, tol=0.001)
logistic_model.fit(train_data, train_target)
# In[152]:
logistic_model.coef_
# In[156]:
model=X_rep_reduc.join(train_target)
import statsmodels.api as sm
import statsmodels.formula.api as smf
formula = TARGET_B ~ GiftAvgLast+GiftTimeFirst+GiftTimeLast+GiftCnt36+GiftAvgCard36+PromCntCard36+StatusCat96NK
lg_m = smf.glm(formula=formula, data=model,
family=sm.families.Binomial(sm.families.links.logit)).fit()
lg_m.summary().tables[1]
# ### 5、模型驗證A(驗證)階段。
# - 對邏輯回歸模型進行評估
# In[56]:
test_est = logistic_model.predict(test_data)
train_est = logistic_model.predict(train_data)
# In[57]:
test_est_p = logistic_model.predict_proba(test_data)[:,1]
train_est_p = logistic_model.predict_proba(train_data)[:,1]
# In[58]:
import sklearn.metrics as metrics
print(metrics.classification_report(test_target, test_est))
# In[59]:
print(metrics.classification_report(train_target, train_est))
# In[60]:
metrics.zero_one_loss(test_target, test_est)
# In[61]:
metrics.zero_one_loss(train_target, train_est)
# - 目標樣本和非目標樣本的分數分布
# In[65]:
import seaborn as sns
red, blue = sns.color_palette("Set1",2)
# In[66]:
sns.kdeplot(test_est_p[test_target==1], shade=True, color=red)
sns.kdeplot(test_est_p[test_target==0], shade=True, color=blue)
# - ROC曲線
# In[67]:
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test, tpr_test, color=blue)
plt.plot(fpr_train, tpr_train, color=red)
plt.title(ROC curve)
print(AUC = %6.4f %metrics.auc(fpr_test, tpr_test))
# In[68]:
test_x_axis = np.arange(len(fpr_test))/float(len(fpr_test))
train_x_axis = np.arange(len(fpr_train))/float(len(fpr_train))
plt.figure(figsize=[6,6])
plt.plot(fpr_test, test_x_axis, color=blue)
plt.plot(tpr_test, test_x_axis, color=blue)
plt.plot(fpr_train, train_x_axis, color=red)
plt.plot(tpr_train, train_x_axis, color=red)
plt.title(KS curve)
# - 構建神經網路並評估
# In[69]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)
# In[70]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10,),
activation=logistic, alpha=0.1, max_iter=1000)
# In[71]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
param_grid = {
hidden_layer_sizes:[(10, ), (15, ), (20, ), (5, 5)],
activation:[logistic, tanh, relu],
alpha:[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]
}
mlp = MLPClassifier(max_iter=1000)
gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,
scoring=roc_auc, cv=4, n_jobs=-1)
gcv.fit(scaled_train_data, train_target)
# In[72]:
gcv.best_params_
# In[73]:
mlp = MLPClassifier(hidden_layer_sizes=gcv.best_params_["hidden_layer_sizes"],
activation=gcv.best_params_["activation"], alpha=gcv.best_params_["alpha"], max_iter=1000)
mlp.fit(scaled_train_data, train_target)
# In[74]:
train_predict = mlp.predict(scaled_train_data)
test_predict = mlp.predict(scaled_test_data)
# In[75]:
train_proba = mlp.predict_proba(scaled_train_data)[:, 1]
test_proba = mlp.predict_proba(scaled_test_data)[:, 1]
# In[76]:
from sklearn import metrics
print(metrics.confusion_matrix(test_target, test_predict, labels=[0, 1]))
print(metrics.classification_report(test_target, test_predict))
# In[77]:
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_proba)
plt.figure(figsize=[4, 4])
plt.plot(fpr_test, tpr_test, b-)
plt.plot(fpr_train, tpr_train, r-)
plt.title(ROC curve)
plt.show()
print(AUC = %6.4f %metrics.auc(fpr_test, tpr_test))
# ### 模型永久化
# In[78]:
import pickle as pickle
model_file = open(rlogitic.model, wb)
pickle.dump(logistic_model, model_file)
model_file.close()
# In[79]:
model_load_file = open(rlogitic.model, rb)
model_load = pickle.load(model_load_file)
model_load_file.close()
# In[80]:
test_est_load = model_load.predict(test_data)
# In[81]:
pd.crosstab(test_est_load,test_est)
# In[ ]:
推薦閱讀:
※通俗易懂說數據挖掘十大經典演算法
※數據挖掘和網路爬蟲有什麼關聯區別?
※Kaggle Titanic 生存預測(Top1.4%)完整代碼分享
※推薦相關梳理
※還在與數據表格鬥毆?這12個數據可視化工具正準備來解放你