精準營銷的兩階段預測模型-客戶響應模型+代碼

上代碼:

# coding: utf-8

# 《Python數據科學實戰》案例之:慈善機構營銷預測模型

# #模型訓練使用流程

# - 數據抽取

# - 數據探索

# - 建模數據準備

# - 變數選擇

# - 模型開發與驗證

# - 模型部署

# - 模型監督

# In[86]:

# ## 步驟一:構造營銷響應模型

# ### 1、 數據獲取與導入的S(抽樣)階段。

# - 規整數據集

# In[87]:

import pandas as pd

model_data = pd.read_csv("donations2.csv").drop(["ID","TARGET_D"],1)

model_data.head()

# In[88]:

model_data.dtypes

# In[89]:

#model_data["TARGET_B"]=pd.Categorical(model_data["TARGET_B"],ordered=False)

model_data["StatusCat96NK"]=pd.Categorical(model_data["StatusCat96NK"],ordered=False)

model_data["DemCluster"]=pd.Categorical(model_data["DemCluster"],ordered=False)

model_data["DemGender"]=pd.Categorical(model_data["DemGender"],ordered=False)

model_data["DemHomeOwner"]=pd.Categorical(model_data["DemHomeOwner"],ordered=False)

# 在pandas中的官方在線文檔中,給出了pandas因子變數的詳細論述,並在適當位置與R語言進行了對比描述。

# Categorical Data

# In[90]:

model_data.dtypes

# In[91]:

y = TARGET_B

var_c = ["GiftCnt36","GiftCntAll","GiftCntCard36","GiftCntCardAll","GiftTimeLast","GiftTimeFirst", "PromCnt12","PromCnt36","PromCntAll","PromCntCard12","PromCntCard36","PromCntCardAll", "StatusCatStarAll","DemAge","DemMedHomeValue","DemPctVeterans","DemMedIncome","GiftAvgLast", "GiftAvg36","GiftAvgAll","GiftAvgCard36"]

var_d = list(set(model_data.columns)-set(var_c)-set([y]))

# In[92]:

X = model_data[var_c+var_d]

Y = model_data[y]

# - 篩選預測能力強的變數

# In[93]:

from woe import WoE

# **WoE類參數說明**:

# + **qnt_num**:int,等頻分箱個數,默認16

# + **min_block_size**:int,最小觀測數目,默認16

# + **spec_values**:dict,若為分類自變數,指派替換值

# + **v_type**:str,自變數類型,分類:『d』,連續變數:『c』,默認c

# + **bins**:list,預定義的連續變數分箱區間

# + **t_type**:str,目標變數類型,二分類:『b』,連續變數:『c』,默認b

# **WoE類重要方法**:

#

# + **plot**:繪製WOE圖

# + **transform**:轉換數據為WOE數據

# + **fit_transform**:轉換數據為WOE數據

# + **optimize**:連續變數使用最優分箱

# **WoE類重要屬性**:

# + **bins**:分箱結果匯總

# + **iv**:變數的信息價值

# + 根據IV值篩選變數-分類變數

# In[94]:

from woe import WoE

iv_d = {}

for i in var_d:

iv_d[i] = WoE(v_type=d,t_type=b).fit(X[i],Y).iv

pd.Series(iv_d).sort_values(ascending=False)

# In[95]:

var_d_s = list(set(var_d)-set(["DemHomeOwner","DemGender"]))

# + 根據IV值篩選變數-連續變數

# In[96]:

iv_c = {}

for i in var_c:

iv_c[i] = WoE(v_type=c,t_type=b,qnt_num=3).fit(X[i],Y).iv

pd.Series(iv_c).sort_values(ascending=False)

# In[97]:

var_c_s = list(set(var_c)-set(["PromCntCard12","PromCnt12","DemMedHomeValue","PromCnt36", "DemAge","DemPctVeterans","DemMedIncome","StatusCatStarAll","GiftCntCard36"]))

# In[98]:

X = model_data[var_c_s+var_d_s]

Y = model_data[y]

# ### 2、針對每個變數的E(探索)階段

# - 對連續變數的統計探索

# In[99]:

X[var_c_s].describe().T

# In[100]:

import matplotlib.pyplot as plt

plt.hist(X["PromCntAll"], bins=20)

plt.show()

# In[101]:

abs((X[var_c_s].mode().ix[0,]-X[var_c_s].median())/(X[var_c_s].quantile(0.75)-X[var_c_s].quantile(0.25)))

# - 對分類變數的統計探索

# In[102]:

X["DemCluster"].value_counts()

# ### 3、針對有問題的變數進行修改的M(修改)階段

# - 將連續變數的錯誤值改為缺失值

# In[103]:

X.isnull().sum()/(X.count()+X.isnull().sum())

# - 將連續變數的缺失值用中位數填補

# In[104]:

top_state = X.GiftAvgCard36.median()

X.GiftAvgCard36.fillna(value=top_state, inplace=True)

# - 對分類水平過多的變數進行合併(或概化)

# In[105]:

X.DemCluster.value_counts()

# In[106]:

X_rep=X.replace({"DemCluster":(37,48,5 ,47,7 ,29,32,9 ,50,6,33)},100)

# In[107]:

for i in var_d_s:

X_rep[i] = WoE(v_type=d).fit_transform(X[i],Y)

# In[108]:

import sklearn.ensemble as ensemble

rfc = ensemble.RandomForestClassifier(criterion=entropy, n_estimators=3, max_features=0.5, min_samples_split=5)

rfc_model = rfc.fit(X_rep, Y)

rfc_model.feature_importances_

rfc_fi = pd.DataFrame()

rfc_fi["features"] = list(X.columns)

rfc_fi["importance"] = list(rfc_model.feature_importances_)

rfc_fi=rfc_fi.set_index("features",drop=True)

rfc_fi.sort_index(by="importance",ascending=False).plot(kind="bar")

# In[109]:

var_x = ["GiftAvgAll","DemCluster","PromCntAll","GiftTimeFirst","GiftAvg36","GiftTimeLast", "GiftAvgCard36","GiftCntAll","PromCntCardAll","GiftAvgLast","PromCntCard36","GiftCntCardAll", "GiftCnt36"]

# - 3)解釋變數分布轉換

# In[25]:

import matplotlib.pyplot as plt

for i in var_x:

plt.hist(X_rep[i], bins=20)

plt.show()

# In[110]:

skew_var_x = {}

for i in var_x:

skew_var_x[i]=abs(X_rep[i].skew())

skew=pd.Series(skew_var_x).sort_values(ascending=False)

skew

# In[111]:

var_x_ln = skew.index[skew>=1]

import numpy as np

for i in var_x_ln:

if min(X_rep[i])<0:

X_rep[i] =np.log(X_rep[i]+abs(min(X_rep[i]))+0.01)

else:

X_rep[i] =np.log(X_rep[i]+0.01)

# In[112]:

skew_var_x = {}

for i in var_x:

skew_var_x[i]=abs(X_rep[i].skew())

skew=pd.Series(skew_var_x).sort_values(ascending=False)

skew

# - 3)變數壓縮

# In[146]:

from VarSelec import *

X_rep_reduc=Var_Select_auto(X_rep)

X_rep_reduc.head()

# In[155]:

X_rep_reduc.corr()

# ### 4、建立邏輯回歸模型M(建模)階段

# - 分成訓練集和測試集,比例為6:4

# In[147]:

import sklearn.cross_validation as cross_validation

train_data, test_data, train_target, test_target = cross_validation.train_test_split(X_rep_reduc, Y, test_size=0.3, random_state=0)

# - 模型訓練

# - 使用全部變數進行logistic回歸

# In[148]:

from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

train_data = min_max_scaler.fit_transform(train_data)

test_data = min_max_scaler.fit_transform(test_data)

train_data

# In[149]:

import sklearn.linear_model as linear_model

logistic_model = linear_model.LogisticRegression(class_weight=None, dual=False, fit_intercept=True,

intercept_scaling=1, penalty=l1, random_state=None, tol=0.001)

# In[150]:

from sklearn.model_selection import ParameterGrid, GridSearchCV

C=np.logspace(-3,0,20,base=10)

param_grid = {C: C}

clf_cv = GridSearchCV(estimator=logistic_model,

param_grid=param_grid,

cv=5,

scoring=roc_auc)

clf_cv.fit(train_data, train_target)

# In[151]:

import sklearn.linear_model as linear_model

logistic_model = linear_model.LogisticRegression(C=clf_cv.best_params_["C"], class_weight=None, dual=False, fit_intercept=True,

intercept_scaling=1, penalty=l1, random_state=None, tol=0.001)

logistic_model.fit(train_data, train_target)

# In[152]:

logistic_model.coef_

# In[156]:

model=X_rep_reduc.join(train_target)

import statsmodels.api as sm

import statsmodels.formula.api as smf

formula = TARGET_B ~ GiftAvgLast+GiftTimeFirst+GiftTimeLast+GiftCnt36+GiftAvgCard36+PromCntCard36+StatusCat96NK

lg_m = smf.glm(formula=formula, data=model,

family=sm.families.Binomial(sm.families.links.logit)).fit()

lg_m.summary().tables[1]

# ### 5、模型驗證A(驗證)階段。

# - 對邏輯回歸模型進行評估

# In[56]:

test_est = logistic_model.predict(test_data)

train_est = logistic_model.predict(train_data)

# In[57]:

test_est_p = logistic_model.predict_proba(test_data)[:,1]

train_est_p = logistic_model.predict_proba(train_data)[:,1]

# In[58]:

import sklearn.metrics as metrics

print(metrics.classification_report(test_target, test_est))

# In[59]:

print(metrics.classification_report(train_target, train_est))

# In[60]:

metrics.zero_one_loss(test_target, test_est)

# In[61]:

metrics.zero_one_loss(train_target, train_est)

# - 目標樣本和非目標樣本的分數分布

# In[65]:

import seaborn as sns

red, blue = sns.color_palette("Set1",2)

# In[66]:

sns.kdeplot(test_est_p[test_target==1], shade=True, color=red)

sns.kdeplot(test_est_p[test_target==0], shade=True, color=blue)

# - ROC曲線

# In[67]:

fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_est_p)

fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_est_p)

plt.figure(figsize=[6,6])

plt.plot(fpr_test, tpr_test, color=blue)

plt.plot(fpr_train, tpr_train, color=red)

plt.title(ROC curve)

print(AUC = %6.4f %metrics.auc(fpr_test, tpr_test))

# In[68]:

test_x_axis = np.arange(len(fpr_test))/float(len(fpr_test))

train_x_axis = np.arange(len(fpr_train))/float(len(fpr_train))

plt.figure(figsize=[6,6])

plt.plot(fpr_test, test_x_axis, color=blue)

plt.plot(tpr_test, test_x_axis, color=blue)

plt.plot(fpr_train, train_x_axis, color=red)

plt.plot(tpr_train, train_x_axis, color=red)

plt.title(KS curve)

# - 構建神經網路並評估

# In[69]:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(train_data)

scaled_train_data = scaler.transform(train_data)

scaled_test_data = scaler.transform(test_data)

# In[70]:

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10,),

activation=logistic, alpha=0.1, max_iter=1000)

# In[71]:

from sklearn.model_selection import GridSearchCV

from sklearn import metrics

param_grid = {

hidden_layer_sizes:[(10, ), (15, ), (20, ), (5, 5)],

activation:[logistic, tanh, relu],

alpha:[0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]

}

mlp = MLPClassifier(max_iter=1000)

gcv = GridSearchCV(estimator=mlp, param_grid=param_grid,

scoring=roc_auc, cv=4, n_jobs=-1)

gcv.fit(scaled_train_data, train_target)

# In[72]:

gcv.best_params_

# In[73]:

mlp = MLPClassifier(hidden_layer_sizes=gcv.best_params_["hidden_layer_sizes"],

activation=gcv.best_params_["activation"], alpha=gcv.best_params_["alpha"], max_iter=1000)

mlp.fit(scaled_train_data, train_target)

# In[74]:

train_predict = mlp.predict(scaled_train_data)

test_predict = mlp.predict(scaled_test_data)

# In[75]:

train_proba = mlp.predict_proba(scaled_train_data)[:, 1]

test_proba = mlp.predict_proba(scaled_test_data)[:, 1]

# In[76]:

from sklearn import metrics

print(metrics.confusion_matrix(test_target, test_predict, labels=[0, 1]))

print(metrics.classification_report(test_target, test_predict))

# In[77]:

fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, test_proba)

fpr_train, tpr_train, th_train = metrics.roc_curve(train_target, train_proba)

plt.figure(figsize=[4, 4])

plt.plot(fpr_test, tpr_test, b-)

plt.plot(fpr_train, tpr_train, r-)

plt.title(ROC curve)

plt.show()

print(AUC = %6.4f %metrics.auc(fpr_test, tpr_test))

# ### 模型永久化

# In[78]:

import pickle as pickle

model_file = open(rlogitic.model, wb)

pickle.dump(logistic_model, model_file)

model_file.close()

# In[79]:

model_load_file = open(rlogitic.model, rb)

model_load = pickle.load(model_load_file)

model_load_file.close()

# In[80]:

test_est_load = model_load.predict(test_data)

# In[81]:

pd.crosstab(test_est_load,test_est)

# In[ ]:


推薦閱讀:

通俗易懂說數據挖掘十大經典演算法
數據挖掘和網路爬蟲有什麼關聯區別?
Kaggle Titanic 生存預測(Top1.4%)完整代碼分享
推薦相關梳理
還在與數據表格鬥毆?這12個數據可視化工具正準備來解放你

TAG:Python | 數據挖掘 | 大數據分析 |