随着市场的不断变化和技术的不断发展,营销人员的工作内容和结构也不再拘泥于过去传统的理论概念,但是如何依靠最新的技术帮助企业实现销量增长、用户留存等问题,是当下很多企业打破困境的焦点问题。
站在企业的角度,他们渴望知晓如果进行折扣,打折/买赠更适合促销、如何通过营销增加自身的销售额等问题。其中产品营销响应预测在出现后对企业的营销产生了很大的影响,包括对用户进行细分并预测他们的生命周期价值(LTV) 以达到定位目的。在细分客户的基础上并进行 A/B 测试使企业能够尝试许多不同的想法来增加销量。
这里将从产品营销响应预测出发,详细介绍什么是产品营销响应预测,以及企业可以如何应用这一技术优化营销策略,增加收益。
xgboost实现产品营销响应预测
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
import plotly.offline as pyoff
import plotly.graph_objs as go
import sklearn
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
pyoff.init_notebook_mode()
加载数据和自定义函数
# 为给定标准排序簇数
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
# 读取数据
df_data = pd.read_csv('data.csv')
df_data.head()
数据指标说明
- recency: 自上次购买以来的几个月
- history: 历史购买的金额值
- used_discount/used_bogo: 是否使用过折扣/买一送一
- zip_code: 邮政编码类别
- is_referral: 客户是否来源是否通过推荐渠道
- channel: 客户下单渠道,电话/网络/多渠道
- offer: 发送给客户的优惠类别,折扣/但一送一/无优惠
计算公式说明
- 转化提升:test组转化率 - test对照组转化率
- 订单提升:转化提升 * test组中的转化客户
- 收入提升:订单提升 * 平均订单价值
转化率函数
def calc_uplift(df):
# 设置平均单价
avg_order_value = 10
# 计算每种优惠类型的转化次数
base_conv = df[df.offer == 'No Offer']['conversion'].mean()
disc_conv = df[df.offer == 'Discount']['conversion'].mean()
bogo_conv = df[df.offer == 'Buy One Get One']['conversion'].mean()
# 计算折扣和 bogo 的转化率提升
disc_conv_uplift = disc_conv - base_conv
bogo_conv_uplift = bogo_conv - base_conv
# 计算订单提升
disc_order_uplift = disc_conv_uplift * len(df[df.offer == 'Discount']['conversion'])
bogo_order_uplift = bogo_conv_uplift * len(df[df.offer == 'Buy One Get One']['conversion'])
# 计算收入提升
disc_rev_uplift = disc_order_uplift * avg_order_value
bogo_rev_uplift = bogo_order_uplift * avg_order_value
print('折扣转化提升: {0}%'.format(np.round(disc_conv_uplift*100,2)))
print('折扣订单提升: {0}'.format(np.round(disc_order_uplift,2)))
print('折扣收入提升: {0}\n'.format(np.round(disc_rev_uplift,2)))
print('-------------- \n')
print('买一送一转换提升: {0}%'.format(np.round(bogo_conv_uplift*100,2)))
print('买一送一订单提升: {0}'.format(np.round(bogo_order_uplift,2)))
print('买一送一收入提升: {0}'.format(np.round(bogo_rev_uplift,2)))
calc_uplift(df_data)
从结果可以看出来折扣的转化率优于买一送一的转化率。
折扣转化提升: 7.66%
折扣订单提升: 1631.89
折扣收入提升: 16318.94
--------------
买一送一转换提升: 4.52%
买一送一订单提升: 967.4
买一送一收入提升: 9674.0
数据预处理特征工程
recency新近况
理论上情况下当 recency 上升的时候转化率应该下降,非活跃客户再一次购买的可能性较低。
df_plot = df_data.groupby('recency').conversion.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['recency'],
y=df_plot['conversion'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='新近况与转化率',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
整体的近况预期符合理想目标,随着每个月的时间推移,增长和跌幅有变化和实际运营有很多的因素有关系。
history历史消费
用聚类的方法进行观察历史消费金额,这里也可以使用pd.cut划分不同的区间操作。最终目标将 history 列数据进行离散化处理。
kmeans = KMeans(n_clusters=5)
df_data['history_cluster'] = kmeans.fit_predict(df_data[['history']])
# 订购簇号
df_data = order_cluster('history_cluster', 'history',df_data,True)
# 集群聚合
df_data.groupby('history_cluster').agg({'history':['mean','min','max'], 'conversion':['count', 'mean']})
# 绘制每个集群的转换
df_plot = df_data.groupby('history_cluster').conversion.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['history_cluster'],
y=df_plot['conversion'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='历史数据和转化率',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
使用折扣/买一送一
对不同类别购买方式进行聚合计算平均的购买率。
df_data.groupby(['used_discount','used_bogo','offer']).agg({'conversion':'mean'})
发现使用买一送一和优惠券的转化率较高。
zip_code邮编
按照不同地区查看消费的转化率。
df_plot = df_data.groupby('zip_code').conversion.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['zip_code'],
y=df_plot['conversion'],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='邮政编码和转化率',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Referal推荐
df_plot = df_data.groupby('is_referral').conversion.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['is_referral'],
y=df_plot['conversion'],
marker=dict(
color=['green', 'blue'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='推荐转化率',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Channel渠道
df_plot = df_data.groupby('channel').conversion.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['channel'],
y=df_plot['conversion'],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Channel vs Conversion',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
offer优惠政策
df_plot = df_data.groupby('offer').conversion.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['offer'],
y=df_plot['conversion'],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='折扣类别和转化率',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
特征工程转化
应用.get_dummies()将分类列转换为离散哑变量。
df_model = df_data.copy()
df_model = pd.get_dummies(df_model)
df_model
模型的构建和预测
创建特征集、标签及基础参数
X = df_model.drop(['conversion'],axis=1)
y = df_model.conversion
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
# 设置基本参数
cv_num = 5
verbose = 1
n_jobs = 8
scoring_fuction = 'accuracy' # 准确率指标评估
参数调优组合:max_depth / min_child_weight
- max_depth树的最大深度,值越大,树越大,模型越复杂 可以用来防止过拟合。
- min_child_weight一个子集的所有观察值的最小权重和。
%%time
cv_params = {'max_depth': list(range(4,9)), 'min_child_weight': list((1,3,6))}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1,
'objective': 'reg:logistic'}#业务类型
model = xgb.XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring=scoring_fuction, cv=cv_num, return_train_score=True,verbose=verbose, n_jobs=n_jobs)
optimized_GBM.fit(X_train, y_train.values)
print("每次运行的结果如下:")
for (r,i,j,k) in zip(optimized_GBM.cv_results_['rank_test_score'],optimized_GBM.cv_results_['params'],optimized_GBM.cv_results_['mean_test_score'],optimized_GBM.cv_results_['std_test_score']):
print(" params:"+str(i)+" mean:"+str(j)," std:"+str(k)+" rank:"+str(r))
print("---------------------------")
print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))
max_depth = optimized_GBM.best_params_['max_depth']
min_child_weight = optimized_GBM.best_params_['min_child_weight']
Fitting 5 folds for each of 15 candidates, totalling 75 fits
每次运行的结果如下:
params:{'max_depth': 4, 'min_child_weight': 1} mean:0.8525669642857144 std:0.0003076796596448719 rank:3
params:{'max_depth': 4, 'min_child_weight': 3} mean:0.8526562499999999 std:0.0003694853874385374 rank:2
params:{'max_depth': 4, 'min_child_weight': 6} mean:0.8527901785714285 std:0.00038012023138230233 rank:1
params:{'max_depth': 5, 'min_child_weight': 1} mean:0.8513839285714286 std:0.0008945296834604386 rank:6
params:{'max_depth': 5, 'min_child_weight': 3} mean:0.8519642857142857 std:0.0010483945148051293 rank:5
params:{'max_depth': 5, 'min_child_weight': 6} mean:0.8521428571428572 std:0.0007582717746188941 rank:4
params:{'max_depth': 6, 'min_child_weight': 1} mean:0.8499553571428571 std:0.0013141237526370594 rank:9
params:{'max_depth': 6, 'min_child_weight': 3} mean:0.8506473214285715 std:0.00088275535416452 rank:8
params:{'max_depth': 6, 'min_child_weight': 6} mean:0.8510267857142857 std:0.0011124942673551054 rank:7
params:{'max_depth': 7, 'min_child_weight': 1} mean:0.8476116071428571 std:0.002034313819654372 rank:12
params:{'max_depth': 7, 'min_child_weight': 3} mean:0.8481249999999999 std:0.0016424082267095638 rank:11
params:{'max_depth': 7, 'min_child_weight': 6} mean:0.8491071428571428 std:0.0015023498812914348 rank:10
params:{'max_depth': 8, 'min_child_weight': 1} mean:0.8447991071428571 std:0.0016244111963089499 rank:15
params:{'max_depth': 8, 'min_child_weight': 3} mean:0.8454910714285713 std:0.0012795455935684494 rank:14
params:{'max_depth': 8, 'min_child_weight': 6} mean:0.8469866071428571 std:0.0014620109726618308 rank:13
---------------------------
参数的最佳取值:{'max_depth': 4, 'min_child_weight': 6}
最佳模型得分:0.8527901785714285
CPU times: total: 51.7 s
Wall time: 3min 8s
grid_visualization=[]
for grid_pair in optimized_GBM.cv_results_['mean_test_score']:
grid_visualization.append(grid_pair)
grid_visualization=np.array(grid_visualization)
scores = grid_visualization.reshape(len(cv_params['max_depth']),len(cv_params['min_child_weight']))
plt.figure(figsize=(10,6))
cp = plt.contourf(cv_params['min_child_weight'],cv_params['max_depth'],scores, cmap='BrBG')
plt.colorbar(cp)
plt.title('max_depth / min_child_weight 优化图',fontsize=20)
plt.xlabel('min_child_weight',fontsize=20)
plt.ylabel('max_depth',fontsize=20)
plt.grid(True)
参数调优:gamma
gamma 分裂节点时,损失函数减小值只有大于等于gamma节点才分裂,gamma值越大,算法越保守,越不容易过拟合,但性能就不一定能保证,需要平衡。
%%time
cv_params = {'gamma':[ 0.1 * i for i in range(0,5)]}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'seed': 0,
'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1,
'gpu_id':0,'tree_method':'gpu_hist',#使用GPU加速
'objective': 'reg:logistic'}#业务类型
model = xgb.XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring=scoring_fuction, cv=cv_num, return_train_score=True,verbose=verbose, n_jobs=n_jobs)
optimized_GBM.fit(X_train, y_train.values)
print("每次运行的结果如下:")
for (r,i,j,k) in zip(optimized_GBM.cv_results_['rank_test_score'],optimized_GBM.cv_results_['params'],optimized_GBM.cv_results_['mean_test_score'],optimized_GBM.cv_results_['std_test_score']):
print(" params:"+str(i)+" mean:"+str(j)," std:"+str(k)+" rank:"+str(r))
print("---------------------------")
print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))
gamma = optimized_GBM.best_params_['gamma']
Fitting 5 folds for each of 5 candidates, totalling 25 fits
每次运行的结果如下:
params:{'gamma': 0.0} mean:0.8528348214285714 std:0.00025055741429287176 rank:5
params:{'gamma': 0.1} mean:0.8528794642857143 std:0.00030278258853238025 rank:3
params:{'gamma': 0.2} mean:0.8529687500000002 std:0.00016703827619527948 rank:1
params:{'gamma': 0.30000000000000004} mean:0.8529464285714287 std:0.0002961272134245882 rank:2
params:{'gamma': 0.4} mean:0.8528571428571429 std:0.00032037276996444076 rank:4
---------------------------
参数的最佳取值:{'gamma': 0.2}
最佳模型得分:0.8529687500000002
CPU times: total: 2.7 s
Wall time: 37 s
grid_visualization=[]
for grid_pair in optimized_GBM.cv_results_['mean_test_score']:
grid_visualization.append(grid_pair)
grid_visualization=np.array(grid_visualization)
plt.figure(figsize=(10,6))
plt.title('gamma优化')
plt.xlabel('gamma',fontsize=20)
plt.ylabel('score',fontsize=20)
plt.plot(cv_params['gamma'], grid_visualization)
plt.grid(True)
plt.show()
参数调优组合:subsample / colsample_bytree
- subsample 构建每棵树对样本的采样率,如果设置成0.5,XGBoost会随机选择一半的样本作为训练集。
- colsample_bytree 列采样率,也就是特征采样率。
%%time
cv_params = {'subsample':[ 0.1 * i for i in range(6,9)],'colsample_bytree':[ 0.1 * i for i in range(6,9)]}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'seed': 0,
'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': gamma, 'reg_alpha': 0, 'reg_lambda': 1,
'gpu_id':0,'tree_method':'gpu_hist',#使用GPU加速
'objective': 'reg:logistic'}#业务类型
model = xgb.XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring=scoring_fuction, cv=cv_num, verbose=verbose, n_jobs=n_jobs)
optimized_GBM.fit(X_train, y_train.values)
print("每次运行的结果如下:")
for (r,i,j,k) in zip(optimized_GBM.cv_results_['rank_test_score'],optimized_GBM.cv_results_['params'],optimized_GBM.cv_results_['mean_test_score'],optimized_GBM.cv_results_['std_test_score']):
print(" params:"+str(i)+" mean:"+str(j)," std:"+str(k)+" rank:"+str(r))
print("---------------------------")
print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))
subsample = optimized_GBM.best_params_['subsample']
colsample_bytree = optimized_GBM.best_params_['colsample_bytree']
Fitting 5 folds for each of 9 candidates, totalling 45 fits
每次运行的结果如下:
params:{'colsample_bytree': 0.6000000000000001, 'subsample': 0.6000000000000001} mean:0.8527901785714287 std:0.0003076796596448559 rank:4
params:{'colsample_bytree': 0.6000000000000001, 'subsample': 0.7000000000000001} mean:0.8529017857142858 std:0.0004054888867094886 rank:3
params:{'colsample_bytree': 0.6000000000000001, 'subsample': 0.8} mean:0.8530803571428571 std:0.00020700041284588372 rank:1
params:{'colsample_bytree': 0.7000000000000001, 'subsample': 0.6000000000000001} mean:0.8527455357142857 std:0.0005167784331602867 rank:5
params:{'colsample_bytree': 0.7000000000000001, 'subsample': 0.7000000000000001} mean:0.8526562500000001 std:0.00041997517237116 rank:9
params:{'colsample_bytree': 0.7000000000000001, 'subsample': 0.8} mean:0.8527455357142857 std:0.00044976432320555326 rank:5
params:{'colsample_bytree': 0.8, 'subsample': 0.6000000000000001} mean:0.8527232142857143 std:0.00025055741429289154 rank:8
params:{'colsample_bytree': 0.8, 'subsample': 0.7000000000000001} mean:0.8527455357142857 std:0.0004920849929698202 rank:5
params:{'colsample_bytree': 0.8, 'subsample': 0.8} mean:0.8529687500000002 std:0.00016703827619527948 rank:2
---------------------------
参数的最佳取值:{'colsample_bytree': 0.6000000000000001, 'subsample': 0.8}
最佳模型得分:0.8530803571428571
CPU times: total: 2.77 s
Wall time: 1min 3s
grid_visualization=[]
for grid_pair in optimized_GBM.cv_results_['mean_test_score']:
grid_visualization.append(grid_pair)
grid_visualization=np.array(grid_visualization)
scores = grid_visualization.reshape(len(cv_params['subsample']),len(cv_params['colsample_bytree']))
plt.figure(figsize=(10,6))
cp = plt.contourf(cv_params['colsample_bytree'],cv_params['subsample'],scores, cmap='BrBG')
plt.colorbar(cp)
plt.title('subsample / colsample_bytree 优化图',fontsize=20)
plt.xlabel('colsample_bytree',fontsize=20)
plt.ylabel('subsample',fontsize=20)
plt.grid(True)
参数调优组合:reg_alpha / reg_lambda
- reg_alpha L1正则化,增加该值会让模型更加收敛。
- reg_lambda L2正则化,这个参数是用来控制XGBoost的正则化部分的。虽然大部分数据科学家很少用到这个参数,但是这个参数在减少过拟合上还是可以挖掘出更多用处的。
%%time
cv_params = {'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'seed': 0,
'subsample': subsample, 'colsample_bytree': colsample_bytree, 'gamma': gamma, 'reg_alpha': 0, 'reg_lambda': 1,
'gpu_id':0,'tree_method':'gpu_hist',#使用GPU加速
'objective': 'reg:logistic'}#业务类型
model = xgb.XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring=scoring_fuction, cv=cv_num, verbose=verbose, n_jobs=n_jobs)
optimized_GBM.fit(X_train, y_train.values)
print("每次运行的结果如下:")
for (r,i,j,k) in zip(optimized_GBM.cv_results_['rank_test_score'],optimized_GBM.cv_results_['params'],optimized_GBM.cv_results_['mean_test_score'],optimized_GBM.cv_results_['std_test_score']):
print(" params:"+str(i)+" mean:"+str(j)," std:"+str(k)+" rank:"+str(r))
print("---------------------------")
print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))
reg_alpha = optimized_GBM.best_params_['reg_alpha']
reg_lambda = optimized_GBM.best_params_['reg_lambda']
Fitting 5 folds for each of 25 candidates, totalling 125 fits
每次运行的结果如下:
params:{'reg_alpha': 0.05, 'reg_lambda': 0.05} mean:0.8529017857142858 std:0.00043512474753611006 rank:22
params:{'reg_alpha': 0.05, 'reg_lambda': 0.1} mean:0.8530580357142858 std:0.0003500979272624529 rank:15
params:{'reg_alpha': 0.05, 'reg_lambda': 1} mean:0.8530803571428571 std:0.00032037276996442834 rank:14
params:{'reg_alpha': 0.05, 'reg_lambda': 2} mean:0.8529017857142855 std:0.00035293277457238185 rank:23
params:{'reg_alpha': 0.05, 'reg_lambda': 3} mean:0.8529687499999999 std:0.0003640514828192227 rank:19
params:{'reg_alpha': 0.1, 'reg_lambda': 0.05} mean:0.8528348214285714 std:0.00039679439363016943 rank:24
params:{'reg_alpha': 0.1, 'reg_lambda': 0.1} mean:0.8528125 std:0.0004596263455797903 rank:25
params:{'reg_alpha': 0.1, 'reg_lambda': 1} mean:0.8529464285714287 std:0.0003203727699644578 rank:21
params:{'reg_alpha': 0.1, 'reg_lambda': 2} mean:0.8530133928571428 std:0.0003076796596448558 rank:18
params:{'reg_alpha': 0.1, 'reg_lambda': 3} mean:0.8530357142857143 std:0.0003414521995709424 rank:17
params:{'reg_alpha': 1, 'reg_lambda': 0.05} mean:0.8529687499999999 std:0.0003280566619798615 rank:19
params:{'reg_alpha': 1, 'reg_lambda': 0.1} mean:0.8530580357142858 std:0.00029612721342458153 rank:15
params:{'reg_alpha': 1, 'reg_lambda': 1} mean:0.8531473214285714 std:0.0003109015240442773 rank:11
params:{'reg_alpha': 1, 'reg_lambda': 2} mean:0.8531026785714285 std:0.000258389216580127 rank:12
params:{'reg_alpha': 1, 'reg_lambda': 3} mean:0.8532366071428571 std:0.0003076796596448558 rank:8
params:{'reg_alpha': 2, 'reg_lambda': 0.05} mean:0.85328125 std:0.00026031035244841955 rank:5
params:{'reg_alpha': 2, 'reg_lambda': 0.1} mean:0.8531026785714285 std:0.0002944398651400014 rank:12
params:{'reg_alpha': 2, 'reg_lambda': 1} mean:0.8532142857142858 std:0.00021641427934893621 rank:9
params:{'reg_alpha': 2, 'reg_lambda': 2} mean:0.8532142857142857 std:0.0002583892165801232 rank:10
params:{'reg_alpha': 2, 'reg_lambda': 3} mean:0.85328125 std:0.0002070004128458813 rank:5
params:{'reg_alpha': 3, 'reg_lambda': 0.05} mean:0.85328125 std:0.000240409143175625 rank:5
params:{'reg_alpha': 3, 'reg_lambda': 0.1} mean:0.8533035714285715 std:0.00020700041284588372 rank:1
params:{'reg_alpha': 3, 'reg_lambda': 1} mean:0.8533035714285715 std:0.00020700041284587653 rank:1
params:{'reg_alpha': 3, 'reg_lambda': 2} mean:0.8533035714285713 std:0.0001670382761952884 rank:4
params:{'reg_alpha': 3, 'reg_lambda': 3} mean:0.8533035714285715 std:0.00020700041284587653 rank:1
---------------------------
参数的最佳取值:{'reg_alpha': 3, 'reg_lambda': 0.1}
最佳模型得分:0.8533035714285715
CPU times: total: 3.22 s
Wall time: 2min 49s
grid_visualization=[]
for grid_pair in optimized_GBM.cv_results_['mean_test_score']:
grid_visualization.append(grid_pair)
grid_visualization=np.array(grid_visualization)
scores = grid_visualization.reshape(len(cv_params['reg_alpha']),len(cv_params['reg_lambda']))
plt.figure(figsize=(10,6))
cp = plt.contourf(cv_params['reg_lambda'],cv_params['reg_alpha'],scores, cmap='BrBG')
plt.colorbar(cp)
plt.title('reg_alpha / reg_lambda 优化图',fontsize=20)
plt.xlabel('reg_lambda',fontsize=20)
plt.ylabel('reg_alpha',fontsize=20)
plt.grid(True)
参数调优:learning_rate
%%time
cv_params = {'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2]}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'seed': 0,
'subsample': subsample, 'colsample_bytree': colsample_bytree, 'gamma': gamma, 'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda,
'gpu_id':0,'tree_method':'gpu_hist',#使用GPU加速
'objective': 'reg:logistic'}#业务类型
model = xgb.XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring=scoring_fuction, cv=cv_num, verbose=verbose, n_jobs=n_jobs)
optimized_GBM.fit(X_train, y_train.values)
print("每次运行的结果如下:")
for (r,i,j,k) in zip(optimized_GBM.cv_results_['rank_test_score'],optimized_GBM.cv_results_['params'],optimized_GBM.cv_results_['mean_test_score'],optimized_GBM.cv_results_['std_test_score']):
print(" params:"+str(i)+" mean:"+str(j)," std:"+str(k)+" rank:"+str(r))
print("---------------------------")
print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))
learning_rate = optimized_GBM.best_params_['learning_rate']
Fitting 5 folds for each of 5 candidates, totalling 25 fits
每次运行的结果如下:
params:{'learning_rate': 0.01} mean:0.8533705357142857 std:4.464285714287363e-05 rank:1
params:{'learning_rate': 0.05} mean:0.8533482142857143 std:0.00015783633508627845 rank:3
params:{'learning_rate': 0.07} mean:0.8533705357142857 std:0.00014806360671228236 rank:1
params:{'learning_rate': 0.1} mean:0.8533035714285715 std:0.00020700041284588372 rank:4
params:{'learning_rate': 0.2} mean:0.8529910714285714 std:0.0004430677062785608 rank:5
---------------------------
参数的最佳取值:{'learning_rate': 0.01}
最佳模型得分:0.8533705357142857
CPU times: total: 2.73 s
Wall time: 35.2 s
grid_visualization=[]
for grid_pair in optimized_GBM.cv_results_['mean_test_score']:
grid_visualization.append(grid_pair)
grid_visualization=np.array(grid_visualization)
plt.figure(figsize=(10,6))
plt.title('learning_rate优化')
plt.xlabel('learning_rate',fontsize=20)
plt.ylabel('score',fontsize=20)
plt.plot(cv_params['learning_rate'], grid_visualization)
plt.grid(True)
plt.show()
参数调优:n_estimators
%%time
cv_params = {'n_estimators':[100 * i for i in range(1,20)]}
other_params = {'learning_rate':learning_rate, 'n_estimators': 500, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'seed': 0,
'subsample': subsample, 'colsample_bytree': colsample_bytree, 'gamma': gamma, 'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda,
'gpu_id':0,'tree_method':'gpu_hist',#使用GPU加速
'objective': 'reg:logistic'}#业务类型
model = xgb.XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring=scoring_fuction, cv=cv_num, verbose=verbose, n_jobs=n_jobs)
optimized_GBM.fit(X_train, y_train.values)
print("每次运行的结果如下:")
for (r,i,j,k) in zip(optimized_GBM.cv_results_['rank_test_score'],optimized_GBM.cv_results_['params'],optimized_GBM.cv_results_['mean_test_score'],optimized_GBM.cv_results_['std_test_score']):
print(" params:"+str(i)+" mean:"+str(j)," std:"+str(k)+" rank:"+str(r))
print("---------------------------")
print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))
n_estimators = optimized_GBM.best_params_['n_estimators']
Fitting 5 folds for each of 19 candidates, totalling 95 fits
每次运行的结果如下:
params:{'n_estimators': 100} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 200} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 300} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 400} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 500} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 600} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 700} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 800} mean:0.8518945312499999 std:4.7841596538744825e-05 rank:1
params:{'n_estimators': 900} mean:0.8518945312499999 std:4.7841596538744825e-05 rank:1
params:{'n_estimators': 1000} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 1100} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 1200} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 1300} mean:0.851875 std:3.9062500000008884e-05 rank:3
params:{'n_estimators': 1400} mean:0.85185546875 std:6.17632355501426e-05 rank:16
params:{'n_estimators': 1500} mean:0.85185546875 std:6.17632355501426e-05 rank:16
params:{'n_estimators': 1600} mean:0.85185546875 std:6.17632355501426e-05 rank:16
params:{'n_estimators': 1700} mean:0.85185546875 std:6.17632355501426e-05 rank:16
params:{'n_estimators': 1800} mean:0.8518749999999999 std:7.307924583540956e-05 rank:14
params:{'n_estimators': 1900} mean:0.8518749999999999 std:7.307924583540956e-05 rank:14
---------------------------
参数的最佳取值:{'n_estimators': 800}
最佳模型得分:0.8518945312499999
CPU times: total: 3.95 s
Wall time: 4min 33s
grid_visualization=[]
for grid_pair in optimized_GBM.cv_results_['mean_test_score']:
grid_visualization.append(grid_pair)
grid_visualization=np.array(grid_visualization)
plt.figure(figsize=(10,6))
plt.title('n_estimators优化')
plt.xlabel('n_estimators',fontsize=20)
plt.ylabel('score',fontsize=20)
plt.plot(cv_params['n_estimators'], grid_visualization)
plt.grid(True)
plt.show()
# XGBoost训练过程,下面的参数就是刚才调试出来的最佳参数组合
print("最佳learning_rate:"+str(learning_rate))
print("最佳n_estimators:"+str(n_estimators))
print("最佳max_depth:"+str(max_depth))
print("最佳min_child_weight:"+str(min_child_weight))
print("最佳subsample:"+str(subsample))
print("最佳colsample_bytree:"+str(colsample_bytree))
print("最佳gamma:"+str(gamma))
print("最佳reg_alpha:"+str(reg_alpha))
print("最佳reg_lambda:"+str(reg_lambda))
model = xgb.XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, seed=0,
subsample=subsample, colsample_bytree=colsample_bytree, gamma=gamma, reg_alpha=reg_alpha, reg_lambda=reg_lambda,
objective='reg:logistic')#业务类型)
model.fit(X_train, y_train.values)
model.get_booster().save_model('model/xgbc.model')
print("模型保存完毕!!!")
最佳learning_rate:0.01
最佳n_estimators:100
最佳max_depth:4
最佳min_child_weight:6
最佳subsample:0.8
最佳colsample_bytree:0.6000000000000001
最佳gamma:0.2
最佳reg_alpha:3
最佳reg_lambda:0.1
模型保存完毕!!!
特征重要性评分
feat_imp = pd.Series(model.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar',figsize=(16,12),fontsize=10)
plt.title('特征的价值',{'size': 10})
plt.ylabel('特征重要性评分',{'size': 10})
最优参数模型训练
# 创建特征集和标签
X = df_model.drop(['conversion'],axis=1)
y = df_model.conversion
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
xgb_model = xgb.XGBClassifier(n_estimators=3000,
max_depth=4,
min_child_weight=6,
subsample=0.8,
learning_rate=0.01,
colsample_bytree=0.6,
gamma=0.2,
reg_alpha=3,
reg_lambda=0.1,
objective='reg:logistic'
).fit(X_train, y_train)
X_test_data = X_test.copy()
X_test['proba'] = model.predict_proba(X_test)[:,1]
X_test = pd.concat([X_test,y_test],axis=1)
测试集上的结果
折扣的结果预测和实际单数比对。
real_disc_uptick = len(X_test)*(X_test[X_test['offer_Discount'] == 1].conversion.mean() - X_test[X_test['offer_No Offer'] == 1].conversion.mean())
pred_disc_uptick = len(X_test)*(X_test[X_test['offer_Discount'] == 1].proba.mean() - X_test[X_test['offer_No Offer'] == 1].proba.mean())
real_disc_uptick,pred_disc_uptick
(930.0479140598773, 832.8819274902344)
买一送一的结果预测和实际单数比对。
real_bogo_uptick = len(X_test)*(X_test[X_test['offer_Buy One Get One'] == 1].conversion.mean() - X_test[X_test['offer_No Offer'] == 1].conversion.mean())
pred_bogo_uptick = len(X_test)*(X_test[X_test['offer_Buy One Get One'] == 1].proba.mean() - X_test[X_test['offer_No Offer'] == 1].proba.mean())
real_bogo_uptick,pred_bogo_uptick
(491.74662391152333, 537.530517578125)