精华内容
下载资源
问答
  • 导包读数据 import pandas as pd import numpy as np import lightgbm as lgb import warnings from tqdm import tqdm # %matplotlib inline from sklearn.preprocessing import LabelEncoder from scipy import ...

    导包读数据

    import pandas as pd
    import numpy as np
    import lightgbm as lgb
    import warnings
    from tqdm import tqdm
    # %matplotlib inline
    from sklearn.preprocessing import LabelEncoder
    from scipy import sparse
    from sklearn.model_selection import StratifiedKFold
    warnings.filterwarnings("ignore")
    
    train = pd.read_csv('./train_set.csv')
    test = pd.read_csv('./test_set.csv')
    data = pd.concat([train, test])
    

    特征工程

    feature = data.columns.tolist()
    feature.remove('ID')
    feature.remove('y')
    sparse_feature = ['campaign', 'contact', 'default', 'education',
                      'housing', 'job', 'loan', 'marital', 'month', 'poutcome']
    dense_feature = list(set(feature) - set(sparse_feature))
    
    

    特征处理1

    # 统计分组之后的频次
    def feature_count(data, features):
        feature_name = 'count'
        for i in features:
            feature_name += '_' + i
        temp = data.groupby(features).size().reset_index().rename(
            columns={0: feature_name})
            # rename将列名进行替换
        data = data.merge(temp, 'left', on=features)
        return data, feature_name
        
        
    ll = []
    for f in['campaign', 'contact', 'default', 'education', 'housing', 'job', 'loan', 'marital', 'poutcome']:
        data, _ = feature_count(data, ['day', 'month', f])
        ll.append(_)
    

    特征处理2

    def get_new_columns(name, aggs):
        l = []
        for k in aggs.keys():
            for agg in aggs[k]:
                if str(type(agg)) == "<class 'function'>":
                    l.append(name + '_' + k + '_' + 'other')
                else:
                    l.append(name + '_' + k + '_' + agg)
        return l
    
    
    for d in tqdm(sparse_feature):
        aggs = {}
        for s in sparse_feature:
            aggs[s] = ['count', 'nunique']
        for den in dense_feature:
            aggs[den] = ['mean', 'max', 'min', 'std']
        aggs.pop(d)
        # 分组前将d移出字典
        temp = data.groupby(d).agg(aggs).reset_index()
        # 修改列名,不修改前的列名为MultiIndex
        temp.columns = [d] + get_new_columns(d, aggs)
        data = pd.merge(data, temp, on=d, how='left')
    

    agg测试

    数据地址

    data = pd.read_csv('./seaborn-data-master/tips.csv')
    data['tip_pct'] = data['tip'] / data['total_bill']
    grouped = data.groupby('day')
    
    functions = ['count', 'mean', 'max']
    # 取出来两列进行操作
    result = grouped['tip_pct', 'total_bill'].agg(functions)
    print(type(result))
    print(result)
    
    # 使用字典可以对不同的列进行特定的函数操作 使用列表则是对所有的列进行相同的函数操作
    grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],'size' : 'sum'})
    

    特征处理3-onehot编码

    for s in ['campaign', 'contact', 'default', 'education', 'housing', 'job', 'loan', 'marital', 'month', 'poutcome']:
        data = pd.concat([data, pd.get_dummies(data[s], prefix=s + '_')], axis=1)
        data.drop(s, axis=1, inplace=True)
    

    拆分数据集

    df_train = data[data['y'].notnull()]
    df_test = data[data['y'].isnull()]
    target = df_train['y']
    df_train_columns = df_train.columns.tolist()
    df_train_columns.remove('ID')
    df_train_columns.remove('y')
    

    数据标准化

    # 感觉作用也不是很大
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    mms = StandardScaler()
    df_train[df_train_columns] = mms.fit_transform(df_train[df_train_columns])
    df_train[df_train_columns].describe()
    df_test[df_train_columns]= mms.transform(df_test[df_train_columns])
    
    

    特征选择

    注意

    df_train[df_train[df_train_columns].isnull().values==True]
    # 会有nan的出现,而特征选择的时候不能出现nan 
    # lgb可以处理缺失值,但特征选择不能处理
    
    # 缺失值填补
    df_train[df_train_columns] = df_train[df_train_columns].fillna(df_train[df_train_columns].mean())
    
    
    
    import lightgbm as lgb
    from sklearn.feature_selection import RFE
    
    model = lgb.LGBMClassifier(
            boosting_type="gbdt", num_leaves=30, reg_alpha=0, reg_lambda=0.,
        max_depth=-1, n_estimators=600, objective='binary',metric= 'auc',
        subsample=0.85, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.02, random_state=2019
        )
    from sklearn.model_selection import cross_val_score
    score = []
    for i in range(1,500,50):
        X_wrapper = RFE(model,n_features_to_select=i, step=50).fit_transform(df_train[df_train_columns], target)
        once = cross_val_score(model,X_wrapper,target,cv=5,scoring='roc_auc').mean()
        print('特征数量:',i,',auc:',once)
        score.append(once)
    
    
    未进行标准化
    特征数量: 1 ,auc: 0.7935873254768542
    特征数量: 51 ,auc: 0.9410002321749318
    特征数量: 101 ,auc: 0.9405527394912129
    特征数量: 151 ,auc: 0.9405328823623723
    特征数量: 201 ,auc: 0.9408284102248168
    特征数量: 251 ,auc: 0.9405677873055179
    特征数量: 301 ,auc: 0.9407322636247646
    特征数量: 351 ,auc: 0.940533122705326
    特征数量: 401 ,auc: 0.9406939289039478
    特征数量: 451 ,auc: 0.9404385124993364
    
    进行标准化后结果反而变差
    特征数量: 1 ,auc: 0.7944185772558552
    特征数量: 51 ,auc: 0.9404873787210775
    特征数量: 101 ,auc: 0.9401977211266516
    特征数量: 151 ,auc: 0.9403600475119557
    特征数量: 201 ,auc: 0.9401212847240394
    特征数量: 251 ,auc: 0.9402594711308122
    特征数量: 301 ,auc: 0.9401619530151825
    

    精细选择

    score = []
    for i in range(30,76,5):
        X_wrapper = RFE(model,n_features_to_select=i, step=20).fit_transform(df_train[df_train_columns], target)
        once = cross_val_score(model,X_wrapper,target,cv=5,scoring='roc_auc').mean()
        print('特征数量:',i,',auc:',once)
        score.append(once)
    # 未标准化
    # 特征数量: 25 ,auc: 0.9392600590373865
    # 特征数量: 30 ,auc: 0.9409825802863147
    # 特征数量: 35 ,auc: 0.9412280205803961
    # 特征数量: 40 ,auc: 0.9410969648019115
    # 特征数量: 45 ,auc: 0.9409206267843725
    # 特征数量: 50 ,auc: 0.9410097738663016
    # 特征数量: 55 ,auc: 0.9408808260329253
    # 特征数量: 60 ,auc: 0.9407293019878438
    # 特征数量: 65 ,auc: 0.9405286498745273
    # 特征数量: 70 ,auc: 0.940852444429453
    

    具体选择

    rfe = RFE(model,40,step=10)
    # step 默认为1,每次只剔除1个特征。速度将会非常慢
    rfe = rfe.fit(df_train[df_train_columns], target)
    
    
    print(rfe.support_)
    print(rfe.ranking_)
    
    df_train[df_train_columns].iloc[:,rfe.support_].head()
    X_wrapper = rfe.transform(df_train[df_train_columns])
    X_wrapper
    # 可以比较二者其实是相同的  
    # X_wrapper and df_train[df_train_columns].iloc[:,rfe.support_].head()
    
    num_features = df_train[df_train_columns].iloc[:,rfe.support_].columns.tolist()
    

    xgboost

    
    df_train = pd.read_csv('./train_std.csv')
    df_test = pd.read_csv('./test_std.csv')
    
    df_train_columns=num_features
    # num_features为经过选择后的特征的名称
    
    import xgboost as xgb
    import matplotlib.pyplot as plt
    from xgboost import XGBClassifier as XGBC
    
    
    
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=666)
    predictions = np.zeros(len(test_std))
    
    param={'booster':'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth':5,
        'gamma': 0.1,
        'lambda':1,
        'subsample':0.75,
        'colsample_bytree':0.75,
        'min_child_weight':2,
        'eta': 0.025,
        'seed':888,
        'nthread':8,
        'silent':1,
        "scale_pos_weight":1
    }
    # split() missing 1 required positional argument: 'y'
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train, df_train['y'].values)):
        print("fold {}".format(fold_))
        trn_data = xgb.DMatrix(
            df_train.iloc[trn_idx][df_train_columns], label=target.iloc[trn_idx])
        val_data = xgb.DMatrix(
            df_train.iloc[val_idx][df_train_columns], label=target.iloc[val_idx])
    
        num_round = 10000
        watchlist = [(trn_data,'train'),(val_data,'val')]
        clf = xgb.train(param, trn_data, num_round, evals=watchlist, verbose_eval=100, early_stopping_rounds=100)
        # verbose_eval=100 每100轮打印一次auc
    
        predictions += clf.predict(xgb.DMatrix(df_test[df_train_columns])) / folds.n_splits
    
        
    sub = df_test[['ID']]
    sub['pred'] = predictions
    sub.to_csv(path+'/xgb-10-std-Result.csv', index=False)
    

    lgb

    train_x = df_train[num_features]
    test_x = df_test[num_features]
    
    res = test[['ID']]
    
    
    res['pred'] = 0
    from sklearn.model_selection import KFold
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_idx, val_idx in kfold.split(train_x):
        clf.random_state = clf.random_state + 1
        train_x1 = train_x.iloc[train_idx]
        train_y1 = target.iloc[train_idx]
        test_x1 = train_x.iloc[val_idx]
        test_y1 = target.iloc[val_idx]
        #,(vali_x,vali_y)
        clf.fit(train_x1, train_y1, eval_set=[
                (train_x1, train_y1), (test_x1, test_y1)], eval_metric='auc', early_stopping_rounds=100, verbose=False)
        res['pred'] += clf.predict_proba(test_x)[:, 1]
    
    # StratifiedKFold用法类似Kfold,但是他是分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同
    from sklearn.model_selection import StratifiedKFold
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    res['pred'] = 0
    for train_idx, val_idx in kfold.split(train_x, df_train['y'].values):
        clf.random_state = clf.random_state + 1
        train_x1 = train_x.iloc[train_idx]
        train_y1 = target.iloc[train_idx]
        test_x1 = train_x.iloc[val_idx]
        test_y1 = target.iloc[val_idx]
        clf.fit(train_x1, train_y1,
                eval_set=[(train_x1, train_y1), (test_x1, test_y1)],
                eval_metric='auc', early_stopping_rounds=100, verbose=100)
        res['pred'] += clf.predict_proba(test_x)[:, 1]
    
    res['pred'] = res['pred'] / 10
    res.to_csv('./res/lgb-50-10-SK.csv', index=False)
    

    原生lgb

    param = {'num_leaves': 31,
             'min_data_in_leaf': 30, 
             'objective':'binary',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.9,
             "bagging_freq": 1,
             "bagging_fraction": 0.9 ,
             "bagging_seed": 11,
             "metric": 'auc',
             "lambda_l1": 0.1,
             "verbosity": -1,
             "nthread": 4,
             "random_state": 666}
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=666)
    oof = np.zeros(len(df_train))
    predictions = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['y'].values)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(df_train.iloc[trn_idx][df_train_columns], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(df_train.iloc[val_idx][df_train_columns], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)
    
        num_round = 2000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
        oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = df_train_columns
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        predictions += clf.predict(df_test[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits
        
    
    from sklearn.metrics import roc_auc_score
    roc_auc_score(target,oof)
    
    sub=df_test[['ID']]
    sub['pred']=predictions
    sub.to_csv('./Result.csv',index=False)
    

    词频统计

    # 获取向量化特征
    data['new_con'] = data['job'].astype(str)
    
    for i in ['marital', 'education', 'contact','month','poutcome']:
        data['new_con'] = data['new_con'].astype(str) + '_' + data[i].astype(str)
    data['new_con']
    #0          management_married_tertiary_unknown_may_unknown
    #1           technician_divorced_primary_cellular_apr_other
    #2            admin._married_secondary_cellular_jul_unknown
    #3         management_single_secondary_cellular_jul_unknown
    #4        technician_divorced_secondary_unknown_may_unknown
    #5          services_divorced_secondary_unknown_jun_unknown
    data['new_con'] = data['new_con'].apply(lambda x: ' '.join(x.split('_')))
    data['new_con']
    # 空格 替换 _
    
    #0          management married tertiary unknown may unknown
    #1           technician divorced primary cellular apr other
    #2            admin. married secondary cellular jul unknown
    #3         management single secondary cellular jul unknown
    #4        technician divorced secondary unknown may unknown
    #5          services divorced secondary unknown jun unknown
    
    train_x=df_train[num_features]
    test_x=df_test[num_features]
    # 此时特征数量为50
    
    vector_feature = ['new_con']
    cv=CountVectorizer()
    for feature in vector_feature:
        cv.fit(data[feature])
        train_a = cv.transform(train[feature])
        test_a = cv.transform(test[feature])
    
    
        
    train_a
    <25317x87 sparse matrix of type '<class 'numpy.float64'>'
    	with 1392880 stored elements in Compressed Sparse Row format>
    
    cv.vocabulary_
    # 词汇表;字典型
    cv.get_feature_names()
    len(cv.get_feature_names())
    # 37
    train_a.toarray()
    # 一共37个词,生成一个样本中每个词出现的次数
    
    df_a = pd.DataFrame(train_a.toarray())
    # 生成列名
    fis = cv.get_feature_names()
    sec = []
    for i in fis:
        i = 'count_'+i
        sec.append(i)
    df_a.columns = sec
    
    df_b = pd.DataFrame(test_a.toarray(),columns=sec)
    # 这里会遇到一个坑,df_b的index与test_b的index不同,不能直接拼接
    # 需要先将df_b与test_b的index变得一致
    df_b.index = test_x.index
    # 拼接数据
    train_x = pd.concat([train_x,df_a],axis=1)
    test_x = pd.concat([test_x,df_b],axis=1)
    
    # 试了下,结果也没有什么提升
    
    import lightgbm as lgb
    from sklearn.model_selection import StratifiedKFold
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=666)
    
    res['pred'] = 0
    clf = lgb.LGBMClassifier(
        boosting_type="gbdt", num_leaves=30, reg_alpha=0, reg_lambda=0.,
        max_depth=-1, n_estimators=1000, objective='binary', metric='auc',
        subsample=0.85, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.02, random_state=2019
    )
    
    for train_idx, val_idx in kfold.split(train_x, df_train['y'].values):
        clf.random_state = clf.random_state + 1
        train_x1 = train_x.iloc[train_idx]
        train_y1 = target.iloc[train_idx]
        test_x1 = train_x.iloc[val_idx]
        test_y1 = target.iloc[val_idx]
        clf.fit(train_x1, train_y1,
                eval_set=[(train_x1, train_y1), (test_x1, test_y1)],
                eval_metric='auc', early_stopping_rounds=100, verbose=100)
        # 这里verbose =100 是为了每过100次打印一下指标
        res['pred'] += clf.predict_proba(test_x)[:, 1]
    
    展开全文
  • 数据共41188条,选自UCI机器学习库中的「银行营销数据集(Bank Marketing Data Set)」,这些数据与葡萄牙银行机构的营销活动相关。这些营销活动以电话为基础,一般,银行的客服人员需要联系客户至少一次,以此确认...

    背景

    数据共41188条,选自UCI机器学习库中的「银行营销数据集(Bank Marketing Data Set)」,这些数据与葡萄牙银行机构的营销活动相关。这些营销活动以电话为基础,一般,银行的客服人员需要联系客户至少一次,以此确认客户是否将认购该银行的产品(定期存款)。
    通过与葡萄牙银行机构的直销活动(电话)有关的一些数据,预测客户是否会订阅定期存款(变量Y)。这对于实际生产有着巨大作用,可以通过这个预测结果对未来的工作进行一个初步规划,同时也可以对某些用户是否会订阅定期存款提供一个参照等;

    数据说明

    客户信息:
    Age:年龄
    Job:工作,工作类型(分类:“行政管理”、“蓝领”、“企业家”、“女佣”、“管理”、 “退休”、“个体户”、“服务”、“学生”、“技术员”、“失业”、“未知”)
    Marital:婚姻,婚姻状况(分类:离婚,已婚,单身,未知)(注:“离婚”指离婚或丧偶)
    Education:教育(分类:‘基本.4y’,‘Basy.6y’,‘Basy.9y’’,‘Health.学校’,‘文盲’,‘专业’课程,‘大学学位’,‘未知’)
    Default:违约,信用违约吗?(分类:“不”,“是”,“不知道”)
    Housing:房,有住房贷款吗?(分类:“不”,“是”,“不知道”)
    Loan:贷款,有个人贷款吗?((分类:“不”,“是”,“不知道”)

    预测相关的其他数据:
    Contact:接触方式(分类:“移动电话”,“固定电话”)
    Month:月,最后一个联系月份(分类:‘MAR’,…,‘NOV’,’DEC’)
    Day_of_week:每周的天数,最后一周的联系日(分类):“Mon”、“Tee”、“We”、“TUU”、“FRI”
    Duration:持续时间,最后的接触持续时间,以秒为单位
    Campaign:在这次战役和这个客户联系的执行人数量
    Pdays:客户上次从上次活动中联系过去之后的天数(数字;999表示以前没有联系过客户)
    Previous:本次活动之前和本客户端的联系人数(数字)
    Proutcome:前一次营销活动的结果(分类:失败,不存在,成功)

    社会和经济背景属性
    EMP.var.rate:就业变化率-季度指标(数字)
    cons.price.idx:消费者价格指数-月度指标(数字)
    cons.conf.idx:消费者信心指数-月度指标(数字)
    euribor3m::欧元同业拆借利率3个月利率-每日指标(数字)
    nr.employed:员工人数-季度指标(数字)

    输出变量:
    Y -客户是否会定期存款?“是”、“否”

    数据预处理

    1. 删除用户ID
    2. 缺失值处理,观察数据可以得知,数值型变量没有缺失,非数值型变量可能存在unknown值。
      (1)删除 “unknown”
      (2)缺失值分析
      (3)采用随机森林对缺失值进行插补
      在这里插入图片描述
      可以看到部分数据中缺失值的情况还是比较严重的,直接删除会对预测结果有较大影响,采取效果较好的随机森林插补缺失值的方法。在数据量较大的情况下,插补时间较长。
     #随机森林插补缺失值
    > library(missForest)
    > data=missForest(mydata)
      missForest iteration 1 in progress...done!
      missForest iteration 2 in progress...done!
      missForest iteration 3 in progress...done!
      missForest iteration 4 in progress...done!
    > intact=data$ximp
    > md.pattern(intact)
         job marital education default housing loan contact month day_of_week duration campaign
    [1,]   1       1         1       1       1    1       1     1           1        1        1
    [2,]   0       0         0       0       0    0       0     0           0        0        0
         pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y  
    [1,]     1        1        1            1              1             1         1           1 1 0
    [2,]     0        0        0            0              0             0         0           0 0 0
    

    描述性分析

    连续性变量

    年轻的客户更愿意购买银行的定期存款,
    接触时间过长的客户反而不会购买,之前联络人数较多的也不会购买。

    在这里插入图片描述可以看到相对于有贷款的客户,没有贷款的客户更愿意参加活动,可能是由于有贷款的客户自身没有能力去消费这样一种定期存款的产品。

    老顾客继续购买的概率高很多,与其他理财产品相比,定期存款风险小,购买者多数不愿意承担其他理财产品高风险带来的高收益率,所以老顾客中风险偏好者更少,购买率会比较高。

    单身者购买率较高,可能由于单身者在财务方面更为自由一些。

    采用移动电话进行联系的效果更好,可能是由于移动电话更容易联系到客户。

    在不同学历中,不识字者的购买率最高,其次是大学本科学历,可能由于不识字者对于其他投资方式了解较少

    在不同的职业中,购买率最高依次为 学生,退休者以及行政管理
    在这里插入图片描述

    在这里插入图片描述在这里插入图片描述
    社会环境
    在这里插入图片描述

    处理数据不平衡

    首先,检查正负的不平衡度,数据集中只有11%的正样本,其余89%都属于负类数据的不平衡性较为严重。

    > prop.table(table(intact$y))
    
           no       yes 
    0.8873458 0.1126542 
    

    在分类模型中,数据不平衡问题会使得学习模型倾向于把样本分为多数类,但我们常常更关心少数类的预测情况。

    为减弱数据不均衡问题带来的不利影响,在数据层面有两种较简单的方法:过抽样和欠抽样。

    欠采样,通过随机从样本较多的数据类中采样得到一个较小的子集,将此子集和数据较少的类结合作为新的数据集。优点是在平衡数据的同时减小了数据量,加速训练,尤其是当样本集规模很大的时候。但是这也正是造成其缺点的主要原因,数据减少会影响模型的特征学习能力和泛化能力。

    过抽样方法通过增加少数类样本来提高少数类的分类性能 ,最简单的办法是简单复制少数类样本。优点是相对于欠抽样的方法,过抽样没有导致数据信息损失,在实际操作中一般效果也好于欠抽样。但是由于对较少类别的复制,过抽样增加了过拟合的可能性。

    这里使用人工数据合成法来对原始数据的正样本进行更好的估计。人工数据合成法(Synthetic Data Generation)也是一种过采样技术,是利用人工数据而不是重复原始观测来解决不平衡性。ROSE(Random Over Sampling Examples)包可以帮助我们基于采样和平滑自助法(smoothed bootstrap)来生成人工样本。

    library(ROSE)
    data.rose=ROSE(y~ ., data =intact, seed = 1)$data
    table(data.rose$y)
    str(data.rose)
    > table(data.rose$y)
    
       no   yes 
    20633 20555 
    

    抽取训练集测试集,采用分层抽样,抽取70%的训练集和30%的测试集。

    模型及评价

    在这次案例分析中,分别采用了支持向量机与随机森林进行分类,并且通过混淆矩阵来对模型进行评价,可以看到两个模型的预测效果都非常好,但是随机森林更胜一筹。

    总体来说,已经达到令人相当满意的准确率了。

    模型 训练集准确率 测试集准确率 Kappa系数
    支持向量机 0.9731411 0.9691882 0.9692
    随机森林 1 0.9749161 0.9479
    ## 支持向量机
    s_mod=svm(y~ .,train)
    s_predict=predict(s_mod,test)
    s_train=predict(s_mod,train)
    > sum(diag(table(train$y,s_train)))/length(s_train)
    [1] 0.9731411
    > sum(diag(table(test$y,s_predict)))/length(s_predict)
    [1] 0.9691882
    > confusionMatrix(data=s_predict,reference=test$y)
    Confusion Matrix and Statistics
    
              Reference
    Prediction    0    1
             0 2476    5
             1  151 2431
                                              
                   Accuracy : 0.9692          
                     95% CI : (0.9641, 0.9738)
        No Information Rate : 0.5189          
        P-Value [Acc > NIR] : < 2.2e-16       
                                              
                      Kappa : 0.9384          
     Mcnemar's Test P-Value : < 2.2e-16       
                                              
                Sensitivity : 0.9425          
                Specificity : 0.9979          
             Pos Pred Value : 0.9980          
             Neg Pred Value : 0.9415          
                 Prevalence : 0.5189          
             Detection Rate : 0.4890          
       Detection Prevalence : 0.4900          
          Balanced Accuracy : 0.9702          
                                              
           'Positive' Class : 0               
                                     
    
    ##  随机森林分类
    u_tree=randomForest(y~.,data=train,importance=TRUE,ntree=1000)
    tt_predict=predict(u_tree,test)
    tt_train=predict(u_tree,train)
    > sum(diag(table(train$y,tt_train)))/length(tt_train)
    [1] 1
    > sum(diag(table(test$y,tt_predict)))/length(tt_predict)
    [1] 0.9749161
    > confusionMatrix(data=t_predict,reference=test$y)
    Confusion Matrix and Statistics
    
              Reference
    Prediction    0    1
             0 2517   22
             1  110 2414
                                              
                   Accuracy : 0.9739          
                     95% CI : (0.9692, 0.9781)
        No Information Rate : 0.5189          
        P-Value [Acc > NIR] : < 2.2e-16       
                                              
                      Kappa : 0.9479          
     Mcnemar's Test P-Value : 3.665e-14       
                                              
                Sensitivity : 0.9581          
                Specificity : 0.9910          
             Pos Pred Value : 0.9913          
             Neg Pred Value : 0.9564          
                 Prevalence : 0.5189          
             Detection Rate : 0.4971          
       Detection Prevalence : 0.5015          
          Balanced Accuracy : 0.9745          
                                              
           'Positive' Class : 0               
                                         
    

    小结

    可以从自变量的重要性程度了解到,社会环境整体对于销售的影响还是比较大的。
    销售人员与客户接触相关因素也影响非常大,为了有一个好的销售效果,接触持续时间应该达到200h以上;同时,和客户的联系要选择固定的执行人,频繁更换人选客户购买率会下降。
    其次就是老顾客率非常高,银行可以把更多精力放在老顾客的身上;而对于新客户,则可以着重关注年长退休人士。

    > importance(u_tree)
                           no        yes MeanDecreaseAccuracy MeanDecreaseGini
    age             53.711350   8.978287            53.725930        373.24257
    job             47.306396  35.411977            55.629825        368.86332
    marital          9.291924  14.580736            17.553094         65.00866
    education       22.979616  34.214832            40.700215        195.76237
    default          0.000000   0.000000             0.000000          0.00000
    housing          2.034791   8.999713             8.003469         35.02138
    loan             3.963241  10.168745            10.259191         27.48620
    contact         17.776201  30.642910            21.255039        106.75061
    month          115.115649   9.044732           119.975682        818.66878
    day_of_week     17.576547  27.451160            31.162160        194.83529
    duration       379.240942 256.048129           389.136231       3784.04379
    campaign         4.377285  64.684120            56.223016        386.30141
    pdays          270.409581 125.766426           268.350699       3963.38617
    previous        70.901029  40.610378            78.494658       1311.44458
    poutcome        30.703517 -12.585281            29.563154        246.26971
    emp.var.rate    56.851807   4.849061            59.580893        864.87450
    cons.price.idx  81.356412 -38.014952            77.520281        478.51227
    cons.conf.idx  102.206176 -37.005703            99.402561        655.01121
    euribor3m       62.765737  24.612920            66.908571       1281.83690
    nr.employed     79.374570   2.185290            84.120966       1314.70020
    > 
    

    本次案例实战,整体效果较好,准确率也比较理想,同时机器学习的大部分流程也都用到了,但是相对来说,还是有一定的缺点,例如,没有对日期数据做更多处理,由于缺少年份数据,难以对日期进行进一步分析,这点可能会对结果产生负面影响。

    完整代码

    file.choose()
    #读取文件位置
    #此处需要注意,本数据的分割方法是通过“;”
    install.packages("readr")
    library(readr)
    mydata=read.csv("F:\\新建文件夹 (6)\\新建文件夹\\bank-additional\\bank-additional-full.csv", head=T,sep=";",stringsAsFactors = T)
    str(mydata)
    head(mydata)
    summary(mydata)
    mydata[mydata=="unknown"] = NA 
    library("mice")
    md.pattern(mydata)
    str(mydata)
    head(mydata)
    summary(mydata)
    mydata=mydata[,-1]
    ##图形缺失值探索
    library(VIM)
    aggr(mydata,prop=FALSE,number=TRUE)
    #随机森林插补缺失值
    library(missForest)
    data=missForest(mydata) 
    intact=data$ximp
    md.pattern(intact)
    summary(intact)
    
    #描述性分析
    library(ggplot2) 
    library(gridExtra) 
    #连续变量
    g1 =ggplot(intact, aes(x=age ,fill= y ))+geom_density(alpha = 0.5)
    g2= ggplot(intact, aes(x= previous,fill= y ))+geom_density(alpha = 0.5)
    g3 = ggplot(intact, aes(x=campaign ,fill= y ))+geom_density(alpha = 0.5)
    g4 = ggplot(intact, aes(x= duration ,fill= y ))+geom_density(alpha = 0.5)
    g5 = ggplot(intact, aes(x= pdays,fill= y ))+geom_density(alpha = 0.5)
    grid.arrange(g1,g2,g3,g4,g5, ncol = 2, nrow = 3)
    #离散变量
    g6 = ggplot(intact, aes(x= poutcome,fill= y ))+geom_bar(alpha = 0.5,position = "fill") 
    g7 = ggplot(intact, aes(x=marital,fill= y ))+geom_bar(alpha = 0.5,position = "fill")
    g8 = ggplot(intact, aes(x=education,fill= y ))+geom_bar(alpha = 0.5,position = "fill")
    g9 =ggplot(intact, aes(x=default,fill= y ))+geom_bar(alpha = 0.5,position = "fill")
    g10 = ggplot(intact, aes(x= housing,fill= y ))+geom_bar(alpha = 0.5,position = "fill")
    g11= ggplot(intact, aes(x=loan,fill= y ))+geom_bar(alpha = 0.55,position = "fill")
    g12 = ggplot(intact, aes(x= contact,fill= y ))+geom_bar(alpha = 0.5,position = "fill")
    g13 =ggplot(intact, aes(x=job  ,fill= y ))+geom_bar(alpha = 0.5,position = "fill") 
    grid.arrange(g6,g7,g8,g9,g10,g11,g12,g13,ncol = 3, nrow = 3)
                 
    ##处理非平衡分类       
    table(intact$y)
    prop.table(table(intact$y))   
    #欠采样会损失信息,过采样容易导致过拟合
    library(ROSE)
    data.rose=ROSE(y~ ., data =intact, seed = 1)$data
    table(data.rose$y)
    str(data.rose)
    #载入分层抽样的包
    library(sampling)
    n=round(0.8*nrow(data.rose)/2)
    sub_train=strata(data.rose,stratanames=("y"),size=rep(n,2),method="srswor")
    train=data.rose[sub_train$ID_unit,]
    test=data.rose[-sub_train$ID_unit,]
    str(train)
    str(test)
    
    library("caret")
    library("e1071")
    ##支持向量机(SVM)分类
    set.seed(0)
    s_mod=svm(y~ .,train)
    s_predict=predict(s_mod,test)
    #分类效果
    s_train=predict(s_mod,train)
    sum(diag(table(train$y,s_train)))/length(s_train)
    sum(diag(table(test$y,s_predict)))/length(s_predict)
    confusionMatrix(data=s_predict,reference=test$y)
    
    
    ###随机森林
    set.seed(123)
    library("randomForest")
    users_tree=randomForest(y~.,data=train,importance=TRUE,ntree=100)
    t_predict=predict(users_tree,test)
    t_train=predict(users_tree,train)
    sum(diag(table(train$y,t_train)))/length(t_train)
    sum(diag(table(test$y,t_predict)))/length(t_predict)
    confusionMatrix(data=t_predict,reference=test$y)
    #改变数目
    u_tree=randomForest(y~.,data=train,importance=TRUE,ntree=1000)
    tt_predict=predict(u_tree,test)
    tt_train=predict(u_tree,train)
    sum(diag(table(train$y,tt_train)))/length(tt_train)
    sum(diag(table(test$y,tt_predict)))/length(tt_predict)
    confusionMatrix(data=t_predict,reference=test$y)
    
    
    
    展开全文
  • 微信营销案例之招商银行“爱心漂流瓶”.doc
  • 银行宣布的战略发展目标是,要通过单一营销渠道为所有人提供全方位的银行服务。 这个案例包含了大量的风险问题,其中的大多数尚未被开普斯的高级管理层研究过。在交代了本案例的情况后,我们将进行一次IS风险管理的...
  • print('上一次营销活动的结果:\n',bank['poutcome'].value_counts()) # 2.类型转换 # 原始数据表中有数值型和类别型两种数据类型,除了决策树,一般机器学习模型只能读取数值型数据,因此我们需要进行类型的转换 # ...


    数据及代码连接—提取码:1234


    1.数据说明与预处理

    import pandas as pd
    import matplotlib.pyplot as plt
    
    # 加载数据
    bank = pd.read_csv('data/bank-full.csv',delimiter=';')
    # 通过查看前五行简要查看数据集的构成
    print(bank.head(5))
    
    # 通过describe()和info()函数查看各类数据的分布情况
    # 用 describe() 函数分别观察数值型(numeric)特征的分布和类别型(categorical)特征的分布
    # 数值型(numeric)特征的分布
    print(bank.describe())
    # 类别型(categorical)特征的分布
    print(bank.describe(include=['O']))
    
    # 用info()观察缺失值情况,可看出数据集中不存在缺失值
    print(bank.info())
    
    # 在此数据表中,部分数据以字符串 'unknown' 形式存在于类别型特征里。使用如下代码查看类别型特征中 'unknown' 的个数
    # 筛选类型为object型数据,统计’unknown‘的个数
    for col in bank.select_dtypes(include=['object']).columns:
         print(col,':',bank[bank[col] == 'unknown'][col].count())
    
    
    # 查看样本类别分布情况
    print('样本类别分布情况:\n',bank['y'].value_counts())
    # 画图
    plt.rcParams['font.sans-serif'] = ['SimHei']
    
    fig,ax = plt.subplots(1,1,figsize=(4,4))
    colors = ["#FA5858", "#64FE2E"]
    labels ="no", "yes"
    ax.set_title('是否认购定期存款',fontsize = 16)
    # 饼状图
    bank['y'].value_counts().plot.pie(explode=[0,0.25],autopct='%.2f%%',ax = ax,shadow=True,colors = colors,labels=labels,fontsize=14,startangle=25)
    plt.axis('off')
    plt.show()
    
       age           job  marital  education  ... pdays  previous poutcome   y
    0   58    management  married   tertiary  ...    -1         0  unknown  no
    1   44    technician   single  secondary  ...    -1         0  unknown  no
    2   33  entrepreneur  married  secondary  ...    -1         0  unknown  no
    3   47   blue-collar  married    unknown  ...    -1         0  unknown  no
    4   33       unknown   single    unknown  ...    -1         0  unknown  no
    [5 rows x 17 columns]
                    age        balance  ...         pdays      previous
    count  45211.000000   45211.000000  ...  45211.000000  45211.000000
    mean      40.936210    1362.272058  ...     40.197828      0.580323
    std       10.618762    3044.765829  ...    100.128746      2.303441
    min       18.000000   -8019.000000  ...     -1.000000      0.000000
    25%       33.000000      72.000000  ...     -1.000000      0.000000
    50%       39.000000     448.000000  ...     -1.000000      0.000000
    75%       48.000000    1428.000000  ...     -1.000000      0.000000
    max       95.000000  102127.000000  ...    871.000000    275.000000
    [8 rows x 7 columns]
                    job  marital  education  ...  month poutcome      y
    count         45211    45211      45211  ...  45211    45211  45211
    unique           12        3          4  ...     12        4      2
    top     blue-collar  married  secondary  ...    may  unknown     no
    freq           9732    27214      23202  ...  13766    36959  39922
    [4 rows x 10 columns]
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 45211 entries, 0 to 45210
    Data columns (total 17 columns):
    age          45211 non-null int64
    job          45211 non-null object
    marital      45211 non-null object
    education    45211 non-null object
    default      45211 non-null object
    balance      45211 non-null int64
    housing      45211 non-null object
    loan         45211 non-null object
    contact      45211 non-null object
    day          45211 non-null int64
    month        45211 non-null object
    duration     45211 non-null int64
    campaign     45211 non-null int64
    pdays        45211 non-null int64
    previous     45211 non-null int64
    poutcome     45211 non-null object
    y            45211 non-null object
    dtypes: int64(7), object(10)
    memory usage: 5.9+ MB
    None
    job : 288
    marital : 0
    education : 1857
    default : 0
    housing : 0
    loan : 0
    contact : 13020
    month : 0
    poutcome : 36959
    y : 0
    样本类别分布情况:
     no     39922
    yes     5289
    Name: y, dtype: int64
    

    在这里插入图片描述

    2.探索性分析

    # 探索性分析
    # 1.数值型特征的分布情况
    # 通过DataFrame的 hist() 函数查看每个数值型特征的分布情况。值得一提的是,虽然我们是对整个数据表调用 hist()
    # 函数,但是由于程序本身无法直观的理解类别型特征(因为它们以str形式存储),所以它们不会显示
    bank.hist(bins=25,figsize=(14,10))
    plt.show()
    
    # 2.类别性特征对结果的影响
    # 通过调用 barplot() 函数查看受教育程度 education 对结果(是否会定期存款)的影响
    fig,ax = plt.subplots(1,1,figsize=(9,7))
    colors = ["#64FE2E", "#FA5858"]
    # 柱状图-barplot
    sns.barplot(x='education',y='balance',hue='y',data=bank,palette=colors,estimator=lambda x:len(x)/len(bank)*100)
    # 柱状图标注
    for p in ax.patches:
        # p.get_x()表示横坐标值
        # p.get_height()表示柱的高度
        ax.annotate('{:.2f}%'.format(p.get_height()),(p.get_x()*1.02,p.get_height()*1.02),fontsize = 15)
    
    ax.set_xticklabels(bank['education'].unique(),fontsize=15)
    ax.set_title('受教育程度与结果(是否认购定期存款)的关系',fontsize=15)
    ax.set_xlabel("受教育程度",fontsize=15)
    ax.set_ylabel("(%)",fontsize=15)
    plt.show()
    
    # 3.特征间的相关性
    # 通过关系矩阵查看各特征之间的关系-heatmap
    fig, ax = plt.subplots(figsize=(12, 8))
    bank['y'] = LabelEncoder().fit_transform(bank['y'])
    # print(bank.head())
    numeric_bank = bank.select_dtypes(exclude="object")
    # 关系矩阵,以矩阵形式存储
    # numeric_bank.corr()返回一个相关系数矩阵
    corr_numeric = numeric_bank.corr()
    
    # 热力图,即关系矩阵
    sns.heatmap(corr_numeric, annot=True, vmax=1, vmin=-1, cmap="Blues",annot_kws={"size":15})
    ax.set_title("Correlation Matrix", fontsize=24)
    ax.tick_params(axis='y',labelsize=11.5)
    ax.tick_params(axis='x',labelsize=11.5)
    plt.show()
    
    
    # 4.我们把 duration 按低于或高于其平均值分成了 below_average 和 over_average 两类,探究这两种情况下人们购买意愿的差异
    sns.set(rc={'figure.figsize':(11.7,8.27)})
    # 设置风格-白色网格线
    sns.set_style('whitegrid')
    # 平均值
    avg_duration = bank['duration'].mean()
    # 建立一个新特征以区分大于duration平均值的duration和小于均值的duration
    # 创建一个新特征
    bank['duration_status'] = np.nan
    lst = [bank]
    for col in lst:
        col.loc[col['duration'] < avg_duration,'duration_status'] = 'below_average'
        col.loc[col['duration'] > avg_duration,'duration_status'] = 'above_average'
    
    # pd.crosstab交叉表-另外一种分析双变量的方式,通过它可以得到两个变量之间的交叉信息,并作图,round是一个四舍五入的函数
    pct_term = pd.crosstab(bank['duration_status'],bank['y']).apply(lambda r: round(r/r.sum(), 2) * 100, axis=1)
    # 以交叉表作柱状图
    ax = pct_term.plot(kind='bar',stacked = False,cmap='RdBu')
    ax.set_xticklabels(['below_average','above_average'],rotation=0,rotation_mode='anchor',fontsize=18)
    plt.title('The Influence of Duration',fontsize=18)
    plt.xlabel('Duration Status',fontsize=18)
    plt.ylabel('Percentage(%)',fontsize=18)
    
    for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()),(p.get_x(),p.get_height()*1.02))
    
    plt.show()
    # 删除特征,inplace=True表示原数组内容改变
    bank.drop(['duration_status'],axis=1,inplace=True)
    

    在这里插入图片描述
    在这里插入图片描述
    在这里插入图片描述

    3.数据的预处理与特征工程

    # 数据的预处理与特征工程
    # 1.缺失值处理
        # 缺失值处理通常有如下的方法:
        # 1.对于 'unknown' 值数量较少的特征,包括job和education,删除这些特征是缺失值('unknown')的行;
        # 2.如果预计该特征对于学习模型效果影响不大,而且在此例中缺失值都是类别型数据,可以对('unknown')值赋众数;或者取平均数
        # 3.可以使用数据完整的行作为训练集,以此来预测缺失值,特征concact,poutcome的缺失值可以采取此法;
        # 4.我们也可以不处理它,使其保留 'unknown' 的形式作为该特征的一种可能取值。
    print('上一次营销活动的结果:\n',bank['poutcome'].value_counts())
    # 2.类型转换
    # 原始数据表中有数值型和类别型两种数据类型,除了决策树,一般机器学习模型只能读取数值型数据,因此我们需要进行类型的转换
    # 我们可以先通过 LabelEncoder 再通过 OneHotEncoder 将str型数据转换成独热编码。但是这样每次只能操作一个类别型数据,函数写起来会比较麻烦
    # CategoricalEncoder,它的好处是可以直接转换多列类别型数据,当前版本没有提供,下面提供了 CategoricalEncoder 的方法
    
    class CategoricalEncoder(BaseEstimator, TransformerMixin):
        def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                     handle_unknown='error'):
            self.encoding = encoding
            self.categories = categories
            self.dtype = dtype
            self.handle_unknown = handle_unknown
        # fit方法与其他Encoder的使用方法一样
        def fit(self, X, y=None):
            """Fit the CategoricalEncoder to X.
            Parameters
            ----------
            X : array-like, shape [n_samples, n_feature]
                The data to determine the categories of each feature.
            Returns
            -------
            self
            """
            #编码有三种方式,按顺序分别为稀疏形式的独热编码,独热编码和序列编码。
            if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
                template = ("encoding should be either 'onehot', 'onehot-dense' "
                            "or 'ordinal', got %s")
                raise ValueError(template % self.handle_unknown)
    
            if self.handle_unknown not in ['error', 'ignore']:
                template = ("handle_unknown should be either 'error' or "
                            "'ignore', got %s")
                raise ValueError(template % self.handle_unknown)
    
            if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
                raise ValueError("handle_unknown='ignore' is not supported for"
                                 " encoding='ordinal'")
            # 处理特征
            X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
            n_samples, n_features = X.shape
            self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
            # CategoricalEncoder的具体思路如下:
            # 先用LabelEncoder()转换成序列数据,再用OneHotEncoder()增添新的列转换成独热编码
            # 在fit阶段,只提取每一列的类别信息,为transform阶段做准备。
            for i in range(n_features):
                le = self._label_encoders_[i]
                Xi = X[:, i]
                if self.categories == 'auto':
                    le.fit(Xi)
                else:
                    valid_mask = np.in1d(Xi, self.categories[i])
                    if not np.all(valid_mask):
                        if self.handle_unknown == 'error':
                            diff = np.unique(Xi[~valid_mask])
                            msg = ("Found unknown categories {0} in column {1}"
                                   " during fit".format(diff, i))
                            raise ValueError(msg)
                    le.classes_ = np.array(np.sort(self.categories[i]))
    
            self.categories_ = [le.classes_ for le in self._label_encoders_]
    
            return self
    
        def transform(self, X):
            """Transform X using one-hot encoding.
            Parameters
            ----------
            X : array-like, shape [n_samples, n_features]
                The data to encode.
            Returns
            -------
            X_out : sparse matrix or a 2-d array
                Transformed input.
            """
            # 处理特征
            X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
            n_samples, n_features = X.shape
            X_int = np.zeros_like(X, dtype=np.int)
            X_mask = np.ones_like(X, dtype=np.bool)
            # 转换类别型变量到独热编码的步骤
            for i in range(n_features):
                valid_mask = np.in1d(X[:, i], self.categories_[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(X[~valid_mask, i])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during transform".format(diff, i))
                        raise ValueError(msg)
                    else:
                        # Set the problematic rows to an acceptable value and
                        # continue `The rows are marked `X_mask` and will be
                        # removed later.
                        X_mask[:, i] = valid_mask
                        X[:, i][~valid_mask] = self.categories_[i][0]
                X_int[:, i] = self._label_encoders_[i].transform(X[:, i])
            # 对于序列编码,直接处理后返回
            if self.encoding == 'ordinal':
                return X_int.astype(self.dtype, copy=False)
            #以下是处理类别型数据的步骤
            mask = X_mask.ravel()
            n_values = [cats.shape[0] for cats in self.categories_]
            n_values = np.array([0] + n_values)
            indices = np.cumsum(n_values)
            column_indices = (X_int + indices[:-1]).ravel()[mask]
            row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                    n_features)[mask]
            data = np.ones(n_samples * n_features)[mask]
            # 默认是以稀疏矩阵的形式输出,节约内存
            out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                    shape=(n_samples, indices[-1]),
                                    dtype=self.dtype).tocsr()
            # 将稀疏矩阵转换成普通矩阵
            if self.encoding == 'onehot-dense':
                return out.toarray()
            else:
                return out
    
    # 将job与marital进行类型转化
    a = CategoricalEncoder().fit_transform(bank[['job','marital']])
    # 将稀疏矩阵转换成稠密矩阵
    print(a.toarray())
    print(a.shape)          # (45211, 15)
    
    
    # 定义一个DataFrameSelector类,作用是从DataFrame中选取特定的列,以便后续pipeline的便捷性。
    class DataFrameSelector(BaseEstimator, TransformerMixin):
        def __init__(self, attribute_names):
            self.attribute_names = attribute_names
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            return X[self.attribute_names]
    
    # 制作管道
    # 对数值型数据特征处理
    numerical_pipline = Pipeline([
        ('select_numeric',DataFrameSelector(["age", "balance", "day", "campaign", "pdays", "previous","duration"])),
        ('std_scaler',StandardScaler())
    ])
    # 对类别型特征处理
    categorical_pipline = Pipeline([
        ('select_cat',DataFrameSelector(["job", "education", "marital", "default", "housing", "loan", "contact", "month","poutcome"])),
        ('cat_encoder',CategoricalEncoder(encoding='onehot-dense'))
    ])
    # 统一管道
    preprocess_pipline = FeatureUnion(transformer_list=[
        ('numerical_pipline',numerical_pipline),
        ('categorical_pipline',categorical_pipline)
    ])
    
    上一次营销活动的结果:
     unknown    36959
    failure     4901
    other       1840
    success     1511
    Name: poutcome, dtype: int64
    [[0. 0. 0. ... 0. 1. 0.]
     [0. 0. 0. ... 0. 0. 1.]
     [0. 0. 1. ... 0. 1. 0.]
     ...
     [0. 0. 0. ... 0. 1. 0.]
     [0. 1. 0. ... 0. 1. 0.]
     [0. 0. 1. ... 0. 1. 0.]]
    (45211, 15)
    
    

    4.模型训练

    # 模型训练
    # 1.数据集的划分
    X = bank.drop(['y'],axis=1)
    y = bank['y']
    X = preprocess_pipline.fit_transform(X)
    # 分割数据集
    X_train,X_test,y_train,y_test = train_test_split(X,y.ravel(),train_size=0.8,random_state=44)
    
    # 将数组转换成DataFrame格式
    preprocess_bank = pd.DataFrame(X)
    print('转换后的数据为:\n',preprocess_bank.head(5))
    
    # 2.模型构建
    t_diff=[]
    # 逻辑回归
    log_reg = LogisticRegression()
    t_start = time.process_time()
    log_scores = cross_val_score(log_reg, X_train, y_train, cv=3,scoring='roc_auc')
    t_end = time.process_time()
    t_diff.append((t_end - t_start))
    log_reg_mean = log_scores.mean()
    
    # 支持向量机
    svc_clf = SVC()
    t_start = time.process_time()
    svc_scores = cross_val_score(svc_clf, X_train, y_train, cv=3, scoring='roc_auc')
    t_end = time.process_time()
    t_diff.append((t_end - t_start))
    svc_mean = svc_scores.mean()
    
    # k邻近
    knn_clf = KNeighborsClassifier()
    t_start = time.process_time()
    knn_scores = cross_val_score(knn_clf, X_train, y_train, cv=3, scoring='roc_auc')
    t_end = time.process_time()
    t_diff.append((t_end - t_start))
    knn_mean = knn_scores.mean()
    
    # 决策树
    tree_clf = DecisionTreeClassifier()
    t_start = time.process_time()
    tree_scores = cross_val_score(tree_clf, X_train, y_train, cv=3, scoring='roc_auc')
    t_end = time.process_time()
    t_diff.append((t_end - t_start))
    tree_mean = tree_scores.mean()
    
    # 梯度提升树
    grad_clf = GradientBoostingClassifier()
    t_start = time.process_time()
    grad_scores = cross_val_score(grad_clf, X_train, y_train, cv=3, scoring='roc_auc')
    t_end = time.process_time()
    t_diff.append((t_end - t_start))
    grad_mean = grad_scores.mean()
    
    # 随机森林
    rand_clf = RandomForestClassifier()
    t_start = time.process_time()
    rand_scores = cross_val_score(rand_clf, X_train, y_train, cv=3, scoring='roc_auc')
    t_end = time.process_time()
    t_diff.append((t_end - t_start))
    rand_mean = rand_scores.mean()
    
    # 神经网络
    neural_clf = MLPClassifier(alpha=0.01)
    t_start = time.process_time()
    neural_scores = cross_val_score(neural_clf, X_train, y_train, cv=3, scoring='roc_auc')
    t_end = time.process_time()
    t_diff.append((t_end - t_start))
    neural_mean = neural_scores.mean()
    
    # 朴素贝叶斯
    nav_clf = GaussianNB()
    t_start = time.process_time()
    nav_scores = cross_val_score(nav_clf, X_train, y_train, cv=3, scoring='roc_auc')
    t_end = time.process_time()
    t_diff.append((t_end - t_start))
    nav_mean = neural_scores.mean()
    
    d = {'Classifiers': ['Logistic Reg.', 'SVC', 'KNN', 'Dec Tree', 'Grad B CLF', 'Rand FC', 'Neural Classifier', 'Naives Bayes'],
        'Crossval Mean Scores': [log_reg_mean, svc_mean, knn_mean, tree_mean, grad_mean, rand_mean, neural_mean, nav_mean],
        'time':t_diff}
    
    result_df = pd.DataFrame(d)
    result_df = result_df.sort_values(by=['Crossval Mean Scores'], ascending=False)
    print(result_df)
    
             Classifiers  Crossval Mean Scores        time
    4         Grad B CLF              0.925986   11.968750
    5            Rand FC              0.925082    7.031250
    6  Neural Classifier              0.918507  315.625000
    7       Naives Bayes              0.918507    0.843750
    1                SVC              0.906926   44.968750
    0      Logistic Reg.              0.905810    3.468750
    2                KNN              0.829798   35.828125
    3           Dec Tree              0.702140    0.687500
    

    5.模型评价

    # 通过该函数获得一个分类器的AUC值与ROC曲线的参数
    def get_auc(clf):
        clf=clf.fit(X_train, y_train)
        prob=clf.predict_proba(X_test)
        prob=prob[:, 1]
        return roc_auc_score(y_test, prob),roc_curve(y_test, prob)
    
    # 通过测试集数据画出ROC曲线并标注AUC值
    grad_roc_scores,grad_roc_curve = get_auc(grad_clf)
    neural_roc_scores,neural_roc_curve = get_auc(neural_clf)
    naives_roc_scores,naives_roc_curve = get_auc(nav_clf)
    
    grd_fpr, grd_tpr, grd_thresold = grad_roc_curve
    neu_fpr, neu_tpr, neu_threshold = neural_roc_curve
    nav_fpr, nav_tpr, nav_threshold = naives_roc_curve
    
    def graph_roc_curve_multiple(grd_fpr, grd_tpr, neu_fpr, neu_tpr, nav_fpr, nav_tpr):
        plt.figure(figsize=(8,6))
        plt.title('ROC Curve \n Top 3 Classifiers', fontsize=18)
        plt.plot(grd_fpr, grd_tpr, label='Gradient Boosting Classifier (Score = {:.2%})'.format(grad_roc_scores))
        plt.plot(neu_fpr, neu_tpr, label='Neural Classifier (Score = {:.2%})'.format(neural_roc_scores))
        plt.plot(nav_fpr, nav_tpr, label='Naives Bayes Classifier (Score = {:.2%})'.format(naives_roc_scores))
        plt.plot([0, 1], [0, 1], 'k--')# 指定x,y轴的坐标在0,1之间
        plt.axis([0, 1, 0, 1])
        plt.xlabel('False Positive Rate', fontsize=16)
        plt.ylabel('True Positive Rate', fontsize=16)
        plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3), arrowprops=dict(facecolor='#6E726D', shrink=0.05),)
        plt.legend()#显示图例
        
    graph_roc_curve_multiple(grd_fpr, grd_tpr, neu_fpr, neu_tpr, nav_fpr, nav_tpr)
    plt.show()
    

    在这里插入图片描述


    如果对您有帮助,麻烦点赞关注,这真的对我很重要!!!如果需要互关,请评论或者私信!
    在这里插入图片描述


    展开全文
  • 今天在B站看了一个sas出品的培训精准营销的视频,介绍的案例非常具有实践意义与实操价值,于是把ppt进行了截图,以供日后工作使用。 视频地址: 精准营销中分析的价值与实现(上) 精准营销中分析的价值与实现...

    前段时间在B站看了一个sas出品的培训精准营销的视频,介绍的案例比较具有实践意义与实操价值,于是把部分ppt进行了截图,以供日后学习、工作使用。

    视频地址:

    精准营销中分析的价值与实现(上)

    精准营销中分析的价值与实现(下)(大部分时间在介绍怎么用sas进行建模,没怎么看)


                             

      

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

     

    展开全文
  • 数据银行报告 消费者的需求-个性化定制化 品牌数字化 品牌数 费者 数据一 与消费者的 关系 品牌 决策方向 资产 么数益的利益 根据消费考喜娆实现品牌差异化年轻化 全固性的市场 品牌 0多岁的老国货品 传播 的数字 ...
  • 加拿大皇家银行客户精准细分案例分析
  • 随着信息化浪潮的推进,数据爆炸式增长,数据...民生银行为开展全行转型升级启动的凤凰计划项目中,将IT能力作为全行转型发展的基础,并将大数据战略作为研究专项,致力于实现大数据分析对产品、营销、决策的强力支持。
  • 以邮储银行为例,2019年上半年累计压降台席5540个,优化柜员3384人,其中2372人调整至网点营销团队,自主设备增加0.99万台。与此同时,在离柜率高达95%的深圳,银行业正在重新审视网点价值,部分银行正在积极申请新...
  • 银行测试案例编写的一些总结

    千次阅读 2017-02-20 21:43:42
    在编写案例之前,首先先理解该交易对应的需求,把握交易中基本业务流以及备选流。其中基本业务流主要是看该交易的基本功能是否已正确实现;备选流是看涉及到选输项连带出异于基本流的交易页面,以及一些反例的情况。...
  • 大数据 银行业应用案例A Portuguese banking institution ran a marketing campaign to convince potential customers to invest in bank term deposits. Information related to direct marketing campaigns of the...

空空如也

空空如也

1 2 3 4 5 ... 20
收藏数 7,758
精华内容 3,103
关键字:

银行营销案例