• 关于lgb模型参数的问题，可以采用贝叶斯全局优化来调参 import lightgbm as lgb from bayes_opt import BayesianOptimization import warnings warnings.filterwarnings("ignore") from sklearn.datasets import ...
关于lgb模型参数的问题，可以采用贝叶斯全局优化来调参
import lightgbm as lgb
from bayes_opt import BayesianOptimization
import warnings
warnings.filterwarnings("ignore")

from sklearn.datasets import make_classification

X, y = make_classification(n_samples=10000,n_features=20,n_classes=2,random_state=2)
data = lgb.Dataset(X,y)

def lgb_cv(feature_fraction,bagging_fraction,min_data_in_leaf,max_depth,min_split_gain,num_leaves,lambda_l1,lambda_l2,num_iterations=1000):
params = {'objective':'binary','num_iterations': num_iterations, 'early_stopping_round':50, 'metric':'l1'}
params['feature_fraction'] = max(min(feature_fraction, 1), 0)
params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
params["min_data_in_leaf"] = int(round(min_data_in_leaf))
params['max_depth'] = int(round(max_depth))
params['min_split_gain'] = min_split_gain
params["num_leaves"] = int(round(num_leaves))
params['lambda_l1'] = max(lambda_l1, 0)
params['lambda_l2'] = max(lambda_l2, 0)

cv_result = lgb.cv(params, data, nfold=5, seed=2, stratified=True, verbose_eval =50)
return -(min(cv_result['l1-mean']))

lgb_bo = BayesianOptimization(
lgb_cv,
{'feature_fraction': (0.5, 1),
'bagging_fraction': (0.5, 1),
'min_data_in_leaf': (1,100),
'max_depth': (3, 15),
'min_split_gain': (0, 5),
'num_leaves': (16, 128),
'lambda_l1': (0, 100),
'lambda_l2': (0, 100)}
)

lgb_bo.maximize(init_points=21,n_iter=90) #init_points表示初始点，n_iter代表迭代次数（即采样数）
print (lgb_bo.max)

展开全文
• 设立初始参数3.2  调解n_estimators3.3  max_depth/num_leaves3.4  min_child_samples/min_child_weight3.5  subsample/colsample_bytree（0.6,1）3.6&n
Table of Contents

1  数据导入2  模型挑选3  模型调参
3.1  设立初始参数3.2  调解n_estimators3.3  max_depth/num_leaves3.4  min_child_samples/min_child_weight3.5  subsample/colsample_bytree（0.6,1）3.6  reg_alpha/reg_lamb3.7  学习率4  测试集生成结果5  特征选择

import numpy as np              # 导入numpy库
import pandas as pd             # 导入pandas库
import matplotlib as mpl        # 导入matplotlib库
import matplotlib.pyplot as plt
import seaborn as sns           # 导入seaborn库
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

数据导入
op = pd.read_csv('op_done.csv',index_col='user')

train = label.join(base).join(op).join(tr)

train.fillna(0,inplace=True)


train_X = train.iloc[:,1:].values
train_y = train.iloc[:,0].values
stand = StandardScaler()
train_X = stand.fit_transform(train_X)

test = sumbit.join(base).join(op).join(tr)
test.fillna(0,inplace =True)
test_X = test.iloc[:,1:].values
test_X = stand.fit_transform(test_X)

模型挑选
lr = LogisticRegression(random_state=2018)  # 逻辑回归模型
svm = SVC(probability=True,random_state=2018)  # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2018) #　随机森林
Xgbc=XGBClassifier(random_state=2018)  #Xgbc
gbm=lgb.LGBMClassifier(random_state=2018)  #lgb
model_name=["lr","svm","forest","Gbdt","Xgbc","gbm"]

def muti_score(model):
auc = cross_val_score(model, train_X, train_y, scoring='roc_auc', cv=3)
return auc.mean()

scores = []
for name in model_name:
model = eval(name)
socre = muti_score(model)
scores.append((name,socre))

scores

[('lr', 0.6374291913334925),
('svm', 0.42584336157620334),
('forest', 0.6732019222635085),
('Gbdt', 0.6995580705824883),
('Xgbc', 0.6890128512134231),
('gbm', 0.7027585172289985)]

模型调参
经过对比 gbdt和gbm效果较好
调参顺序 n_estimators -- max_depth/num_leaves -- min_child_samples/min_child_weight -- subsample/colsample_bytree --reg_alpha/reg_lambda -- 学习率

设立初始参数
params = {'boosting_type':'gbdt','objective': 'binary','subsample': 0.8,'colsample_bytree': 0.8}
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()

{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': -1,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 100,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0}

调解n_estimators
param_1 = {'n_estimators':range(50,150,5)}
cv = GridSearchCV(gbm,param_grid=param_1,scoring='roc_auc',cv=5)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
plt.plot(result['param_n_estimators'],result['mean_test_score'])

0.718750789233066 {'n_estimators': 80}

[<matplotlib.lines.Line2D at 0x1b2b4aab0f0>]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hqXl8ymd-1597326586451)(output_17_2.png)]
params.update(grid_result.best_params_)
gbm=lgb.LGBMClassifier(**params)

max_depth/num_leaves
param_2 = {'max_depth':range(5,9),'num_leaves ':range(20,50,2)}
cv = GridSearchCV(gbm,param_grid=param_2,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)

0.7191457708890046 {'max_depth': 8, 'num_leaves ': 20}

params.update({'max_depth': 8, 'num_leaves ': 20})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()

{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 8,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 80,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'num_leaves ': 20}

min_child_samples/min_child_weight
param_3 = {'min_child_samples':range(10,30,2),'min_child_weight':[i/1000 for i in range(0,20,2)]}
cv = GridSearchCV(gbm,param_grid=param_3,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)

0.7191457708890046 {'min_child_samples': 20, 'min_child_weight': 0.0}

params.update({'min_child_samples': 20, 'min_child_weight': 0.0})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()

{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 8,
'min_child_samples': 20,
'min_child_weight': 0.0,
'min_split_gain': 0.0,
'n_estimators': 80,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'num_leaves ': 20}

subsample/colsample_bytree（0.6,1）
param_4 = {'subsample':[i/10 for i in range(6,10,1)],'colsample_bytree':[i/10 for i in range(6,10,1)]}
cv = GridSearchCV(gbm,param_grid=param_4,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)

0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.6}

reg_alpha/reg_lamb
param_5 = {'subsample':[i/10 for i in range(10)],'colsample_bytree':[i/10 for i in range(10)]}
cv = GridSearchCV(gbm,param_grid=param_5,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)

0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.1}

学习率
param_6 = {'learning_rate':[i/100 for i in range(20)]}
cv = GridSearchCV(gbm,param_grid=param_6,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)

0.7191457708890046 {'learning_rate': 0.1}

测试集生成结果
gbm.fit(train_X, train_y)

LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
n_estimators=80, num_leaves =20, objective='binary',
subsample=0.8)

gbm.feature_importances_

array([ 17,  82,  16,  13,   8,  86,  16,  12,  13,  15,   0,  25,  33,
12,  25,   0,   6,   1,  11,  12,   6,  49,  87,  61,  56,  51,
58,  36,  21,  20,  38,  20,  24,   7,  17,  14,   7,   2,  34,
5,   0,   0,  39,  60, 121,  41,  67,  35,  30,  68,  66,  53,
12,  49,   3,  12,   1,  18,   2,   9,   1,  24,  26,   8,  13,
1,  28,  18,  24,   9,   0,   5,   1,   0,   0,   0,   1,  23,
11,  19,   5,   0,   8,   5,  31,  11,   4,   6,   7,  92,  26,
0,   0,  12,   0,   0,   0,   1,   0,  16,   0,   0,   0,  35,
0,   5,   0,  10,   9,  16,   0,   0,   0,   3,   5,   0,  23,
107,  49])

train.iloc[:,1:].columns

Index(['sex', 'age', 'provider', 'level', 'verified', 'using_time',
'regist_type', 'card_a_cnt', 'card_b_cnt', 'card_c_cnt',
...
'type2_7', 'type2_8', 'type2_9', 'type2_10', 'type2_11', 'type2_12',
'type2_13', 'tr_time', 'mean_amount', 'ip_ture'],
dtype='object', length=119)

feature_importance = pd.DataFrame({'feature':train.iloc[:,1:].columns,'importance':gbm.feature_importances_})

feature_importance.sort_values(by='importance',ascending=False).head(20)


y_pre = gbm.predict(train_X)
y_pre = gbm.predict_proba(train_X)

roc_auc_score(train_y,y_pre[:,1])

0.7876456295250149

y = gbm.predict_proba(test_X)
y[:,1]

array([0.02967902, 0.44846496, 0.02377314, ..., 0.21914047, 0.28423991,
0.16758796])

test['prob'] = y[:,1]

pd.DataFrame(test.iloc[:,0]).to_csv('result.csv')

auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=10)


array([0.73246514, 0.72179747, 0.72288483, 0.72767674, 0.72240485,
0.72194103, 0.71986724, 0.71257605, 0.71155348, 0.7186749 ])

auc.mean()

0.7211841722940205

特征选择
list_feature = feature_importance.sort_values(by='importance',ascending=False)['feature'].to_list()

list_socre = []
for i in range(50,120,10):
fearture = list_feature[:i]
train_X = stand.fit_transform(train.loc[:,fearture].values)
auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=5)
list_socre.append((i,auc.mean()))
list_socre

[(50, 0.7164324796787287),
(60, 0.7178106094882282),
(70, 0.7200468611823796),
(80, 0.7193456575143582),
(90, 0.7190751868013574),
(100, 0.7190497035344566),
(110, 0.7182153617821309)]

test_X = stand.fit_transform(test.loc[:,list_feature[:70]].values)
train_X = stand.fit_transform(train.loc[:,list_feature[:70]].values)

gbm.fit(train_X,train_y)

LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
n_estimators=80, num_leaves =20, objective='binary',
subsample=0.8)



展开全文
• 没有用深度模型，用的传统的lgb当成分类做的，这里的代码只用了一个非常基本的tfidf特征，模型构造好了，大家可以自己遵循自己的想法体现特征。 想先做实体的部分，就没做情感，可以加一个文件features / emo_...
• ## lgb参数

万次阅读 2018-06-12 15:36:08
参数： params = {'learning_rate': 0.2, # default=0.1, type=double, alias=shrinkage_rate 'application': 'binary', # default=regression，任务类型 'num_leaves': 31, #...
中文文档
参数
params = {
# default=0.1, type=double, alias=shrinkage_rate
'learning_rate': 0.2,

# default=regression，任务类型
'application': 'binary',
# 叶子节点的数量
'num_leaves': 31,
# default=1, type=int, alias=verbose  |  日志冗长度，[详细信息]代表是否输出 < 0: Fatal, = 0: Error (Warn), > 0: Info
'verbosity': -1,
'data_random_seed': 2,
'bagging_fraction': 0.8,# default=1.0, type=double, 0.0 < bagging_fraction < 1.0, alias=sub_row, subsample
# 类似于 feature_fraction, 但是它将在不进行重采样的情况下随机选择部分数据
# 可以用来加速训练
# 可以用来处理过拟合
# Note: 为了启用 bagging, bagging_freq 应该设置为非零值

'feature_fraction': 0.6,
#default=1.0, type=double, 0.0 < feature_fraction < 1.0, alias=sub_feature,                                                                        #colsample_bytree
# 如果 feature_fraction 小于 1.0, LightGBM 将会在每次迭代中随机选择部分特征. 例如, 如果设置为 0.8, 将会在每棵树训练之前选择 80% 的特征
# 可以用来加速训练
# 可以用来处理过拟合
# LightGBM 的线程数
# 为了更快的速度, 将此设置为真正的 CPU 内核数, 而不是线程的数量 (大多数 CPU 使用超线程来使每个 CPU 内核生成 2 个线程)
# 当你的数据集小的时候不要将它设置的过大 (比如, 当数据集有 10,000 行时不要使用 64 线程)
# 请注意, 任务管理器或任何类似的 CPU 监视工具可能会报告未被充分利用的内核. 这是正常的
# 对于并行学习, 不应该使用全部的 CPU 内核, 因为这会导致网络性能不佳

'lambda_l1': 1,         #lambda_l1, default=0, type=double, alias=reg_alpha   L1 正则
'lambda_l2': 1}         #lambda_l2, default=0, type=double, alias=reg_lambda   L2 正则

rounds_lookup = {'toxic': 140,
'severe_toxic': 50,
'obscene': 80,
'threat': 80,
'insult': 70,
'identity_hate': 80}
model = lgb.train(params,
train_set=d_train,
valid_sets=watchlist,
verbose_eval=10)
# num_iterations, default=100, type=int, alias=num_iteration, num_tree, num_trees, num_round, num_rounds, num_boost_round
# boosting 的迭代次数
# Note: 对于 Python/R 包, 这个参数是被忽略的, 使用 train and cv 的输入参数 num_boost_round (Python) or nrounds (R) 来代替
# Note: 在内部, LightGBM 对于 multiclass 问题设置 num_class * num_iterations 棵树


 另一个代码参数：
params = {
"objective": "binary",
'metric': {'auc'},
"boosting_type": "gbdt",
# boosting, default=gbdt, type=enum, options=gbdt, rf, dart, goss, alias=boost, boosting_type
# gbdt, 传统的梯度提升决策树
# rf, Random Forest (随机森林)
# dart, Dropouts meet Multiple Additive Regression Trees
# goss, Gradient-based One-Side Sampling (基于梯度的单侧采样)

"verbosity": -1,
"bagging_fraction": 0.8,
"feature_fraction": 0.8,
"learning_rate": 0.1,
"num_leaves": 31,
"verbose": -1,
"min_split_gain": .1,       #min_split_gain, default=0, type=double, alias=min_gain_to_split
#执行切分的最小增益
"reg_alpha": .1             ##l1正则化参数
}

展开全文
• #lgb参数 lgb_params = { "boosting_type": "gbdt", "objective": "binary", 'metric': {'binary_logloss', 'auc'}, #二进制对数损失 "learning_rate": 0.01, "max_depth": 7, "num_leaves": 105, ...
#lgb参数
lgb_params = {
"boosting_type": "gbdt",
"objective": "binary",
'metric': {'binary_logloss', 'auc'},  #二进制对数损失
"learning_rate": 0.01,
"max_depth": 7,
"num_leaves": 105,
"feature_fraction": 1,
"bagging_fraction": 1,
'min_data_in_leaf': 100,
'bagging_freq': 6,

}

def pred_select(label_name,column_name):
if label_name == 'favorite':
labels = train.favorite
else:
labels = train.purchase
x = train_feature.values
y = labels
y_val = np.zeros((train_feature.shape[0]))#创建测试集
y_test = np.zeros((test_feature.shape[0]))#创建测试集
score_valid=[]

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
for train_index,valid_index in skf.split(x,y):
x_train,x_valid,y_train,y_valid = x[train_index],x[valid_index],y[train_index],y[valid_index]#
#传入测试集数据 ，skf.split将数据集划分，划分后结果为索引，通过train_feature.values[索引值]访问

train_data = lgb.Dataset(x_train,label=y_train)
valid_data = lgb.Dataset(x_valid,label=y_valid)
model = lgb.train(lgb_params,train_data,valid_sets=[valid_data],verbose_eval=1)
y_val[valid_index] = model.predict(x_valid)
score_valid.append(roc_auc_score(y_valid,y_val[valid_index]))
y_test += np.array(model.predict(test_feature)/5)
score_valid = np.array(score_valid)
y_test = pd.DataFrame(y_test,columns=[column_name])
print(label_name+'验证集结果：{}'.format(score_valid.mean()))
return y_test

fav_test = pred_select('favorite','pred_favorite')   

展开全文
• lgb 模型调参 以回归为例子 可以试试贝叶斯调参
• lgb.LGBMRegressor为例 model_lgb = lgb.LGBMRegressor(objective='regression', max_depth = 3, learning_rate=0.1, n_estimators=3938, metric='rmse', bagging_fraction = 0.8,
• ## python|LightGBM模型

千次阅读 2020-06-15 15:36:49
# -*- coding: utf-8 -*- """ Created on Fri Jun 12 16:20...import lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn.metrics import * from sklearn.datasets import load_iri.
• 一些重要的参数如下： config, 默认="", type（类型）=string, alias（别名）=config_file 配置文件的路径 task, 默认=train, type（类型）=enum, options（可选）=train, predict, convert_model train, ...
• 本文主要讲解：lgb训练模型后使用AutoML-NNI对注塑工艺参数调参优化 主要思路： 使用lgb训练注塑品的三维尺寸预测模型，保存模型 将训练特征作为需要调优的参数，使用保存的模型对这个参数进行预测 将预测的结果...
• ## LGB、XGB、CBT参数

千次阅读 2019-10-11 22:09:41
LGB： 1、lgb.Dataset() train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'], weight=w ) LightGBM 可以直接使用 categorical features（分类特征）作为 ...
• ## lgb多分类参数设置

千次阅读 2019-09-05 13:13:16
trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx]) val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx]) num_round = 1000 clf = lgb.train(param, trn_data, num_round, valid_...
• XGBoost 参数介绍 XGBoost的参数一共分为三类(完整参数请戳官方文档)： 通用参数：宏观函数控制。 ...Booster参数：控制每一步的...booster参数一般可以调控模型的效果和计算代价。我们所说的调参，很这是大程度...
• 现在的比赛，想要拿到一个好的名次，就一定要进行模型融合，这里总结一下三种基础的模型： - lightgbm：由于现在的比赛数据越来越大，想要获得一个比较高的预测精度，同时又要减少内存占用以及提升训练速度，light...
• 了解xgboost 找到网络一个图 侵删 感谢原作者提供图...xgboost训练回归模型很简单，按照前面的博客安装了xgboost库之后： xgboost的参数说明如下代码： params={ 'booster':'gbtree', 'objective': 'multi:...
• 本文主要讲解：lgb训练模型，保存模型，将训练特征作为需要调优的参数，使用AutoML-NNI对训练特征参数调参优化 主要思路： 使用lgb训练模型，保存模型 将训练特征作为需要调优的参数，使用保存的模型对这个参数进行...
• import lightgbm as lgb import sys THRESHOLD = 0.28 # 设定截断阈值 data_input_path = sys.argv[1] # 数据输入路径 data_output_path = sys.argv[2] # 数据输出路径 model_file_path = sys.argv[3] # 模型路径 ...
• lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)  # specify your configurations as a dict   params = {    'boosting_type' :  'gbdt' ,    'objective' :  'binary' ,    '...

...