import pandas as pd, numpy as np, time
from sklearn.model_selection import train_test_split
# 读取数据
data = pd.read_csv("https://cdn.coggle.club/kaggle-flight-delays/flights_10k.csv.zip")
# 提取有用的列
data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
"ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
data.dropna(inplace=True)
# 筛选出部分数据
data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"]>10)*1
# 进行编码
cols = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"]
for item in cols:
data[item] = data[item].astype("category").cat.codes +1
# 划分训练集和测试集
train, test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"],
random_state=10, test_size=0.25)
data
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
def auc(m, train, test):
return (metrics.roc_auc_score(y_train, m.predict_proba(train)[:,1]),
metrics.roc_auc_score(y_test, m.predict_proba(test)[:,1]))
# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [10,30,50],
"min_child_weight" : [1,3,6],
"n_estimators": [200],
"learning_rate": [0.05, 0.1,0.16],}
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3,
verbose=10, n_jobs=-1)
grid_search.fit(train, y_train)
grid_search.best_estimator_
model = xgb.XGBClassifier(max_depth=3, min_child_weight=1, n_estimators=20,\
n_jobs=-1 , verbose=1,learning_rate=0.16)
model.fit(train,y_train)
print(auc(model, train, test))
import lightgbm as lgb
from sklearn import metrics
def auc2(m, train, test):
return (metrics.roc_auc_score(y_train,m.predict(train)),
metrics.roc_auc_score(y_test,m.predict(test)))
lg = lgb.LGBMClassifier(silent=False)
param_dist = {"max_depth": [25,50, 75],
"learning_rate" : [0.01,0.05,0.1],
"num_leaves": [300,900,1200],
"n_estimators": [200]
}
grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=5)
grid_search.fit(train,y_train)
grid_search.best_estimator_
d_train = lgb.Dataset(train, label=y_train, free_raw_data=False)
params = {"max_depth": 3, "learning_rate" : 0.1, "num_leaves": 900, "n_estimators": 20}
# Without Categorical Features
model2 = lgb.train(params, d_train)
print(auc2(model2, train, test))
#With Catgeorical Features
cate_features_name = ["MONTH","DAY","DAY_OF_WEEK","AIRLINE","DESTINATION_AIRPORT",
"ORIGIN_AIRPORT"]
model2 = lgb.train(params, d_train, categorical_feature = cate_features_name)
print(auc2(model2, train, test))
import catboost
cat_features_index = [0,1,2,3,4,5,6]
def auc(m, train, test):
return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))
params = {'depth': [4, 7, 10],
'learning_rate' : [0.03, 0.1, 0.15],
'l2_leaf_reg': [1,4,9],
'iterations': [20],
'silent': [True]}
cb = catboost.CatBoostClassifier()
cb_model = GridSearchCV(cb, params, scoring="roc_auc", cv = 3)
cb_model.fit(train, y_train)
# With Categorical features
clf = catboost.CatBoostClassifier(eval_metric="AUC", depth=3, silent=True,
iterations= 20, l2_leaf_reg= 9, learning_rate= 0.15)
clf.fit(train,y_train)
print(auc(clf, train, test))
# With Categorical features
clf = catboost.CatBoostClassifier(eval_metric="AUC",one_hot_max_size=31, silent=True,
depth=3, iterations= 20, l2_leaf_reg= 9, learning_rate= 0.15)
clf.fit(train,y_train, cat_features= cat_features_index)
print(auc(clf, train, test))
Use bagging by setting bagging_fraction
and bagging_freq
Use feature sub-sampling by setting feature_fraction
Use small max_bin
Use save_binary
to speed up data loading in future learning
Use parallel learning, refer to Parallel Learning Guide <./Parallel-Learning-Guide.rst>
__
Use large max_bin
(may be slower)
Use small learning_rate
with large num_iterations
Use large num_leaves
(may cause over-fitting)
Use bigger training data
Try dart
Use small max_bin
Use small num_leaves
Use min_data_in_leaf
and min_sum_hessian_in_leaf
Use bagging by set bagging_fraction
and bagging_freq
Use feature sub-sampling by set feature_fraction
Use bigger training data
Try lambda_l1
, lambda_l2
and min_gain_to_split
for regularization
Try max_depth
to avoid growing deep tree
Try extra_trees
Try increasing path_smooth
# coding: utf-8
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
try:
import cPickle as pickle
except BaseException:
import pickle
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.test.weight', header=None)[0]
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]
print('Starting training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])
print('Finished first 10 rounds...')
# check feature name
print('7th feature name is:', lgb_train.feature_name[6])
print('Saving model...')
# save model to file
gbm.save_model('model.txt')
print('Dumping model to JSON...')
# dump model to JSON (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
# feature names
print('Feature names:', gbm.feature_name())
# feature importances
print('Feature importances:', list(gbm.feature_importance()))
print('Loading model to predict...')
# load model to predict
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)
print('Dumping and loading model with pickle...')
# dump model with pickle
with open('model.pkl', 'wb') as fout:
pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# eval with loaded model
print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
print('Finished 10 - 20 rounds with model file...')
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval)
print('Finished 20 - 30 rounds with decay learning rates...')
# change other parameters during training
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
print('Finished 30 - 40 rounds with changing bagging_fraction...')
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
return grad, hess
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# binary error
# NOTE: when you do customized loss function, the default prediction value is margin
# This may make built-in evalution metric calculate wrong results
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'error', np.mean(labels != (preds > 0.5)), False
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
fobj=loglikelihood,
feval=binary_error,
valid_sets=lgb_eval)
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
d_train = lgb.Dataset(train, label=y_train)
params = {"max_depth": 4, "learning_rate" : 0.05, "num_leaves": 250, 'n_estimators': 600}
data = lgb.cv(params, d_train, num_boost_round=350, nfold=5, metrics='auc')
print(pd.DataFrame(data))
lg = lgb.LGBMClassifier(silent=False)
param_dist = {"max_depth": [4,5, 7],
"learning_rate" : [0.01,0.05,0.1],
"num_leaves": [300,900,1200],
"n_estimators": [50, 100, 150]
}
grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 5, scoring="roc_auc", verbose=5)
grid_search.fit(train,y_train)
grid_search.best_estimator_, grid_search.best_score_
import warnings
import time
warnings.filterwarnings("ignore")
from bayes_opt import BayesianOptimization
def lgb_eval(max_depth, learning_rate, num_leaves, n_estimators):
params = {
"metric" : 'auc'
}
params['max_depth'] = int(max(max_depth, 1))
params['learning_rate'] = np.clip(0, 1, learning_rate)
params['num_leaves'] = int(max(num_leaves, 1))
params['n_estimators'] = int(max(n_estimators, 1))
cv_result = lgb.cv(params, d_train, nfold=5, seed=0, verbose_eval =200,stratified=False)
return 1.0 * np.array(cv_result['auc-mean']).max()
lgbBO = BayesianOptimization(lgb_eval, {'max_depth': (4, 8),
'learning_rate': (0.05, 0.2),
'num_leaves' : (20,1500),
'n_estimators': (5, 200)}, random_state=0)
lgbBO.maximize(init_points=5, n_iter=50,acq='ei')
print(lgbBO.max)