{ "cells": [ { "cell_type": "markdown", "id": "94c4297f-a896-4a8a-946b-0b971b910ee3", "metadata": {}, "source": [ "# 🍀 XGBoost\n", "\n", "XGBoost（Extreme Gradient Boosting），极致提升树/极限提升树，由陈天奇开发。在GBDT基础上改进的算法。\n", "\n", "官网：https://xgboost.readthedocs.io/en/stable/\n", "\n", "安装：`pip install xgboost`" ] }, { "cell_type": "code", "execution_count": 1, "id": "cc4733d1-8aa4-4022-804c-59761aa6f40b", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])\n" ] } ], "source": [ "# 导入加利福尼亚房价数据\n", "from sklearn.datasets import fetch_california_housing\n", "from sklearn.model_selection import train_test_split as TTS\n", "\n", "data = fetch_california_housing()\n", "print(data.keys())\n", "\n", "# 划分数据集\n", "x = data['data']\n", "y = data['target']\n", "train_x, test_x, train_y, test_y = TTS(x, y, test_size=0.3, random_state=22)" ] }, { "cell_type": "markdown", "id": "2d37839f-ceb2-4c41-b957-c428d1466a2e", "metadata": {}, "source": [ "## 壹丨XGBoost原生API调用" ] }, { "cell_type": "code", "execution_count": 2, "id": "58f23458-b181-42d2-861b-6163d9d9dbf8", "metadata": { "tags": [] }, "outputs": [], "source": [ "import xgboost as xgb\n", "from sklearn.metrics import mean_squared_error as MSE" ] }, { "cell_type": "markdown", "id": "58578e30-582e-4f38-a63c-6593964af071", "metadata": {}, "source": [ "### 1. 数据转换" ] }, { "cell_type": "code", "execution_count": 3, "id": "1c1af626-4dac-4459-bad7-dc8c6229e1f9", "metadata": { "tags": [] }, "outputs": [], "source": [ "# XGBoost原生调用需要把数据转换为特定的数据格式\n", "train_data = xgb.DMatrix(train_x, train_y)\n", "test_data = xgb.DMatrix(test_x, test_y)" ] }, { "cell_type": "markdown", "id": "50bd6b34-ebcf-4626-9c37-83a72894cc7a", "metadata": {}, "source": [ "### 2. XGB训练测试" ] }, { "cell_type": "code", "execution_count": 4, "id": "18f00b03-faae-4ec1-9b3c-722955d4f562", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE = 0.5131600787859798\n" ] } ], "source": [ "# 设置参数\n", "param = {'seed': 22}\n", "\n", "# 训练模型\n", "reg = xgb.train(param, train_data)\n", "\n", "# 预测结果\n", "y_pred = reg.predict(test_data)\n", "\n", "# 计算mse\n", "mse = MSE(test_y, y_pred, squared=False)\n", "print(f'MSE = {mse}')" ] }, { "cell_type": "markdown", "id": "4e458ad2-f759-4d1f-8e15-8ab4e5d5f34c", "metadata": {}, "source": [ "### 3. 交叉验证" ] }, { "cell_type": "code", "execution_count": 5, "id": "365c72d3-5e68-4bd9-b8d7-4f5f7cc75608", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	train-rmse-mean	train-rmse-std	test-rmse-mean	test-rmse-std
0	0.942639	0.004828	0.950416	0.015797
1	0.795500	0.004464	0.810545	0.015531
2	0.698609	0.003671	0.722387	0.019121
3	0.633481	0.002354	0.665639	0.020278
4	0.587897	0.003754	0.627098	0.018831
5	0.557956	0.004437	0.604098	0.016470
6	0.533969	0.004048	0.585822	0.018141
7	0.511882	0.004187	0.568054	0.016490
8	0.495614	0.005177	0.556252	0.016304
9	0.480777	0.005718	0.545853	0.012671

\n", "

" ], "text/plain": [ " train-rmse-mean train-rmse-std test-rmse-mean test-rmse-std\n", "0 0.942639 0.004828 0.950416 0.015797\n", "1 0.795500 0.004464 0.810545 0.015531\n", "2 0.698609 0.003671 0.722387 0.019121\n", "3 0.633481 0.002354 0.665639 0.020278\n", "4 0.587897 0.003754 0.627098 0.018831\n", "5 0.557956 0.004437 0.604098 0.016470\n", "6 0.533969 0.004048 0.585822 0.018141\n", "7 0.511882 0.004187 0.568054 0.016490\n", "8 0.495614 0.005177 0.556252 0.016304\n", "9 0.480777 0.005718 0.545853 0.012671" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xgb.cv(param, train_data, nfold=5, seed=22)" ] }, { "cell_type": "markdown", "id": "5502448e-04c4-470c-9f9d-20342aa15d2b", "metadata": {}, "source": [ "输出为生成树过程的n折平均结果，即最终结果为最后一行，且为n折交叉验证的平均" ] }, { "cell_type": "markdown", "id": "4e6085b1-dc05-4f8e-9cf9-bd3e3fb52dd2", "metadata": {}, "source": [ "## 贰丨XGBoost的sklearn API调用" ] }, { "cell_type": "code", "execution_count": 6, "id": "cdb6bb52-57dc-4372-802e-e214648fe667", "metadata": { "tags": [] }, "outputs": [], "source": [ "from xgboost import XGBRegressor" ] }, { "cell_type": "markdown", "id": "7bd64879-713e-4713-9ff8-d63d6127ed43", "metadata": {}, "source": [ "### 1. 训练测试" ] }, { "cell_type": "code", "execution_count": 7, "id": "3fdc1fcc-f1a3-4d9e-83e1-395f66d2c543", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0.833294092003024" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 创建对象\n", "reg = xgb.XGBRegressor(n_estimators=100, random_state=22)\n", "\n", "# 训练\n", "reg.fit(train_x, train_y)\n", "\n", "# 测试, 默认返回R2\n", "reg.score(test_x, test_y)" ] }, { "cell_type": "code", "execution_count": 8, "id": "ea4a803a-0a3d-446f-9fea-ac0977353001", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE = 0.4609130051083081\n" ] } ], "source": [ "# 预测\n", "y_pred = reg.predict(test_x)\n", "\n", "# 计算mse\n", "mse = MSE(test_y, y_pred, squared=False)\n", "print(f'MSE = {mse}')" ] }, { "cell_type": "code", "execution_count": 9, "id": "e962b521-d94b-4c25-a9d6-ef1fa9b0a391", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MedInc: 0.4889\n", "HouseAge: 0.0658\n", "AveRooms: 0.0363\n", "AveBedrms: 0.0228\n", "Population: 0.0245\n", "AveOccup: 0.1429\n", "Latitude: 0.1079\n", "Longitude: 0.1109\n" ] } ], "source": [ "# 输出贡献率\n", "for fn, fi in zip(data['feature_names'], reg.feature_importances_):\n", " print(f'{fn}: {fi:.4f}')" ] }, { "cell_type": "markdown", "id": "d0abcd65-6fe4-49dd-8c82-de35c211935a", "metadata": {}, "source": [ "### 2. 交叉验证" ] }, { "cell_type": "code", "execution_count": 10, "id": "c3489f68-7701-4120-8731-eb814dba82f0", "metadata": { "tags": [] }, "outputs": [], "source": [ "from sklearn.model_selection import KFold\n", "from sklearn.model_selection import cross_val_score as CVS" ] }, { "cell_type": "code", "execution_count": 11, "id": "c07756ff-3de6-4e05-88b9-47ba92072268", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean(RMSE) = 0.4830401686217102\n", "Var(RMSE) = 0.0002604404270682063\n" ] } ], "source": [ "reg = xgb.XGBRegressor(n_estimators=100, random_state=22)\n", "# 交叉验证\n", "cv = KFold(n_splits=5, shuffle=True, random_state=22)\n", "res = CVS(reg, train_x, train_y, cv=cv, scoring='neg_mean_squared_error')\n", "rmse = (abs(res) ** 0.5).mean()\n", "vrmse = (abs(res) ** 0.5).var()\n", "print(f'Mean(RMSE) = {rmse}\\n'\n", " f'Var(RMSE) = {vrmse}')" ] }, { "cell_type": "markdown", "id": "3246e8ed-1864-4be7-8ea6-62b51e171065", "metadata": {}, "source": [ "## 叁丨XGBoost参数\n", "\n", "### 1. 查看XGBoost参数\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c7cfba61-7da3-494a-9220-9b2d06941e03", "metadata": {}, "outputs": [], "source": [ "# 查看sklearn API参数\n", "from xgboost import XGBRegressor as XGBR\n", "\n", "reg = XGBR()\n", "reg" ] }, { "cell_type": "markdown", "id": "74a3324f-6e85-43a1-8af2-5feca3d500fa", "metadata": {}, "source": [ "### 2. sklearn API和原生API对照\n", "\n", "| 参数相关流程 | 原生库参数 | sklearn API参数 |\n", "| -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |\n", "| 损失函数 | `objective`，`lambda`，`alpha` | `objective`，`reg_alpha`，`reg_lambda` |\n", "| 集成规则 | `eta`，`base_score`，`eval_metric`，`subsample`，`sampling_method`，`colsample_bytree`，`colsample_bylevel`，`colsample_bynode` | `learning_rate`，`base_score`，`eval_metric`，`subsample`，`colsample_bytree`，`colsample_bylevel`，`colsample_bynode` |\n", "| 弱评估器 | `num_boost_round`，`booster`，`tree_method`，`sketch_eps`，`updater`，`grow_policy` | `n_estimators`，`booster`，`tree_method` |\n", "| 训练流程（抗过拟合） | `num_feature`，`max_depth`，`gamma`，`min_child_weight`，`max_delta_step`，`max_leaves`，`max_bin` | `max_depth`，`gamma`，`min_child_weight`，`max_delta_step` |\n", "| 训练流程（结果监控） | `verbosity` | `verbosity` |\n", "| 训练流程（提前停止） | `early_stopping_rounds` | `early_stopping_rounds` |\n", "| 训练流程（增量学习） | | `warm_start` |\n", "| 随机性控制 | `seed` | `random_state` |\n", "| 其他流程 | `missing`，`scale_pos_weight`，`predictor`，`num_parallel_tree` | `n_jobs`，`scale_pos_weight`，`num_parallel_tree`，`enable_categorical`，`importance_type` |\n", "\n", "\n", "### 3. 参数解析\n", "\n", "`objective`：目标函数（损失函数+模型复杂\n", "\n", "度）\n", "$$\n", "Obj = \\Sigma^m_{i=1}l(y, f(x)) + \\Sigma^K_{k=1} \\Omega(g_k) \\\\\n", "\n", "\\Omega(g_k) = \\gamma T+正则项(Regularization)\n", "$$\n", "\n", "\n", "实际填写`objective`参数时，填写的是损失函数名，不包括正则项（该参数默认值为平方损失）\n", "\n", "| 任务 | 损失函数 | 概念 |\n", "| ---- | --------------------- | -------------------------------------------- |\n", "| 回归 | `reg:squarederror` | 平方损失 |\n", "| 回归 | `reg:squaredlogerror` | 平放对数损失 |\n", "| 分类 | `binary:logistic` | 二分类交叉熵损失，输出概率 |\n", "| 分类 | `binary:logitraw` | 二分类交叉熵损失，输出Sigmoid之前的值 |\n", "| 分类 | `multi:softmax` | 多分类交叉熵损失，输出具体的类别 |\n", "| 分类 | `multi:softprob` | 多分类交叉熵损失，输出每个样本每个类别下概率 |\n", "| …… | …… | …… |\n", "\n", "`lambda`：决定完整目标函数，L2正则项系数，默认为1\n", "\n", "`alpha`：决定完整目标函数，L1正则项系数，默认为0\n", "\n", "`num_boost_round`：树的数量，默认10。（`n_estimators`：默认100）\n", "\n", "`eta`：学习率，默认0.3（`learning_rate`：默认0.3）\n", "\n", "`verbosity`：打印相关信息。\n", "\n", "| 参数 | 操作 |\n", "| ---- | ----------------- |\n", "| 0 | 不打印任何内容 |\n", "| 1 | 仅打印警告 |\n", "| 2 | 打印树的全部信息 |\n", "| 3 | 打印更多debug信息 |\n", "\n", "`num_parallel_tree`：允许并行建立的树的oost建立随机森林\n", "\n", "`missing`：容忍缺失的数据\n", "\n", "## 肆丨调参\n", "\n", "以学习率为例" ] }, { "cell_type": "code", "execution_count": 12, "id": "4b6de43b-3fc3-4c60-836f-23d051b73521", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "learning_rate = 0.1000, res = 0.4868025769066027\n", "learning_rate = 0.2000, res = 0.4793697761831933\n", "learning_rate = 0.3000, res = 0.4830401686217102\n", "learning_rate = 0.4000, res = 0.5026510715198398\n", "learning_rate = 0.5000, res = 0.512333817939106\n" ] } ], "source": [ "# 🔟 scikit-learn API\n", "from xgboost import XGBRegressor\n", "from sklearn.model_selection import KFold, cross_val_score as CVS\n", "\n", "import numpy as np\n", "\n", "for lr in np.linspace(0.1, 0.5, 5):\n", " reg = XGBRegressor(objective='reg:squarederror',\n", " learning_rate=lr,\n", " random_state=22)\n", " cv = KFold(n_splits=5, shuffle=True, random_state=22)\n", " res = CVS(reg, train_x, train_y, cv=cv, scoring='neg_mean_squared_error')\n", "\n", " print(f'learning_rate = {lr:.4f}, res = {(abs(res) ** 0.5).mean()}')" ] }, { "cell_type": "code", "execution_count": 13, "id": "ec4a0499-5d79-442c-b456-3b7edece54c9", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "eta = 0.1000, res = 0.4873787736549489\n", "eta = 0.2000, res = 0.4836093275141261\n", "eta = 0.3000, res = 0.4846305601993509\n", "eta = 0.4000, res = 0.49834340288402323\n", "eta = 0.5000, res = 0.5076573688820785\n" ] } ], "source": [ "# 原生API\n", "import xgboost as xgb\n", "\n", "train_data = xgb.DMatrix(train_x, train_y)\n", "\n", "for eta in np.linspace(0.1, 0.5, 5):\n", " param = {'objective': 'reg:squarederror',\n", " 'eta': eta,\n", " 'seed': 22}\n", " res = xgb.cv(param, train_data,\n", " num_boost_round=100, nfold=5, seed=22,\n", " shuffle=True, metrics='rmse')\n", " print(f'eta = {eta:.4f}, res = {res.iloc[-1, -2]}')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 5 }