{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "94c4297f-a896-4a8a-946b-0b971b910ee3",
   "metadata": {},
   "source": [
    "# 🍀 XGBoost\n",
    "\n",
    "XGBoost（Extreme Gradient Boosting），极致提升树/极限提升树，由陈天奇开发。在GBDT基础上改进的算法。\n",
    "\n",
    "官网：https://xgboost.readthedocs.io/en/stable/\n",
    "\n",
    "安装：`pip install xgboost`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cc4733d1-8aa4-4022-804c-59761aa6f40b",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])\n"
     ]
    }
   ],
   "source": [
    "# 导入加利福尼亚房价数据\n",
    "from sklearn.datasets import fetch_california_housing\n",
    "from sklearn.model_selection import train_test_split as TTS\n",
    "\n",
    "data = fetch_california_housing()\n",
    "print(data.keys())\n",
    "\n",
    "# 划分数据集\n",
    "x = data['data']\n",
    "y = data['target']\n",
    "train_x, test_x, train_y, test_y = TTS(x, y, test_size=0.3, random_state=22)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d37839f-ceb2-4c41-b957-c428d1466a2e",
   "metadata": {},
   "source": [
    "## 壹丨XGBoost原生API调用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "58f23458-b181-42d2-861b-6163d9d9dbf8",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "from sklearn.metrics import mean_squared_error as MSE"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58578e30-582e-4f38-a63c-6593964af071",
   "metadata": {},
   "source": [
    "### 1. 数据转换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1c1af626-4dac-4459-bad7-dc8c6229e1f9",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# XGBoost原生调用需要把数据转换为特定的数据格式\n",
    "train_data = xgb.DMatrix(train_x, train_y)\n",
    "test_data = xgb.DMatrix(test_x, test_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50bd6b34-ebcf-4626-9c37-83a72894cc7a",
   "metadata": {},
   "source": [
    "### 2. XGB训练测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "18f00b03-faae-4ec1-9b3c-722955d4f562",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MSE = 0.5131600787859798\n"
     ]
    }
   ],
   "source": [
    "# 设置参数\n",
    "param = {'seed': 22}\n",
    "\n",
    "# 训练模型\n",
    "reg = xgb.train(param, train_data)\n",
    "\n",
    "# 预测结果\n",
    "y_pred = reg.predict(test_data)\n",
    "\n",
    "# 计算mse\n",
    "mse = MSE(test_y, y_pred, squared=False)\n",
    "print(f'MSE = {mse}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e458ad2-f759-4d1f-8e15-8ab4e5d5f34c",
   "metadata": {},
   "source": [
    "### 3. 交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "365c72d3-5e68-4bd9-b8d7-4f5f7cc75608",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>train-rmse-mean</th>\n",
       "      <th>train-rmse-std</th>\n",
       "      <th>test-rmse-mean</th>\n",
       "      <th>test-rmse-std</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.942639</td>\n",
       "      <td>0.004828</td>\n",
       "      <td>0.950416</td>\n",
       "      <td>0.015797</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.795500</td>\n",
       "      <td>0.004464</td>\n",
       "      <td>0.810545</td>\n",
       "      <td>0.015531</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.698609</td>\n",
       "      <td>0.003671</td>\n",
       "      <td>0.722387</td>\n",
       "      <td>0.019121</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.633481</td>\n",
       "      <td>0.002354</td>\n",
       "      <td>0.665639</td>\n",
       "      <td>0.020278</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.587897</td>\n",
       "      <td>0.003754</td>\n",
       "      <td>0.627098</td>\n",
       "      <td>0.018831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.557956</td>\n",
       "      <td>0.004437</td>\n",
       "      <td>0.604098</td>\n",
       "      <td>0.016470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.533969</td>\n",
       "      <td>0.004048</td>\n",
       "      <td>0.585822</td>\n",
       "      <td>0.018141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.511882</td>\n",
       "      <td>0.004187</td>\n",
       "      <td>0.568054</td>\n",
       "      <td>0.016490</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.495614</td>\n",
       "      <td>0.005177</td>\n",
       "      <td>0.556252</td>\n",
       "      <td>0.016304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.480777</td>\n",
       "      <td>0.005718</td>\n",
       "      <td>0.545853</td>\n",
       "      <td>0.012671</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std\n",
       "0         0.942639        0.004828        0.950416       0.015797\n",
       "1         0.795500        0.004464        0.810545       0.015531\n",
       "2         0.698609        0.003671        0.722387       0.019121\n",
       "3         0.633481        0.002354        0.665639       0.020278\n",
       "4         0.587897        0.003754        0.627098       0.018831\n",
       "5         0.557956        0.004437        0.604098       0.016470\n",
       "6         0.533969        0.004048        0.585822       0.018141\n",
       "7         0.511882        0.004187        0.568054       0.016490\n",
       "8         0.495614        0.005177        0.556252       0.016304\n",
       "9         0.480777        0.005718        0.545853       0.012671"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgb.cv(param, train_data, nfold=5, seed=22)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5502448e-04c4-470c-9f9d-20342aa15d2b",
   "metadata": {},
   "source": [
    "输出为生成树过程的n折平均结果，即最终结果为最后一行，且为n折交叉验证的平均"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e6085b1-dc05-4f8e-9cf9-bd3e3fb52dd2",
   "metadata": {},
   "source": [
    "## 贰丨XGBoost的sklearn API调用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cdb6bb52-57dc-4372-802e-e214648fe667",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from xgboost import XGBRegressor"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7bd64879-713e-4713-9ff8-d63d6127ed43",
   "metadata": {},
   "source": [
    "### 1. 训练测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3fdc1fcc-f1a3-4d9e-83e1-395f66d2c543",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.833294092003024"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 创建对象\n",
    "reg = xgb.XGBRegressor(n_estimators=100, random_state=22)\n",
    "\n",
    "# 训练\n",
    "reg.fit(train_x, train_y)\n",
    "\n",
    "# 测试, 默认返回R2\n",
    "reg.score(test_x, test_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ea4a803a-0a3d-446f-9fea-ac0977353001",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MSE = 0.4609130051083081\n"
     ]
    }
   ],
   "source": [
    "# 预测\n",
    "y_pred = reg.predict(test_x)\n",
    "\n",
    "# 计算mse\n",
    "mse = MSE(test_y, y_pred, squared=False)\n",
    "print(f'MSE = {mse}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e962b521-d94b-4c25-a9d6-ef1fa9b0a391",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MedInc: 0.4889\n",
      "HouseAge: 0.0658\n",
      "AveRooms: 0.0363\n",
      "AveBedrms: 0.0228\n",
      "Population: 0.0245\n",
      "AveOccup: 0.1429\n",
      "Latitude: 0.1079\n",
      "Longitude: 0.1109\n"
     ]
    }
   ],
   "source": [
    "# 输出贡献率\n",
    "for fn, fi in zip(data['feature_names'], reg.feature_importances_):\n",
    "    print(f'{fn}: {fi:.4f}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0abcd65-6fe4-49dd-8c82-de35c211935a",
   "metadata": {},
   "source": [
    "### 2. 交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c3489f68-7701-4120-8731-eb814dba82f0",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import cross_val_score as CVS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c07756ff-3de6-4e05-88b9-47ba92072268",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean(RMSE) = 0.4830401686217102\n",
      "Var(RMSE) = 0.0002604404270682063\n"
     ]
    }
   ],
   "source": [
    "reg = xgb.XGBRegressor(n_estimators=100, random_state=22)\n",
    "# 交叉验证\n",
    "cv = KFold(n_splits=5, shuffle=True, random_state=22)\n",
    "res = CVS(reg, train_x, train_y, cv=cv, scoring='neg_mean_squared_error')\n",
    "rmse = (abs(res) ** 0.5).mean()\n",
    "vrmse = (abs(res) ** 0.5).var()\n",
    "print(f'Mean(RMSE) = {rmse}\\n'\n",
    "      f'Var(RMSE) = {vrmse}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3246e8ed-1864-4be7-8ea6-62b51e171065",
   "metadata": {},
   "source": [
    "## 叁丨XGBoost参数\n",
    "\n",
    "### 1. 查看XGBoost参数\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7cfba61-7da3-494a-9220-9b2d06941e03",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 查看sklearn API参数\n",
    "from xgboost import XGBRegressor as XGBR\n",
    "\n",
    "reg = XGBR()\n",
    "reg"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74a3324f-6e85-43a1-8af2-5feca3d500fa",
   "metadata": {},
   "source": [
    "### 2. sklearn API和原生API对照\n",
    "\n",
    "| 参数相关流程         | 原生库参数                                                   | sklearn API参数                                              |\n",
    "| -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |\n",
    "| 损失函数             | <b>`objective`</b>，`lambda`，`alpha`                           | <b>`objective`</b>，`reg_alpha`，`reg_lambda`                   |\n",
    "| 集成规则             | <b>`eta`</b>，`base_score`，`eval_metric`，`subsample`，`sampling_method`，`colsample_bytree`，`colsample_bylevel`，`colsample_bynode` | <b>`learning_rate`</b>，`base_score`，`eval_metric`，`subsample`，`colsample_bytree`，`colsample_bylevel`，`colsample_bynode` |\n",
    "| 弱评估器             | <b>`num_boost_round`</b>，`booster`，`tree_method`，`sketch_eps`，`updater`，`grow_policy` | <b>`n_estimators`</b>，`booster`，`tree_method`                 |\n",
    "| 训练流程（抗过拟合） | `num_feature`，`max_depth`，`gamma`，`min_child_weight`，`max_delta_step`，`max_leaves`，`max_bin` | `max_depth`，`gamma`，`min_child_weight`，`max_delta_step`   |\n",
    "| 训练流程（结果监控） | <b>`verbosity`</b>                                              | <b>`verbosity`</b>                                              |\n",
    "| 训练流程（提前停止） | `early_stopping_rounds`                                      | `early_stopping_rounds`                                      |\n",
    "| 训练流程（增量学习） |                                                              | `warm_start`                                                 |\n",
    "| 随机性控制           | `seed`                                                       | `random_state`                                               |\n",
    "| 其他流程             | <b>`missing`</b>，`scale_pos_weight`，`predictor`，<b>`num_parallel_tree`</b> | `n_jobs`，`scale_pos_weight`，`num_parallel_tree`，`enable_categorical`，`importance_type` |\n",
    "\n",
    "\n",
    "### 3. 参数解析\n",
    "\n",
    "`objective`：目标函数（损失函数+模型复杂\n",
    "\n",
    "度）\n",
    "$$\n",
    "Obj = \\Sigma^m_{i=1}l(y, f(x)) + \\Sigma^K_{k=1} \\Omega(g_k) \\\\\n",
    "\n",
    "\\Omega(g_k) = \\gamma T+正则项(Regularization)\n",
    "$$\n",
    "\n",
    "\n",
    "实际填写`objective`参数时，填写的是损失函数名，不包括正则项（该参数默认值为平方损失）\n",
    "\n",
    "| 任务 | 损失函数              | 概念                                         |\n",
    "| ---- | --------------------- | -------------------------------------------- |\n",
    "| 回归 | `reg:squarederror`    | 平方损失                                     |\n",
    "| 回归 | `reg:squaredlogerror` | 平放对数损失                                 |\n",
    "| 分类 | `binary:logistic`     | 二分类交叉熵损失，输出概率                   |\n",
    "| 分类 | `binary:logitraw`     | 二分类交叉熵损失，输出Sigmoid之前的值        |\n",
    "| 分类 | `multi:softmax`       | 多分类交叉熵损失，输出具体的类别             |\n",
    "| 分类 | `multi:softprob`      | 多分类交叉熵损失，输出每个样本每个类别下概率 |\n",
    "| ……   | ……                    | ……                                           |\n",
    "\n",
    "`lambda`：决定完整目标函数，L2正则项系数，默认为1\n",
    "\n",
    "`alpha`：决定完整目标函数，L1正则项系数，默认为0\n",
    "\n",
    "`num_boost_round`：树的数量，默认10。（`n_estimators`：默认100）\n",
    "\n",
    "`eta`：学习率，默认0.3（`learning_rate`：默认0.3）\n",
    "\n",
    "`verbosity`：打印相关信息。\n",
    "\n",
    "| 参数 | 操作              |\n",
    "| ---- | ----------------- |\n",
    "| 0    | 不打印任何内容    |\n",
    "| 1    | 仅打印警告        |\n",
    "| 2    | 打印树的全部信息  |\n",
    "| 3    | 打印更多debug信息 |\n",
    "\n",
    "`num_parallel_tree`：允许并行建立的树的oost建立随机森林\n",
    "\n",
    "`missing`：容忍缺失的数据\n",
    "\n",
    "## 肆丨调参\n",
    "\n",
    "以学习率为例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4b6de43b-3fc3-4c60-836f-23d051b73521",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "learning_rate = 0.1000, res = 0.4868025769066027\n",
      "learning_rate = 0.2000, res = 0.4793697761831933\n",
      "learning_rate = 0.3000, res = 0.4830401686217102\n",
      "learning_rate = 0.4000, res = 0.5026510715198398\n",
      "learning_rate = 0.5000, res = 0.512333817939106\n"
     ]
    }
   ],
   "source": [
    "# 🔟 scikit-learn API\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.model_selection import KFold, cross_val_score as CVS\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "for lr in np.linspace(0.1, 0.5, 5):\n",
    "    reg = XGBRegressor(objective='reg:squarederror',\n",
    "                       learning_rate=lr,\n",
    "                       random_state=22)\n",
    "    cv = KFold(n_splits=5, shuffle=True, random_state=22)\n",
    "    res = CVS(reg, train_x, train_y, cv=cv, scoring='neg_mean_squared_error')\n",
    "\n",
    "    print(f'learning_rate = {lr:.4f}, res = {(abs(res) ** 0.5).mean()}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "ec4a0499-5d79-442c-b456-3b7edece54c9",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eta = 0.1000, res = 0.4873787736549489\n",
      "eta = 0.2000, res = 0.4836093275141261\n",
      "eta = 0.3000, res = 0.4846305601993509\n",
      "eta = 0.4000, res = 0.49834340288402323\n",
      "eta = 0.5000, res = 0.5076573688820785\n"
     ]
    }
   ],
   "source": [
    "# 原生API\n",
    "import xgboost as xgb\n",
    "\n",
    "train_data = xgb.DMatrix(train_x, train_y)\n",
    "\n",
    "for eta in np.linspace(0.1, 0.5, 5):\n",
    "    param = {'objective': 'reg:squarederror',\n",
    "             'eta': eta,\n",
    "             'seed': 22}\n",
    "    res = xgb.cv(param, train_data,\n",
    "                 num_boost_round=100, nfold=5, seed=22,\n",
    "                 shuffle=True, metrics='rmse')\n",
    "    print(f'eta = {eta:.4f}, res = {res.iloc[-1, -2]}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
