repo-sync-2024-06-19T10:45:51+0800 (#72)

asterinas · Jun 20, 2024 · 0215a9a · 0215a9a
1 parent a10a98d
commit 0215a9a
Show file tree

Hide file tree

Showing 9 changed files with 923 additions and 86 deletions.
diff --git a/docs/architecture/apps/index.rst b/docs/architecture/apps/index.rst
@@ -31,6 +31,8 @@ TrustedFlow内置了多种可信APP，每一个可信APP在执行计算逻辑之
    lr_train
    xgb_predict
    lr_predict
+   lgbm_train
+   lgbm_predict
    binary_evaluation
    prediction_bias_eval
 

diff --git a/docs/architecture/apps/lgbm_predict.md b/docs/architecture/apps/lgbm_predict.md
@@ -0,0 +1,123 @@
+# LightGBM预测
+
+使用给定的LightGBM模型对数据进行预测。
+
+## 组件定义
+
+1. 参数
+    (1) pred_name: 预测值的列名。
+    (2) save_label: 输出结果是否包含标签列，true表示保存。
+    (3) label_name: 标签列的名称，默认为“label”。
+    (4) save_id: 输出结果是否保存ID列，true表示保存。
+    (5) id_name： ID列的名称。
+    (6) col_names: 可选，输出指定的列到结果中，默认为空。
+2. 输入：待预测的数据以及LightGBM模型。
+3. 输出：预测结果。
+
+```json
+{
+    "domain": "ml.predict",
+    "name": "lgbm_predict",
+    "desc": "Predict using the lgbm model.",
+    "version": "0.0.1",
+    "attrs": [
+        {
+            "name": "pred_name",
+            "desc": "Column name for predictions.",
+            "type": "AT_STRING",
+            "atomic": {
+                "is_optional": true,
+                "default_value": {
+                    "s": "pred"
+                }
+            }
+        },
+        {
+            "name": "save_label",
+            "desc": "Whether or not to save real label column into output pred table. If true, input feature_dataset must contain label column.",
+            "type": "AT_BOOL",
+            "atomic": {
+                "is_optional": true,
+                "default_value": {}
+            }
+        },
+        {
+            "name": "label_name",
+            "desc": "Column name for label.",
+            "type": "AT_STRING",
+            "atomic": {
+                "is_optional": true,
+                "default_value": {
+                    "s": "label"
+                }
+            }
+        },
+        {
+            "name": "save_id",
+            "desc": "Whether to save id column into output pred table. If true, input feature_dataset must contain id column.",
+            "type": "AT_BOOL",
+            "atomic": {
+                "is_optional": true,
+                "default_value": {}
+            }
+        },
+        {
+            "name": "id_name",
+            "desc": "Column name for id.",
+            "type": "AT_STRING",
+            "atomic": {
+                "is_optional": true,
+                "default_value": {
+                    "s": "id"
+                }
+            }
+        },
+        {
+            "name": "col_names",
+            "desc": "Extra column names into output pred table.",
+            "type": "AT_STRINGS",
+            "atomic": {
+                "list_max_length_inclusive": "-1",
+                "is_optional": true
+            }
+        }
+    ],
+    "inputs": [
+        {
+            "name": "feature_dataset",
+            "desc": "Input feature dataset.",
+            "types": [
+                "sf.table.individual"
+            ],
+            "attrs": [
+                {
+                    "name": "ids",
+                    "desc": "Id columns.",
+                    "col_max_cnt_inclusive": "1"
+                },
+                {
+                    "name": "label",
+                    "desc": "Label column.",
+                    "col_max_cnt_inclusive": "1"
+                }
+            ]
+        },
+        {
+            "name": "model",
+            "desc": "Input model.",
+            "types": [
+                "sf.model.lgbm"
+            ]
+        }
+    ],
+    "outputs": [
+        {
+            "name": "pred",
+            "desc": "Output prediction.",
+            "types": [
+                "sf.table.individual"
+            ]
+        }
+    ]
+}
+```
diff --git a/docs/architecture/apps/lgbm_train.md b/docs/architecture/apps/lgbm_train.md
@@ -0,0 +1,141 @@
+# LightGBM训练
+
+使用LightGBM对数据集进行训练，得到LightGBM模型，支持二分类和线性回归。
+
+## 组件定义
+
+```json
+{
+  "domain": "ml.train",
+  "name": "lgbm_train",
+  "desc": "LightGBM train component for individual dataset.",
+  "version": "0.0.1",
+  "attrs": [
+      {
+          "name": "n_estimators",
+          "desc": "Number of boosted trees to fit.",
+          "type": "AT_INT",
+          "atomic": {
+              "is_optional": true,
+              "default_value": {
+                  "i64": "10"
+              },
+              "lower_bound_enabled": true,
+              "lower_bound": {
+                  "i64": "1"
+              },
+              "lower_bound_inclusive": true,
+              "upper_bound_enabled": true,
+              "upper_bound": {
+                  "i64": "1024"
+              },
+              "upper_bound_inclusive": true
+          }
+      },
+      {
+          "name": "objective",
+          "desc": "Specify the learning objective.",
+          "type": "AT_STRING",
+          "atomic": {
+              "is_optional": true,
+              "default_value": {
+                  "s": "binary"
+              },
+              "allowed_values": {
+                  "ss": [
+                      "regression",
+                      "binary"
+                  ]
+              }
+          }
+      },
+      {
+          "name": "boosting_type",
+          "desc": "Boosting type.",
+          "type": "AT_STRING",
+          "atomic": {
+              "is_optional": true,
+              "default_value": {
+                  "s": "gbdt"
+              },
+              "allowed_values": {
+                  "ss": [
+                      "gbdt",
+                      "rf",
+                      "dart"
+                  ]
+              }
+          }
+      },
+      {
+          "name": "learning_rate",
+          "desc": "Learning rate.",
+          "type": "AT_FLOAT",
+          "atomic": {
+              "is_optional": true,
+              "default_value": {
+                  "f": 0.1
+              },
+              "lower_bound_enabled": true,
+              "lower_bound": {},
+              "upper_bound_enabled": true,
+              "upper_bound": {
+                  "f": 1
+              },
+              "upper_bound_inclusive": true
+          }
+      },
+      {
+          "name": "num_leaves",
+          "desc": "Max number of leaves in one tree.",
+          "type": "AT_INT",
+          "atomic": {
+              "is_optional": true,
+              "default_value": {
+                  "i64": "31"
+              },
+              "lower_bound_enabled": true,
+              "lower_bound": {
+                  "i64": "2"
+              },
+              "lower_bound_inclusive": true,
+              "upper_bound_enabled": true,
+              "upper_bound": {
+                  "i64": "1024"
+              },
+              "upper_bound_inclusive": true
+          }
+      }
+  ],
+  "inputs": [
+      {
+          "name": "train_dataset",
+          "desc": "Input table.",
+          "types": [
+              "sf.table.individual"
+          ],
+          "attrs": [
+              {
+                  "name": "ids",
+                  "desc": "Id columns will not be trained."
+              },
+              {
+                  "name": "label",
+                  "desc": "Label column.",
+                  "col_min_cnt_inclusive": "1",
+                  "col_max_cnt_inclusive": "1"
+              }
+          ]
+      }
+  ],
+  "outputs": [
+      {
+          "name": "output_model",
+          "desc": "Output model.",
+          "types": [
+              "sf.model.lgbm"
+          ]
+      }
+  ]
+}
+```
diff --git a/docs/architecture/index.rst b/docs/architecture/index.rst
@@ -1,4 +1,4 @@
-核心功能
+架构设计
 ========================
 想了解TrustedFlow原理和功能，欢迎阅读下列文章！