From e12c66a4a02eca26d47028853318dfdc60c5381a Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 18:19:35 +0800 Subject: [PATCH 01/22] change_feature_pdf_to_md --- .git_bin_url | 2 +- docs/source/feature/fg_docs/ComboFeature.md | 33 +++ docs/source/feature/fg_docs/IdFeature.md | 35 ++++ docs/source/feature/fg_docs/LookupFeature.md | 121 +++++++++++ docs/source/feature/fg_docs/MatchFeature.md | 113 ++++++++++ docs/source/feature/fg_docs/OverLapFeature.md | 64 ++++++ docs/source/feature/fg_docs/RawFeature.md | 71 +++++++ .../source/feature/fg_docs/SequenceFeature.md | 198 ++++++++++++++++++ docs/source/feature/fg_docs/mutiValues.md | 25 +++ docs/source/feature/rtp_fg.md | 18 +- 10 files changed, 670 insertions(+), 10 deletions(-) create mode 100644 docs/source/feature/fg_docs/ComboFeature.md create mode 100644 docs/source/feature/fg_docs/IdFeature.md create mode 100644 docs/source/feature/fg_docs/LookupFeature.md create mode 100644 docs/source/feature/fg_docs/MatchFeature.md create mode 100644 docs/source/feature/fg_docs/OverLapFeature.md create mode 100644 docs/source/feature/fg_docs/RawFeature.md create mode 100644 docs/source/feature/fg_docs/SequenceFeature.md create mode 100644 docs/source/feature/fg_docs/mutiValues.md diff --git a/.git_bin_url b/.git_bin_url index 18eeef0d3..22c8ef187 100644 --- a/.git_bin_url +++ b/.git_bin_url @@ -38,7 +38,7 @@ {"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"} {"leaf_path": "data/test/mt_ckpt", "sig": "803499f48e2df5e51ce5606e9649c6d4", "remote_path": "data/git_oss_sample_data/data_test_mt_ckpt_803499f48e2df5e51ce5606e9649c6d4"} {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"} -{"leaf_path": "data/test/tb_data", "sig": "b1579db090d72b3b70b59ba3c7692701", "remote_path": "data/git_oss_sample_data/data_test_tb_data_b1579db090d72b3b70b59ba3c7692701"} +{"leaf_path": "data/test/tb_data", "sig": "f1279ca42de1734be321e88f85775d5f", "remote_path": "data/git_oss_sample_data/data_test_tb_data_f1279ca42de1734be321e88f85775d5f"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_edge", "sig": "48f994681d719a2546ec4003fcbc638c", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_edge_48f994681d719a2546ec4003fcbc638c"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_item", "sig": "f23a9eb9457c14a8e57b455804b1f013", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_item_f23a9eb9457c14a8e57b455804b1f013"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_user", "sig": "23514156eae5a4250ac1d0a118883430", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_user_23514156eae5a4250ac1d0a118883430"} diff --git a/docs/source/feature/fg_docs/ComboFeature.md b/docs/source/feature/fg_docs/ComboFeature.md new file mode 100644 index 000000000..184148592 --- /dev/null +++ b/docs/source/feature/fg_docs/ComboFeature.md @@ -0,0 +1,33 @@ +# 6.3 Combo Feature + +combo feature是多个字段(或表达式)的组合(即笛卡尔积),id feature可以看成是一种特殊的combo feature,即参与交叉字段只有一个的combo feature。一般来讲,参与交叉的各个字段来自不同的表(比如user特征和item特征进行交叉)。 + +配置: + +``` +{ + "feature_type" : "combo_feature", + "feature_name" : "comb_u_age_item", + "expression" : ["user:age_class", "item:item_id"] +} +``` + + + +## 例子 + +^]表示多值分隔符,注意这是一个符号,其ASCII编码是"\x1D",而不是两个符号 + +| user:age_class的取值 | item:item_id的取值 | 输出的feature | +| -------------------- | ------------------ | ------------------------------------------------------------ | +| 123 | 45678 | comb_u_age_item_123_45678 | +| abc, bcd | 45678 | comb_u_age_item_abc_45678, comb_u_age_item_bcd_45678 | +| abc, bcd | 12345^]45678 | comb_u_age_item_abc_12345, comb_u_age_item_abc_45678, comb_u_age_item_bcd_12345, comb_u_age_item_bcd_45678 | + +输出的feature个数等于 + +``` +|F1| * |F2| * ... * |Fn| +``` + +其中Fn指依赖的第n个字段的值的个数。 \ No newline at end of file diff --git a/docs/source/feature/fg_docs/IdFeature.md b/docs/source/feature/fg_docs/IdFeature.md new file mode 100644 index 000000000..e3deec7a5 --- /dev/null +++ b/docs/source/feature/fg_docs/IdFeature.md @@ -0,0 +1,35 @@ +# 6.1 Id Feature + +功能介绍 + +id feature是一个sparse feature,是一种最简单的离散特征,只是简单的将某个字段的值与用户配置的feature名字拼接。 + +配置方法 + +```json +{ + "feature_type" : "id_feature", + "feature_name" : "item_is_main", + "expression" : "item:is_main" +} +``` + +| 字段名 | 含义 | +| -------------- | ------------------------------------------------------------ | +| feature_name | 必选项,feature_name会被当做最终输出的feature的前缀 | +| expression | 必选项,expression描述该feature所依赖的字段来源 | +| need_prefix | 可选项,true表示会拼上feature_name作为前缀,false表示不拼,默认为true,通常在shared_embedding的场景会用false | +| invalid_values | 可选项,表示这些values都会被输出成null。list string,例如[""],表示将所有的空字符串输出变成null。 | + + + +例子 ( ^]表示多值分隔符,注意这是一个符号,其ASCII编码是"\x1D",而不是两个符号) + +| 类型 | item:is_main的取值 | 输出的feature | +| ---------- | ------------------ | ------------------------------------------- | +| int64_t | 100 | (item_is_main_100, 1) | +| double | 5.2 | (item_is_main_5, 1)(小数部分会被截取) | +| string | abc | (item_is_main_abc, 1) | +| 多值string | abc^]bcd | (item_is_main_abc, 1),(item_is_main_bcd, 1) | +| 多值int | 123^]456 | (item_is_main_123, 1),(item_is_main_456, 1) | + diff --git a/docs/source/feature/fg_docs/LookupFeature.md b/docs/source/feature/fg_docs/LookupFeature.md new file mode 100644 index 000000000..8f6feabde --- /dev/null +++ b/docs/source/feature/fg_docs/LookupFeature.md @@ -0,0 +1,121 @@ +# 6.5 Lookup Feature + + + +## 功能简介 + +如果离线生成不符合预期 请先使用最新的离线fg包 + +lookup feature 和 match feature类似,是从一组kv中匹配到自己需要的结果。 + +lookup feature 依赖 map 和 key 两个字段,map是一个多值string(MultiString)类型的字段,其中每一个string的样子如"k1:v2"。;key可以是一个任意类型的字段。生成特征时,先是取出key的值,将其转换成string类型,然后在map字段所持有的kv对中进行匹配,获取最终的特征。 + +map 和 key 源可以是 item,user,context 的任意组合。在线输入的时候item的多值用多值分隔符char(29)分隔,user和context的多值在tpp访问时用list表示。该特征仅支持json形式的配置方式。 + + + +## 实例 + +```json +{ + "features" : [ + { + "feature_type" : "lookup_feature", + "feature_name" : "item_match_item", + "map" : "item:item_attr", + "key" : "item:item_value", + "needDiscrete" : true + } + ] +} +``` + +对于上面的配置,假设对于某个 doc: + +``` +item_attr : "k1:v1^]k2:v2^]k3:v3" +``` + +^]表示多值分隔符,注意这是一个符号,其ASCII编码是"\x1D",而不是两个符号。该字符在emacs中的输入方式是C-q C-5, 在vi中的输入方式是C-v C-5。 这里item_attr是个多值string。需要切记,当map用来表征多个kv对时,是个多值string,而不是string! + +``` +item_value : "k2" +``` + +特征结果为 item_match_item_k2_v2。由于needDiscrete的值为true,所以特征结果为离散化后的结果。 + + + +## 其它 + +match feature 和 lookup feature都是匹配类型的特征,即从kv对中匹配到相应的结果。两者的区别是: match feature的被匹配字段user 必须是qinfo中传入的字段,即一次查询中对所有的doc来说这个字段的值都是一致的。而 lookup feature 的 key 和 map 没有来源的限制。 + + + +## 配置详解 + +默认情况的配置为 `needDiscrete == true, needWeighting = false, needKey = true, combiner = "sum"` + +### 默认输出 + +### needWeighting == true + +``` +feature_name:fg +map:{{"k1:123", "k2:234", "k3:3"}} +key:{"k1"} +结果:feature={"fg_k1", 123} +``` + +此时会用 string 部分查 weight 表,然后乘对应 feature value 用于 LR 模型。 + +### needDiscrete == true + +``` +feature_name:fg +map:{{"k1:123", "k2:234", "k3:3"}} +key:{"k1"} +结果:feature={"fg_123"} +``` + +### needDiscrete == false + +``` +map:{{"k1:123", "k2:234", "k3:3"}} +key:{"k1"} +结果:feature={123} +``` + +如果存在多个 key 时,可以通过配置 combiner 来组合多个查到的值。可能的配置有 `sum, mean, max, min`。 ps:如果要使用combiner的话需要将needDiscrete设置为false,只有dense类才能做conbiner,生成的value会是数值类的 + +一个配置样例 update on 2021.04.15 + +```json +"kv_fields_encode": [ + { + "name": "cnty_dense_features", + "dimension": 99, + "min_hash_type": 0, + "use_sparse": true + }, + { + "name": "cross_a_tag", + "dimension": 12, + "min_hash_type": 0, + "use_sparse": true + }, + { + "name": "cross_gender", + "dimension": 12, + "min_hash_type": 0, + "use_sparse": true + }, + { + "name": "cross_purchasing_power", + "dimension": 12, + "min_hash_type": 0, + "use_sparse": true + } + ] +``` + diff --git a/docs/source/feature/fg_docs/MatchFeature.md b/docs/source/feature/fg_docs/MatchFeature.md new file mode 100644 index 000000000..9410e9fda --- /dev/null +++ b/docs/source/feature/fg_docs/MatchFeature.md @@ -0,0 +1,113 @@ +# 6.4 Match Feature + + + +## Match feature使用说明 + +match feature一般用来做特征之间的匹配关系,要用到user,item和category三个字段的值。 +match feature支持两种类型,hit和multi hit。 +match feature本质是是一个两层map的匹配,user字段使用string的方式描述了一个两层map,|为第一层map的item之间的分隔符,^为第一层map的key与value之间的分隔符。,为第二层map的item之间的分隔符,:第二层map的key与value之间的分隔符。例如对于50011740^50011740:0.2,36806676:0.3,122572685:0.5|50006842^16788:0.1这样的一个string,转化为二层map就是 + +```json +{ + "50011740" : { + "50011740" : 0.2, + "36806676" : 0.3, + "122572685" : 0.5 + }, + "50006842" : { + "16788" : 0.1 + } +} +``` + +对于hit match 匹配的方式,就是用category的值在第一层map中查找,然后使用item的值在第二层map中查找,最终得到一个结果。 如果不需要使用两层匹配,只需要一层匹配,则可以在map的第一层key中填入ALL, 然后在fg配置的category一项中也填成"ALL"即可。具体见实例一。 + + + +## 配置方式 + +json格式配置文件: + +```json +{ + "feature_name": "user__l1_ctr_1", + "feature_type": "match_feature", + "category": "ALL", + "needDiscrete": false, + "item": "item:category_level1", + "user": "user:l1_ctr_1", + "matchType": "hit" +} +``` + +needDiscrete:true 时,模型使用 match feature 输出的特征名,忽略特征值。默认为 true。 +needDiscrete:false 时,模型取 match feature 输出的特征值,而忽略特征名。 + +matchType: +hit:输出命中的feature + +xml配置文件: + +```xml + + + + +``` + +dependencie:需要做Match 的两个特征 + +category: 类目的feature 字段。category="ALL"不需要分类目匹配 + + + +## Normalizer + +match_feature 支持和 raw_feature 一样的 normalizer,具体可见 [raw_feature](https://yuque.alibaba-inc.com/rtp/wtm2oh/chapter6-raw_feature#normalizer)。 + +## 配置详解 + + + +### hit + + + +对于下面的配置 + +```json +{ + "feature_name": "brand_hit", + "feature_type": "match_feature", + "category": "item:auction_root_category", + "needDiscrete": true, + "item": "item:brand_id", + "user": "user:user_brand_tags_hit", + "matchType": "hit" +} +``` + +假设各字段的值如下: + +| user_brand_tags_hit | 50011740^107287172:0.2,36806676:0.3,122572685:0.5\|50006842^16788816:0.1,10122:0.2,29889:0.3,30068:19 | +| --------------------- | ------------------------------------------------------------ | +| brand_id | 30068 | +| auction_root_category | 50006842 | + +如果 needDiscrete=true,结果为: +如果 needDiscrete=false,结果为: +如果只需要使用一层匹配,则需要将上面配置里的 category 的值改为 ALL。这种情况,用户也可以考虑使用 lookup_feature。 假设各字段的值如下 + +| user_brand_tags_hit | ALL^16788816:40,10122:40,29889:20,30068:20 | +| ------------------- | ------------------------------------------ | +| brand_id | 30068 | + +如果 needDiscrete=true,结果: 如果 needDiscrete=false,结果: + + + +### multihit + +允许用户 category 和 item 两个值为 ALL(注意,不是配置的值,是传入的值),进行 wildcard 匹配,可以匹配出多个值。输出结果类似于 hit。 + diff --git a/docs/source/feature/fg_docs/OverLapFeature.md b/docs/source/feature/fg_docs/OverLapFeature.md new file mode 100644 index 000000000..168a7ec2f --- /dev/null +++ b/docs/source/feature/fg_docs/OverLapFeature.md @@ -0,0 +1,64 @@ +# 6.7 OverLap Feature + + + +## 功能简介 + +用来输出一些字符串字词匹配信息的feature + +离线推荐使用1.3.56-SNAPSHOT这个版本,或者1.3.28(不支持参数need_prefix) ps: 写fg的时候注意维度,title的维度要大于或等于query的问题(简单来说就是如果title是user特征,那query也只能是user特征,user特征的batch size为1,商品特征的batch size为商品数) + +| 方式 | 描述 | 备注 | +| ------------------- | ----------------------------------------------------------- | ------------------------------ | +| common_word | 计算query与title间重复term,并输出为fg_common1_common2 | 重复数不超过query term数 | +| diff_word | 计算query与title间不重复term,并输出为fg_diff1_diff2 | 不重复数不超过query term数 | +| query_common_ratio | 计算query与title间重复term数占query中term比例,乘以10取下整 | 取值为[0,10] | +| title_common_ratio | 计算query与title间重复term数占title中term比例,乘以100取下整 | 取值为[0,100] | +| is_contain | 计算query是否全部包含在title中,保持顺序 | 0表示未包含,1表示包含 | +| is_equal | 计算query是否与title完全相同 | 0表示不完全相同,1表示完全相同 | +| common_word_divided | 计算query与title间重复term,并输出为fg_common1, fg_common2 | 重复数不超过query term数 | +| diff_word_divided | 计算query与title间不重复term,并输出为fg_diff1, fg_diff2 | 重复数不超过query term数 | + + + + + +## 配置方法 + +```json + { + "feature_type" : "overlap_feature", + "feature_name" : "is_contain", + "query" : "user:attr1", + "title" : "item:attr2", + "method" : "is_contain", + "separator" : " " + } +``` + +| 字段名 | 含义 | +| ------------ | ------------------------------------------------------------ | +| feature_type | 必选项,描述改feature的类型 | +| feature_name | 必选项,feature_name会被当做最终输出的feature的前缀 | +| query | 必选项,query依赖的表, attr1是一个多值string, 多值string的分隔符使用chr(29) | +| title | 必选项,title依赖的表, attr2是一个多值string | +| method | 可填common_word, diff_word, query_common_ratio, title_common_ratio, is_contain, 对应上图五种方式 | +| separator | 输出结果中的分割字符,不填写我们默认为_ ,但也可以用户自己定制,具体看例子 | + + + +## 例子 + +query为high,high2,fiberglass,abc +title为high,quality,fiberglass,tube,for,golf,bag + +| method | separator | feature | +| ------------------- | --------- | -------------------------- | +| common_word | | name_high_fiberglass | +| diff_word | " " | name high2 abc | +| query_common_ratio | | name_5 | +| title_common_ratio | | name_28 | +| is_contain | | name_0 | +| is_equal | | name_0 | +| common_word_divided | | name_high, name_fiberglass | +| diff_word_divided | | name_high2, name_abc | \ No newline at end of file diff --git a/docs/source/feature/fg_docs/RawFeature.md b/docs/source/feature/fg_docs/RawFeature.md new file mode 100644 index 000000000..8775b0a81 --- /dev/null +++ b/docs/source/feature/fg_docs/RawFeature.md @@ -0,0 +1,71 @@ +# 6.2 Raw Feature + + + +## 功能介绍 + +raw feature是一种dense的feature,是直接引用原始feature的字段值作为feature的value。raw feature仅支持数值int、float、double等数值类型,对非数值类型的feature需使用id feature。 + +## 配置方法 + +```json +{ + "feature_type" : "raw_feature", + "feature_name" : "ctr", + "expression" : "item:ctr", + "normalizer" : "method=log10" +} +``` + + + +| 字段名 | 含义 | +| --------------- | ------------------------------------------------------------ | +| feature_name | 必选项,在正常使用时该选项是没用处的,因为实际参与接下来运算的主要是feature value,但是在debug的情况下,可以看到对应feature name的值。 | +| expression | 必选项,expression描述该feature所依赖的字段来源 | +| value_dimension | 可选项,默认值为1,表示输出的字段的维度。 | +| normalizer | 可选项,归一化方法,详见后文 | + + + +## 例子 + +^]表示多值分隔符,注意这是一个符号,其ASCII编码是"\x1D",而不是两个符号 + +| 类型 | item:ctr的取值 | 输出的feature | +| ------- | -------------- | ------------------------------------------------------------ | +| int64_t | 100 | (ctr, 100) | +| double | 100.1 | (ctr, 100.1) | +| 多值int | 123^]456 | (ctr, (123,456)) (注意,输入字段必须与配置的dimension维度一致) | + + + + + +## Normalizer + +raw_feature 和 match_feature 支持 normalizer,共三种,`minmax,zscore,log10`。配置和计算方法如下: + +### log10 + +``` +配置例子:method=log10,threshold=1e-10,default=-10 +计算公式:x = x > threshold ? log10(x) : default; +``` + +### zscore + +``` +配置例子:method=zscore,mean=0.0,standard_deviation=10.0 +计算公式:x = (x - mean) / standard_deviation +``` + +### minmax + +``` +配置例子:method=minmax,min=2.1,max=2.2 +计算公式:x = (x - min) / (max - min) +``` + + + diff --git a/docs/source/feature/fg_docs/SequenceFeature.md b/docs/source/feature/fg_docs/SequenceFeature.md new file mode 100644 index 000000000..febd2f92c --- /dev/null +++ b/docs/source/feature/fg_docs/SequenceFeature.md @@ -0,0 +1,198 @@ + + +# 6.8 sequence 类 feature + +## 基本场景 + +⽤户的历史⾏为也是⼀个很重要的 feature。历史⾏为通常是⼀个序列,例如点击序列、购买序列等,组成这个序列的实体可能是商品本身。 +例如我们需要对⽤户的点击序列进⾏ fg,序列⻓度为 30,每个序列提取 nid 和 price, seq_context 特征。正常 item 维度有⼀个 feat0 特征。配置如下: + +```json +{ + "features":[ + { + "feature_type":"raw_feature", + "feature_name":"feat0", + "expression":"user:feat0" + }, + { + "sequence_name":"click", + "sequence_column":"click_field", + "sequence_length":10, + "sequence_delim":";", + "attribute_delim":"#", + "sequence_table":"item", + "sequence_pk":"user:user_behavior_seq", + "features":[ + { + "feature_name":"nid", + "feature_type":"id_feature", + "value_type":"String", + "expression":"item:nid" + }, + { + "feature_name":"price", + "feature_type":"raw_feature", + "expression":"item:price" + }, + { + "feature_name":"seq_context", + "feature_type":"raw_feature", + "expression":"user:seq_context" + } + ] + } + ] +} +``` + +## 在线 FG + +我们⽀持两种⽅式获取⾏为序列,⼀种如例⼦所示,我们以 sequence_pk 配置的字段为主键,RTP 会帮忙从 item 表中查到序列的对应字段值;另⼀种⽤户需要在 qinfo 中准备好所有的字段。 + +### RTP 取 sequence 字段 + +第⼀种情况,`sequence_pk` 的⻓度应该⼩于等于 `sequence_length` 。如果 `sequence_pk` 指定的值不⾜ `sequence_length` 个会补⻬到 `sequence_length` ⻓度,fg 的结果会出默认值(dense 类是 0,sparse 类为空)。 +qinfo 例⼦: + +```json + { + "user:user_behavior_seq" : ["item_id_1", "item_id_2"] + } +``` + +### qinfo 传递 sequence 字段 + +第⼆种情况,sequence feature 也⽀持所有的序列内容都从 qinfo 中传递。例如这⾥的user:seq_context 数组,他的值分别对应 click_0 和 click_1 。这种情况下⽤户可以忽略sequence_table 和 sequence_pk 。 +qinfo 例⼦: + +```json + { + "user:feat0" : 1.0, + "user:user_behavior_seq" : [0, 1], + "user:seq_context" : [2, 3] + } +``` + +### context seq使⽤ + +``` +{ + "features": [{ + "sequence_name": "click", + "sequence_column": "click_field", + "sequence_length": 30, + "sequence_delim": ";", + "attribute_delim": "#", + "sequence_table": "context_table", + "sequence_pk": "context:context_seq_id", + "features": [{ + "feature_name": "cid", + "feature_type": "id_feature", + "value_type": "String", + "expression": "context_table:cid" + }, + { + "feature_name": "price", + "feature_type": "raw_feature", + "expression": "context_table:price" + }, + { + "feature_name": "seq_context", + "feature_type": "raw_feature", + "expression": "context:seq_context" + } + ] + }] +} +``` + +context seq特征与user seq类似,区别是每个context是batch size维度的,user seq是⼀维的 +配置如上,context_seq_id为输⼊的context字段 +第⼀类特征:需要查context_table,如price特征,会根据context_seq_id查询context_table中的price,然后做fg, +第⼆类特征:不需要context_table,如seq_context特征,会直接取seq_context做fg, + +### item seq使⽤ + +增加"is_item_seq": true配置,如下, + +```json +{ + "features": [{ + "sequence_name": "item_pic_seq", + "sequence_column": "item__pic_vec_seq", + "sequence_table": "pic_table", + "sequence_pk": "item:pic_sop_id_list", + "attribute_delim": "#", + "feature_name": "item_pic_seq", + "sequence_length": 10, + "is_item_seq": true, + "features": [{ + "normalizer": "method=log10", + "feature_type": "id_feature", + "shared_name": "pic_pv", + "hash_bucket_size": 10, + "need_prefix": false, + "embedding_dimension": 8, + "value_type": "String", + "feature_name": "pic_pv", + "expression": "pic_table:pv" + }, + { + "normalizer": "method=log10", + "feature_type": "id_feature", + "shared_name": "pic_ipv", + "hash_bucket_size": 10, + "need_prefix": false, + "embedding_dimension": 8, + "value_type": "String", + "feature_name": "pic_ipv", + "expression": "pic_table:ipv" + }, + { + "feature_type": "id_feature", + "shared_name": "bandit_level", + "hash_bucket_size": 100, + "need_prefix": false, + "embedding_dimension": 4, + "value_type": "String", + "feature_name": "bandit_level", + "expression": "pic_table:bandit_level" + }, + { + "feature_type": "id_feature", + "shared_name": "is_fake_long", + "hash_bucket_size": 100, + "need_prefix": false, + "embedding_dimension": 4, + "value_type": "String", + "feature_name": "is_fake_long", + "expression": "pic_table:is_fake_long" + } + ] + }] +} +``` + +## 离线 FG + +​ ⽬前使⽤ sequence feature 要求使⽤ 新新版 feature_generator_java , tensorflow 训练流程要求使⽤ rtp_fg.parse_genreated_fg。 +​ 离线阶段没有sequence表去查,⽽是通过`sequence_column` 读取本来应该去表⾥查的字段。因此,`sequence_column ,sequence_delim ,attribute_delim` 这三个字段只有在离线 fg 阶段有⽤。`sequence_column` 是数据源odps表⾥所有 sequence 特征输⼊的字段名,离线fg会根据这个字段⾥的值⽣成sequence feature,该字段内容是 kv 格式的。`sequence_delim` 是sequence 中⾏为之间的分隔符,`attribute_delim` 是实际字段名字和字段值的分隔符。 +​ sequence_length 是 sequence 的⻓度,⽤户需要保证字段内容⼀定是补⻬到这个⻓度的。以上⾯的配置为例,⽤户需要有⼀个名字叫 click_field 的字段。假设某条record⾥它的内容是: + +``` +1 item__nid:11#item__price:2.0\u001D3.0;item__nid:22#item__price:4.0\u001D5.0 +``` + +表示 `click_0` 和 `click_1` 中的字段分别是 `item__nid:11 item__price:2.0\u001D3.0` 和`item__nid:22 item__price:4.0\u001D5.0` 。fg 的结果会是: + +``` +"click_0_nid", "nid_11" +"click_0_price", "2.0\u001D3.0" +"click_0_seq_context", "0" +"click_1_nid", "nid_22" +"click_1_price", "4.0\u001D5.0" +"click_1_seq_context", "0" +``` + +`rtp_fg.parse_genreated_fg` 的结果中我们可以获得 `click_0_nid , click_0_price ,click_0_seq_context ,click_1_nid , click_1_price , click_1_seq_context ,`分别对应 sequence 中两个 item 的结果。 \ No newline at end of file diff --git a/docs/source/feature/fg_docs/mutiValues.md b/docs/source/feature/fg_docs/mutiValues.md new file mode 100644 index 000000000..c71910073 --- /dev/null +++ b/docs/source/feature/fg_docs/mutiValues.md @@ -0,0 +1,25 @@ +# 多值类型及分隔符 + +## item: 维度 + +例如 v1^]v2^]v3 + +^]表示多值分隔符,注意这是⼀个符号,其ASCII编码是"\x1D",⽽不是两个符号。该字符在emacs 中的输⼊⽅式是C-q C-5, 在vi中的输⼊⽅式是 C-v C-5。 + +## context: 和 user: 维度 + +在线请求中,使⽤ json array 表示多值。 + +离线 FG 过程中,和 item: ⼀样使⽤多值分隔符。 + +## 注意事项 + +浮点型的特征,rtp只保证6位精度 + +## 训练模型时样本的分隔符 + +⽣成的训练样本的分隔符为 ^ B,^ C,^ D, ASCII编码分别是"0x02","0x03","0x04" 0x04⽤户多值的时候的值之间的分隔 + +例⼦如下: + +特征⼀<0x03>值<0x02>多值特征<0x03>值<0x04>值<0x04>值<0x02> \ No newline at end of file diff --git a/docs/source/feature/rtp_fg.md b/docs/source/feature/rtp_fg.md index 29625a080..c1d40750c 100644 --- a/docs/source/feature/rtp_fg.md +++ b/docs/source/feature/rtp_fg.md @@ -35,7 +35,7 @@ - Feature配置说明: - - [IdFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/IdFeature.pdf) + - [IdFeature](./fg_docs/IdFeature.md) - is_multi: id_feature是否是多值属性 @@ -45,7 +45,7 @@ - 多值分隔符使用chr(29)\[ctrl+v ctrl+\]. - - [多值类型说明](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/%E5%A4%9A%E5%80%BC%E7%B1%BB%E5%9E%8B.pdf) + - [多值类型说明](./fg_docs/mutiValues.md) - vocab_file: 词典文件路径,根据词典将对应的输入映射成ID. @@ -61,7 +61,7 @@ - embedding_dimension/embedding_dim: 对应EasyRec feature_config.features里面的embedding_dim. - - [RawFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/RawFeature.pdf) + - [RawFeature](./fg_docs/RawFeature.md) - bucketize_boundaries: 会生成离散化的结果, 在生成EasyRec config的时候: @@ -93,10 +93,10 @@ - 该选项对生成数据有影响. - 该选项对生成EasyRec config也有影响, 对应到[feature_config.raw_input_dim](../proto.html#protos.FeatureConfig) - - [ComboFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/ComboFeature.pdf) + - [ComboFeature](./fg_docs/ComboFeature.md) - 需要设置embedding_dimension和hash_bucket_size. - 方法一:在fg中生成combo特征,见[ComboFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/ComboFeature.pdf) + 方法一:在fg中生成combo特征,见[ComboFeature](./fg_docs/ComboFeature.pdf) ``` {"expression": "user:user_id", "feature_name": "user_id", "feature_type":"id_feature", "value_type":"String", "combiner":"mean", "hash_bucket_size": 100000, "embedding_dim": 16, "group":"user"}, @@ -125,11 +125,11 @@ - feature_names: 除当前特征外,参与combo的特征,至少一项. - combiner, hash_bucket_size, embedding_dim 配置与上述一致. - - [LookupFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/LookupFeature.pdf) + - [LookupFeature](./fg_docs/LookupFeature.md) - 根据id查找对应的value. - - [MatchFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/MatchFeature.pdf) + - [MatchFeature](./fg_docs/MatchFeature.md) - 双层查找, 根据category和item_id查找value. @@ -141,7 +141,7 @@ - needWeighting: 生成特征权重,即kv格式, kv之间用\[ctrl+v ctrl+e\]分割, 转换成TagFeature. - - [SequenceFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/SequenceFeature.pdf) + - [SequenceFeature](./fg_docs/SequenceFeature.md) - 序列特征用于对用户行为建模, 通常应用于DIN和Transformer模型当中 @@ -159,7 +159,7 @@ - Note: item_seq(如item的图片列表)目前还不支持 - - [OverLapFeature](http://easyrec.oss-cn-beijing.aliyuncs.com/fg_docs/OverLapFeature.pdf) + - [OverLapFeature](./fg_docs/OverLapFeature.md) - 针对EasyRec的扩展字段: From 4dbd9361488588b6b5f8e5ea2effaf043ce5d639 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 18:48:40 +0800 Subject: [PATCH 02/22] change_feature_pdf_to_md --- .git_bin_path | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.git_bin_path b/.git_bin_path index 9dc118a47..640d0441b 100644 --- a/.git_bin_path +++ b/.git_bin_path @@ -38,7 +38,7 @@ {"leaf_name": "data/test/movielens_1m", "leaf_file": ["data/test/movielens_1m/ml_test_data", "data/test/movielens_1m/ml_train_data"]} {"leaf_name": "data/test/mt_ckpt", "leaf_file": ["data/test/mt_ckpt/model.ckpt-100.data-00000-of-00001", "data/test/mt_ckpt/model.ckpt-100.index", "data/test/mt_ckpt/model.ckpt-100.meta"]} {"leaf_name": "data/test/rtp", "leaf_file": ["data/test/rtp/taobao_fg_pred.out", "data/test/rtp/taobao_test_bucketize_feature.txt", "data/test/rtp/taobao_test_feature.txt", "data/test/rtp/taobao_test_input.txt", "data/test/rtp/taobao_train_bucketize_feature.txt", "data/test/rtp/taobao_train_feature.txt", "data/test/rtp/taobao_train_input.txt", "data/test/rtp/taobao_valid.csv", "data/test/rtp/taobao_valid_feature.txt"]} -{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_pdn_fake_test_data", "data/test/tb_data/taobao_pdn_fake_train_data", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]} +{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_edge", "leaf_file": ["data/test/tb_data/hard_negative_sampler_edge/taobao_noclk_edge_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_item", "leaf_file": ["data/test/tb_data/hard_negative_sampler_item/taobao_ad_feature_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_user", "leaf_file": ["data/test/tb_data/hard_negative_sampler_user/taobao_user_profile_gl.csv"]} From de30f3b39099213e6ab8bb111e30af3f3c11a716 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 20:09:51 +0800 Subject: [PATCH 03/22] change_feature_pdf_to_md --- .git_bin_path | 3 +-- .git_bin_url | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.git_bin_path b/.git_bin_path index 640d0441b..5da41941d 100644 --- a/.git_bin_path +++ b/.git_bin_path @@ -38,8 +38,7 @@ {"leaf_name": "data/test/movielens_1m", "leaf_file": ["data/test/movielens_1m/ml_test_data", "data/test/movielens_1m/ml_train_data"]} {"leaf_name": "data/test/mt_ckpt", "leaf_file": ["data/test/mt_ckpt/model.ckpt-100.data-00000-of-00001", "data/test/mt_ckpt/model.ckpt-100.index", "data/test/mt_ckpt/model.ckpt-100.meta"]} {"leaf_name": "data/test/rtp", "leaf_file": ["data/test/rtp/taobao_fg_pred.out", "data/test/rtp/taobao_test_bucketize_feature.txt", "data/test/rtp/taobao_test_feature.txt", "data/test/rtp/taobao_test_input.txt", "data/test/rtp/taobao_train_bucketize_feature.txt", "data/test/rtp/taobao_train_feature.txt", "data/test/rtp/taobao_train_input.txt", "data/test/rtp/taobao_valid.csv", "data/test/rtp/taobao_valid_feature.txt"]} -{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]} -{"leaf_name": "data/test/tb_data/hard_negative_sampler_edge", "leaf_file": ["data/test/tb_data/hard_negative_sampler_edge/taobao_noclk_edge_gl.csv"]} +{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_pdn_fake_test_data", "data/test/tb_data/taobao_pdn_fake_train_data", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]}{"leaf_name": "data/test/tb_data/hard_negative_sampler_edge", "leaf_file": ["data/test/tb_data/hard_negative_sampler_edge/taobao_noclk_edge_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_item", "leaf_file": ["data/test/tb_data/hard_negative_sampler_item/taobao_ad_feature_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_user", "leaf_file": ["data/test/tb_data/hard_negative_sampler_user/taobao_user_profile_gl.csv"]} {"leaf_name": "data/test/tb_data_with_time", "leaf_file": ["data/test/tb_data_with_time/taobao_test_data_with_time", "data/test/tb_data_with_time/taobao_train_data_with_time"]} diff --git a/.git_bin_url b/.git_bin_url index 22c8ef187..82822e93f 100644 --- a/.git_bin_url +++ b/.git_bin_url @@ -38,8 +38,6 @@ {"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"} {"leaf_path": "data/test/mt_ckpt", "sig": "803499f48e2df5e51ce5606e9649c6d4", "remote_path": "data/git_oss_sample_data/data_test_mt_ckpt_803499f48e2df5e51ce5606e9649c6d4"} {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"} -{"leaf_path": "data/test/tb_data", "sig": "f1279ca42de1734be321e88f85775d5f", "remote_path": "data/git_oss_sample_data/data_test_tb_data_f1279ca42de1734be321e88f85775d5f"} -{"leaf_path": "data/test/tb_data/hard_negative_sampler_edge", "sig": "48f994681d719a2546ec4003fcbc638c", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_edge_48f994681d719a2546ec4003fcbc638c"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_item", "sig": "f23a9eb9457c14a8e57b455804b1f013", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_item_f23a9eb9457c14a8e57b455804b1f013"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_user", "sig": "23514156eae5a4250ac1d0a118883430", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_user_23514156eae5a4250ac1d0a118883430"} {"leaf_path": "data/test/tb_data_with_time", "sig": "1a7648f4ae55faf37855762bccbb70cc", "remote_path": "data/git_oss_sample_data/data_test_tb_data_with_time_1a7648f4ae55faf37855762bccbb70cc"} From 2443bbcaf26f2f1501ecf4bdb0b0fcb5ea9b768d Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 20:12:00 +0800 Subject: [PATCH 04/22] change_feature_pdf_to_md --- .git_bin_path | 3 ++- .git_bin_url | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.git_bin_path b/.git_bin_path index 5da41941d..9dc118a47 100644 --- a/.git_bin_path +++ b/.git_bin_path @@ -38,7 +38,8 @@ {"leaf_name": "data/test/movielens_1m", "leaf_file": ["data/test/movielens_1m/ml_test_data", "data/test/movielens_1m/ml_train_data"]} {"leaf_name": "data/test/mt_ckpt", "leaf_file": ["data/test/mt_ckpt/model.ckpt-100.data-00000-of-00001", "data/test/mt_ckpt/model.ckpt-100.index", "data/test/mt_ckpt/model.ckpt-100.meta"]} {"leaf_name": "data/test/rtp", "leaf_file": ["data/test/rtp/taobao_fg_pred.out", "data/test/rtp/taobao_test_bucketize_feature.txt", "data/test/rtp/taobao_test_feature.txt", "data/test/rtp/taobao_test_input.txt", "data/test/rtp/taobao_train_bucketize_feature.txt", "data/test/rtp/taobao_train_feature.txt", "data/test/rtp/taobao_train_input.txt", "data/test/rtp/taobao_valid.csv", "data/test/rtp/taobao_valid_feature.txt"]} -{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_pdn_fake_test_data", "data/test/tb_data/taobao_pdn_fake_train_data", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]}{"leaf_name": "data/test/tb_data/hard_negative_sampler_edge", "leaf_file": ["data/test/tb_data/hard_negative_sampler_edge/taobao_noclk_edge_gl.csv"]} +{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_pdn_fake_test_data", "data/test/tb_data/taobao_pdn_fake_train_data", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]} +{"leaf_name": "data/test/tb_data/hard_negative_sampler_edge", "leaf_file": ["data/test/tb_data/hard_negative_sampler_edge/taobao_noclk_edge_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_item", "leaf_file": ["data/test/tb_data/hard_negative_sampler_item/taobao_ad_feature_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_user", "leaf_file": ["data/test/tb_data/hard_negative_sampler_user/taobao_user_profile_gl.csv"]} {"leaf_name": "data/test/tb_data_with_time", "leaf_file": ["data/test/tb_data_with_time/taobao_test_data_with_time", "data/test/tb_data_with_time/taobao_train_data_with_time"]} diff --git a/.git_bin_url b/.git_bin_url index 82822e93f..22c8ef187 100644 --- a/.git_bin_url +++ b/.git_bin_url @@ -38,6 +38,8 @@ {"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"} {"leaf_path": "data/test/mt_ckpt", "sig": "803499f48e2df5e51ce5606e9649c6d4", "remote_path": "data/git_oss_sample_data/data_test_mt_ckpt_803499f48e2df5e51ce5606e9649c6d4"} {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"} +{"leaf_path": "data/test/tb_data", "sig": "f1279ca42de1734be321e88f85775d5f", "remote_path": "data/git_oss_sample_data/data_test_tb_data_f1279ca42de1734be321e88f85775d5f"} +{"leaf_path": "data/test/tb_data/hard_negative_sampler_edge", "sig": "48f994681d719a2546ec4003fcbc638c", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_edge_48f994681d719a2546ec4003fcbc638c"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_item", "sig": "f23a9eb9457c14a8e57b455804b1f013", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_item_f23a9eb9457c14a8e57b455804b1f013"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_user", "sig": "23514156eae5a4250ac1d0a118883430", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_user_23514156eae5a4250ac1d0a118883430"} {"leaf_path": "data/test/tb_data_with_time", "sig": "1a7648f4ae55faf37855762bccbb70cc", "remote_path": "data/git_oss_sample_data/data_test_tb_data_with_time_1a7648f4ae55faf37855762bccbb70cc"} From 2f3712e71cfbccbcfb051a953fe8921c32dca332 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 20:14:19 +0800 Subject: [PATCH 05/22] change_feature_pdf_to_md --- .git_bin_path | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.git_bin_path b/.git_bin_path index 9dc118a47..640d0441b 100644 --- a/.git_bin_path +++ b/.git_bin_path @@ -38,7 +38,7 @@ {"leaf_name": "data/test/movielens_1m", "leaf_file": ["data/test/movielens_1m/ml_test_data", "data/test/movielens_1m/ml_train_data"]} {"leaf_name": "data/test/mt_ckpt", "leaf_file": ["data/test/mt_ckpt/model.ckpt-100.data-00000-of-00001", "data/test/mt_ckpt/model.ckpt-100.index", "data/test/mt_ckpt/model.ckpt-100.meta"]} {"leaf_name": "data/test/rtp", "leaf_file": ["data/test/rtp/taobao_fg_pred.out", "data/test/rtp/taobao_test_bucketize_feature.txt", "data/test/rtp/taobao_test_feature.txt", "data/test/rtp/taobao_test_input.txt", "data/test/rtp/taobao_train_bucketize_feature.txt", "data/test/rtp/taobao_train_feature.txt", "data/test/rtp/taobao_train_input.txt", "data/test/rtp/taobao_valid.csv", "data/test/rtp/taobao_valid_feature.txt"]} -{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_pdn_fake_test_data", "data/test/tb_data/taobao_pdn_fake_train_data", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]} +{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_edge", "leaf_file": ["data/test/tb_data/hard_negative_sampler_edge/taobao_noclk_edge_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_item", "leaf_file": ["data/test/tb_data/hard_negative_sampler_item/taobao_ad_feature_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_user", "leaf_file": ["data/test/tb_data/hard_negative_sampler_user/taobao_user_profile_gl.csv"]} From d471204a0d5cd78bf2b8ba90e05843c436bf964e Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 20:20:24 +0800 Subject: [PATCH 06/22] change_feature_pdf_to_md --- .git_bin_path | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.git_bin_path b/.git_bin_path index 640d0441b..9dc118a47 100644 --- a/.git_bin_path +++ b/.git_bin_path @@ -38,7 +38,7 @@ {"leaf_name": "data/test/movielens_1m", "leaf_file": ["data/test/movielens_1m/ml_test_data", "data/test/movielens_1m/ml_train_data"]} {"leaf_name": "data/test/mt_ckpt", "leaf_file": ["data/test/mt_ckpt/model.ckpt-100.data-00000-of-00001", "data/test/mt_ckpt/model.ckpt-100.index", "data/test/mt_ckpt/model.ckpt-100.meta"]} {"leaf_name": "data/test/rtp", "leaf_file": ["data/test/rtp/taobao_fg_pred.out", "data/test/rtp/taobao_test_bucketize_feature.txt", "data/test/rtp/taobao_test_feature.txt", "data/test/rtp/taobao_test_input.txt", "data/test/rtp/taobao_train_bucketize_feature.txt", "data/test/rtp/taobao_train_feature.txt", "data/test/rtp/taobao_train_input.txt", "data/test/rtp/taobao_valid.csv", "data/test/rtp/taobao_valid_feature.txt"]} -{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]} +{"leaf_name": "data/test/tb_data", "leaf_file": ["data/test/tb_data/taobao_ad_feature_gl", "data/test/tb_data/taobao_clk_edge_gl", "data/test/tb_data/taobao_multi_seq_test_data", "data/test/tb_data/taobao_multi_seq_train_data", "data/test/tb_data/taobao_noclk_edge_gl", "data/test/tb_data/taobao_pdn_fake_test_data", "data/test/tb_data/taobao_pdn_fake_train_data", "data/test/tb_data/taobao_test_data", "data/test/tb_data/taobao_test_data_compress.gz", "data/test/tb_data/taobao_test_data_for_expr", "data/test/tb_data/taobao_test_data_kd", "data/test/tb_data/taobao_test_data_remap_label", "data/test/tb_data/taobao_train_data", "data/test/tb_data/taobao_train_data_for_expr", "data/test/tb_data/taobao_train_data_kd", "data/test/tb_data/taobao_train_data_remap_label", "data/test/tb_data/taobao_user_profile_gl"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_edge", "leaf_file": ["data/test/tb_data/hard_negative_sampler_edge/taobao_noclk_edge_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_item", "leaf_file": ["data/test/tb_data/hard_negative_sampler_item/taobao_ad_feature_gl.csv"]} {"leaf_name": "data/test/tb_data/hard_negative_sampler_user", "leaf_file": ["data/test/tb_data/hard_negative_sampler_user/taobao_user_profile_gl.csv"]} From 7aa9e0fc8f436baf159882f917858802ef354ada Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 20:23:02 +0800 Subject: [PATCH 07/22] change_feature_pdf_to_md From dddaed37a33e050d2e08abbc4f71fa837fd40f4e Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 20:31:42 +0800 Subject: [PATCH 08/22] change_feature_pdf_to_md From 9d74326e88503c74385fba2163abc0d08a64b5bc Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Mon, 11 Sep 2023 20:36:24 +0800 Subject: [PATCH 09/22] change_feature_pdf_to_md --- .git_bin_url | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.git_bin_url b/.git_bin_url index 22c8ef187..18eeef0d3 100644 --- a/.git_bin_url +++ b/.git_bin_url @@ -38,7 +38,7 @@ {"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"} {"leaf_path": "data/test/mt_ckpt", "sig": "803499f48e2df5e51ce5606e9649c6d4", "remote_path": "data/git_oss_sample_data/data_test_mt_ckpt_803499f48e2df5e51ce5606e9649c6d4"} {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"} -{"leaf_path": "data/test/tb_data", "sig": "f1279ca42de1734be321e88f85775d5f", "remote_path": "data/git_oss_sample_data/data_test_tb_data_f1279ca42de1734be321e88f85775d5f"} +{"leaf_path": "data/test/tb_data", "sig": "b1579db090d72b3b70b59ba3c7692701", "remote_path": "data/git_oss_sample_data/data_test_tb_data_b1579db090d72b3b70b59ba3c7692701"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_edge", "sig": "48f994681d719a2546ec4003fcbc638c", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_edge_48f994681d719a2546ec4003fcbc638c"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_item", "sig": "f23a9eb9457c14a8e57b455804b1f013", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_item_f23a9eb9457c14a8e57b455804b1f013"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_user", "sig": "23514156eae5a4250ac1d0a118883430", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_user_23514156eae5a4250ac1d0a118883430"} From 0eda657da2269438ac89356a23e2fdff1d91cfbb Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 11:14:01 +0800 Subject: [PATCH 10/22] change_feature_pdf_to_md --- .git_bin_url | 2 +- docs/source/feature/fg_docs/ComboFeature.md | 18 +++---- docs/source/feature/fg_docs/IdFeature.md | 29 +++++------ docs/source/feature/fg_docs/LookupFeature.md | 35 +++++-------- docs/source/feature/fg_docs/MatchFeature.md | 32 +++++------- docs/source/feature/fg_docs/OverLapFeature.md | 52 ++++++++----------- docs/source/feature/fg_docs/RawFeature.md | 49 +++++++---------- .../source/feature/fg_docs/SequenceFeature.md | 4 +- docs/source/feature/fg_docs/mutiValues.md | 16 +++--- 9 files changed, 98 insertions(+), 139 deletions(-) diff --git a/.git_bin_url b/.git_bin_url index 18eeef0d3..22c8ef187 100644 --- a/.git_bin_url +++ b/.git_bin_url @@ -38,7 +38,7 @@ {"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"} {"leaf_path": "data/test/mt_ckpt", "sig": "803499f48e2df5e51ce5606e9649c6d4", "remote_path": "data/git_oss_sample_data/data_test_mt_ckpt_803499f48e2df5e51ce5606e9649c6d4"} {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"} -{"leaf_path": "data/test/tb_data", "sig": "b1579db090d72b3b70b59ba3c7692701", "remote_path": "data/git_oss_sample_data/data_test_tb_data_b1579db090d72b3b70b59ba3c7692701"} +{"leaf_path": "data/test/tb_data", "sig": "f1279ca42de1734be321e88f85775d5f", "remote_path": "data/git_oss_sample_data/data_test_tb_data_f1279ca42de1734be321e88f85775d5f"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_edge", "sig": "48f994681d719a2546ec4003fcbc638c", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_edge_48f994681d719a2546ec4003fcbc638c"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_item", "sig": "f23a9eb9457c14a8e57b455804b1f013", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_item_f23a9eb9457c14a8e57b455804b1f013"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_user", "sig": "23514156eae5a4250ac1d0a118883430", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_user_23514156eae5a4250ac1d0a118883430"} diff --git a/docs/source/feature/fg_docs/ComboFeature.md b/docs/source/feature/fg_docs/ComboFeature.md index 184148592..3f610279b 100644 --- a/docs/source/feature/fg_docs/ComboFeature.md +++ b/docs/source/feature/fg_docs/ComboFeature.md @@ -12,17 +12,15 @@ combo feature是多个字段(或表达式)的组合(即笛卡尔积),i } ``` - +## 例子 -## 例子 +^\]表示多值分隔符,注意这是一个符号,其ASCII编码是"\\x1D",而不是两个符号 -^]表示多值分隔符,注意这是一个符号,其ASCII编码是"\x1D",而不是两个符号 - -| user:age_class的取值 | item:item_id的取值 | 输出的feature | -| -------------------- | ------------------ | ------------------------------------------------------------ | -| 123 | 45678 | comb_u_age_item_123_45678 | -| abc, bcd | 45678 | comb_u_age_item_abc_45678, comb_u_age_item_bcd_45678 | -| abc, bcd | 12345^]45678 | comb_u_age_item_abc_12345, comb_u_age_item_abc_45678, comb_u_age_item_bcd_12345, comb_u_age_item_bcd_45678 | +| user:age_class的取值 | item:item_id的取值 | 输出的feature | +| ----------------- | --------------- | ---------------------------------------------------------------------------------------------------------- | +| 123 | 45678 | comb_u_age_item_123_45678 | +| abc, bcd | 45678 | comb_u_age_item_abc_45678, comb_u_age_item_bcd_45678 | +| abc, bcd | 12345^\]45678 | comb_u_age_item_abc_12345, comb_u_age_item_abc_45678, comb_u_age_item_bcd_12345, comb_u_age_item_bcd_45678 | 输出的feature个数等于 @@ -30,4 +28,4 @@ combo feature是多个字段(或表达式)的组合(即笛卡尔积),i |F1| * |F2| * ... * |Fn| ``` -其中Fn指依赖的第n个字段的值的个数。 \ No newline at end of file +其中Fn指依赖的第n个字段的值的个数。 diff --git a/docs/source/feature/fg_docs/IdFeature.md b/docs/source/feature/fg_docs/IdFeature.md index e3deec7a5..83279d1ad 100644 --- a/docs/source/feature/fg_docs/IdFeature.md +++ b/docs/source/feature/fg_docs/IdFeature.md @@ -14,22 +14,19 @@ id feature是一个sparse feature,是一种最简单的离散特征,只是 } ``` -| 字段名 | 含义 | -| -------------- | ------------------------------------------------------------ | -| feature_name | 必选项,feature_name会被当做最终输出的feature的前缀 | -| expression | 必选项,expression描述该feature所依赖的字段来源 | +| 字段名 | 含义 | +| -------------- | ----------------------------------------------------------------------------- | +| feature_name | 必选项,feature_name会被当做最终输出的feature的前缀 | +| expression | 必选项,expression描述该feature所依赖的字段来源 | | need_prefix | 可选项,true表示会拼上feature_name作为前缀,false表示不拼,默认为true,通常在shared_embedding的场景会用false | -| invalid_values | 可选项,表示这些values都会被输出成null。list string,例如[""],表示将所有的空字符串输出变成null。 | +| invalid_values | 可选项,表示这些values都会被输出成null。list string,例如\[""\],表示将所有的空字符串输出变成null。 | +例子 ( ^\]表示多值分隔符,注意这是一个符号,其ASCII编码是"\\x1D",而不是两个符号) - -例子 ( ^]表示多值分隔符,注意这是一个符号,其ASCII编码是"\x1D",而不是两个符号) - -| 类型 | item:is_main的取值 | 输出的feature | -| ---------- | ------------------ | ------------------------------------------- | -| int64_t | 100 | (item_is_main_100, 1) | -| double | 5.2 | (item_is_main_5, 1)(小数部分会被截取) | -| string | abc | (item_is_main_abc, 1) | -| 多值string | abc^]bcd | (item_is_main_abc, 1),(item_is_main_bcd, 1) | -| 多值int | 123^]456 | (item_is_main_123, 1),(item_is_main_456, 1) | - +| 类型 | item:is_main的取值 | 输出的feature | +| -------- | --------------- | ------------------------------------------- | +| int64_t | 100 | (item_is_main_100, 1) | +| double | 5.2 | (item_is_main_5, 1)(小数部分会被截取) | +| string | abc | (item_is_main_abc, 1) | +| 多值string | abc^\]bcd | (item_is_main_abc, 1),(item_is_main_bcd, 1) | +| 多值int | 123^\]456 | (item_is_main_123, 1),(item_is_main_456, 1) | diff --git a/docs/source/feature/fg_docs/LookupFeature.md b/docs/source/feature/fg_docs/LookupFeature.md index 8f6feabde..34752eb3e 100644 --- a/docs/source/feature/fg_docs/LookupFeature.md +++ b/docs/source/feature/fg_docs/LookupFeature.md @@ -1,7 +1,5 @@ # 6.5 Lookup Feature - - ## 功能简介 如果离线生成不符合预期 请先使用最新的离线fg包 @@ -12,8 +10,6 @@ lookup feature 依赖 map 和 key 两个字段,map是一个多值string(MultiS map 和 key 源可以是 item,user,context 的任意组合。在线输入的时候item的多值用多值分隔符char(29)分隔,user和context的多值在tpp访问时用list表示。该特征仅支持json形式的配置方式。 - - ## 实例 ```json @@ -36,7 +32,7 @@ map 和 key 源可以是 item,user,context 的任意组合。在线输入的 item_attr : "k1:v1^]k2:v2^]k3:v3" ``` -^]表示多值分隔符,注意这是一个符号,其ASCII编码是"\x1D",而不是两个符号。该字符在emacs中的输入方式是C-q C-5, 在vi中的输入方式是C-v C-5。 这里item_attr是个多值string。需要切记,当map用来表征多个kv对时,是个多值string,而不是string! +^\]表示多值分隔符,注意这是一个符号,其ASCII编码是"\\x1D",而不是两个符号。该字符在emacs中的输入方式是C-q C-5, 在vi中的输入方式是C-v C-5。 这里item_attr是个多值string。需要切记,当map用来表征多个kv对时,是个多值string,而不是string! ``` item_value : "k2" @@ -44,45 +40,41 @@ item_value : "k2" 特征结果为 item_match_item_k2_v2。由于needDiscrete的值为true,所以特征结果为离散化后的结果。 - - -## 其它 +## 其它 match feature 和 lookup feature都是匹配类型的特征,即从kv对中匹配到相应的结果。两者的区别是: match feature的被匹配字段user 必须是qinfo中传入的字段,即一次查询中对所有的doc来说这个字段的值都是一致的。而 lookup feature 的 key 和 map 没有来源的限制。 - - -## 配置详解 +## 配置详解 默认情况的配置为 `needDiscrete == true, needWeighting = false, needKey = true, combiner = "sum"` -### 默认输出 +### 默认输出 ### needWeighting == true ``` feature_name:fg -map:{{"k1:123", "k2:234", "k3:3"}} -key:{"k1"} +map:{{"k1:123", "k2:234", "k3:3"}} +key:{"k1"} 结果:feature={"fg_k1", 123} ``` -此时会用 string 部分查 weight 表,然后乘对应 feature value 用于 LR 模型。 +此时会用 string 部分查 weight 表,然后乘对应 feature value 用于 LR 模型。 -### needDiscrete == true +### needDiscrete == true ``` feature_name:fg -map:{{"k1:123", "k2:234", "k3:3"}} -key:{"k1"} +map:{{"k1:123", "k2:234", "k3:3"}} +key:{"k1"} 结果:feature={"fg_123"} ``` -### needDiscrete == false +### needDiscrete == false ``` -map:{{"k1:123", "k2:234", "k3:3"}} -key:{"k1"} +map:{{"k1:123", "k2:234", "k3:3"}} +key:{"k1"} 结果:feature={123} ``` @@ -118,4 +110,3 @@ key:{"k1"} } ] ``` - diff --git a/docs/source/feature/fg_docs/MatchFeature.md b/docs/source/feature/fg_docs/MatchFeature.md index 9410e9fda..607a750f7 100644 --- a/docs/source/feature/fg_docs/MatchFeature.md +++ b/docs/source/feature/fg_docs/MatchFeature.md @@ -1,11 +1,11 @@ # 6.4 Match Feature - -## Match feature使用说明 + +## Match feature使用说明 match feature一般用来做特征之间的匹配关系,要用到user,item和category三个字段的值。 -match feature支持两种类型,hit和multi hit。 +match feature支持两种类型,hit和multi hit。 match feature本质是是一个两层map的匹配,user字段使用string的方式描述了一个两层map,|为第一层map的item之间的分隔符,^为第一层map的key与value之间的分隔符。,为第二层map的item之间的分隔符,:第二层map的key与value之间的分隔符。例如对于50011740^50011740:0.2,36806676:0.3,122572685:0.5|50006842^16788:0.1这样的一个string,转化为二层map就是 ```json @@ -23,9 +23,9 @@ match feature本质是是一个两层map的匹配,user字段使用string的方 对于hit match 匹配的方式,就是用category的值在第一层map中查找,然后使用item的值在第二层map中查找,最终得到一个结果。 如果不需要使用两层匹配,只需要一层匹配,则可以在map的第一层key中填入ALL, 然后在fg配置的category一项中也填成"ALL"即可。具体见实例一。 - -## 配置方式 + +## 配置方式 json格式配置文件: @@ -41,39 +41,36 @@ json格式配置文件: } ``` -needDiscrete:true 时,模型使用 match feature 输出的特征名,忽略特征值。默认为 true。 +needDiscrete:true 时,模型使用 match feature 输出的特征名,忽略特征值。默认为 true。 needDiscrete:false 时,模型取 match feature 输出的特征值,而忽略特征名。 -matchType: +matchType: hit:输出命中的feature xml配置文件: -```xml +``` ``` -dependencie:需要做Match 的两个特征 +dependencie:需要做Match 的两个特征 category: 类目的feature 字段。category="ALL"不需要分类目匹配 - -## Normalizer -match_feature 支持和 raw_feature 一样的 normalizer,具体可见 [raw_feature](https://yuque.alibaba-inc.com/rtp/wtm2oh/chapter6-raw_feature#normalizer)。 +## Normalizer + +match_feature 支持和 raw_feature 一样的 normalizer,具体可见 [raw_feature](./RawFeature.md)。 ## 配置详解 - ### hit - - 对于下面的配置 ```json @@ -105,9 +102,8 @@ match_feature 支持和 raw_feature 一样的 normalizer,具体可见 [raw_fea 如果 needDiscrete=true,结果: 如果 needDiscrete=false,结果: - -### multihit -允许用户 category 和 item 两个值为 ALL(注意,不是配置的值,是传入的值),进行 wildcard 匹配,可以匹配出多个值。输出结果类似于 hit。 +### multihit +允许用户 category 和 item 两个值为 ALL(注意,不是配置的值,是传入的值),进行 wildcard 匹配,可以匹配出多个值。输出结果类似于 hit。 diff --git a/docs/source/feature/fg_docs/OverLapFeature.md b/docs/source/feature/fg_docs/OverLapFeature.md index 168a7ec2f..c27ef161e 100644 --- a/docs/source/feature/fg_docs/OverLapFeature.md +++ b/docs/source/feature/fg_docs/OverLapFeature.md @@ -1,29 +1,23 @@ # 6.7 OverLap Feature - - ## 功能简介 -用来输出一些字符串字词匹配信息的feature +用来输出一些字符串字词匹配信息的feature 离线推荐使用1.3.56-SNAPSHOT这个版本,或者1.3.28(不支持参数need_prefix) ps: 写fg的时候注意维度,title的维度要大于或等于query的问题(简单来说就是如果title是user特征,那query也只能是user特征,user特征的batch size为1,商品特征的batch size为商品数) -| 方式 | 描述 | 备注 | -| ------------------- | ----------------------------------------------------------- | ------------------------------ | -| common_word | 计算query与title间重复term,并输出为fg_common1_common2 | 重复数不超过query term数 | -| diff_word | 计算query与title间不重复term,并输出为fg_diff1_diff2 | 不重复数不超过query term数 | -| query_common_ratio | 计算query与title间重复term数占query中term比例,乘以10取下整 | 取值为[0,10] | -| title_common_ratio | 计算query与title间重复term数占title中term比例,乘以100取下整 | 取值为[0,100] | -| is_contain | 计算query是否全部包含在title中,保持顺序 | 0表示未包含,1表示包含 | -| is_equal | 计算query是否与title完全相同 | 0表示不完全相同,1表示完全相同 | -| common_word_divided | 计算query与title间重复term,并输出为fg_common1, fg_common2 | 重复数不超过query term数 | -| diff_word_divided | 计算query与title间不重复term,并输出为fg_diff1, fg_diff2 | 重复数不超过query term数 | - - +| 方式 | 描述 | 备注 | +| ------------------- | ----------------------------------------------- | ------------------ | +| common_word | 计算query与title间重复term,并输出为fg_common1_common2 | 重复数不超过query term数 | +| diff_word | 计算query与title间不重复term,并输出为fg_diff1_diff2 | 不重复数不超过query term数 | +| query_common_ratio | 计算query与title间重复term数占query中term比例,乘以10取下整 | 取值为\[0,10\] | +| title_common_ratio | 计算query与title间重复term数占title中term比例,乘以100取下整 | 取值为\[0,100\] | +| is_contain | 计算query是否全部包含在title中,保持顺序 | 0表示未包含,1表示包含 | +| is_equal | 计算query是否与title完全相同 | 0表示不完全相同,1表示完全相同 | +| common_word_divided | 计算query与title间重复term,并输出为fg_common1, fg_common2 | 重复数不超过query term数 | +| diff_word_divided | 计算query与title间不重复term,并输出为fg_diff1, fg_diff2 | 重复数不超过query term数 | - - -## 配置方法 +## 配置方法 ```json { @@ -36,20 +30,18 @@ } ``` -| 字段名 | 含义 | -| ------------ | ------------------------------------------------------------ | -| feature_type | 必选项,描述改feature的类型 | -| feature_name | 必选项,feature_name会被当做最终输出的feature的前缀 | -| query | 必选项,query依赖的表, attr1是一个多值string, 多值string的分隔符使用chr(29) | -| title | 必选项,title依赖的表, attr2是一个多值string | +| 字段名 | 含义 | +| ------------ | -------------------------------------------------------------------------------------- | +| feature_type | 必选项,描述改feature的类型 | +| feature_name | 必选项,feature_name会被当做最终输出的feature的前缀 | +| query | 必选项,query依赖的表, attr1是一个多值string, 多值string的分隔符使用chr(29) | +| title | 必选项,title依赖的表, attr2是一个多值string | | method | 可填common_word, diff_word, query_common_ratio, title_common_ratio, is_contain, 对应上图五种方式 | -| separator | 输出结果中的分割字符,不填写我们默认为_ ,但也可以用户自己定制,具体看例子 | - - +| separator | 输出结果中的分割字符,不填写我们默认为\_ ,但也可以用户自己定制,具体看例子 | -## 例子 +## 例子 -query为high,high2,fiberglass,abc +query为high,high2,fiberglass,abc title为high,quality,fiberglass,tube,for,golf,bag | method | separator | feature | @@ -61,4 +53,4 @@ title为high,quality,fiberglass,tube,for,golf,bag | is_contain | | name_0 | | is_equal | | name_0 | | common_word_divided | | name_high, name_fiberglass | -| diff_word_divided | | name_high2, name_abc | \ No newline at end of file +| diff_word_divided | | name_high2, name_abc | diff --git a/docs/source/feature/fg_docs/RawFeature.md b/docs/source/feature/fg_docs/RawFeature.md index 8775b0a81..3ff6215b9 100644 --- a/docs/source/feature/fg_docs/RawFeature.md +++ b/docs/source/feature/fg_docs/RawFeature.md @@ -1,12 +1,10 @@ # 6.2 Raw Feature - - -## 功能介绍 +## 功能介绍 raw feature是一种dense的feature,是直接引用原始feature的字段值作为feature的value。raw feature仅支持数值int、float、double等数值类型,对非数值类型的feature需使用id feature。 -## 配置方法 +## 配置方法 ```json { @@ -17,55 +15,44 @@ raw feature是一种dense的feature,是直接引用原始feature的字段值 } ``` - - -| 字段名 | 含义 | -| --------------- | ------------------------------------------------------------ | +| 字段名 | 含义 | +| --------------- | ---------------------------------------------------------------------------------- | | feature_name | 必选项,在正常使用时该选项是没用处的,因为实际参与接下来运算的主要是feature value,但是在debug的情况下,可以看到对应feature name的值。 | -| expression | 必选项,expression描述该feature所依赖的字段来源 | -| value_dimension | 可选项,默认值为1,表示输出的字段的维度。 | -| normalizer | 可选项,归一化方法,详见后文 | - - - -## 例子 +| expression | 必选项,expression描述该feature所依赖的字段来源 | +| value_dimension | 可选项,默认值为1,表示输出的字段的维度。 | +| normalizer | 可选项,归一化方法,详见后文 | -^]表示多值分隔符,注意这是一个符号,其ASCII编码是"\x1D",而不是两个符号 +## 例子 -| 类型 | item:ctr的取值 | 输出的feature | -| ------- | -------------- | ------------------------------------------------------------ | -| int64_t | 100 | (ctr, 100) | -| double | 100.1 | (ctr, 100.1) | -| 多值int | 123^]456 | (ctr, (123,456)) (注意,输入字段必须与配置的dimension维度一致) | +^\]表示多值分隔符,注意这是一个符号,其ASCII编码是"\\x1D",而不是两个符号 +| 类型 | item:ctr的取值 | 输出的feature | +| ------- | ----------- | ---------------------------------------------- | +| int64_t | 100 | (ctr, 100) | +| double | 100.1 | (ctr, 100.1) | +| 多值int | 123^\]456 | (ctr, (123,456)) (注意,输入字段必须与配置的dimension维度一致) | - - - -## Normalizer +## Normalizer raw_feature 和 match_feature 支持 normalizer,共三种,`minmax,zscore,log10`。配置和计算方法如下: -### log10 +### log10 ``` 配置例子:method=log10,threshold=1e-10,default=-10 计算公式:x = x > threshold ? log10(x) : default; ``` -### zscore +### zscore ``` 配置例子:method=zscore,mean=0.0,standard_deviation=10.0 计算公式:x = (x - mean) / standard_deviation ``` -### minmax +### minmax ``` 配置例子:method=minmax,min=2.1,max=2.2 计算公式:x = (x - min) / (max - min) ``` - - - diff --git a/docs/source/feature/fg_docs/SequenceFeature.md b/docs/source/feature/fg_docs/SequenceFeature.md index febd2f92c..2ac6593ac 100644 --- a/docs/source/feature/fg_docs/SequenceFeature.md +++ b/docs/source/feature/fg_docs/SequenceFeature.md @@ -1,5 +1,3 @@ - - # 6.8 sequence 类 feature ## 基本场景 @@ -195,4 +193,4 @@ context seq特征与user seq类似,区别是每个context是batch size维度 "click_1_seq_context", "0" ``` -`rtp_fg.parse_genreated_fg` 的结果中我们可以获得 `click_0_nid , click_0_price ,click_0_seq_context ,click_1_nid , click_1_price , click_1_seq_context ,`分别对应 sequence 中两个 item 的结果。 \ No newline at end of file +`rtp_fg.parse_genreated_fg` 的结果中我们可以获得 `click_0_nid , click_0_price ,click_0_seq_context ,click_1_nid , click_1_price , click_1_seq_context ,`分别对应 sequence 中两个 item 的结果。 diff --git a/docs/source/feature/fg_docs/mutiValues.md b/docs/source/feature/fg_docs/mutiValues.md index c71910073..65a3b15a9 100644 --- a/docs/source/feature/fg_docs/mutiValues.md +++ b/docs/source/feature/fg_docs/mutiValues.md @@ -1,16 +1,16 @@ -# 多值类型及分隔符 +# 多值类型及分隔符 ## item: 维度 -例如 v1^]v2^]v3 +例如 v1^\]v2^\]v3 -^]表示多值分隔符,注意这是⼀个符号,其ASCII编码是"\x1D",⽽不是两个符号。该字符在emacs 中的输⼊⽅式是C-q C-5, 在vi中的输⼊⽅式是 C-v C-5。 +^\]表示多值分隔符,注意这是⼀个符号,其ASCII编码是"\\x1D",⽽不是两个符号。该字符在emacs 中的输⼊⽅式是C-q C-5, 在vi中的输⼊⽅式是 C-v C-5。 ## context: 和 user: 维度 -在线请求中,使⽤ json array 表示多值。 +在线请求中,使⽤ json array 表示多值。 -离线 FG 过程中,和 item: ⼀样使⽤多值分隔符。 +离线 FG 过程中,和 item: ⼀样使⽤多值分隔符。 ## 注意事项 @@ -18,8 +18,8 @@ ## 训练模型时样本的分隔符 -⽣成的训练样本的分隔符为 ^ B,^ C,^ D, ASCII编码分别是"0x02","0x03","0x04" 0x04⽤户多值的时候的值之间的分隔 +⽣成的训练样本的分隔符为 ^ B,^ C,^ D, ASCII编码分别是"0x02","0x03","0x04" 0x04⽤户多值的时候的值之间的分隔 -例⼦如下: +例⼦如下: -特征⼀<0x03>值<0x02>多值特征<0x03>值<0x04>值<0x04>值<0x02> \ No newline at end of file +特征⼀\<0x03>值\<0x02>多值特征\<0x03>值\<0x04>值\<0x04>值\<0x02> From 5d0a3ad34f930fd3e673d8d66755fc3de38aadfa Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 11:17:13 +0800 Subject: [PATCH 11/22] change_feature_pdf_to_md From f86a0712cd5e2c3e2a4bd888a1df027af236ffe9 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 13:24:55 +0800 Subject: [PATCH 12/22] change_feature_pdf_to_md --- docs/source/feature/fg_docs/MatchFeature.md | 25 ++++++------------- docs/source/feature/fg_docs/OverLapFeature.md | 2 +- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/docs/source/feature/fg_docs/MatchFeature.md b/docs/source/feature/fg_docs/MatchFeature.md index 607a750f7..afbc92f2d 100644 --- a/docs/source/feature/fg_docs/MatchFeature.md +++ b/docs/source/feature/fg_docs/MatchFeature.md @@ -1,7 +1,5 @@ # 6.4 Match Feature - - ## Match feature使用说明 match feature一般用来做特征之间的匹配关系,要用到user,item和category三个字段的值。 @@ -23,8 +21,6 @@ match feature本质是是一个两层map的匹配,user字段使用string的方 对于hit match 匹配的方式,就是用category的值在第一层map中查找,然后使用item的值在第二层map中查找,最终得到一个结果。 如果不需要使用两层匹配,只需要一层匹配,则可以在map的第一层key中填入ALL, 然后在fg配置的category一项中也填成"ALL"即可。具体见实例一。 - - ## 配置方式 json格式配置文件: @@ -49,7 +45,7 @@ hit:输出命中的feature xml配置文件: -``` +```xml @@ -60,15 +56,12 @@ dependencie:需要做Match 的两个特征 category: 类目的feature 字段。category="ALL"不需要分类目匹配 - - ## Normalizer match_feature 支持和 raw_feature 一样的 normalizer,具体可见 [raw_feature](./RawFeature.md)。 ## 配置详解 - ### hit 对于下面的配置 @@ -87,22 +80,20 @@ match_feature 支持和 raw_feature 一样的 normalizer,具体可见 [raw_fea 假设各字段的值如下: -| user_brand_tags_hit | 50011740^107287172:0.2,36806676:0.3,122572685:0.5\|50006842^16788816:0.1,10122:0.2,29889:0.3,30068:19 | -| --------------------- | ------------------------------------------------------------ | -| brand_id | 30068 | -| auction_root_category | 50006842 | +| user_brand_tags_hit | `50011740^107287172:0.2,36806676:0.3,122572685:0.5\|50006842^16788816:0.1,10122:0.2,29889:0.3,30068:19` | +| --------------------- | ------------------------------------------------------------------------------------------------------- | +| brand_id | 30068 | +| auction_root_category | 50006842 | -如果 needDiscrete=true,结果为: -如果 needDiscrete=false,结果为: +如果 needDiscrete=true,结果为:\ +如果 needDiscrete=false,结果为:\ 如果只需要使用一层匹配,则需要将上面配置里的 category 的值改为 ALL。这种情况,用户也可以考虑使用 lookup_feature。 假设各字段的值如下 | user_brand_tags_hit | ALL^16788816:40,10122:40,29889:20,30068:20 | | ------------------- | ------------------------------------------ | | brand_id | 30068 | -如果 needDiscrete=true,结果: 如果 needDiscrete=false,结果: - - +如果 needDiscrete=true,结果:\ 如果 needDiscrete=false,结果:\ ### multihit diff --git a/docs/source/feature/fg_docs/OverLapFeature.md b/docs/source/feature/fg_docs/OverLapFeature.md index c27ef161e..9ee293605 100644 --- a/docs/source/feature/fg_docs/OverLapFeature.md +++ b/docs/source/feature/fg_docs/OverLapFeature.md @@ -22,7 +22,7 @@ ```json { "feature_type" : "overlap_feature", - "feature_name" : "is_contain", + "feature_name" : "is_contain", "query" : "user:attr1", "title" : "item:attr2", "method" : "is_contain", From 1fc5ea277f9fa298f6c37047ba0efa27a88f7bae Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 13:48:14 +0800 Subject: [PATCH 13/22] change_feature_pdf_to_md --- .git_bin_url | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.git_bin_url b/.git_bin_url index 22c8ef187..18eeef0d3 100644 --- a/.git_bin_url +++ b/.git_bin_url @@ -38,7 +38,7 @@ {"leaf_path": "data/test/movielens_1m", "sig": "99badbeec64f2fcabe0dfa1d2bfd8fb5", "remote_path": "data/git_oss_sample_data/data_test_movielens_1m_99badbeec64f2fcabe0dfa1d2bfd8fb5"} {"leaf_path": "data/test/mt_ckpt", "sig": "803499f48e2df5e51ce5606e9649c6d4", "remote_path": "data/git_oss_sample_data/data_test_mt_ckpt_803499f48e2df5e51ce5606e9649c6d4"} {"leaf_path": "data/test/rtp", "sig": "76cda60582617ddbb7cd5a49eb68a4b9", "remote_path": "data/git_oss_sample_data/data_test_rtp_76cda60582617ddbb7cd5a49eb68a4b9"} -{"leaf_path": "data/test/tb_data", "sig": "f1279ca42de1734be321e88f85775d5f", "remote_path": "data/git_oss_sample_data/data_test_tb_data_f1279ca42de1734be321e88f85775d5f"} +{"leaf_path": "data/test/tb_data", "sig": "b1579db090d72b3b70b59ba3c7692701", "remote_path": "data/git_oss_sample_data/data_test_tb_data_b1579db090d72b3b70b59ba3c7692701"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_edge", "sig": "48f994681d719a2546ec4003fcbc638c", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_edge_48f994681d719a2546ec4003fcbc638c"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_item", "sig": "f23a9eb9457c14a8e57b455804b1f013", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_item_f23a9eb9457c14a8e57b455804b1f013"} {"leaf_path": "data/test/tb_data/hard_negative_sampler_user", "sig": "23514156eae5a4250ac1d0a118883430", "remote_path": "data/git_oss_sample_data/data_test_tb_data_hard_negative_sampler_user_23514156eae5a4250ac1d0a118883430"} From 953d94b70b0367b1639844791ddb6a719c7589b0 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 14:02:24 +0800 Subject: [PATCH 14/22] change_feature_pdf_to_md --- docs/source/feature/rtp_fg.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/feature/rtp_fg.md b/docs/source/feature/rtp_fg.md index c1d40750c..eadaf9ba5 100644 --- a/docs/source/feature/rtp_fg.md +++ b/docs/source/feature/rtp_fg.md @@ -96,7 +96,7 @@ - [ComboFeature](./fg_docs/ComboFeature.md) - 需要设置embedding_dimension和hash_bucket_size. - 方法一:在fg中生成combo特征,见[ComboFeature](./fg_docs/ComboFeature.pdf) + 方法一:在fg中生成combo特征,见[ComboFeature](./fg_docs/ComboFeature.md) ``` {"expression": "user:user_id", "feature_name": "user_id", "feature_type":"id_feature", "value_type":"String", "combiner":"mean", "hash_bucket_size": 100000, "embedding_dim": 16, "group":"user"}, From 56b550c3aa2907a7670f1cf6efe3cb92218e6595 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 14:24:43 +0800 Subject: [PATCH 15/22] change_feature_pdf_to_md --- docs/source/feature/fg_docs/ComboFeature.md | 2 +- docs/source/feature/fg_docs/IdFeature.md | 2 +- docs/source/feature/fg_docs/LookupFeature.md | 2 +- docs/source/feature/fg_docs/MatchFeature.md | 2 +- docs/source/feature/fg_docs/OverLapFeature.md | 2 +- docs/source/feature/fg_docs/RawFeature.md | 2 +- docs/source/feature/fg_docs/SequenceFeature.md | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/feature/fg_docs/ComboFeature.md b/docs/source/feature/fg_docs/ComboFeature.md index 3f610279b..837ecb2bd 100644 --- a/docs/source/feature/fg_docs/ComboFeature.md +++ b/docs/source/feature/fg_docs/ComboFeature.md @@ -1,4 +1,4 @@ -# 6.3 Combo Feature +# Combo Feature combo feature是多个字段(或表达式)的组合(即笛卡尔积),id feature可以看成是一种特殊的combo feature,即参与交叉字段只有一个的combo feature。一般来讲,参与交叉的各个字段来自不同的表(比如user特征和item特征进行交叉)。 diff --git a/docs/source/feature/fg_docs/IdFeature.md b/docs/source/feature/fg_docs/IdFeature.md index 83279d1ad..4f5e6eed6 100644 --- a/docs/source/feature/fg_docs/IdFeature.md +++ b/docs/source/feature/fg_docs/IdFeature.md @@ -1,4 +1,4 @@ -# 6.1 Id Feature +# Id Feature 功能介绍 diff --git a/docs/source/feature/fg_docs/LookupFeature.md b/docs/source/feature/fg_docs/LookupFeature.md index 34752eb3e..9b4eb6a14 100644 --- a/docs/source/feature/fg_docs/LookupFeature.md +++ b/docs/source/feature/fg_docs/LookupFeature.md @@ -1,4 +1,4 @@ -# 6.5 Lookup Feature +# Lookup Feature ## 功能简介 diff --git a/docs/source/feature/fg_docs/MatchFeature.md b/docs/source/feature/fg_docs/MatchFeature.md index afbc92f2d..3a29038c4 100644 --- a/docs/source/feature/fg_docs/MatchFeature.md +++ b/docs/source/feature/fg_docs/MatchFeature.md @@ -1,4 +1,4 @@ -# 6.4 Match Feature +# Match Feature ## Match feature使用说明 diff --git a/docs/source/feature/fg_docs/OverLapFeature.md b/docs/source/feature/fg_docs/OverLapFeature.md index 9ee293605..9033efeb1 100644 --- a/docs/source/feature/fg_docs/OverLapFeature.md +++ b/docs/source/feature/fg_docs/OverLapFeature.md @@ -1,4 +1,4 @@ -# 6.7 OverLap Feature +# OverLap Feature ## 功能简介 diff --git a/docs/source/feature/fg_docs/RawFeature.md b/docs/source/feature/fg_docs/RawFeature.md index 3ff6215b9..ca9e5580b 100644 --- a/docs/source/feature/fg_docs/RawFeature.md +++ b/docs/source/feature/fg_docs/RawFeature.md @@ -1,4 +1,4 @@ -# 6.2 Raw Feature +# Raw Feature ## 功能介绍 diff --git a/docs/source/feature/fg_docs/SequenceFeature.md b/docs/source/feature/fg_docs/SequenceFeature.md index 2ac6593ac..7e14a030a 100644 --- a/docs/source/feature/fg_docs/SequenceFeature.md +++ b/docs/source/feature/fg_docs/SequenceFeature.md @@ -1,4 +1,4 @@ -# 6.8 sequence 类 feature +# sequence 类 feature ## 基本场景 From db45619a4ab89ab8f486c7432aff41374a90b193 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 15:16:50 +0800 Subject: [PATCH 16/22] change_feature_pdf_to_md --- docs/source/feature/fg_docs/ComboFeature.md | 4 ++-- docs/source/feature/fg_docs/IdFeature.md | 4 ++-- docs/source/feature/fg_docs/LookupFeature.md | 6 +++--- docs/source/feature/fg_docs/MatchFeature.md | 14 +++++++------- docs/source/feature/fg_docs/OverLapFeature.md | 2 +- docs/source/feature/fg_docs/RawFeature.md | 4 ++-- docs/source/feature/fg_docs/SequenceFeature.md | 10 +++++----- docs/source/feature/rtp_fg.md | 16 ++++++++-------- 8 files changed, 30 insertions(+), 30 deletions(-) diff --git a/docs/source/feature/fg_docs/ComboFeature.md b/docs/source/feature/fg_docs/ComboFeature.md index 837ecb2bd..777a033d8 100644 --- a/docs/source/feature/fg_docs/ComboFeature.md +++ b/docs/source/feature/fg_docs/ComboFeature.md @@ -1,6 +1,6 @@ -# Combo Feature +# combo_feature -combo feature是多个字段(或表达式)的组合(即笛卡尔积),id feature可以看成是一种特殊的combo feature,即参与交叉字段只有一个的combo feature。一般来讲,参与交叉的各个字段来自不同的表(比如user特征和item特征进行交叉)。 +combo_feature是多个字段(或表达式)的组合(即笛卡尔积),id feature可以看成是一种特殊的combo feature,即参与交叉字段只有一个的combo feature。一般来讲,参与交叉的各个字段来自不同的表(比如user特征和item特征进行交叉)。 配置: diff --git a/docs/source/feature/fg_docs/IdFeature.md b/docs/source/feature/fg_docs/IdFeature.md index 4f5e6eed6..1e697a00c 100644 --- a/docs/source/feature/fg_docs/IdFeature.md +++ b/docs/source/feature/fg_docs/IdFeature.md @@ -1,8 +1,8 @@ -# Id Feature +# id_feature 功能介绍 -id feature是一个sparse feature,是一种最简单的离散特征,只是简单的将某个字段的值与用户配置的feature名字拼接。 +id_feature是一个sparse feature,是一种最简单的离散特征,只是简单的将某个字段的值与用户配置的feature名字拼接。 配置方法 diff --git a/docs/source/feature/fg_docs/LookupFeature.md b/docs/source/feature/fg_docs/LookupFeature.md index 9b4eb6a14..2716669b0 100644 --- a/docs/source/feature/fg_docs/LookupFeature.md +++ b/docs/source/feature/fg_docs/LookupFeature.md @@ -1,12 +1,12 @@ -# Lookup Feature +# lookup_feature ## 功能简介 如果离线生成不符合预期 请先使用最新的离线fg包 -lookup feature 和 match feature类似,是从一组kv中匹配到自己需要的结果。 +lookup_feature 和 match_feature类似,是从一组kv中匹配到自己需要的结果。 -lookup feature 依赖 map 和 key 两个字段,map是一个多值string(MultiString)类型的字段,其中每一个string的样子如"k1:v2"。;key可以是一个任意类型的字段。生成特征时,先是取出key的值,将其转换成string类型,然后在map字段所持有的kv对中进行匹配,获取最终的特征。 +lookup_feature 依赖 map 和 key 两个字段,map是一个多值string(MultiString)类型的字段,其中每一个string的样子如"k1:v2"。;key可以是一个任意类型的字段。生成特征时,先是取出key的值,将其转换成string类型,然后在map字段所持有的kv对中进行匹配,获取最终的特征。 map 和 key 源可以是 item,user,context 的任意组合。在线输入的时候item的多值用多值分隔符char(29)分隔,user和context的多值在tpp访问时用list表示。该特征仅支持json形式的配置方式。 diff --git a/docs/source/feature/fg_docs/MatchFeature.md b/docs/source/feature/fg_docs/MatchFeature.md index 3a29038c4..892af7a2a 100644 --- a/docs/source/feature/fg_docs/MatchFeature.md +++ b/docs/source/feature/fg_docs/MatchFeature.md @@ -1,10 +1,10 @@ -# Match Feature +# match_feature -## Match feature使用说明 +## match_feature使用说明 -match feature一般用来做特征之间的匹配关系,要用到user,item和category三个字段的值。 -match feature支持两种类型,hit和multi hit。 -match feature本质是是一个两层map的匹配,user字段使用string的方式描述了一个两层map,|为第一层map的item之间的分隔符,^为第一层map的key与value之间的分隔符。,为第二层map的item之间的分隔符,:第二层map的key与value之间的分隔符。例如对于50011740^50011740:0.2,36806676:0.3,122572685:0.5|50006842^16788:0.1这样的一个string,转化为二层map就是 +match_feature一般用来做特征之间的匹配关系,要用到user,item和category三个字段的值。 +match_feature支持两种类型,hit和multi hit。 +match_feature本质是是一个两层map的匹配,user字段使用string的方式描述了一个两层map,|为第一层map的item之间的分隔符,^为第一层map的key与value之间的分隔符。,为第二层map的item之间的分隔符,:第二层map的key与value之间的分隔符。例如对于50011740^50011740:0.2,36806676:0.3,122572685:0.5|50006842^16788:0.1这样的一个string,转化为二层map就是 ```json { @@ -37,8 +37,8 @@ json格式配置文件: } ``` -needDiscrete:true 时,模型使用 match feature 输出的特征名,忽略特征值。默认为 true。 -needDiscrete:false 时,模型取 match feature 输出的特征值,而忽略特征名。 +needDiscrete:true 时,模型使用 match_feature 输出的特征名,忽略特征值。默认为 true。 +needDiscrete:false 时,模型取 match_feature 输出的特征值,而忽略特征名。 matchType: hit:输出命中的feature diff --git a/docs/source/feature/fg_docs/OverLapFeature.md b/docs/source/feature/fg_docs/OverLapFeature.md index 9033efeb1..b9aeb0f0a 100644 --- a/docs/source/feature/fg_docs/OverLapFeature.md +++ b/docs/source/feature/fg_docs/OverLapFeature.md @@ -1,4 +1,4 @@ -# OverLap Feature +# overlap_feature ## 功能简介 diff --git a/docs/source/feature/fg_docs/RawFeature.md b/docs/source/feature/fg_docs/RawFeature.md index ca9e5580b..a7b0b772c 100644 --- a/docs/source/feature/fg_docs/RawFeature.md +++ b/docs/source/feature/fg_docs/RawFeature.md @@ -1,8 +1,8 @@ -# Raw Feature +# raw_feature ## 功能介绍 -raw feature是一种dense的feature,是直接引用原始feature的字段值作为feature的value。raw feature仅支持数值int、float、double等数值类型,对非数值类型的feature需使用id feature。 +raw_feature是一种dense的feature,是直接引用原始feature的字段值作为feature的value。raw feature仅支持数值int、float、double等数值类型,对非数值类型的feature需使用id feature。 ## 配置方法 diff --git a/docs/source/feature/fg_docs/SequenceFeature.md b/docs/source/feature/fg_docs/SequenceFeature.md index 7e14a030a..4afa27e3b 100644 --- a/docs/source/feature/fg_docs/SequenceFeature.md +++ b/docs/source/feature/fg_docs/SequenceFeature.md @@ -1,4 +1,4 @@ -# sequence 类 feature +# sequence类feature ## 基本场景 @@ -61,7 +61,7 @@ qinfo 例⼦: ### qinfo 传递 sequence 字段 -第⼆种情况,sequence feature 也⽀持所有的序列内容都从 qinfo 中传递。例如这⾥的user:seq_context 数组,他的值分别对应 click_0 和 click_1 。这种情况下⽤户可以忽略sequence_table 和 sequence_pk 。 +第⼆种情况,sequence_feature 也⽀持所有的序列内容都从 qinfo 中传递。例如这⾥的user:seq_context 数组,他的值分别对应 click_0 和 click_1 。这种情况下⽤户可以忽略sequence_table 和 sequence_pk 。 qinfo 例⼦: ```json @@ -174,9 +174,9 @@ context seq特征与user seq类似,区别是每个context是batch size维度 ## 离线 FG -​ ⽬前使⽤ sequence feature 要求使⽤ 新新版 feature_generator_java , tensorflow 训练流程要求使⽤ rtp_fg.parse_genreated_fg。 -​ 离线阶段没有sequence表去查,⽽是通过`sequence_column` 读取本来应该去表⾥查的字段。因此,`sequence_column ,sequence_delim ,attribute_delim` 这三个字段只有在离线 fg 阶段有⽤。`sequence_column` 是数据源odps表⾥所有 sequence 特征输⼊的字段名,离线fg会根据这个字段⾥的值⽣成sequence feature,该字段内容是 kv 格式的。`sequence_delim` 是sequence 中⾏为之间的分隔符,`attribute_delim` 是实际字段名字和字段值的分隔符。 -​ sequence_length 是 sequence 的⻓度,⽤户需要保证字段内容⼀定是补⻬到这个⻓度的。以上⾯的配置为例,⽤户需要有⼀个名字叫 click_field 的字段。假设某条record⾥它的内容是: +​⽬前使⽤ sequence_feature 要求使⽤ 新新版 feature_generator_java , tensorflow 训练流程要求使⽤ rtp_fg.parse_genreated_fg。 +​离线阶段没有sequence表去查,⽽是通过`sequence_column` 读取本来应该去表⾥查的字段。因此,`sequence_column ,sequence_delim ,attribute_delim` 这三个字段只有在离线 fg 阶段有⽤。`sequence_column` 是数据源odps表⾥所有 sequence 特征输⼊的字段名,离线fg会根据这个字段⾥的值⽣成sequence feature,该字段内容是 kv 格式的。`sequence_delim` 是sequence 中⾏为之间的分隔符,`attribute_delim` 是实际字段名字和字段值的分隔符。 +​sequence_length 是 sequence 的⻓度,⽤户需要保证字段内容⼀定是补⻬到这个⻓度的。以上⾯的配置为例,⽤户需要有⼀个名字叫 click_field 的字段。假设某条record⾥它的内容是: ``` 1 item__nid:11#item__price:2.0\u001D3.0;item__nid:22#item__price:4.0\u001D5.0 diff --git a/docs/source/feature/rtp_fg.md b/docs/source/feature/rtp_fg.md index 3d5565001..f08220720 100644 --- a/docs/source/feature/rtp_fg.md +++ b/docs/source/feature/rtp_fg.md @@ -36,7 +36,7 @@ - Feature配置说明: - - [IdFeature](./fg_docs/IdFeature.md) + - [id_Feature](./fg_docs/IdFeature.md) - is_multi: id_feature是否是多值属性 @@ -62,7 +62,7 @@ - embedding_dimension/embedding_dim: 对应EasyRec feature_config.features里面的embedding_dim. - - [RawFeature](./fg_docs/RawFeature.md) + - [raw_feature](./fg_docs/RawFeature.md) - bucketize_boundaries: 会生成离散化的结果, 在生成EasyRec config的时候: @@ -94,10 +94,10 @@ - 该选项对生成数据有影响. - 该选项对生成EasyRec config也有影响, 对应到[feature_config.raw_input_dim](../proto.html#protos.FeatureConfig) - - [ComboFeature](./fg_docs/ComboFeature.md) + - [combo_feature](./fg_docs/ComboFeature.md) - 需要设置embedding_dimension和hash_bucket_size. - 方法一:在fg中生成combo特征,见[ComboFeature](./fg_docs/ComboFeature.md) + 方法一:在fg中生成combo特征,见[combo_feature](./fg_docs/ComboFeature.md) ``` {"expression" : ["user:user_id", "user:occupation"], "feature_name" : "combo__occupation_age_level", "feature_type" : "combo_feature", "hash_bucket_size": 10, "embedding_dim": 16} @@ -124,11 +124,11 @@ - feature_names: 除当前特征外,参与combo的特征,至少一项. - combiner, hash_bucket_size, embedding_dim 配置与上述一致. - - [LookupFeature](./fg_docs/LookupFeature.md) + - [lookup_feature](./fg_docs/LookupFeature.md) - 单层查找, 根据id(如item_id, item_category_id等)查找对应的value. - - [MatchFeature](./fg_docs/MatchFeature.md) + - [match_feature](./fg_docs/MatchFeature.md) - 双层查找, 根据category和item_id查找value. @@ -140,7 +140,7 @@ - needWeighting: 生成特征权重,即kv格式, kv之间用\[ctrl+v ctrl+e\]分割, 转换成TagFeature. - - [SequenceFeature](./fg_docs/SequenceFeature.md) + - [sequence_feature](./fg_docs/SequenceFeature.md) - 序列特征用于对用户行为建模, 通常应用于DIN和Transformer模型当中 @@ -158,7 +158,7 @@ - Note: item_seq(如item的图片列表)目前还不支持 - - [OverLapFeature](./fg_docs/OverLapFeature.md) + - [overlap_feature](./fg_docs/OverLapFeature.md) - 针对EasyRec的扩展字段: From c6fb600c13ee64e2925664a30d47495b36794d15 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 15:46:37 +0800 Subject: [PATCH 17/22] change_feature_pdf_to_md --- docs/source/feature/fg_docs/LookupFeature.md | 4 ++-- docs/source/feature/rtp_fg.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/feature/fg_docs/LookupFeature.md b/docs/source/feature/fg_docs/LookupFeature.md index 2716669b0..dc1052460 100644 --- a/docs/source/feature/fg_docs/LookupFeature.md +++ b/docs/source/feature/fg_docs/LookupFeature.md @@ -42,7 +42,7 @@ item_value : "k2" ## 其它 -match feature 和 lookup feature都是匹配类型的特征,即从kv对中匹配到相应的结果。两者的区别是: match feature的被匹配字段user 必须是qinfo中传入的字段,即一次查询中对所有的doc来说这个字段的值都是一致的。而 lookup feature 的 key 和 map 没有来源的限制。 +match_feature 和 lookup_feature都是匹配类型的特征,即从kv对中匹配到相应的结果。两者的区别是: match_feature的被匹配字段user 必须是qinfo中传入的字段,即一次查询中对所有的doc来说这个字段的值都是一致的。而 lookup_feature 的 key 和 map 没有来源的限制。 ## 配置详解 @@ -78,7 +78,7 @@ key:{"k1"} 结果:feature={123} ``` -如果存在多个 key 时,可以通过配置 combiner 来组合多个查到的值。可能的配置有 `sum, mean, max, min`。 ps:如果要使用combiner的话需要将needDiscrete设置为false,只有dense类才能做conbiner,生成的value会是数值类的 +如果存在多个 key 时,可以通过配置 combiner 来组合多个查到的值。可能的配置有 `sum, mean, max, min`。 ps:如果要使用combiner的话需要将needDiscrete设置为false,只有dense类才能做combiner,生成的value会是数值类的 一个配置样例 update on 2021.04.15 diff --git a/docs/source/feature/rtp_fg.md b/docs/source/feature/rtp_fg.md index f08220720..5d9255ab6 100644 --- a/docs/source/feature/rtp_fg.md +++ b/docs/source/feature/rtp_fg.md @@ -36,7 +36,7 @@ - Feature配置说明: - - [id_Feature](./fg_docs/IdFeature.md) + - [id_feature](./fg_docs/IdFeature.md) - is_multi: id_feature是否是多值属性 From 1a337712600cea07dfc30d036b858ac966d7825c Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 15:49:09 +0800 Subject: [PATCH 18/22] change_feature_pdf_to_md --- docs/source/feature/fg_docs/ComboFeature.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/feature/fg_docs/ComboFeature.md b/docs/source/feature/fg_docs/ComboFeature.md index 777a033d8..86db576d5 100644 --- a/docs/source/feature/fg_docs/ComboFeature.md +++ b/docs/source/feature/fg_docs/ComboFeature.md @@ -1,6 +1,6 @@ # combo_feature -combo_feature是多个字段(或表达式)的组合(即笛卡尔积),id feature可以看成是一种特殊的combo feature,即参与交叉字段只有一个的combo feature。一般来讲,参与交叉的各个字段来自不同的表(比如user特征和item特征进行交叉)。 +combo_feature是多个字段(或表达式)的组合(即笛卡尔积),id_feature可以看成是一种特殊的combo_feature,即参与交叉字段只有一个的combo_feature。一般来讲,参与交叉的各个字段来自不同的表(比如user特征和item特征进行交叉)。 配置: From d6664b132e8b947c0e9e2f6d7dc734fac33fe84a Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 17:09:22 +0800 Subject: [PATCH 19/22] change_feature_pdf_to_md --- docs/source/feature/fg_docs/ComboFeature.md | 4 +++- docs/source/feature/fg_docs/IdFeature.md | 4 ++-- docs/source/feature/fg_docs/LookupFeature.md | 4 ++-- docs/source/feature/fg_docs/MatchFeature.md | 2 +- docs/source/feature/fg_docs/OverLapFeature.md | 2 +- .../source/feature/fg_docs/SequenceFeature.md | 21 +++++++++++-------- docs/source/feature/rtp_fg.md | 20 ++++++++---------- 7 files changed, 30 insertions(+), 27 deletions(-) diff --git a/docs/source/feature/fg_docs/ComboFeature.md b/docs/source/feature/fg_docs/ComboFeature.md index 86db576d5..5e0495cdd 100644 --- a/docs/source/feature/fg_docs/ComboFeature.md +++ b/docs/source/feature/fg_docs/ComboFeature.md @@ -1,8 +1,10 @@ # combo_feature +## 功能介绍 + combo_feature是多个字段(或表达式)的组合(即笛卡尔积),id_feature可以看成是一种特殊的combo_feature,即参与交叉字段只有一个的combo_feature。一般来讲,参与交叉的各个字段来自不同的表(比如user特征和item特征进行交叉)。 -配置: +## 配置方法 ``` { diff --git a/docs/source/feature/fg_docs/IdFeature.md b/docs/source/feature/fg_docs/IdFeature.md index 1e697a00c..88a25488c 100644 --- a/docs/source/feature/fg_docs/IdFeature.md +++ b/docs/source/feature/fg_docs/IdFeature.md @@ -1,10 +1,10 @@ # id_feature -功能介绍 +## 功能介绍 id_feature是一个sparse feature,是一种最简单的离散特征,只是简单的将某个字段的值与用户配置的feature名字拼接。 -配置方法 +## 配置方法 ```json { diff --git a/docs/source/feature/fg_docs/LookupFeature.md b/docs/source/feature/fg_docs/LookupFeature.md index dc1052460..be2f0b549 100644 --- a/docs/source/feature/fg_docs/LookupFeature.md +++ b/docs/source/feature/fg_docs/LookupFeature.md @@ -1,6 +1,6 @@ # lookup_feature -## 功能简介 +## 功能介绍 如果离线生成不符合预期 请先使用最新的离线fg包 @@ -10,7 +10,7 @@ lookup_feature 依赖 map 和 key 两个字段,map是一个多值string(MultiS map 和 key 源可以是 item,user,context 的任意组合。在线输入的时候item的多值用多值分隔符char(29)分隔,user和context的多值在tpp访问时用list表示。该特征仅支持json形式的配置方式。 -## 实例 +## 配置方法 ```json { diff --git a/docs/source/feature/fg_docs/MatchFeature.md b/docs/source/feature/fg_docs/MatchFeature.md index 892af7a2a..4da869694 100644 --- a/docs/source/feature/fg_docs/MatchFeature.md +++ b/docs/source/feature/fg_docs/MatchFeature.md @@ -1,6 +1,6 @@ # match_feature -## match_feature使用说明 +## 功能介绍 match_feature一般用来做特征之间的匹配关系,要用到user,item和category三个字段的值。 match_feature支持两种类型,hit和multi hit。 diff --git a/docs/source/feature/fg_docs/OverLapFeature.md b/docs/source/feature/fg_docs/OverLapFeature.md index b9aeb0f0a..5faf09b68 100644 --- a/docs/source/feature/fg_docs/OverLapFeature.md +++ b/docs/source/feature/fg_docs/OverLapFeature.md @@ -1,6 +1,6 @@ # overlap_feature -## 功能简介 +## 功能介绍 用来输出一些字符串字词匹配信息的feature diff --git a/docs/source/feature/fg_docs/SequenceFeature.md b/docs/source/feature/fg_docs/SequenceFeature.md index 4afa27e3b..f6333d2e0 100644 --- a/docs/source/feature/fg_docs/SequenceFeature.md +++ b/docs/source/feature/fg_docs/SequenceFeature.md @@ -1,8 +1,11 @@ # sequence类feature -## 基本场景 +## 功能介绍 ⽤户的历史⾏为也是⼀个很重要的 feature。历史⾏为通常是⼀个序列,例如点击序列、购买序列等,组成这个序列的实体可能是商品本身。 + +## 配置方法 + 例如我们需要对⽤户的点击序列进⾏ fg,序列⻓度为 30,每个序列提取 nid 和 price, seq_context 特征。正常 item 维度有⼀个 feat0 特征。配置如下: ```json @@ -44,11 +47,11 @@ } ``` -## 在线 FG +### 在线 FG -我们⽀持两种⽅式获取⾏为序列,⼀种如例⼦所示,我们以 sequence_pk 配置的字段为主键,RTP 会帮忙从 item 表中查到序列的对应字段值;另⼀种⽤户需要在 qinfo 中准备好所有的字段。 +我们⽀持两种⽅式获取⾏为序列,⼀种如例⼦所示,我们以 `sequence_pk` 配置的字段为主键,RTP 会帮忙从 item 表中查到序列的对应字段值;另⼀种⽤户需要在 `qinfo` 中准备好所有的字段。 -### RTP 取 sequence 字段 +#### RTP 取 sequence 字段 第⼀种情况,`sequence_pk` 的⻓度应该⼩于等于 `sequence_length` 。如果 `sequence_pk` 指定的值不⾜ `sequence_length` 个会补⻬到 `sequence_length` ⻓度,fg 的结果会出默认值(dense 类是 0,sparse 类为空)。 qinfo 例⼦: @@ -59,9 +62,9 @@ qinfo 例⼦: } ``` -### qinfo 传递 sequence 字段 +#### qinfo 传递 sequence 字段 -第⼆种情况,sequence_feature 也⽀持所有的序列内容都从 qinfo 中传递。例如这⾥的user:seq_context 数组,他的值分别对应 click_0 和 click_1 。这种情况下⽤户可以忽略sequence_table 和 sequence_pk 。 +第⼆种情况,sequence_feature 也⽀持所有的序列内容都从 qinfo 中传递。例如这⾥的`user:seq_context` 数组,他的值分别对应 `click_0` 和 `click_1` 。这种情况下⽤户可以忽略`sequence_table` 和 `sequence_pk` 。 qinfo 例⼦: ```json @@ -72,7 +75,7 @@ qinfo 例⼦: } ``` -### context seq使⽤ +#### context seq使⽤ ``` { @@ -110,7 +113,7 @@ context seq特征与user seq类似,区别是每个context是batch size维度 第⼀类特征:需要查context_table,如price特征,会根据context_seq_id查询context_table中的price,然后做fg, 第⼆类特征:不需要context_table,如seq_context特征,会直接取seq_context做fg, -### item seq使⽤ +#### item seq使⽤ 增加"is_item_seq": true配置,如下, @@ -172,7 +175,7 @@ context seq特征与user seq类似,区别是每个context是batch size维度 } ``` -## 离线 FG +### 离线 FG ​⽬前使⽤ sequence_feature 要求使⽤ 新新版 feature_generator_java , tensorflow 训练流程要求使⽤ rtp_fg.parse_genreated_fg。 ​离线阶段没有sequence表去查,⽽是通过`sequence_column` 读取本来应该去表⾥查的字段。因此,`sequence_column ,sequence_delim ,attribute_delim` 这三个字段只有在离线 fg 阶段有⽤。`sequence_column` 是数据源odps表⾥所有 sequence 特征输⼊的字段名,离线fg会根据这个字段⾥的值⽣成sequence feature,该字段内容是 kv 格式的。`sequence_delim` 是sequence 中⾏为之间的分隔符,`attribute_delim` 是实际字段名字和字段值的分隔符。 diff --git a/docs/source/feature/rtp_fg.md b/docs/source/feature/rtp_fg.md index 5d9255ab6..4d4097130 100644 --- a/docs/source/feature/rtp_fg.md +++ b/docs/source/feature/rtp_fg.md @@ -223,16 +223,11 @@ | ----- | ------- | ------- | --------------- | --------------------------------------------------------------- | -------------------------------------------------- | | 0 | 122017 | 389957 | | tag_category_list:4589,new_user_class_level:,...,user_id:122017 | adgroup_id:539227,pid:430548_1007,...,cate_id:4281 | -```sql --- taobao_train_input.txt oss://easyrec/data/rtp/ --- wget http://easyrec.oss-cn-beijing.aliyuncs.com/data/rtp/taobao_train_input.txt --- wget http://easyrec.oss-cn-beijing.aliyuncs.com/data/rtp/taobao_test_input.txt -drop table if exists taobao_train_input; -create table if not exists taobao_train_input(`label` BIGINT,user_id STRING,item_id STRING,context_feature STRING,user_feature STRING,item_feature STRING); -tunnel upload taobao_train_input.txt taobao_train_input -fd=';'; -drop table if exists taobao_test_input; -create table if not exists taobao_test_input(`label` BIGINT,user_id STRING,item_id STRING,context_feature STRING,user_feature STRING,item_feature STRING); -tunnel upload taobao_test_input.txt taobao_test_input -fd=';'; +提供了在任何项目下都可以访问两张样例表 + +``` +pai_online_project.taobao_train_input +pai_online_project.taobao_test_input ``` - 稠密格式的数据,每个特征是单独的一列,如: @@ -242,7 +237,7 @@ tunnel upload taobao_test_input.txt taobao_test_input -fd=';'; | 1 | 122017 | 389957 | 4589 | | 0 | ```sql - drop table if exists taobao_train_input; + drop table if exists taobao_train_input_dense; create table taobao_train_input_dense(label bigint, user_id string, item_id string, tag_category_list bigint, ...); ``` @@ -267,9 +262,11 @@ set odps.sql.counters.dynamic.limit=true; drop table if exists taobao_fg_train_out; create table taobao_fg_train_out(label bigint, user_id string, item_id string, features string); +#--@resource_reference{"fg_on_odps-1.3.59-jar-with-dependencies.jar"} dataworks内运行要添加 jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i taobao_train_input -o taobao_fg_train_out -f fg.json; drop table if exists taobao_fg_test_out; create table taobao_fg_test_out(label bigint, user_id string, item_id string, features string); +#--@resource_reference{"fg_on_odps-1.3.59-jar-with-dependencies.jar"} dataworks内运行要添加 jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i taobao_test_input -o taobao_fg_test_out -f fg.json; --下载查看数据(可选) @@ -281,6 +278,7 @@ tunnel download taobao_fg_test_out taobao_fg_test_out.txt -fd=';'; - 支持分区表,分区表可以指定partition,也可以不指定partition,不指定partition时使用所有partition - **分区格式示例:** my_table/day=20201010,sex=male - 可以用多个-i指定**多个表的多个分区** + - 支持添加project,示例:pai_online_project.taobao_train_input - -o, 输出表,如果是分区表,一定要指定分区,只能指定一个输出表 - -f, fg.json - -m, mapper memory的大小,默认可以不设置 From 4d3cec2a173d505251f9ec012a5a2ad75dff9159 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Tue, 12 Sep 2023 17:19:58 +0800 Subject: [PATCH 20/22] change_feature_pdf_to_md --- docs/source/feature/rtp_fg.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/feature/rtp_fg.md b/docs/source/feature/rtp_fg.md index 4d4097130..e8dd4155d 100644 --- a/docs/source/feature/rtp_fg.md +++ b/docs/source/feature/rtp_fg.md @@ -278,7 +278,7 @@ tunnel download taobao_fg_test_out taobao_fg_test_out.txt -fd=';'; - 支持分区表,分区表可以指定partition,也可以不指定partition,不指定partition时使用所有partition - **分区格式示例:** my_table/day=20201010,sex=male - 可以用多个-i指定**多个表的多个分区** - - 支持添加project,示例:pai_online_project.taobao_train_input + - 支持添加project,示例:project.table/ds=xxx - -o, 输出表,如果是分区表,一定要指定分区,只能指定一个输出表 - -f, fg.json - -m, mapper memory的大小,默认可以不设置 From fab082b866ef47b30b144818a078704a07137745 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Wed, 13 Sep 2023 10:19:18 +0800 Subject: [PATCH 21/22] change_feature_pdf_to_md --- docs/source/feature/rtp_fg.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/source/feature/rtp_fg.md b/docs/source/feature/rtp_fg.md index e8dd4155d..7fdc41bec 100644 --- a/docs/source/feature/rtp_fg.md +++ b/docs/source/feature/rtp_fg.md @@ -262,12 +262,14 @@ set odps.sql.counters.dynamic.limit=true; drop table if exists taobao_fg_train_out; create table taobao_fg_train_out(label bigint, user_id string, item_id string, features string); -#--@resource_reference{"fg_on_odps-1.3.59-jar-with-dependencies.jar"} dataworks内运行要添加 -jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i taobao_train_input -o taobao_fg_train_out -f fg.json; +-- dataworks内运行,注意需要带有resource_reference这一行 +--@resource_reference{"fg_on_odps-1.3.59-jar-with-dependencies.jar"} +jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i pai_online_project.taobao_train_input -o taobao_fg_train_out -f fg.json; drop table if exists taobao_fg_test_out; create table taobao_fg_test_out(label bigint, user_id string, item_id string, features string); -#--@resource_reference{"fg_on_odps-1.3.59-jar-with-dependencies.jar"} dataworks内运行要添加 -jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i taobao_test_input -o taobao_fg_test_out -f fg.json; +-- dataworks内运行,注意需要带有resource_reference这一行 +--@resource_reference{"fg_on_odps-1.3.59-jar-with-dependencies.jar"} +jar -resources fg_on_odps-1.3.59-jar-with-dependencies.jar,fg.json -classpath fg_on_odps-1.3.59-jar-with-dependencies.jar com.taobao.fg_on_odps.EasyRecFGMapper -i pai_online_project.taobao_test_input -o taobao_fg_test_out -f fg.json; --下载查看数据(可选) tunnel download taobao_fg_test_out taobao_fg_test_out.txt -fd=';'; From 117cc411296f5b2aa375283343d49bfc9bd74be7 Mon Sep 17 00:00:00 2001 From: wb-lcl910122 Date: Wed, 13 Sep 2023 10:33:13 +0800 Subject: [PATCH 22/22] change_feature_pdf_to_md --- docs/source/feature/fg_docs/OverLapFeature.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/feature/fg_docs/OverLapFeature.md b/docs/source/feature/fg_docs/OverLapFeature.md index 5faf09b68..b6396db61 100644 --- a/docs/source/feature/fg_docs/OverLapFeature.md +++ b/docs/source/feature/fg_docs/OverLapFeature.md @@ -4,7 +4,7 @@ 用来输出一些字符串字词匹配信息的feature -离线推荐使用1.3.56-SNAPSHOT这个版本,或者1.3.28(不支持参数need_prefix) ps: 写fg的时候注意维度,title的维度要大于或等于query的问题(简单来说就是如果title是user特征,那query也只能是user特征,user特征的batch size为1,商品特征的batch size为商品数) +离线推荐使用1.3.56-SNAPSHOT这个版本。 ps: 写fg的时候注意维度,title的维度要大于或等于query的问题(简单来说就是如果title是user特征,那query也只能是user特征,user特征的batch size为1,商品特征的batch size为商品数) | 方式 | 描述 | 备注 | | ------------------- | ----------------------------------------------- | ------------------ |