This repository has been archived by the owner on Oct 7, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
convert_tabular_to_graph_data_train.py
123 lines (94 loc) · 4.47 KB
/
convert_tabular_to_graph_data_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Copyright (C) 2023 Intel Corporation
# SPDX-License-Identifier: MIT:w
import pandas as pd
import glob as glob
import numpy as np
from tqdm import tqdm
import argparse
parser = argparse.ArgumentParser()
parser.add_argument( "--role1", "-r1", nargs='+', type=str, default="f_15")
parser.add_argument( "--role2", "-r2", nargs='+', type=str, default="f_15")
parser.add_argument( "--normalize", "-n", nargs='+', type=bool, default=False)
args = parser.parse_args()
role1 = args.role1
role2 = args.role2
normalize = args.normalize
print("role1 = ", role1)
print("role2 = ", role2)
# Read Data
def read_data():
all_train_csvs = glob.glob("../../data/train/*.csv")
train_df = []
for i,file in tqdm(enumerate(all_train_csvs), total=len(all_train_csvs), desc="Read Train files"):
df_i = pd.read_table(file, engine="pyarrow")
train_df.append(df_i)
train_df = pd.concat(train_df, axis=0, ignore_index=True)
return train_df
train_df = read_data()
print("processing...")
val_df = train_df[train_df["f_1"]==66]
train_df = train_df[train_df["f_1"]<66]
train_df["train_mask"] = np.ones(len(train_df))
train_df["val_mask"] = np.zeros(len(train_df))
train_df["test_mask"] = np.zeros(len(train_df))
val_df["train_mask"] = np.zeros(len(val_df))
val_df["val_mask"] = np.ones(len(val_df))
val_df["test_mask"] = np.zeros(len(val_df))
fullds_df = pd.concat([train_df, val_df])
date_feature_inds = np.arange(1)
date_features = np.char.add("f_", date_feature_inds.astype(str))
categorical_feature_inds = np.arange(2,42)
categorical_features = np.char.add("f_", categorical_feature_inds.astype(str))
numerical_feature_inds = np.arange(42,80)
numerical_features = np.char.add("f_", numerical_feature_inds.astype(str))
all_feature_inds =np.arange(1,80)
all_features = np.char.add("f_", all_feature_inds.astype(str))
labels = ['is_clicked', 'is_installed']
fullds_df = fullds_df.replace('', np.nan)
for col in categorical_features:
if (fullds_df[col].isnull().values.any()):
fullds_df[col] = fullds_df[col].fillna(-1)
# print(col, fullds_df[col].isnull().sum())
for col in labels:
if (fullds_df[col].isnull().values.any()):
fullds_df[col] = fullds_df[col].fillna(-1)
# print(col, fullds_df[col].isnull().sum())
for col in numerical_features:
if (fullds_df[col].isnull().values.any()):
fullds_df[col] = fullds_df[col].replace(np.NaN, fullds_df[col].mean())
# print(col, fullds_df[col].isnull().sum())
if normalize:
print("WARNING: Using normalization of numerical features")
fullds_df[numerical_features]=(fullds_df[numerical_features]-fullds_df[numerical_features].mean())/fullds_df[numerical_features].std()
labels = ['is_clicked', 'is_installed']
role1.sort()
role2.sort()
role3 = list(set(fullds_df.columns) - set(role1) - set(role2) - set(labels) )
role3.sort()
role_1_df = fullds_df.drop_duplicates(subset=role1).reset_index()[role1]
role_2_df = fullds_df.drop_duplicates(subset=role2).reset_index()[role2]
def get_group_idx(df, group_id, f= []):
k = [i for i in df.columns if i not in f][0]
ret = df.groupby(by = f, as_index = False, sort=False)['f_0'].count().drop('f_0', axis=1)
ret[f'group_{group_id}'] = ret.index
ret2 = df.merge(ret, on=f)
return ret2
trial_df = get_group_idx(fullds_df, 1, role1)
trial_df = get_group_idx(trial_df, 2, role2)
trial_df.rename(columns={"group_1": "src_id", "group_2": "dst_id"}, inplace=True)
filename = './graph_data/sym_recsys_hetero_CSVDatasets/edges_train.csv.gz'
# filename = './graph_data/sym_recsys_hetero_CSVDatasets/edges'+'_'.join(role1)+'_'+'_'.join(role2) + '.csv.gz'
# trial_df[['src_id', 'dst_id'] + role1 + role2 + role3 + labels].to_csv(filename, compression='gzip', index=False)
chunks = np.array_split(trial_df.index, 100) # split into 100 chunks
for chunck, subset in enumerate(tqdm(chunks, desc='Writing csv file')):
if chunck == 0: # first row
trial_df[['src_id', 'dst_id'] + role1 + role2 + role3 + labels].loc[subset].to_csv(filename, mode='w', compression='gzip', index=True)
else:
trial_df[['src_id', 'dst_id'] + role1 + role2 + role3 + labels].loc[subset].to_csv(filename, header=None, mode='a', compression='gzip', index=True)
with open('./yaml_store/recsys2graph.yaml', 'a') as f:
f.write('edge_features: ' + str(list(set(role3) -set(['f_7', 'train_mask', 'test_mask', 'val_mask']))) + '\n')
f.write('node_features:' + '\n')
role1.append('f_7')
role2.append('f_7')
f.write(' - ' + str(role1) + '\n')
f.write(' - ' + str(role2) + '\n')