forked from maged918/tagaware-rec-sys
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stats.py
183 lines (165 loc) · 7.24 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import pandas as pd
import pickle
import config
from joiner import build_tags, create_df
import matplotlib as mpl
# mpl.use('pgf')
# mpl.use('pdf')
import numpy as np
import config
can_visualize = True
try:
import seaborn as sns
except ImportError:
can_visualize = False
from itertools import product
# print(df.groupby['string'].value_counts())
from scipy import stats
def figsize(scale):
fig_width_pt = 469.755 # Get this from LaTeX using \the\textwidth
inches_per_pt = 1.0/72.27 # Convert pt to inch
golden_mean = (np.sqrt(5.0)-1.0)/2.0 # Aesthetic ratio (you could change this)
fig_width = fig_width_pt*inches_per_pt*scale # width in inches
fig_height = fig_width*golden_mean # height in inches
fig_size = [fig_width,fig_height]
return fig_size
pgf_with_latex = { # setup matplotlib to use latex for output
"pgf.texsystem": "pdflatex", # change this if using xetex or lautex
"text.usetex": True, # use LaTeX to write all text
"font.family": "serif",
"font.serif": [], # blank entries should cause plots to inherit fonts from the document
"font.sans-serif": [],
"font.monospace": [],
"axes.labelsize": 14, # LaTeX default is 10pt font.
"font.size": 12,
"legend.fontsize": 12, # Make the legend/label fonts a little smaller
"xtick.labelsize": 12,
"ytick.labelsize": 12,
"figure.figsize": figsize(0.9), # default fig size of 0.9 textwidth
"pgf.preamble": [
r"\usepackage[utf8x]{inputenc}", # use utf8 fonts becasue your computer can handle it :)
r"\usepackage[T1]{fontenc}", # plots will be generated using this preamble
]
}
# mpl.rcParams.update(pgf_with_latex)
import matplotlib.pyplot as plt
def newfig(width):
plt.clf()
fig = plt.figure(figsize=figsize(width))
ax = fig.add_subplot(111)
return fig, ax
def savefig(filename):
plt.savefig('{}.png'.format(filename))
# plt.savefig('{}.pdf'.format(filename))
def ema(y, a):
s = []
s.append(y[0])
for t in range(1, len(y)):
s.append(a * y[t] + (1-a) * s[t-1])
return np.array(s)
def visualize_totals(inst_feats):
# for col in inst_feats.columns.difference(['id', 'problem_id', 'tags']):
# sns.boxplot(y=col, data =inst_feats, showfliers=1)
# plt.show()
# print(inst_feats[col].value_counts())
# print(inst_feats.get_group('math')['vector'])
# sns.distplot(inst_feats.get_group('math')['operations'], norm_hist=True, kde=False)
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False)
ops = inst_feats.get_group('dp')['minus']
sns.countplot(list(ops[ops<30]), ax = ax1)
# print(list(ops))
# plt.show()
# sns.distplot(inst_feats.get_group('graphs')['operations'], norm_hist=True, kde=False)
ops = inst_feats.get_group('brute force')['minus']
# print(list(ops))
sns.countplot(list(ops[ops<30]), ax = ax2)
plt.show()
def visualize(inst_feats):
if can_visualize:
# for col in ['string']:
order = inst_feats.loc[:, 'tags'].unique()
for col in inst_feats.columns.difference(['id', 'problem_id', 'tags']):
# for col in ['operations', 'variables']:
# for col in ['variables']:
# for col in ['modulus']:
# plt.ylim(-10, 40)
# sns.set(font_scale=1.75)
if algo_mode == 'algos':
inst_feats = inst_feats.loc[inst_feats['tags'].isin(['matrices', 'combinatorics', 'shortest paths', 'flows'])]
order = ['matrices', 'combinatorics', 'shortest paths', 'flows']
fig, ax = newfig(1.2)
sns.boxplot(y = col, x='tags', data = inst_feats, orient='v', showfliers=0, order=order, ax=ax)
plt.ylabel(col)
# # y = [0]*200
# # y.extend([20]*(1000-len(y)))
# # s = ema(y, 0.01)
# ax.plot(s)
# plt.xticks(rotation=30)
# plt.show()
savefig('dataset/%s/%s' % (algo_mode, col))
def visualize_pairs(inst_feats):
lm = sns.lmplot(y='minus', x='arrays', hue='tags', data=inst_feats, fit_reg=True)
axes = lm.axes
axes[0][0].set_xlim(0,20)
axes[0][0].set_ylim(0,100)
plt.show()
def get_properties():
out = pd.read_csv('out-classifier.csv', header=0, sep=',')
diffs = config.get_difficulties()
diffs = [':'.join(diff) for diff in diffs]
for diff in diffs:
print('--------------------------------------------')
print(out.loc[(out['difficulties']==diff) & (out['algo_mode']=='dp_bf')])
def load_output():
out = pd.read_csv('out-classifier.csv', header=0, sep=',')
# algos_df = out.loc[list(range(287,302)),:] #algorithms
algos_df = out.iloc[list(range(302,319)),:]
# print(algos_df.head())
print("& SVM & RFT & ADA \\\\ \hline")
svm = algos_df.loc[algos_df['classifier']=='SVM',['acc', 'feat_types']]
rft = algos_df.loc[algos_df['classifier']=='RFT',['acc', 'feat_types']]
ada = algos_df.loc[algos_df['classifier']=='ADA',['acc', 'feat_types']]
print(svm)
for word, feat_type in zip(['all\\_feats', 'count\\_vars', 'count\\_vars + operations', 'count\\_vars + operations + constructs', 'lines'],\
['all_feats', 'count_vars', 'count_vars:operations' , 'count_vars:operations:constructs', 'lines']):
print('%s & %.2f & %.2f & %.2f \\\\ \hline' % (word, svm.loc[svm['feat_types']==feat_type, 'acc'],\
rft.loc[rft['feat_types']==feat_type, 'acc'], ada.loc[ada['feat_types']==feat_type, 'acc']))
# all\_feats& 0.85 & 0.89 & 0.88 \\ \hline
# count\_vars& 0.70 & 0.81 & 0.82 \\ \hline
# count\_vars + operations& 0.80 & 0.87 & 0.86 \\ \hline
# count\_vars + operations + constructs & 0.84 & 0.88 & 0.87 \\ \hline
# lines& 0.70 & 0.73 & 0.73
def problems_stats(inst_feats):
grouped = inst_feats.groupby('problem_id')
result = pd.DataFrame(columns=inst_feats.columns)
i = 0
for df in grouped:
# df = grouped.get_group('560/C').drop(['id', 'tags', 'problem_id'], axis=1)
# print(df[(np.abs(stats.zscore(df))<1).all(axis=1)])
# df = df[1].drop(['id', 'tags', 'problem_id'], axis=1)
df = df[1]
result = result.append(df[df[df.columns.difference(['id', 'tags', 'problem_id'])]\
.apply(lambda x: np.abs(x - x.mean()) / (x.std() if x.std()!= 0 else 1) < 3).all(axis=1)])
if i % 100 == 0:
print(i)
i+=1
print(result.shape)
pickle.dump(result, open(config.get_feat_prefix() + 'features-pandas-no-outliers.pickle', 'wb'))
# print(df[np.abs(df-df.mean())<=(10*df.std())])
# print(grouped.describe())
# print(df.mean())
ds_dir = config.get_ds_dir()
in_dir = ds_dir + 'DivAll'
algo_mode = config.get_algorithm_modes()[0]
tags_file = config.get_tags_file(in_dir, algo_mode)
inst_feats = pickle.load(open( config.get_feat_prefix() + 'features-pandas.pickle', 'rb'))
tags_list, delete_keys, inst_tags = build_tags(tags_file)
inst_feats, X, Y = create_df(inst_feats, inst_tags, delete_keys)
grouped = inst_feats.groupby('tags')
# print(grouped['operations'].describe())
# visualize(inst_feats)
visualize_pairs(inst_feats)
# problems_stats(inst_feats)
visualize_totals(grouped)
load_output()
# get_properties()