-
Notifications
You must be signed in to change notification settings - Fork 0
/
datapreprocess.cpp
105 lines (96 loc) · 2.95 KB
/
datapreprocess.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include<bits/stdc++.h>
using namespace std;
#define NUMBER_OF_INTERVALS 5
fstream h("valtoattr.txt",ios::out);
int counter = 1;
void process_continuous(vector<double> v, vector<int> &classes, int numintervals,string attrname)
{
double max, min;
max = *max_element(v.begin(),v.end());
min = *min_element(v.begin(),v.end());
double intervalwidth = (max-min)/numintervals;
for (int i = 0; i < numintervals; ++i)
{
h<<counter<<" "<<(min+(i*numintervals))<<"<="<<attrname<<"<"<<(min+((i+1)*numintervals))<<endl;
counter++;
}
for (int i = 0; i < v.size(); ++i)
{
int cl = ceil((v[i] - min)/intervalwidth);
if(cl==0)
cl = 1;
classes.push_back(cl);
}
}
int main()
{
// Attributes
// 1. Number of times pregnant
// 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
// 3. Diastolic blood pressure (mm Hg)
// 4. Triceps skin fold thickness (mm)
// 5. 2-Hour serum insulin (mu U/ml)
// 6. Body mass index (weight in kg/(height in m)^2)
// 7. Diabetes pedigree function
// 8. Age (years)
// 9. Class variable (0 or 1)
fstream f("pima-indians-diabetes.data", ios::in);
fstream g("transactions.txt", ios::out);
int numattributes = 9;
int numdata = 768;
vector<double> npreg(numdata);
vector<double> gluc(numdata);
vector<double> bp(numdata);
vector<double> triceps(numdata);
vector<double> si(numdata);
vector<double> bmi(numdata);
vector<double> dpf(numdata);
vector<double> age(numdata);
vector<int> hasdm(numdata);
vector<vector<int> > itemsets;
for (int i = 0; i < numdata; ++i)
{
f>>npreg[i]>>gluc[i]>>bp[i]>>triceps[i]>>si[i]>>bmi[i]>>dpf[i]>>age[i]>>hasdm[i];
hasdm[i]++;
}
vector<int> temp;
process_continuous(npreg,temp,NUMBER_OF_INTERVALS,"number-of-pregnancies");
itemsets.push_back(temp);
temp.clear();
process_continuous(gluc,temp,NUMBER_OF_INTERVALS,"plasma-glucose-concentration");
itemsets.push_back(temp);
temp.clear();
process_continuous(bp,temp,NUMBER_OF_INTERVALS,"diastolic-blood-pressure");
itemsets.push_back(temp);
temp.clear();
process_continuous(triceps,temp,NUMBER_OF_INTERVALS,"triceps-skin-fold-thickness");
itemsets.push_back(temp);
temp.clear();
process_continuous(si,temp,NUMBER_OF_INTERVALS,"serum-insulin");
itemsets.push_back(temp);
temp.clear();
process_continuous(bmi,temp,NUMBER_OF_INTERVALS,"body-mass-index");
itemsets.push_back(temp);
temp.clear();
process_continuous(dpf,temp,NUMBER_OF_INTERVALS,"diabetes-pedigree-function");
itemsets.push_back(temp);
temp.clear();
process_continuous(age,temp,NUMBER_OF_INTERVALS,"age");
itemsets.push_back(temp);
temp.clear();
itemsets.push_back(hasdm);
h<<(NUMBER_OF_INTERVALS*8)+1<<" has-diabetes\n";
h<<(NUMBER_OF_INTERVALS*8)+2<<" does-not-have-diabetes\n";
g<<numdata<<" "<<(NUMBER_OF_INTERVALS*8)+2<<endl;
for (int i = 0; i < numdata; ++i)
{
g<<numattributes<<" ";
for (int j = 0; j < numattributes; ++j)
{
g<<itemsets[j][i]+(j*NUMBER_OF_INTERVALS)<<" ";
}
g<<endl;
}
g.close();
f.close();
}