-
Notifications
You must be signed in to change notification settings - Fork 1
/
SparkDecisionTree.py
76 lines (64 loc) · 2.71 KB
/
SparkDecisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext
from numpy import array
# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkDecisionTree")
sc = SparkContext(conf = conf)
# Some functions that convert our CSV input data into numerical
# features for each job candidate
def binary(YN):
if (YN == 'Y'):
return 1
else:
return 0
def mapEducation(degree):
if (degree == 'BS'):
return 1
elif (degree =='MS'):
return 2
elif (degree == 'PhD'):
return 3
else:
return 0
# Convert a list of raw fields from our CSV file to a
# LabeledPoint that MLLib can use. All data must be numerical...
def createLabeledPoints(fields):
yearsExperience = int(fields[0])
employed = binary(fields[1])
previousEmployers = int(fields[2])
educationLevel = mapEducation(fields[3])
topTier = binary(fields[4])
interned = binary(fields[5])
hired = binary(fields[6])
return LabeledPoint(hired, array([yearsExperience, employed,
previousEmployers, educationLevel, topTier, interned]))
#Load up our CSV file, and filter out the header line with the column names
rawData = sc.textFile("e:/sundog-consult/udemy/datascience/PastHires.csv")
header = rawData.first()
rawData = rawData.filter(lambda x:x != header)
# Split each line into a list based on the comma delimiters
csvData = rawData.map(lambda x: x.split(","))
# Convert these lists to LabeledPoints
trainingData = csvData.map(createLabeledPoints)
# Create a test candidate, with 10 years of experience, currently employed,
# 3 previous employers, a BS degree, but from a non-top-tier school where
# he or she did not do an internship. You could of course load up a whole
# huge RDD of test candidates from disk, too.
testCandidates = [ array([10, 1, 3, 1, 0, 0])]
testData = sc.parallelize(testCandidates)
# Train our DecisionTree classifier using our data set
model = DecisionTree.trainClassifier(trainingData, numClasses=2,
categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2},
impurity='gini', maxDepth=5, maxBins=32)
# Now get predictions for our unknown candidates. (Note, you could separate
# the source data into a training set and a test set while tuning
# parameters and measure accuracy as you go!)
predictions = model.predict(testData)
print('Hire prediction:')
results = predictions.collect()
for result in results:
print(result)
# We can also print out the decision tree itself:
print('Learned classification tree model:')
print(model.toDebugString())