-
Notifications
You must be signed in to change notification settings - Fork 0
/
NbaStats.py
189 lines (136 loc) · 5.63 KB
/
NbaStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#### IMPORT LIBRARIES ####
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import mplcursors
#
##
###
#### CREATE THE DATAFRAME ####
""" 1. Create a variable where the url string is stored, the data that will be used are the Nba Totals
for 2019/2020 season , this dataset contains the total production of each NBA player that has played
in any game of the forementioned season"""
url = ("https://www.basketball-reference.com/leagues/NBA_2020_totals.html")
""" 2. Create the Dataframe using the url variable. The Pandas library contains the function read_html()
which finds all the html tables in a given webpage and creates a list of pandas dataframes """
df_totals = pd.read_html(url)
#
##
###
#### Data Cleaning & Validation ####
#------------------------------------------#
""" 1.In order to confirm that the variable contains 1 table and that it has taken the correct data type
the following if statement is used """
type_check = isinstance(df_totals[0], pd.DataFrame)
if len(df_totals) > 1:
print("Error: More than one html table inside the given url")
elif len(df_totals) < 1:
print("Error: No html table inside the given url")
elif len(df_totals) == 1:
if type_check == True:
df_totals = df_totals[0]
print("Type Check: OK")
else:
print("Error: The Dataframe is not of the correct type.")
#------------------------------------------#
""" 2. Check if there are duplicated values in the Dataset based on Player column and drop duplicated values"""
df_totals.drop_duplicates(subset="Player",keep="first",inplace=True)
if df_totals.Player.nunique() == len(df_totals.Player.unique()):
print("Duplicate Check: OK")
else:
print("Error: Your DataFrame contains duplicated values")
df_totals = df_totals[df_totals["Player"] != "Player"]
df_totals.to_csv("removeduplicates.csv")
#------------------------------------------#
""" 3. After removing duplicated player rows it appears that some columns contain only digits but their dtype is
object. In order to fix this the follow loop will be used """
## Create an empty list
cols = []
## Create a for loop that checks if all values of a column are numeric and save those columns to cols
df_totals.applymap(str)
for i in df_totals:
a = df_totals[i].str.isnumeric()
aun = a.unique()
if len(aun) == 1 and aun[0] == True:
cols.append(i)
## Convert the filtered columns to float
df_totals[cols]= df_totals[cols].astype("float")
colb = ["FG%","3P%","2P%","eFG%","FT%"]
df_totals[colb]= df_totals[colb].astype("float")
print("Dftype Check: OK")
#------------------------------------------#
#
##
###
#### SET INDEX ####
""" 1. In order to have better results in the data analysis it is necessary to set a meaningfull index , in this case the only unique identifier
is the Player's name """
df_totals.set_index("Player",inplace=True)
#------------------------------------------#
#
##
###
#### DROP COLUMNS ####
""" 2. The Rk column is uneccessary for the analysis therefore is can be dropped from the DataFrame"""
df_totals.drop(columns="Rk",inplace=True)
#------------------------------------------#
#
##
###
#### Data Analysis ` ####
""" For the Data Analysis a variety of functions will be defined that will be conducting a specific type of statistical analysis """
columns = []
for i in df_totals:
columns.append(i)
# 1. Descriptive Statistics
def describe(category):
df_totals[category].describe()
print("Descriptive Statistics for: " , category, "\n")
print("The Mean for ", category, " is: ", round(df_totals[category].mean(),3))
print("The Median for ", category, " is: ", round(df_totals[category].median(),3))
print("The Mean Absolute Deviation for ", category, " is: ", round(df_totals[category].mad(),3),"\n")
print("Top Ten Players in: " , category, "\n")
print(df_totals[category].nlargest(10))
# 2. Scatter Plots
## 2.1 2 Dimensional scatter plot
def scatter_2D(catA,catB):
if df_totals[catA].sum() > df_totals[catB].sum():
x= df_totals[catB]
y= df_totals[catA]
df_totals["Ratio"] = df_totals[catA]/df_totals[catB]
plt.xlabel(catB)
plt.ylabel(catA)
elif df_totals[catA].sum() < df_totals[catB].sum():
x= df_totals[catA]
y= df_totals[catB]
df_totals["Ratio"] = df_totals[catB]/df_totals[catA]
plt.xlabel(catA)
plt.ylabel(catB)
ratio= df_totals["Ratio"]
plt.style.use('seaborn')
a=plt.scatter(x,y,s=ratio*25,c=ratio,cmap="summer", edgecolor='black',linewidth=1,alpha=0.5)
cbar= plt.colorbar()
mplcursors.cursor(a)
cbar.set_label("Ratio")
plt.title("Scatter Plot")
plt.show()
"""
c= df_totals[catA].corr(df_totals[catB],method="spearman")
ax=df_totals.plot(kind="scatter",x=catA,y=catB,color="#94b8b8",grid=True)
ax.set_facecolor("#ffffe6")
plt.show()
"""
"""
# 2.2 Fantasy Correlograph
def scatter():
import seaborn as sns
sns.set(color_codes=True)
sns.pairplot(df_totals[["PTS","AST","STL","BLK","3P","TOV","FG%","FT%","TRB"]])
plt.show()
"""
# 3 Histogram
def histogram(cat):
import seaborn as sns
sns.set(color_codes=True)
sns.distplot(df_totals[cat])
plt.show()