-
Notifications
You must be signed in to change notification settings - Fork 0
/
webscraping.py
110 lines (99 loc) · 4.05 KB
/
webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from bs4 import BeautifulSoup
import urllib.request as urllib
import pandas as pd
import ssl
from random import randint
from time import sleep
def find_urls(initial_url):
"""
:param initial_url:
:return: web scrapes a table to find a list of urls, specialized for fbref.com,
maybe think about adding this as a parser
"""
urls = []
ssl._create_default_https_context = ssl._create_unverified_context
#html = urllib.urlopen(initial_url)
soup = BeautifulSoup((urllib.urlopen(initial_url)).read())
table = soup.find('tbody')
rows = table.find_all('tr')
for row in rows:
#date1 = row.find('td', {'class':"left", "data-stat":"date"})
#team1 = row.find('td', {'class':"right", "data-stat": "squad_a"})
#team2 = row.find('td', {'class':"right", "data-stat": "squad_b"})
link = row.find('td', {'class':'center', 'data-stat': 'score'})
#dates.append(date1.get_text())
if link.find('a') == None:
pass
elif link.find('a').get('href') != None:
urls.append('https://fbref.com'+ str(link.find('a').get('href')))
#urls = [if row.find('td', {'class':'center', 'data-stat': 'score'}).find('a') == None for row in rows]
return urls[0:5]
def scrape_data(intialurl):
"""
:param intialurl:
:return: Maybe considering just taking list of urls? Project is specialized for fb ref.
Feel like not going to be scalable.
"""
playerstats = {}
urls = find_urls(intialurl)
for url in urls:
teams = {}
score = 0
html = urllib.urlopen(url)
soup = BeautifulSoup(html.read())
table = soup.find('table', {'id': 'shots_all'})
rows = table.find_all('tr', {'style': 'background-color:rgb(210,240,210)'})
print(url)
for row in rows:
player = row.find('td', {'class': 'left', 'data-stat':'player'})
player = player.find('a').get_text()
team = row.find('td',{'class':'left','data-stat':'squad'}).get_text()
print(team)
if team not in teams:
teams[team] = 1
if player not in playerstats:
playerstats[player] = {'clutch goals': 0, 'non-clutch goals': 0}
if team not in teams or team != teams[-1]:
print('yaos')
score -= 1
teams.append(team)
playerstats[player]['clutch goals'] += 1
elif team == teams[-1]:
score += 1
print('yup')
teams.append(team)
playerstats[player]['non-clutch goals'] += 1
else:
if team not in teams or team != teams[-1] or scores == 0:
print('yaos')
teams.append(team)
playerstats[player]['clutch goals'] += 1
score += 1
elif team == teams[-1]:
print('yup')
teams.append(team)
playerstats[player]['non-clutch goals'] += 1
# Sleep a random number of seconds (between 1 and 5)
print(playerstats)
sleep(randint(1, 5))
print(playerstats)
# creating dataframe with data, indexvals and columns
df = pd.DataFrame.from_dict(playerstats, orient='index')
#df.to_csv('trying.csv')
df = df.fillna(0)
# converting to float to alter later
return df
# cumulative score through one metric
def main():
intial_url = "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures"
#find_urls(intial_url)
scrape_data(intial_url)
#
# Essentially this: 1. Get all possible links for premier league stats from fbref.
# 2. Enter link and gather all goal data, and times?
# generate metric that creates dictionary of stats, when it is a goal that is purposeful.
# This meaning that the goal either a. Sends team ahead in score, or equalizes scores.
# could maybe put this in df?
# 3. Then can conduct analysis. Possibly give a score per 90.
if __name__ == '__main__':
main()