-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraping.py
85 lines (79 loc) · 2.92 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import requests
from bs4 import BeautifulSoup
def get_cbc_headlines():
"""
Retrieves the latest headlines from the CBC News website.
Returns:
list or None: A list of tuples, each containing the headline text and its URL.
Returns None if an error occurs during retrieval.
"""
url = "https://www.cbc.ca/news"
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
links = soup.find_all("a")
cbc_headlines = []
for link in links:
headline = link.h3
href = link['href']
if headline and href:
cbc_headlines.append((headline.text, "https://www.cbc.ca" + href))
else:
cbc_headlines = None
except Exception as e:
cbc_headlines = None
print(f"ERROR: {e}")
return cbc_headlines
def get_ctv_headlines():
"""
Fetches the latest headlines from the CTV News Canada section.
Returns:
list or None: A list of tuples, each containing the headline text and its URL.
Returns None if an error occurs during retrieval.
"""
url = "https://www.ctvnews.ca/canada"
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
links = soup.find_all("h3", attrs={'class': 'c-list__item__title'})
ctv_headlines = []
for link in links:
if link.a:
href = link.a['href'].strip()
text = link.a.text.strip()
if text and href:
ctv_headlines.append((text, href))
else:
ctv_headlines = None
except Exception as e:
ctv_headlines = None
print(f"ERROR: {e}")
return ctv_headlines
def get_global_headlines():
"""
Retrieves the latest headlines from the Global News Canada section.
Returns:
list or None: A list of tuples, each containing the headline text and its URL.
Returns None if an error occurs during retrieval.
"""
url = "https://globalnews.ca/canada/"
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
links = soup.find_all("a", attrs={'class': 'c-posts__inner'})
global_headlines = []
for link in links:
if link.span:
href = link['href'].strip()
text = link.span.text.strip()
if text and href:
global_headlines.append((text, href))
else:
global_headlines = None
except Exception as e:
global_headlines = None
print(f"ERROR: {e}")
return global_headlines