-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordExtractor.py
72 lines (56 loc) · 2.19 KB
/
WordExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import requests
import re
from bs4 import BeautifulSoup
import click
def get_all_words_from(url):
html = get_html_of(url)
soup = BeautifulSoup(html,'html.parser')
raw_text = soup.get_text()
return re.findall(r'\w+',raw_text)
def get_html_of(url):
res = requests.get(url)
if res.status_code != 200:
print(f'The status code of {res.status_code} is not acceptable')
return ''
return res.content.decode()
def get_top_words_from(all_words,length):
word_count = word_counter(all_words,length)
return sorted(word_count.items(), key = lambda item: item[1], reverse = True)
def word_counter (all_words, length):
word_count = {}
for word in all_words:
if len(word) < length:
continue
if word not in word_count:
word_count[word] = 1
else :
counter = word_count.get(word)
word_count[word] = counter + 1
return word_count;
@click.command()
@click.option('--url','-u',prompt='Please enter the url',help='Url of the webpage')
@click.option('--length','-l',default=0,help='Minimum length of the word(default value is zero)')
@click.option('--deep','-d',default=None,type = int,help='How many words to extract')
@click.option('--save','-s',default=None,help='Specify the file name to save the output')
def main(url,length,deep,save):
all_words = get_all_words_from(url)
top_words = get_top_words_from(all_words,length)
if deep is None:
deep = len(top_words)
if deep > len(top_words):
print(f"Warning: Requested {deep} words, but only {len(top_words)} are available.")
deep = len(top_words)
output = []
for i in range(deep):
output.append(top_words[i][0])
# Print to console
for word in output:
print(word)
# Save to file if specified
if save:
with open(save, 'w') as f:
for word in output:
f.write(word + '\n')
print(f"___________________________________________________________________________OUTPUT SAVED TO {save}___________________________________________________________________________")
if __name__ == '__main__':
main()