-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.py
29 lines (26 loc) · 1.06 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import re
from collections import Counter
from unicodedata import normalize
excerpt_max_size = 150
whitespace_regex = re.compile(r"\s+")
word_regex = re.compile(r"\w+")
excerpt_regex = re.compile(r"(?<=\s)\S((.*)\S)?(?=\s)", re.DOTALL)
def index_text(text):
text = normalize("NFKC", text)
text = whitespace_regex.sub(" ", text).strip()
text = " " + text + " "
lowercase_text = text.lower()
word_list = word_regex.findall(lowercase_text)
words = Counter(word_list)
word_count = len(word_list)
def get_excerpt(word):
if len(word) > excerpt_max_size:
return ""
word_start_index = lowercase_text.index(word)
word_end_index = word_start_index + len(word)
padding = (excerpt_max_size - len(word)) // 2
start_index = max(word_start_index - padding, 0)
end_index = word_end_index + padding
excerpt = excerpt_regex.search(text, start_index, end_index)
return excerpt.group() if excerpt else ""
return {word: (count / word_count, get_excerpt(word)) for (word, count) in words.items()}