-
Notifications
You must be signed in to change notification settings - Fork 0
/
datastore.py
140 lines (105 loc) · 3.93 KB
/
datastore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import mysql.connector
from mysql.connector import errorcode
from python_mysql_dbconfig import read_db_config
import threading
class DataStore:
def __init__(self, config_file):
'''
Example:
ds = DataStore('config.ini')
url = 'www.Thisisatest.com'
title = 'Test Title'
dict_word = {'test1': (50, "this test1 word is a test excerpt"),
'test2': (10, "this test2 : excerpt")}
ds.indexPage(url, title, dict_word)
word_list = ['test1', 'test2']
r = ds.search(word_list)
for t in r:
print(t)
'''
# holds configuration data (user, password, etc) for db connection
self.db_config = read_db_config(config_file)
# most connections allowed to Database and semaphore
self.max_connections = 5
# create counting semaphore
self.pool_sema = threading.BoundedSemaphore(value = self.max_connections)
# (url, title, Dict<word: (wordCount, excerpt)>)
def indexPage(self, url, title, words):
""" Inputs are a url (string), title (string), Dict<word (string), tuple(wordCount (int), excerpt(string)>
"""
# check lengths: URL <= 255 title <= 100
url = str(url[0:255])
title = str(title[0:100])
arg_url = (url, title, 0)
# with automatically calls acquire and release upon entering and exiting block
with self.pool_sema:
# use connection pool implicitly
conn = mysql.connector.connect(pool_name="thePool",
pool_size=self.max_connections, # 5 is the default
**self.db_config,
charset = "utf8mb4",
collation = "utf8mb4_bin")
# turn off autocommit
conn.autocommit = False
# create a cursor to call procedure
cursor_URL = conn.cursor()
try:
# results holds the args sent into callproc but contains an output with the uid
# that is needed in the next mysql procedure call
results = cursor_URL.callproc('PRC_STORE_URL_TTL', arg_url)
cursor_URL.close()
cursor_word = conn.cursor()
cursor_excerpt = conn.cursor()
# word, wordcount and excerpt to arg list
for word, word_data in words.items():
# check lengths: Word <= 45, excerpt <= 150
word = str(word[0:45])
excerpt = str(word_data[1][0:150])
# add word, UID, count, excerpt to Db
arg_word = (word, results[2], word_data[0], excerpt)
try:
cursor_word.callproc('PRC_STORE_WORD', arg_word)
except mysql.connector.Error as err:
print(err)
# close cursors
cursor_word.close()
cursor_excerpt.close()
except mysql.connector.Error as err:
print(err)
# return connection back to pool
conn.close()
# List<word> -> List<(url, title, excerpt, word count)>
def search(self, words):
""" Inputs are a list of words
Returns:
list of tuples if no problem (maybe empty if words arent in db...),
list containing '1' and the error if error storing info,
list containing '2' and 'Db not connected' if database is not connected
"""
# with automatically calls acquire and release upon entering and exiting block
with self.pool_sema:
# use connection pool implicitly
conn = mysql.connector.connect(pool_name="thePool",
pool_size=self.max_connections, # 5 is the default
**self.db_config)
try:
return_list = []
# create a cursor to call search procedure
cursor_retrieve = conn.cursor()
words = tuple(set(words))
format_strings = ','.join(['%s'] * len(words))
cursor_retrieve.execute("""SELECT SITE_URL, SITE_TITLE, ANY_VALUE(EXCERPT_PHRASE)
FROM WORD JOIN WORD_IDF USING (WORD_ID) JOIN EXCERPT USING (WORD_ID) JOIN SITE USING (SITE_ID)
WHERE WORD_WORD IN (%s)
GROUP BY SITE_ID
ORDER BY SUM(EXCERPT_WRD_FREQ_SITE * WORD_IDF) DESC
LIMIT 100;""" %format_strings, words)
for row in cursor_retrieve:
return_list.append(tuple(row))
# close cursor
cursor_retrieve.close()
except mysql.connector.Error as err:
print(err)
# return connection back to pool
conn.close()
return return_list