Skip to content

Commit

Permalink
configs plus better svm
Browse files Browse the repository at this point in the history
  • Loading branch information
abrahamq committed Aug 22, 2019
1 parent b43ab34 commit 8cde989
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 142 deletions.
85 changes: 85 additions & 0 deletions chennai_only_pics_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# import this file and then overwrite whatever you need in
# the default_config object
import logging
import pandas as pd
from sqlalchemy import create_engine
DATABASE = "riskmap"
engine = create_engine(
"postgresql://postgres:postgres@localhost:5432/"
+ DATABASE)

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

TEST_LOG_FILENAME = ".log_filename.log"
fh = logging.FileHandler(TEST_LOG_FILENAME)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
LOGGER.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
LOGGER.addHandler(ch)

start_known_flood = "'2017-11-01 00:00:35.630000+05:30'"
end_known_flood = "'2017-11-07 00:00:35.630000+05:30'"


def __get_flood_pkeys(start_date, end_date, engine):
# gets the pkeys of reports during flood dates

pkeys = pd.read_sql_query(
'''
SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
created_at > %(start_date)s::timestamptz
AND
created_at < %(end_date)s::timestamptz
AND
image_url IS NOT NULL
''',
params={"start_date": start_date, "end_date": end_date},
con=engine, index_col="pkey")
return set(pkeys.index)


def __get_no_flood_pkeys(start_flood_date, end_flood_date, engine):
# gets the pkeys of reports outside dates

pkeys = pd.read_sql_query(
'''
SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
created_at < %(start_date)s::timestamptz
OR
created_at > %(end_date)s::timestamptz
AND
image_url IS NOT NULL
''',
params={"start_date": start_flood_date, "end_date": end_flood_date},
con=engine, index_col="pkey")
return set(pkeys.index)


flood_pkeys = __get_flood_pkeys(
start_known_flood,
end_known_flood,
engine)

no_flood_pkeys = __get_no_flood_pkeys(
start_known_flood,
end_known_flood,
engine)


config = {
"flood_pkeys": flood_pkeys,
"no_flood_pkeys": no_flood_pkeys,
"all_pkeys": flood_pkeys.union(no_flood_pkeys),
"database_engine": engine,
"database_name": DATABASE,
"location": "ch",
"data_folder_prefix": "default_chennai_data",
"logger": LOGGER
}
136 changes: 0 additions & 136 deletions flood_depth/indentity_labeler.py

This file was deleted.

105 changes: 105 additions & 0 deletions jakarta_only_pics_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Jakarta config: where only those reports with images included
# import this file and then overwrite whatever you need in
# the default_config object
import logging
import pandas as pd
from sqlalchemy import create_engine
DATABASE = "cognicity"
engine = create_engine(
"postgresql://postgres:postgres@localhost:5432/"
+ DATABASE)

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

LOG_FILENAME = ".default_jakarta.log"
fh = logging.FileHandler(LOG_FILENAME)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
LOGGER.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
LOGGER.addHandler(ch)

start_period = "'2017-01-01 00:00:35.630000-05:00'"
end_period = "'2017-03-10 00:00:35.630000-05:00'"

start_known_flood = "'2017-02-20 00:00:35.630000-05:00'"
end_known_flood = "'2017-02-23 00:00:35.630000-05:00'"


def __get_flood_pkeys(start_date, end_date, engine):
# gets the pkeys of reports during flood dates

pkeys = pd.read_sql_query(
'''
SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
created_at > %(start_date)s::timestamptz
AND
created_at < %(end_date)s::timestamptz
AND
image_url IS NOT NULL
''',
params={"start_date": start_date, "end_date": end_date},
con=engine, index_col="pkey")
return set(pkeys.index)


def __get_no_flood_pkeys(start_period,
start_flood_date,
end_flood_date,
end_period,
engine):
# gets the pkeys of reports outside dates

pkeys = pd.read_sql_query(
'''
SELECT pkey,
created_at
FROM ''' + DATABASE + '''.all_reports
WHERE (
created_at > %(start_period)s::timestamptz
AND created_at < %(start_flood_date)s::timestamptz)
OR (
created_at > %(end_flood_date)s::timestamptz
AND created_at < %(end_period)s::timestamptz)
AND
image_url IS NOT NULL
''',
params={
"start_period": start_period,
"start_flood_date": start_flood_date,
"end_flood_date": end_flood_date,
"end_period": end_period
},
con=engine, index_col="pkey")
return set(pkeys.index)


flood_pkeys = __get_flood_pkeys(
start_known_flood,
end_known_flood,
engine)

no_flood_pkeys = __get_no_flood_pkeys(
start_period,
start_known_flood,
end_known_flood,
end_period,
engine)


config = {
"flood_pkeys": flood_pkeys,
"no_flood_pkeys": no_flood_pkeys,
"all_pkeys": flood_pkeys.union(no_flood_pkeys),
"database_engine": engine,
"database_name": DATABASE,
"location": "id",
"data_folder_prefix": "default_jakarta_data",
"logger": LOGGER
}
10 changes: 9 additions & 1 deletion learners/svm_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def train(self, params, validation_keys):
self.t_labels = t_labels
t_data = t_data_w_pkey[1:, :]

self.clf = svm.SVC(gamma="scale", kernel="poly")
# found best hyper params by exhaustive grid search in svm_test.ipynb
self.clf = svm.SVC(gamma="scale", kernel="rbf", degree=3)
# sklearn expects rows to be data points, we've gone with columns
self.clf.fit(t_data.T, self.t_labels[0, :])

Expand All @@ -85,6 +86,13 @@ def train(self, params, validation_keys):
self.logger.info("Val score: " + str(percent_correct))

# get the signed distance for every train data point
# with the pkey as the first row
# self.t_sd = np.vstack((self.t_data_w_pkey[0, :],
# self.clf.decision_function(t_data.T)))
# # for every validation data point
# self.val_sd = np.vstack((self.val_data_w_pkey[0, :],
# self.clf.decision_function(val_data.T)))

self.t_sd = self.clf.decision_function(t_data.T)
# for every validation data point
self.val_sd = self.clf.decision_function(val_data.T)
Expand Down
10 changes: 6 additions & 4 deletions nlp/bow_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def load_labels_from_disk(self, filename='./bow_labels.p'):
def dump_labels_to_disk(self, labels, filename='./bow_labels.p'):
return super().dump_labels_to_disk(filename)

def make_feature_vectors(self, reports_dict, vocab):
def make_feature_vectors(self, reports_dict, vocab,
include_zero_vects=True):
"""
Args:
reports_dict: dict of {pkey: list(str)}
Expand All @@ -43,9 +44,10 @@ def make_feature_vectors(self, reports_dict, vocab):
feature_list = self.make_unary_feature_vector(vocab, word_list)
feature_vect_dict[pkey] = feature_list

zero_list = [0]*len(vocab)
for pkey in remaining_pkeys:
feature_vect_dict[pkey] = zero_list
if include_zero_vects:
zero_list = [0]*len(vocab)
for pkey in remaining_pkeys:
feature_vect_dict[pkey] = zero_list

return feature_vect_dict

Expand Down
Loading

0 comments on commit 8cde989

Please sign in to comment.