From 8cde9895484aca9d86f3602568d316d4b62b1e16 Mon Sep 17 00:00:00 2001 From: abrahamq Date: Wed, 21 Aug 2019 21:26:26 -0400 Subject: [PATCH] configs plus better svm --- chennai_only_pics_config.py | 85 +++++++++++++++++++ flood_depth/indentity_labeler.py | 136 ------------------------------- jakarta_only_pics_config.py | 105 ++++++++++++++++++++++++ learners/svm_learner.py | 10 ++- nlp/bow_labeler.py | 10 ++- simple_nn.py | 19 ++++- 6 files changed, 223 insertions(+), 142 deletions(-) create mode 100644 chennai_only_pics_config.py delete mode 100644 flood_depth/indentity_labeler.py create mode 100644 jakarta_only_pics_config.py diff --git a/chennai_only_pics_config.py b/chennai_only_pics_config.py new file mode 100644 index 0000000..2ff6d99 --- /dev/null +++ b/chennai_only_pics_config.py @@ -0,0 +1,85 @@ +# import this file and then overwrite whatever you need in +# the default_config object +import logging +import pandas as pd +from sqlalchemy import create_engine +DATABASE = "riskmap" +engine = create_engine( + "postgresql://postgres:postgres@localhost:5432/" + + DATABASE) + +LOGGER = logging.getLogger() +LOGGER.setLevel(logging.DEBUG) + +formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + +TEST_LOG_FILENAME = ".log_filename.log" +fh = logging.FileHandler(TEST_LOG_FILENAME) +fh.setLevel(logging.DEBUG) +fh.setFormatter(formatter) +LOGGER.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +ch.setFormatter(formatter) +LOGGER.addHandler(ch) + +start_known_flood = "'2017-11-01 00:00:35.630000+05:30'" +end_known_flood = "'2017-11-07 00:00:35.630000+05:30'" + + +def __get_flood_pkeys(start_date, end_date, engine): + # gets the pkeys of reports during flood dates + + pkeys = pd.read_sql_query( + ''' + SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE + created_at > %(start_date)s::timestamptz + AND + created_at < %(end_date)s::timestamptz + AND + image_url IS NOT NULL + ''', + params={"start_date": start_date, "end_date": end_date}, + con=engine, index_col="pkey") + return set(pkeys.index) + + +def __get_no_flood_pkeys(start_flood_date, end_flood_date, engine): + # gets the pkeys of reports outside dates + + pkeys = pd.read_sql_query( + ''' + SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE + created_at < %(start_date)s::timestamptz + OR + created_at > %(end_date)s::timestamptz + AND + image_url IS NOT NULL + ''', + params={"start_date": start_flood_date, "end_date": end_flood_date}, + con=engine, index_col="pkey") + return set(pkeys.index) + + +flood_pkeys = __get_flood_pkeys( + start_known_flood, + end_known_flood, + engine) + +no_flood_pkeys = __get_no_flood_pkeys( + start_known_flood, + end_known_flood, + engine) + + +config = { + "flood_pkeys": flood_pkeys, + "no_flood_pkeys": no_flood_pkeys, + "all_pkeys": flood_pkeys.union(no_flood_pkeys), + "database_engine": engine, + "database_name": DATABASE, + "location": "ch", + "data_folder_prefix": "default_chennai_data", + "logger": LOGGER +} diff --git a/flood_depth/indentity_labeler.py b/flood_depth/indentity_labeler.py deleted file mode 100644 index e84c4df..0000000 --- a/flood_depth/indentity_labeler.py +++ /dev/null @@ -1,136 +0,0 @@ -""" -A pass through labeler -Abraham Quintero, MIT 2018 -v0.0.1 -""" - -__author__ = "Abraham Quintero" -__created__ = "2019-2-9" -__version__ = "0.0.1" -__copyright__ = "Copyright 2019 MIT Urban Risk Lab" -__license__ = "MIT" -__email__ = "abrahamq@mit.edu" -__status__ = "Development" -__url__ = "https://github.com/urbanriskmap/timeseries-analysis" - -import os -import pickle -from image_recognition.abstract_labeler import AbstractLabeler - - -class IdentityLabeler(AbstractLabeler): - def __init__(self, configObj, loader): - self.loader = loader - self.config = configObj - self.database = configObj["database_engine"] - self.database_name = configObj["database_name"] - self.location = configObj["location"] - self.data_folder_prefix = configObj["data_folder_prefix"] - self.logger = configObj["logger"] - - super().__init__(configObj, loader) - self.logger.debug("IdentityLabeler constructed") - - def run_labeler(self, filename="iden_labels_default.p", rerun=False): - """ - loads labels from disk or uses labeler api to labeles images if - there are no labels on disk, then saves the labels to disk in - config.data_folder_prefix/filename - """ - label_path = os.path.join(self.config["data_folder_prefix"], filename) - if rerun or not os.path.exists(label_path): - depths = self.loader.get_flood_depths() - labels = self.get_labels(depths, self.dump_labels_to_disk) - else: - labels = self.load_labels_from_disk(filename) - return labels - - def load_labels_from_disk(self, filename="iden_labels_default.p"): - path = os.path.join(self.data_folder_prefix, filename) - return pickle.load(open(path, "rb")) - - def dump_labels_to_disk(self, labels, filename="iden_labels_default.p"): - path = os.path.join(self.data_folder_prefix, filename) - pickle.dump(labels, open(path, "wb")) - return - - def make_matrix(self, feat_vects): - return super().make_matrix(feat_vects) - - def make_feature_vectors(self, inp, allowed): - """ - Args: - inp: - Dictionary of (pkeys: int) - where for each pkey represents the flood depth for pkey - allowed: - Dictionary of allowed word to the index in the feature vector - example: allowed = { - "Flood":0, - "Flooding":1, - "Water":2, - "Puddle":3, - "Person":4 - } - would create feature vectors where the zeroth - feature is the confidence score of - Flood in picture, 1st element is Flooding and so on - Returns: - Dictionary{ string Pkey: list{float}} - where list is a vector defined by allowed - """ - # dict of pkeys to feature vectors - - flood = self.config["flood_pkeys"] - all_selected_pkeys = flood.union(self.config["no_flood_pkeys"]) - - features = dict([(key, [0]*len(allowed.keys())) for key in inp.keys() - if key in all_selected_pkeys]) - for pkey in features.keys(): - all_selected_pkeys.remove(pkey) - # fill in the label if it exists - if pkey in inp: - desc = "flood_depth" - if desc in allowed: - features[pkey][allowed[desc]] =\ - float(inp[pkey]) - # add in zero features that don't have flood heights - zero_list = [0]*len(allowed.keys()) - for pkey in all_selected_pkeys: - assert(pkey not in features) - features[pkey] = zero_list - - self.features = features - return features - - def get_labels(self, depths_df, hook=None): - """ - Returns - Args: - depths_df(pd.DataFrame): - pkeys to integer flood depths (in cm) - Returns: - Labels dict of (pkey, int)): - for each pkey in depths_df, the integer result - for that pkey - """ - labels = depths_df.to_dict()["flood_depth"] - if hook is not None: - hook(labels) - - return labels - - def make_label_to_index(self, inp): - """ - Args: - inp dict of (pkeys: int) - flood depth - Returns: - lab_to_index: dict(string: index) - constant of {"flood_depth": 0} - index_to_label: dict(index: string) - constant of {0: "flood_depth"} - """ - lab_to_index = {"flood_depth": 0} - index_to_label = {0: "flood_depth"} - return lab_to_index, index_to_label diff --git a/jakarta_only_pics_config.py b/jakarta_only_pics_config.py new file mode 100644 index 0000000..35fbf2c --- /dev/null +++ b/jakarta_only_pics_config.py @@ -0,0 +1,105 @@ +# Jakarta config: where only those reports with images included +# import this file and then overwrite whatever you need in +# the default_config object +import logging +import pandas as pd +from sqlalchemy import create_engine +DATABASE = "cognicity" +engine = create_engine( + "postgresql://postgres:postgres@localhost:5432/" + + DATABASE) + +LOGGER = logging.getLogger() +LOGGER.setLevel(logging.DEBUG) + +formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + +LOG_FILENAME = ".default_jakarta.log" +fh = logging.FileHandler(LOG_FILENAME) +fh.setLevel(logging.DEBUG) +fh.setFormatter(formatter) +LOGGER.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +ch.setFormatter(formatter) +LOGGER.addHandler(ch) + +start_period = "'2017-01-01 00:00:35.630000-05:00'" +end_period = "'2017-03-10 00:00:35.630000-05:00'" + +start_known_flood = "'2017-02-20 00:00:35.630000-05:00'" +end_known_flood = "'2017-02-23 00:00:35.630000-05:00'" + + +def __get_flood_pkeys(start_date, end_date, engine): + # gets the pkeys of reports during flood dates + + pkeys = pd.read_sql_query( + ''' + SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE + created_at > %(start_date)s::timestamptz + AND + created_at < %(end_date)s::timestamptz + AND + image_url IS NOT NULL + ''', + params={"start_date": start_date, "end_date": end_date}, + con=engine, index_col="pkey") + return set(pkeys.index) + + +def __get_no_flood_pkeys(start_period, + start_flood_date, + end_flood_date, + end_period, + engine): + # gets the pkeys of reports outside dates + + pkeys = pd.read_sql_query( + ''' + SELECT pkey, + created_at + FROM ''' + DATABASE + '''.all_reports + WHERE ( + created_at > %(start_period)s::timestamptz + AND created_at < %(start_flood_date)s::timestamptz) + OR ( + created_at > %(end_flood_date)s::timestamptz + AND created_at < %(end_period)s::timestamptz) + AND + image_url IS NOT NULL + ''', + params={ + "start_period": start_period, + "start_flood_date": start_flood_date, + "end_flood_date": end_flood_date, + "end_period": end_period + }, + con=engine, index_col="pkey") + return set(pkeys.index) + + +flood_pkeys = __get_flood_pkeys( + start_known_flood, + end_known_flood, + engine) + +no_flood_pkeys = __get_no_flood_pkeys( + start_period, + start_known_flood, + end_known_flood, + end_period, + engine) + + +config = { + "flood_pkeys": flood_pkeys, + "no_flood_pkeys": no_flood_pkeys, + "all_pkeys": flood_pkeys.union(no_flood_pkeys), + "database_engine": engine, + "database_name": DATABASE, + "location": "id", + "data_folder_prefix": "default_jakarta_data", + "logger": LOGGER +} diff --git a/learners/svm_learner.py b/learners/svm_learner.py index b5a7202..270bb06 100644 --- a/learners/svm_learner.py +++ b/learners/svm_learner.py @@ -64,7 +64,8 @@ def train(self, params, validation_keys): self.t_labels = t_labels t_data = t_data_w_pkey[1:, :] - self.clf = svm.SVC(gamma="scale", kernel="poly") + # found best hyper params by exhaustive grid search in svm_test.ipynb + self.clf = svm.SVC(gamma="scale", kernel="rbf", degree=3) # sklearn expects rows to be data points, we've gone with columns self.clf.fit(t_data.T, self.t_labels[0, :]) @@ -85,6 +86,13 @@ def train(self, params, validation_keys): self.logger.info("Val score: " + str(percent_correct)) # get the signed distance for every train data point + # with the pkey as the first row + # self.t_sd = np.vstack((self.t_data_w_pkey[0, :], + # self.clf.decision_function(t_data.T))) + # # for every validation data point + # self.val_sd = np.vstack((self.val_data_w_pkey[0, :], + # self.clf.decision_function(val_data.T))) + self.t_sd = self.clf.decision_function(t_data.T) # for every validation data point self.val_sd = self.clf.decision_function(val_data.T) diff --git a/nlp/bow_labeler.py b/nlp/bow_labeler.py index c08eff8..cbaad5c 100644 --- a/nlp/bow_labeler.py +++ b/nlp/bow_labeler.py @@ -18,7 +18,8 @@ def load_labels_from_disk(self, filename='./bow_labels.p'): def dump_labels_to_disk(self, labels, filename='./bow_labels.p'): return super().dump_labels_to_disk(filename) - def make_feature_vectors(self, reports_dict, vocab): + def make_feature_vectors(self, reports_dict, vocab, + include_zero_vects=True): """ Args: reports_dict: dict of {pkey: list(str)} @@ -43,9 +44,10 @@ def make_feature_vectors(self, reports_dict, vocab): feature_list = self.make_unary_feature_vector(vocab, word_list) feature_vect_dict[pkey] = feature_list - zero_list = [0]*len(vocab) - for pkey in remaining_pkeys: - feature_vect_dict[pkey] = zero_list + if include_zero_vects: + zero_list = [0]*len(vocab) + for pkey in remaining_pkeys: + feature_vect_dict[pkey] = zero_list return feature_vect_dict diff --git a/simple_nn.py b/simple_nn.py index b82ec0e..d83947d 100644 --- a/simple_nn.py +++ b/simple_nn.py @@ -1,4 +1,5 @@ import torch +import torch.nn.functional as F class Simple_nn(torch.nn.Module): @@ -14,6 +15,22 @@ def forward(self, x): return self.output(y_pred) +class Complex_nn(torch.nn.Module): + def __init__(self, dims_in, hidden): + super(Complex_nn, self).__init__() + self.fc1 = torch.nn.Linear(dims_in, hidden) + self.fc2 = torch.nn.Linear(hidden, hidden) + self.fc3 = torch.nn.Linear(hidden, 2) + self.fc4 = torch.nn.LogSoftmax() + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + x = self.fc4(x) + return x + + def run_training(model, x_data, y_data, num_epochs=10000): lossfn = torch.nn.NLLLoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.0015) @@ -32,7 +49,7 @@ def run_training(model, x_data, y_data, num_epochs=10000): last_n.pop(0) last_n.append(loss) diff = abs(last_n[-1] - last_n[0]) - if diff < .0001: + if diff < .000001: print("Early stopping at epoch: " + str(epoch)) break