From 8cde9895484aca9d86f3602568d316d4b62b1e16 Mon Sep 17 00:00:00 2001
From: abrahamq <abrahamquintero2323@gmail.com>
Date: Wed, 21 Aug 2019 21:26:26 -0400
Subject: [PATCH] configs plus better svm

---
 chennai_only_pics_config.py      |  85 +++++++++++++++++++
 flood_depth/indentity_labeler.py | 136 -------------------------------
 jakarta_only_pics_config.py      | 105 ++++++++++++++++++++++++
 learners/svm_learner.py          |  10 ++-
 nlp/bow_labeler.py               |  10 ++-
 simple_nn.py                     |  19 ++++-
 6 files changed, 223 insertions(+), 142 deletions(-)
 create mode 100644 chennai_only_pics_config.py
 delete mode 100644 flood_depth/indentity_labeler.py
 create mode 100644 jakarta_only_pics_config.py

diff --git a/chennai_only_pics_config.py b/chennai_only_pics_config.py
new file mode 100644
index 0000000..2ff6d99
--- /dev/null
+++ b/chennai_only_pics_config.py
@@ -0,0 +1,85 @@
+# import this file and then overwrite whatever you need in
+# the default_config object
+import logging
+import pandas as pd
+from sqlalchemy import create_engine
+DATABASE = "riskmap"
+engine = create_engine(
+        "postgresql://postgres:postgres@localhost:5432/"
+        + DATABASE)
+
+LOGGER = logging.getLogger()
+LOGGER.setLevel(logging.DEBUG)
+
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+
+TEST_LOG_FILENAME = ".log_filename.log"
+fh = logging.FileHandler(TEST_LOG_FILENAME)
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(formatter)
+LOGGER.addHandler(fh)
+
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+ch.setFormatter(formatter)
+LOGGER.addHandler(ch)
+
+start_known_flood = "'2017-11-01 00:00:35.630000+05:30'"
+end_known_flood = "'2017-11-07 00:00:35.630000+05:30'"
+
+
+def __get_flood_pkeys(start_date, end_date, engine):
+    # gets the pkeys of reports during flood dates
+
+    pkeys = pd.read_sql_query(
+        '''
+        SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
+            created_at > %(start_date)s::timestamptz
+                AND
+            created_at < %(end_date)s::timestamptz
+                AND
+            image_url IS NOT NULL
+        ''',
+        params={"start_date": start_date, "end_date": end_date},
+        con=engine, index_col="pkey")
+    return set(pkeys.index)
+
+
+def __get_no_flood_pkeys(start_flood_date, end_flood_date, engine):
+    # gets the pkeys of reports outside dates
+
+    pkeys = pd.read_sql_query(
+        '''
+        SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
+            created_at < %(start_date)s::timestamptz
+                OR
+            created_at > %(end_date)s::timestamptz
+                AND
+            image_url IS NOT NULL
+        ''',
+        params={"start_date": start_flood_date, "end_date": end_flood_date},
+        con=engine, index_col="pkey")
+    return set(pkeys.index)
+
+
+flood_pkeys = __get_flood_pkeys(
+    start_known_flood,
+    end_known_flood,
+    engine)
+
+no_flood_pkeys = __get_no_flood_pkeys(
+    start_known_flood,
+    end_known_flood,
+    engine)
+
+
+config = {
+    "flood_pkeys": flood_pkeys,
+    "no_flood_pkeys": no_flood_pkeys,
+    "all_pkeys": flood_pkeys.union(no_flood_pkeys),
+    "database_engine": engine,
+    "database_name": DATABASE,
+    "location": "ch",
+    "data_folder_prefix": "default_chennai_data",
+    "logger": LOGGER
+}
diff --git a/flood_depth/indentity_labeler.py b/flood_depth/indentity_labeler.py
deleted file mode 100644
index e84c4df..0000000
--- a/flood_depth/indentity_labeler.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""
-A pass through labeler
-Abraham Quintero, MIT 2018
-v0.0.1
-"""
-
-__author__ = "Abraham Quintero"
-__created__ = "2019-2-9"
-__version__ = "0.0.1"
-__copyright__ = "Copyright 2019 MIT Urban Risk Lab"
-__license__ = "MIT"
-__email__ = "abrahamq@mit.edu"
-__status__ = "Development"
-__url__ = "https://github.com/urbanriskmap/timeseries-analysis"
-
-import os
-import pickle
-from image_recognition.abstract_labeler import AbstractLabeler
-
-
-class IdentityLabeler(AbstractLabeler):
-    def __init__(self, configObj, loader):
-        self.loader = loader
-        self.config = configObj
-        self.database = configObj["database_engine"]
-        self.database_name = configObj["database_name"]
-        self.location = configObj["location"]
-        self.data_folder_prefix = configObj["data_folder_prefix"]
-        self.logger = configObj["logger"]
-
-        super().__init__(configObj, loader)
-        self.logger.debug("IdentityLabeler constructed")
-
-    def run_labeler(self, filename="iden_labels_default.p", rerun=False):
-        """
-        loads labels from disk or uses labeler api to labeles images if
-        there are no labels on disk, then saves the labels to disk in
-        config.data_folder_prefix/filename
-        """
-        label_path = os.path.join(self.config["data_folder_prefix"], filename)
-        if rerun or not os.path.exists(label_path):
-            depths = self.loader.get_flood_depths()
-            labels = self.get_labels(depths, self.dump_labels_to_disk)
-        else:
-            labels = self.load_labels_from_disk(filename)
-        return labels
-
-    def load_labels_from_disk(self, filename="iden_labels_default.p"):
-        path = os.path.join(self.data_folder_prefix, filename)
-        return pickle.load(open(path, "rb"))
-
-    def dump_labels_to_disk(self, labels, filename="iden_labels_default.p"):
-        path = os.path.join(self.data_folder_prefix, filename)
-        pickle.dump(labels, open(path, "wb"))
-        return
-
-    def make_matrix(self, feat_vects):
-        return super().make_matrix(feat_vects)
-
-    def make_feature_vectors(self, inp, allowed):
-        """
-        Args:
-            inp:
-                Dictionary of (pkeys: int)
-                where for each pkey represents the flood depth for pkey
-            allowed:
-                Dictionary of allowed word to the index in the feature vector
-                example: allowed = {
-                                    "Flood":0,
-                                    "Flooding":1,
-                                    "Water":2,
-                                    "Puddle":3,
-                                    "Person":4
-                                    }
-                would create feature vectors  where the zeroth
-                feature is the confidence score of
-                Flood in picture, 1st element is Flooding and so on
-        Returns:
-            Dictionary{ string Pkey: list{float}}
-                where list is a vector defined by allowed
-        """
-        # dict of pkeys to feature vectors
-
-        flood = self.config["flood_pkeys"]
-        all_selected_pkeys = flood.union(self.config["no_flood_pkeys"])
-
-        features = dict([(key, [0]*len(allowed.keys())) for key in inp.keys()
-                        if key in all_selected_pkeys])
-        for pkey in features.keys():
-            all_selected_pkeys.remove(pkey)
-            # fill in the label if it exists
-            if pkey in inp:
-                desc = "flood_depth"
-                if desc in allowed:
-                    features[pkey][allowed[desc]] =\
-                            float(inp[pkey])
-        # add in zero features that don't have flood heights
-        zero_list = [0]*len(allowed.keys())
-        for pkey in all_selected_pkeys:
-            assert(pkey not in features)
-            features[pkey] = zero_list
-
-        self.features = features
-        return features
-
-    def get_labels(self, depths_df, hook=None):
-        """
-        Returns
-        Args:
-            depths_df(pd.DataFrame):
-                pkeys to integer flood depths (in cm)
-        Returns:
-            Labels dict of (pkey, int)):
-                for each pkey in depths_df, the integer result
-                for that pkey
-        """
-        labels = depths_df.to_dict()["flood_depth"]
-        if hook is not None:
-            hook(labels)
-
-        return labels
-
-    def make_label_to_index(self, inp):
-        """
-        Args:
-            inp dict of (pkeys: int)
-                flood depth
-        Returns:
-            lab_to_index: dict(string: index)
-                constant of {"flood_depth": 0}
-            index_to_label: dict(index: string)
-                constant of {0: "flood_depth"}
-        """
-        lab_to_index = {"flood_depth": 0}
-        index_to_label = {0: "flood_depth"}
-        return lab_to_index, index_to_label
diff --git a/jakarta_only_pics_config.py b/jakarta_only_pics_config.py
new file mode 100644
index 0000000..35fbf2c
--- /dev/null
+++ b/jakarta_only_pics_config.py
@@ -0,0 +1,105 @@
+# Jakarta config: where only those reports with images included
+# import this file and then overwrite whatever you need in
+# the default_config object
+import logging
+import pandas as pd
+from sqlalchemy import create_engine
+DATABASE = "cognicity"
+engine = create_engine(
+        "postgresql://postgres:postgres@localhost:5432/"
+        + DATABASE)
+
+LOGGER = logging.getLogger()
+LOGGER.setLevel(logging.DEBUG)
+
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+
+LOG_FILENAME = ".default_jakarta.log"
+fh = logging.FileHandler(LOG_FILENAME)
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(formatter)
+LOGGER.addHandler(fh)
+
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+ch.setFormatter(formatter)
+LOGGER.addHandler(ch)
+
+start_period = "'2017-01-01 00:00:35.630000-05:00'"
+end_period = "'2017-03-10 00:00:35.630000-05:00'"
+
+start_known_flood = "'2017-02-20 00:00:35.630000-05:00'"
+end_known_flood = "'2017-02-23 00:00:35.630000-05:00'"
+
+
+def __get_flood_pkeys(start_date, end_date, engine):
+    # gets the pkeys of reports during flood dates
+
+    pkeys = pd.read_sql_query(
+        '''
+        SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
+            created_at > %(start_date)s::timestamptz
+                AND
+            created_at < %(end_date)s::timestamptz
+                AND
+            image_url IS NOT NULL
+        ''',
+        params={"start_date": start_date, "end_date": end_date},
+        con=engine, index_col="pkey")
+    return set(pkeys.index)
+
+
+def __get_no_flood_pkeys(start_period,
+                         start_flood_date,
+                         end_flood_date,
+                         end_period,
+                         engine):
+    # gets the pkeys of reports outside dates
+
+    pkeys = pd.read_sql_query(
+        '''
+        SELECT pkey,
+               created_at
+        FROM ''' + DATABASE + '''.all_reports
+        WHERE (
+                created_at > %(start_period)s::timestamptz
+            AND created_at < %(start_flood_date)s::timestamptz)
+        OR (
+                created_at > %(end_flood_date)s::timestamptz
+            AND created_at < %(end_period)s::timestamptz)
+        AND
+            image_url IS NOT NULL
+        ''',
+        params={
+            "start_period": start_period,
+            "start_flood_date": start_flood_date,
+            "end_flood_date": end_flood_date,
+            "end_period": end_period
+            },
+        con=engine, index_col="pkey")
+    return set(pkeys.index)
+
+
+flood_pkeys = __get_flood_pkeys(
+    start_known_flood,
+    end_known_flood,
+    engine)
+
+no_flood_pkeys = __get_no_flood_pkeys(
+    start_period,
+    start_known_flood,
+    end_known_flood,
+    end_period,
+    engine)
+
+
+config = {
+    "flood_pkeys": flood_pkeys,
+    "no_flood_pkeys": no_flood_pkeys,
+    "all_pkeys": flood_pkeys.union(no_flood_pkeys),
+    "database_engine": engine,
+    "database_name": DATABASE,
+    "location": "id",
+    "data_folder_prefix": "default_jakarta_data",
+    "logger": LOGGER
+}
diff --git a/learners/svm_learner.py b/learners/svm_learner.py
index b5a7202..270bb06 100644
--- a/learners/svm_learner.py
+++ b/learners/svm_learner.py
@@ -64,7 +64,8 @@ def train(self, params, validation_keys):
         self.t_labels = t_labels
         t_data = t_data_w_pkey[1:, :]
 
-        self.clf = svm.SVC(gamma="scale", kernel="poly")
+        # found best hyper params by exhaustive grid search in svm_test.ipynb
+        self.clf = svm.SVC(gamma="scale", kernel="rbf", degree=3)
         # sklearn expects rows to be data points, we've gone with columns
         self.clf.fit(t_data.T, self.t_labels[0, :])
 
@@ -85,6 +86,13 @@ def train(self, params, validation_keys):
         self.logger.info("Val score: " + str(percent_correct))
 
         # get the signed distance for every train data point
+        # with the pkey as the first row
+        # self.t_sd = np.vstack((self.t_data_w_pkey[0, :],
+        #                       self.clf.decision_function(t_data.T)))
+        # # for every validation data point
+        # self.val_sd = np.vstack((self.val_data_w_pkey[0, :],
+        #                         self.clf.decision_function(val_data.T)))
+
         self.t_sd = self.clf.decision_function(t_data.T)
         # for every validation data point
         self.val_sd = self.clf.decision_function(val_data.T)
diff --git a/nlp/bow_labeler.py b/nlp/bow_labeler.py
index c08eff8..cbaad5c 100644
--- a/nlp/bow_labeler.py
+++ b/nlp/bow_labeler.py
@@ -18,7 +18,8 @@ def load_labels_from_disk(self, filename='./bow_labels.p'):
     def dump_labels_to_disk(self, labels, filename='./bow_labels.p'):
         return super().dump_labels_to_disk(filename)
 
-    def make_feature_vectors(self, reports_dict, vocab):
+    def make_feature_vectors(self, reports_dict, vocab,
+                             include_zero_vects=True):
         """
         Args:
             reports_dict: dict of {pkey: list(str)}
@@ -43,9 +44,10 @@ def make_feature_vectors(self, reports_dict, vocab):
                 feature_list = self.make_unary_feature_vector(vocab, word_list)
                 feature_vect_dict[pkey] = feature_list
 
-        zero_list = [0]*len(vocab)
-        for pkey in remaining_pkeys:
-            feature_vect_dict[pkey] = zero_list
+        if include_zero_vects:
+            zero_list = [0]*len(vocab)
+            for pkey in remaining_pkeys:
+                feature_vect_dict[pkey] = zero_list
 
         return feature_vect_dict
 
diff --git a/simple_nn.py b/simple_nn.py
index b82ec0e..d83947d 100644
--- a/simple_nn.py
+++ b/simple_nn.py
@@ -1,4 +1,5 @@
 import torch
+import torch.nn.functional as F
 
 
 class Simple_nn(torch.nn.Module):
@@ -14,6 +15,22 @@ def forward(self, x):
         return self.output(y_pred)
 
 
+class Complex_nn(torch.nn.Module):
+    def __init__(self, dims_in, hidden):
+        super(Complex_nn, self).__init__()
+        self.fc1 = torch.nn.Linear(dims_in, hidden)
+        self.fc2 = torch.nn.Linear(hidden, hidden)
+        self.fc3 = torch.nn.Linear(hidden, 2)
+        self.fc4 = torch.nn.LogSoftmax()
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        x = self.fc4(x)
+        return x
+
+
 def run_training(model, x_data, y_data, num_epochs=10000):
     lossfn = torch.nn.NLLLoss()
     optimizer = torch.optim.SGD(model.parameters(), lr=0.0015)
@@ -32,7 +49,7 @@ def run_training(model, x_data, y_data, num_epochs=10000):
             last_n.pop(0)
             last_n.append(loss)
             diff = abs(last_n[-1] - last_n[0])
-            if diff < .0001:
+            if diff < .000001:
                 print("Early stopping at epoch: " + str(epoch))
                 break