configs plus better svm

urbanriskmap · Aug 22, 2019 · 8cde989 · 8cde989
1 parent b43ab34
commit 8cde989
Show file tree

Hide file tree

Showing 6 changed files with 223 additions and 142 deletions.
diff --git a/chennai_only_pics_config.py b/chennai_only_pics_config.py
@@ -0,0 +1,85 @@
+# import this file and then overwrite whatever you need in
+# the default_config object
+import logging
+import pandas as pd
+from sqlalchemy import create_engine
+DATABASE = "riskmap"
+engine = create_engine(
+        "postgresql://postgres:postgres@localhost:5432/"
+        + DATABASE)
+
+LOGGER = logging.getLogger()
+LOGGER.setLevel(logging.DEBUG)
+
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+
+TEST_LOG_FILENAME = ".log_filename.log"
+fh = logging.FileHandler(TEST_LOG_FILENAME)
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(formatter)
+LOGGER.addHandler(fh)
+
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+ch.setFormatter(formatter)
+LOGGER.addHandler(ch)
+
+start_known_flood = "'2017-11-01 00:00:35.630000+05:30'"
+end_known_flood = "'2017-11-07 00:00:35.630000+05:30'"
+
+
+def __get_flood_pkeys(start_date, end_date, engine):
+    # gets the pkeys of reports during flood dates
+
+    pkeys = pd.read_sql_query(
+        '''
+        SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
+            created_at > %(start_date)s::timestamptz
+                AND
+            created_at < %(end_date)s::timestamptz
+                AND
+            image_url IS NOT NULL
+        ''',
+        params={"start_date": start_date, "end_date": end_date},
+        con=engine, index_col="pkey")
+    return set(pkeys.index)
+
+
+def __get_no_flood_pkeys(start_flood_date, end_flood_date, engine):
+    # gets the pkeys of reports outside dates
+
+    pkeys = pd.read_sql_query(
+        '''
+        SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
+            created_at < %(start_date)s::timestamptz
+                OR
+            created_at > %(end_date)s::timestamptz
+                AND
+            image_url IS NOT NULL
+        ''',
+        params={"start_date": start_flood_date, "end_date": end_flood_date},
+        con=engine, index_col="pkey")
+    return set(pkeys.index)
+
+
+flood_pkeys = __get_flood_pkeys(
+    start_known_flood,
+    end_known_flood,
+    engine)
+
+no_flood_pkeys = __get_no_flood_pkeys(
+    start_known_flood,
+    end_known_flood,
+    engine)
+
+
+config = {
+    "flood_pkeys": flood_pkeys,
+    "no_flood_pkeys": no_flood_pkeys,
+    "all_pkeys": flood_pkeys.union(no_flood_pkeys),
+    "database_engine": engine,
+    "database_name": DATABASE,
+    "location": "ch",
+    "data_folder_prefix": "default_chennai_data",
+    "logger": LOGGER
+}
diff --git a/flood_depth/indentity_labeler.py b/flood_depth/indentity_labeler.py
diff --git a/jakarta_only_pics_config.py b/jakarta_only_pics_config.py
@@ -0,0 +1,105 @@
+# Jakarta config: where only those reports with images included
+# import this file and then overwrite whatever you need in
+# the default_config object
+import logging
+import pandas as pd
+from sqlalchemy import create_engine
+DATABASE = "cognicity"
+engine = create_engine(
+        "postgresql://postgres:postgres@localhost:5432/"
+        + DATABASE)
+
+LOGGER = logging.getLogger()
+LOGGER.setLevel(logging.DEBUG)
+
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+
+LOG_FILENAME = ".default_jakarta.log"
+fh = logging.FileHandler(LOG_FILENAME)
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(formatter)
+LOGGER.addHandler(fh)
+
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+ch.setFormatter(formatter)
+LOGGER.addHandler(ch)
+
+start_period = "'2017-01-01 00:00:35.630000-05:00'"
+end_period = "'2017-03-10 00:00:35.630000-05:00'"
+
+start_known_flood = "'2017-02-20 00:00:35.630000-05:00'"
+end_known_flood = "'2017-02-23 00:00:35.630000-05:00'"
+
+
+def __get_flood_pkeys(start_date, end_date, engine):
+    # gets the pkeys of reports during flood dates
+
+    pkeys = pd.read_sql_query(
+        '''
+        SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
+            created_at > %(start_date)s::timestamptz
+                AND
+            created_at < %(end_date)s::timestamptz
+                AND
+            image_url IS NOT NULL
+        ''',
+        params={"start_date": start_date, "end_date": end_date},
+        con=engine, index_col="pkey")
+    return set(pkeys.index)
+
+
+def __get_no_flood_pkeys(start_period,
+                         start_flood_date,
+                         end_flood_date,
+                         end_period,
+                         engine):
+    # gets the pkeys of reports outside dates
+
+    pkeys = pd.read_sql_query(
+        '''
+        SELECT pkey,
+               created_at
+        FROM ''' + DATABASE + '''.all_reports
+        WHERE (
+                created_at > %(start_period)s::timestamptz
+            AND created_at < %(start_flood_date)s::timestamptz)
+        OR (
+                created_at > %(end_flood_date)s::timestamptz
+            AND created_at < %(end_period)s::timestamptz)
+        AND
+            image_url IS NOT NULL
+        ''',
+        params={
+            "start_period": start_period,
+            "start_flood_date": start_flood_date,
+            "end_flood_date": end_flood_date,
+            "end_period": end_period
+            },
+        con=engine, index_col="pkey")
+    return set(pkeys.index)
+
+
+flood_pkeys = __get_flood_pkeys(
+    start_known_flood,
+    end_known_flood,
+    engine)
+
+no_flood_pkeys = __get_no_flood_pkeys(
+    start_period,
+    start_known_flood,
+    end_known_flood,
+    end_period,
+    engine)
+
+
+config = {
+    "flood_pkeys": flood_pkeys,
+    "no_flood_pkeys": no_flood_pkeys,
+    "all_pkeys": flood_pkeys.union(no_flood_pkeys),
+    "database_engine": engine,
+    "database_name": DATABASE,
+    "location": "id",
+    "data_folder_prefix": "default_jakarta_data",
+    "logger": LOGGER
+}
diff --git a/learners/svm_learner.py b/learners/svm_learner.py
@@ -64,7 +64,8 @@ def train(self, params, validation_keys):
         self.t_labels = t_labels
         t_data = t_data_w_pkey[1:, :]
 
-        self.clf = svm.SVC(gamma="scale", kernel="poly")
+        # found best hyper params by exhaustive grid search in svm_test.ipynb
+        self.clf = svm.SVC(gamma="scale", kernel="rbf", degree=3)
         # sklearn expects rows to be data points, we've gone with columns
         self.clf.fit(t_data.T, self.t_labels[0, :])
 
@@ -85,6 +86,13 @@ def train(self, params, validation_keys):
         self.logger.info("Val score: " + str(percent_correct))
 
         # get the signed distance for every train data point
+        # with the pkey as the first row
+        # self.t_sd = np.vstack((self.t_data_w_pkey[0, :],
+        #                       self.clf.decision_function(t_data.T)))
+        # # for every validation data point
+        # self.val_sd = np.vstack((self.val_data_w_pkey[0, :],
+        #                         self.clf.decision_function(val_data.T)))
+
         self.t_sd = self.clf.decision_function(t_data.T)
         # for every validation data point
         self.val_sd = self.clf.decision_function(val_data.T)

diff --git a/nlp/bow_labeler.py b/nlp/bow_labeler.py
@@ -18,7 +18,8 @@ def load_labels_from_disk(self, filename='./bow_labels.p'):
     def dump_labels_to_disk(self, labels, filename='./bow_labels.p'):
         return super().dump_labels_to_disk(filename)
 
-    def make_feature_vectors(self, reports_dict, vocab):
+    def make_feature_vectors(self, reports_dict, vocab,
+                             include_zero_vects=True):
         """
         Args:
             reports_dict: dict of {pkey: list(str)}
@@ -43,9 +44,10 @@ def make_feature_vectors(self, reports_dict, vocab):
                 feature_list = self.make_unary_feature_vector(vocab, word_list)
                 feature_vect_dict[pkey] = feature_list
 
-        zero_list = [0]*len(vocab)
-        for pkey in remaining_pkeys:
-            feature_vect_dict[pkey] = zero_list
+        if include_zero_vects:
+            zero_list = [0]*len(vocab)
+            for pkey in remaining_pkeys:
+                feature_vect_dict[pkey] = zero_list
 
         return feature_vect_dict