Skip to content

Commit

Permalink
multirow ensemble
Browse files Browse the repository at this point in the history
  • Loading branch information
abrahamq committed Aug 22, 2019
1 parent 8cde989 commit d889c80
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 9 deletions.
113 changes: 113 additions & 0 deletions jakarta_pic_and_text_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Jakarta config: where only those reports with images included
# import this file and then overwrite whatever you need in
# the default_config object
import logging
import pandas as pd
from sqlalchemy import create_engine
DATABASE = "cognicity"
engine = create_engine(
"postgresql://postgres:postgres@localhost:5432/"
+ DATABASE)

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

LOG_FILENAME = ".default_jakarta.log"
fh = logging.FileHandler(LOG_FILENAME)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
LOGGER.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
LOGGER.addHandler(ch)

start_period = "'2017-01-01 00:00:35.630000-05:00'"
end_period = "'2017-03-10 00:00:35.630000-05:00'"

start_known_flood = "'2017-02-20 00:00:35.630000-05:00'"
end_known_flood = "'2017-02-23 00:00:35.630000-05:00'"


def __get_flood_pkeys(start_date, end_date, engine):
# gets the pkeys of reports during flood dates

pkeys = pd.read_sql_query(
'''
SELECT pkey, created_at FROM ''' + DATABASE + '''.all_reports WHERE
created_at > %(start_date)s::timestamptz
AND
created_at < %(end_date)s::timestamptz
AND
image_url IS NOT NULL
AND
text IS NOT null
AND
LENGTH(text) > 0
''',
params={"start_date": start_date, "end_date": end_date},
con=engine, index_col="pkey")
return set(pkeys.index)


def __get_no_flood_pkeys(start_period,
start_flood_date,
end_flood_date,
end_period,
engine):
# gets the pkeys of reports outside dates

pkeys = pd.read_sql_query(
'''
SELECT pkey,
created_at
FROM ''' + DATABASE + '''.all_reports
WHERE (
created_at > %(start_period)s::timestamptz
AND created_at < %(start_flood_date)s::timestamptz)
OR (
created_at > %(end_flood_date)s::timestamptz
AND created_at < %(end_period)s::timestamptz)
AND
image_url IS NOT NULL
AND
text IS NOT null
AND
LENGTH(text) > 0
''',
params={
"start_period": start_period,
"start_flood_date": start_flood_date,
"end_flood_date": end_flood_date,
"end_period": end_period
},
con=engine, index_col="pkey")
return set(pkeys.index)


flood_pkeys = __get_flood_pkeys(
start_known_flood,
end_known_flood,
engine)

no_flood_pkeys = __get_no_flood_pkeys(
start_period,
start_known_flood,
end_known_flood,
end_period,
engine)


config = {
"flood_pkeys": flood_pkeys,
"no_flood_pkeys": no_flood_pkeys,
"all_pkeys": flood_pkeys.union(no_flood_pkeys),
"database_engine": engine,
"database_name": DATABASE,
"location": "id",
"data_folder_prefix": "default_jakarta_data",
"logger": LOGGER
}
72 changes: 63 additions & 9 deletions learners/ensemble_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pickle
import os
import pandas as pd

# import img_util as img_util

Expand All @@ -25,6 +26,39 @@ def __init__(self, config, names, learners):
self.models = []
pass

def _fill_no_data_spots(self, data_list, label_list):
"""
Places zeros where data is missing
Args:
data_list: list of ndarrays where the first row is
pkeys and the second row is a float datapoint
labels: ndarray of labels
Returns:
pd.DataFrame:
"""
all_pkeys = set()
for each in data_list:
all_pkeys.update(each[0, :].tolist())
res = pd.DataFrame(all_pkeys,
columns=["pkey"]).set_index("pkey",
drop=True).sort_index()
last_index = 0
lab_df = pd.DataFrame(all_pkeys,
columns=["pkey"]).set_index("pkey",
drop=True).sort_index()
lab_df["label"] = 0
for i, (data, labels) in enumerate(zip(data_list, label_list)):
vect_len = data.shape[0]-1
# pkeys should be index in the first row of data
ind = pd.Index(data[0, :], name="pkey")
add_df = pd.DataFrame(data=data[1:, :].T, index=ind, columns=range( last_index, last_index+vect_len))
last_index += vect_len
res = pd.concat([res, add_df], axis=1).fillna(0)
labs = pd.DataFrame(data=labels.T, columns=["label"], index=ind)
lab_df.update(labs)
print(res)
return res.join(lab_df)

def train(self, params, validation_keys):
"""
Runs all learners
Expand All @@ -39,32 +73,52 @@ def train(self, params, validation_keys):
# run all the learners
train_sd = []
val_sd = []
t_labels = []
t_labels_list = []
val_labels_list = []
self.val_labels = []
for name, each in zip(self.names, self.learners):
model = each.run_learner(name,
rerun=True,
rerun=False,
validation_keys=validation_keys,
params=params)
self.models.append(model)
train_sd.append(each.t_sd)
val_sd.append(each.val_sd)
t_labels = each.t_labels
self.val_labels = each.val_labels

t_sd_w_pkey = np.vstack((each.t_data_w_pkey[0, :],
each.t_sd))
train_sd.append(t_sd_w_pkey)
t_labels_list.append(each.t_labels)

val_sd_w_pkey = np.vstack((each.val_data_w_pkey[0, :],
each.val_sd))
val_sd.append(val_sd_w_pkey)
val_labels_list.append(each.val_labels)

# arrange a matrix st each column is the
# result of predicting on this pkey
# ex: the signed distance from the separator
train_matrix = np.vstack(train_sd)
val_matrix = np.vstack(val_sd)
train_matrix = self._fill_no_data_spots(train_sd,
t_labels_list).to_numpy().T
self.train_matrix = train_matrix
self.t_labels = train_matrix[-1, :]
train_matrix = train_matrix[:-1, :] # remove label
self.logger.info("training size " + str(train_matrix.shape))
self.logger.info("training matrix " + str(self.train_matrix))

val_pd = self._fill_no_data_spots(val_sd, val_labels_list)
val_matrix = val_pd.loc[pd.Index(validation_keys)].to_numpy().T
self.val_matrix = val_matrix
self.val_labels = val_matrix[-1, :]
val_matrix = val_matrix[:-1, :] # remove label
self.logger.info("validation size " + str(val_matrix.shape))

t_full_matrix = torch.from_numpy(train_matrix.T).float()
# no flood is zero class, flood is 1st class
into_zeros = np.where(t_labels < 0, 0, 1)[0, :]
into_zeros = np.where(self.t_labels < 0, 0, 1)
t_full_labels = torch.from_numpy(into_zeros).long()

hidden_layers = params["hidden"]
nn_model = nn.Simple_nn(len(self.models), hidden_layers)
# nn_model = nn.Simple_nn(len(self.models), hidden_layers)
nn.run_training(nn_model, t_full_matrix, t_full_labels)

torch_val_matrix = torch.from_numpy(val_matrix.T).float()
Expand Down

0 comments on commit d889c80

Please sign in to comment.